dit & video

c07946d8 · hepj · c07946d8 · c07946d8 · c07946d8 · c07946d8
Commit c07946d8 authored Apr 09, 2026 by hepj
20 changed files
--- a/FastVideo-main/docs/source/_static/custom.js
+++ b/FastVideo-main/docs/source/_static/custom.js
+// Update URL search params when tab is clicked
+  document.addEventListener("DOMContentLoaded", function () {
+    const tabs = document.querySelectorAll(".sd-tab-label");
+
+    function updateURL(tab) {
+      const syncGroup = tab.getAttribute("data-sync-group");
+      const syncId = tab.getAttribute("data-sync-id");
+      if (syncGroup && syncId) {
+          const url = new URL(window.location);
+          url.searchParams.set(syncGroup, syncId);
+          window.history.replaceState(null, "", url);
+      }
+    }
+
+    tabs.forEach(tab => {
+        tab.addEventListener("click", () => updateURL(tab));
+    });
+});
--- a/FastVideo-main/docs/source/_static/images/runpod_cuda.png
+++ b/FastVideo-main/docs/source/_static/images/runpod_cuda.png
--- a/FastVideo-main/docs/source/_static/images/runpod_ssh.png
+++ b/FastVideo-main/docs/source/_static/images/runpod_ssh.png
--- a/FastVideo-main/docs/source/_static/images/runpod_template.png
+++ b/FastVideo-main/docs/source/_static/images/runpod_template.png
--- a/FastVideo-main/docs/source/_templates/sections/header.html
+++ b/FastVideo-main/docs/source/_templates/sections/header.html
+<style>
+  .notification-bar {
+    width: 100vw;
+    display: flex;
+    justify-content: center;
+    align-items: center;
+    font-size: 16px;
+    padding: 0 6px 0 6px;
+  }
+  .notification-bar p {
+    margin: 0;
+  }
+  .notification-bar a {
+    font-weight: bold;
+    text-decoration: none;
+  }
+
+  /* Light mode styles (default) */
+  .notification-bar {
+    background-color: #fff3cd;
+    color: #856404;
+  }
+  .notification-bar a {
+    color: #d97706;
+  }
+
+  /* Dark mode styles */
+  html[data-theme=dark] .notification-bar {
+    background-color: #333;
+    color: #ddd;
+  }
+  html[data-theme=dark] .notification-bar a {
+    color: #ffa500; /* Brighter color for visibility */
+  }
+</style>
+
+<!-- <div class="notification-bar">
+  <p>You are viewing the latest developer preview docs. <a href="https://docs.vllm.ai/en/stable/">Click here</a> to view docs for the latest stable release.</p>
+</div> -->
--- a/FastVideo-main/docs/source/conf.py
+++ b/FastVideo-main/docs/source/conf.py
+# SPDX-License-Identifier: Apache-2.0
+
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+
+import datetime
+import inspect
+import logging
+import os
+import sys
+from typing import Optional
+
+import requests
+from sphinx.ext import autodoc
+
+logger = logging.getLogger(__name__)
+sys.path.append(os.path.abspath("../.."))
+
+# -- Project information -----------------------------------------------------
+
+project = 'FastVideo'
+copyright = f'{datetime.datetime.now().year}, FastVideo Team'
+author = 'the FastVideo Team'
+
+# -- General configuration ---------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    "sphinx.ext.napoleon",
+    "sphinx.ext.linkcode",
+    "sphinx.ext.intersphinx",
+    "sphinx_copybutton",
+    "sphinx.ext.autodoc",
+    "sphinx.ext.autosummary",
+    "myst_parser",
+    "sphinxarg.ext",
+    "sphinx_design",
+    "sphinx_togglebutton",
+]
+myst_enable_extensions = [
+    "colon_fence",
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns: list[str] = ["**/*.template.md", "**/*.inc.md"]
+
+# Exclude the prompt "$" when copying code
+copybutton_prompt_text = r"\$ "
+copybutton_prompt_is_regexp = True
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_title = project
+html_theme = 'sphinx_book_theme'
+html_logo = '../../assets/logo.jpg'
+#html_favicon = 'assets/logos/vllm-logo-only-light.ico'
+html_theme_options = {
+    'path_to_docs': 'docs/source',
+    'repository_url': 'https://github.com/hao-ai-lab/FastVideo/',
+    'use_repository_button': True,
+    'use_edit_page_button': True,
+}
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ["_static"]
+html_js_files = ["custom.js"]
+html_css_files = ["custom.css"]
+
+myst_url_schemes = {
+    'http': None,
+    'https': None,
+    'mailto': None,
+    'ftp': None,
+    "gh-issue": {
+        "url":
+        "https://github.com/hao-ai-lab/FastVideo/issues/{{path}}#{{fragment}}",
+        "title": "Issue #{{path}}",
+        "classes": ["github"],
+    },
+    "gh-pr": {
+        "url":
+        "https://github.com/hao-ai-lab/FastVideo/pull/{{path}}#{{fragment}}",
+        "title": "Pull Request #{{path}}",
+        "classes": ["github"],
+    },
+    "gh-dir": {
+        "url": "https://github.com/hao-ai-lab/FastVideo/tree/main/{{path}}",
+        "title": "{{path}}",
+        "classes": ["github"],
+    },
+    "gh-file": {
+        "url": "https://github.com/hao-ai-lab/FastVideo/blob/main/{{path}}",
+        "title": "{{path}}",
+        "classes": ["github"],
+    },
+}
+
+# see https://docs.readthedocs.io/en/stable/reference/environment-variables.html # noqa
+READTHEDOCS_VERSION_TYPE = os.environ.get('READTHEDOCS_VERSION_TYPE')
+if READTHEDOCS_VERSION_TYPE == "tag":
+    # remove the warning banner if the version is a tagged release
+    header_file = os.path.join(os.path.dirname(__file__),
+                               "_templates/sections/header.html")
+    # The file might be removed already if the build is triggered multiple times
+    # (readthedocs build both HTML and PDF versions separately)
+    if os.path.exists(header_file):
+        os.remove(header_file)
+
+
+# Generate additional rst documentation here.
+def setup(app):
+    from docs.source.generate_examples import generate_examples
+    generate_examples()
+
+
+_cached_base: str = ""
+_cached_branch: str = ""
+
+
+def get_repo_base_and_branch(
+        pr_number: str) -> tuple[Optional[str], Optional[str]]:
+    global _cached_base, _cached_branch
+    if _cached_base and _cached_branch:
+        return _cached_base, _cached_branch
+
+    url = f"https://api.github.com/repos/hao-ai-lab/FastVideo/pulls/{pr_number}"
+    response = requests.get(url)
+    if response.status_code == 200:
+        data = response.json()
+        _cached_base = data['head']['repo']['full_name']
+        _cached_branch = data['head']['ref']
+        return _cached_base, _cached_branch
+    else:
+        logger.error("Failed to fetch PR details: %s", response)
+        return None, None
+
+
+def linkcode_resolve(domain, info):
+    if domain != 'py':
+        return None
+    if not info['module']:
+        return None
+    module = info['module']
+
+    # try to determine the correct file and line number to link to
+    obj = sys.modules[module]
+
+    # get as specific as we can
+    lineno: int = 0
+    filename: str = ""
+    try:
+        for part in info['fullname'].split('.'):
+            obj = getattr(obj, part)
+
+            if not (inspect.isclass(obj) or inspect.isfunction(obj)
+                    or inspect.ismethod(obj)):
+                obj = obj.__class__  # type: ignore[assignment]
+
+            lineno = inspect.getsourcelines(obj)[1]
+            filename = (inspect.getsourcefile(obj)
+                        or f"{filename}.py").split("FastVideo/", 1)[1]
+    except Exception:
+        # For some things, like a class member, won't work, so
+        # we'll use the line number of the parent (the class)
+        pass
+
+    if filename.startswith("checkouts/"):
+        # a PR build on readthedocs
+        pr_number = filename.split("/")[1]
+        filename = filename.split("/", 2)[2]
+        base, branch = get_repo_base_and_branch(pr_number)
+        if base and branch:
+            return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
+
+    # Otherwise, link to the source file on the main branch
+    return f"https://github.com/hao-ai-lab/FastVideo/blob/main/{filename}#L{lineno}"
+
+
+# Mock out external dependencies here, otherwise the autodoc pages may be blank.
+autodoc_mock_imports = [
+    "blake3",
+    "compressed_tensors",
+    "cpuinfo",
+    "cv2",
+    "torch",
+    "transformers",
+    "psutil",
+    "prometheus_client",
+    "sentencepiece",
+    "vllm._C",
+    "PIL",
+    "numpy",
+    'triton',
+    "tqdm",
+    "tensorizer",
+    "pynvml",
+    "outlines",
+    "xgrammar",
+    "librosa",
+    "soundfile",
+    "gguf",
+    "lark",
+    "decord",
+]
+
+for mock_target in autodoc_mock_imports:
+    if mock_target in sys.modules:
+        logger.info(
+            "Potentially problematic mock target (%s) found; "
+            "autodoc_mock_imports cannot mock modules that have already "
+            "been loaded into sys.modules when the sphinx build starts.",
+            mock_target)
+
+
+class MockedClassDocumenter(autodoc.ClassDocumenter):
+    """Remove note about base class when a class is derived from object."""
+
+    def add_line(self, line: str, source: str, *lineno: int) -> None:
+        if line == "   Bases: :py:class:`object`":
+            return
+        super().add_line(line, source, *lineno)
+
+
+autodoc.ClassDocumenter = MockedClassDocumenter
+
+intersphinx_mapping = {
+    "python": ("https://docs.python.org/3", None),
+    "typing_extensions":
+    ("https://typing-extensions.readthedocs.io/en/latest", None),
+    "aiohttp": ("https://docs.aiohttp.org/en/stable", None),
+    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "torch": ("https://pytorch.org/docs/stable", None),
+    "psutil": ("https://psutil.readthedocs.io/en/stable", None),
+}
+
+autodoc_preserve_defaults = True
+autodoc_warningiserror = True
+
+navigation_with_keys = False
--- a/FastVideo-main/docs/source/contributing/add_pipeline.md
+++ b/FastVideo-main/docs/source/contributing/add_pipeline.md
+(add-pipeline)=
+
+# 🏗️ Adding a New Diffusion Pipeline
+
+This guide explains how to implement a custom diffusion pipeline in FastVideo, leveraging the framework's modular architecture for high-performance video generation.
+
+## Implementation Process Overview
+
+1. **Port Required Modules** - Identify and implement necessary model components
+2. **Create Directory Structure** - Set up pipeline files and folders
+3. **Implement Pipeline Class** - Build the pipeline using existing or custom stages
+4. **Register Your Pipeline** - Make it discoverable by the framework
+5. **Configure Your Pipeline** - (Coming soon)
+
+Need help? Join our [Slack community](https://join.slack.com/t/fastvideo/shared_invite/zt-2zf6ru791-sRwI9lPIUJQq1mIeB_yjJg).
+
+## Step 1: Pipeline Modules
+
+### Identifying Required Modules
+
+FastVideo uses the Hugging Face Diffusers format for model organization:
+
+1. Examine the `model_index.json` in the HF model repository:
+
+```json
+{
+    "_class_name": "WanImageToVideoPipeline",
+    "_diffusers_version": "0.33.0.dev0",
+    "image_encoder": ["transformers", "CLIPVisionModelWithProjection"],
+    "image_processor": ["transformers", "CLIPImageProcessor"],
+    "scheduler": ["diffusers", "UniPCMultistepScheduler"],
+    "text_encoder": ["transformers", "UMT5EncoderModel"],
+    "tokenizer": ["transformers", "T5TokenizerFast"],
+    "transformer": ["diffusers", "WanTransformer3DModel"],
+    "vae": ["diffusers", "AutoencoderKLWan"]
+}
+```
+
+1. For each component:
+   - Note the originating library (`transformers` or `diffusers`)
+   - Identify the class name
+   - Check if it's already available in FastVideo
+
+2. Review config files in each component's directory for architecture details
+
+### Implementing Modules
+
+Place new modules in the appropriate directories:
+- Encoders: `fastvideo/v1/models/encoders/`
+- VAEs: `fastvideo/v1/models/vaes/`
+- Transformer models: `fastvideo/v1/models/dits/`
+- Schedulers: `fastvideo/v1/models/schedulers/`
+
+### Adapting Model Layers
+
+#### Layer Replacements
+Replace standard PyTorch layers with FastVideo optimized versions:
+- nn.LayerNorm → fastvideo.v1.layers.layernorm.RMSNorm
+- Embedding layers → fastvideo.v1.layers.vocab_parallel_embedding modules
+- Activation functions → versions from fastvideo.v1.layers.activation
+
+#### Distributed Linear Layers
+Use appropriate parallel layers for distribution:
+
+```python
+# Output dimension parallelism
+from fastvideo.v1.layers.linear import ColumnParallelLinear
+self.q_proj = ColumnParallelLinear(
+    input_size=hidden_size,
+    output_size=head_size * num_heads,
+    bias=bias,
+    gather_output=False
+)
+
+# Fused QKV projection
+from fastvideo.v1.layers.linear import QKVParallelLinear
+self.qkv_proj = QKVParallelLinear(
+    hidden_size=hidden_size,
+    head_size=attention_head_dim,
+    total_num_heads=num_attention_heads,
+    bias=True
+)
+
+# Input dimension parallelism
+from fastvideo.v1.layers.linear import RowParallelLinear
+self.out_proj = RowParallelLinear(
+    input_size=head_size * num_heads,
+    output_size=hidden_size,
+    bias=bias,
+    input_is_parallel=True
+)
+```
+
+### Attention Layers
+Replace standard attention with FastVideo's optimized attention:
+
+```python
+# Local attention patterns
+from fastvideo.v1.attention import LocalAttention
+from fastvideo.v1.attention.backends.abstract import _Backend
+self.attn = LocalAttention(
+    num_heads=num_heads,
+    head_size=head_dim,
+    dropout_rate=0.0,
+    softmax_scale=None,
+    causal=False,
+    supported_attention_backends=(_Backend.FLASH_ATTN, _Backend.TORCH_SDPA)
+)
+
+# Distributed attention for long sequences
+from fastvideo.v1.attention import DistributedAttention
+self.attn = DistributedAttention(
+    num_heads=num_heads,
+    head_size=head_dim,
+    dropout_rate=0.0,
+    softmax_scale=None,
+    causal=False,
+    supported_attention_backends=(_Backend.SLIDING_TILE_ATTN, _Backend.FLASH_ATTN, _Backend.TORCH_SDPA)
+)
+```
+
+#### Define supported backend selection
+
+```python
+   _supported_attention_backends = (_Backend.FLASH_ATTN, _Backend.TORCH_SDPA)
+```
+
+### Registering Models
+
+Register implemented modules in the model registry:
+
+```python
+# In fastvideo/v1/models/registry.py
+_TEXT_TO_VIDEO_DIT_MODELS = {
+    "YourTransformerModel": ("dits", "yourmodule", "YourTransformerClass"),
+}
+
+_VAE_MODELS = {
+    "YourVAEModel": ("vaes", "yourvae", "YourVAEClass"),
+}
+```
+
+## Step 2: Directory Structure
+
+Create a new directory for your pipeline:
+
+```
+fastvideo/v1/pipelines/
+├── your_pipeline/
+│   ├── __init__.py
+│   └── your_pipeline.py
+```
+
+## Step 3: Implement Pipeline Class
+
+Pipelines are composed of stages, each handling a specific part of the diffusion process:
+
+- **InputValidationStage**: Validates input parameters
+- **Text Encoding Stages**: Handle text encoding (CLIP/Llama/T5)
+- **CLIPImageEncodingStage**: Processes image inputs
+- **TimestepPreparationStage**: Prepares diffusion timesteps
+- **LatentPreparationStage**: Manages latent representations
+- **ConditioningStage**: Processes conditioning inputs
+- **DenoisingStage**: Performs denoising diffusion
+- **DecodingStage**: Converts latents to pixels
+
+### Creating Your Pipeline
+
+```python
+from fastvideo.v1.pipelines.composed_pipeline_base import ComposedPipelineBase
+from fastvideo.v1.pipelines.stages import (
+    InputValidationStage, CLIPTextEncodingStage, TimestepPreparationStage,
+    LatentPreparationStage, DenoisingStage, DecodingStage
+)
+from fastvideo.v1.fastvideo_args import FastVideoArgs
+from fastvideo.v1.pipelines.pipeline_batch_info import ForwardBatch
+import torch
+
+class MyCustomPipeline(ComposedPipelineBase):
+    """Custom diffusion pipeline implementation."""
+    
+    # Define required model components from model_index.json
+    _required_config_modules = [
+        "text_encoder", "tokenizer", "vae", "transformer", "scheduler"
+    ]
+    
+    @property
+    def required_config_modules(self) -> List[str]:
+        return self._required_config_modules
+        
+    def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
+        """Initialize pipeline-specific components."""
+        pass
+        
+    def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
+        """Set up pipeline stages with proper dependency injection."""
+        self.add_stage(
+            stage_name="input_validation_stage",
+            stage=InputValidationStage()
+        )
+        
+        self.add_stage(
+            stage_name="prompt_encoding_stage",
+            stage=CLIPTextEncodingStage(
+                text_encoder=self.get_module("text_encoder"),
+                tokenizer=self.get_module("tokenizer")
+            )
+        )
+        
+        self.add_stage(
+            stage_name="timestep_preparation_stage",
+            stage=TimestepPreparationStage(
+                scheduler=self.get_module("scheduler")
+            )
+        )
+        
+        self.add_stage(
+            stage_name="latent_preparation_stage",
+            stage=LatentPreparationStage(
+                scheduler=self.get_module("scheduler"),
+                vae=self.get_module("vae")
+            )
+        )
+        
+        self.add_stage(
+            stage_name="denoising_stage",
+            stage=DenoisingStage(
+                transformer=self.get_module("transformer"),
+                scheduler=self.get_module("scheduler")
+            )
+        )
+        
+        self.add_stage(
+            stage_name="decoding_stage",
+            stage=DecodingStage(
+                vae=self.get_module("vae")
+            )
+        )
+    
+# Register the pipeline class
+EntryClass = MyCustomPipeline
+```
+
+### Creating Custom Stages (Optional)
+
+If existing stages don't meet your needs, create custom ones:
+
+```python
+from fastvideo.v1.pipelines.stages.base import PipelineStage
+
+class MyCustomStage(PipelineStage):
+    """Custom processing stage for the pipeline."""
+    
+    def __init__(self, custom_module, other_param=None):
+        super().__init__()
+        self.custom_module = custom_module
+        self.other_param = other_param
+        
+    def forward(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch:
+        # Access input data
+        input_data = batch.some_attribute
+        
+        # Validate inputs
+        if input_data is None:
+            raise ValueError("Required input is missing")
+            
+        # Process with your module
+        result = self.custom_module(input_data)
+        
+        # Update batch with results
+        batch.some_output = result
+        
+        return batch
+```
+
+Add your custom stage to the pipeline:
+
+```python
+self.add_stage(
+    stage_name="my_custom_stage",
+    stage=MyCustomStage(
+        custom_module=self.get_module("custom_module"),
+        other_param="some_value"
+    )
+)
+```
+
+#### Stage Design Principles
+
+1. **Single Responsibility**: Focus on one specific task
+2. **Functional Pattern**: Receive and return a `ForwardBatch` object
+3. **Dependency Injection**: Pass dependencies through constructor
+4. **Input Validation**: Validate inputs for clear error messages
+
+## Step 4: Register Your Pipeline
+
+Define `EntryClass` at the end of your pipeline file:
+
+```python
+# Single pipeline class
+EntryClass = MyCustomPipeline
+
+# Or multiple pipeline classes
+EntryClass = [MyCustomPipeline, MyOtherPipeline]
+```
+
+The registry will automatically:
+1. Scan all packages under `fastvideo/v1/pipelines/`
+2. Look for `EntryClass` variables
+3. Register pipelines using their class names as identifiers
+
+## Best Practices
+
+- **Reuse Existing Components**: Leverage built-in stages and modules
+- **Follow Module Organization**: Place new modules in appropriate directories
+- **Match Model Patterns**: Follow existing code patterns and conventions
--- a/FastVideo-main/docs/source/contributing/developer_env/docker.md
+++ b/FastVideo-main/docs/source/contributing/developer_env/docker.md
+# 🐳 Using the FastVideo Docker Image
+
+If you prefer a containerized development environment or want to avoid managing dependencies manually, you can use our prebuilt Docker image:
+
+**Image:** [`ghcr.io/hao-ai-lab/fastvideo/fastvideo-dev:latest`](https://ghcr.io/hao-ai-lab/fastvideo/fastvideo-dev)
+
+## Starting the container
+
+```bash
+docker run --gpus all -it ghcr.io/hao-ai-lab/fastvideo/fastvideo-dev:latest
+```
+
+This will:
+
+- Start the container with GPU access  
+- Drop you into a shell with the `fastvideo-dev` Conda environment preconfigured
+
+## Using the container
+
+```bash
+# Conda environment should already be active
+# FastVideo package installed in editable mode
+
+# Pull the latest changes from remote
+cd /FastVideo
+git pull
+
+# Run linters and tests
+pre-commit run --all-files
+pytest tests/
+```
--- a/FastVideo-main/docs/source/contributing/developer_env/index.md
+++ b/FastVideo-main/docs/source/contributing/developer_env/index.md
+(developer-env)
+
+# 🧰 Developer Environment
+
+Accelerate your FastVideo development workflow by leveraging Docker images and cloud GPUs for efficient experimentation and reproducible environments.
+
+:::{toctree}
+:caption: Contents
+:maxdepth: 1
+
+docker
+runpod
+:::
--- a/FastVideo-main/docs/source/contributing/developer_env/runpod.md
+++ b/FastVideo-main/docs/source/contributing/developer_env/runpod.md
+(runpod)=
+
+# 📦 Developing FastVideo on RunPod
+
+You can easily use the FastVideo Docker image as a custom container on [RunPod](https://www.runpod.io) for development or experimentation.
+
+## Creating a new pod
+
+Choose a GPU that supports CUDA 12.4
+
+![RunPod CUDA selection](../../_static/images/runpod_cuda.png)
+
+When creating your pod template, use this image:
+
+```
+ghcr.io/hao-ai-lab/fastvideo/fastvideo-dev:latest
+```
+
+Paste Container Start Command to support SSH ([RunPod Docs](https://docs.runpod.io/pods/configuration/use-ssh)):
+
+```bash
+bash -c "apt update;DEBIAN_FRONTEND=noninteractive apt-get install openssh-server -y;mkdir -p ~/.ssh;cd $_;chmod 700 ~/.ssh;echo \"$PUBLIC_KEY\" >> authorized_keys;chmod 700 authorized_keys;service ssh start;sleep infinity"
+```
+
+![RunPod template configuration](../../_static/images/runpod_template.png)
+
+After deploying, the pod will take a few minutes to pull the image and start the SSH service.
+
+![RunPod ssh](../../_static/images/runpod_ssh.png)
+
+## Working with the pod
+
+After SSH'ing into your pod, you'll find the `fastvideo-dev` Conda environment already activated.
+
+To pull in the latest changes from the GitHub repo:
+
+```bash
+cd /FastVideo
+git pull
+```
+
+`If you have a persistent volume and want to keep your code changes, you can move /FastVideo to /workspace/FastVideo, or simply clone the repository there.`
+
+Run your development workflows as usual:
+
+```bash
+# Run linters
+pre-commit run --all-files
+
+# Run tests
+pytest tests/
+```
--- a/FastVideo-main/docs/source/contributing/overview.md
+++ b/FastVideo-main/docs/source/contributing/overview.md
+(developer-overview)=
+
+# 🛠️ Contributing to FastVideo
+
+Thank you for your interest in contributing to FastVideo. We want to make the process as smooth for you as possible and this is a guide to help get you started!
+
+Our community is open to everyone and welcomes any contributions no matter how large or small.
+
+# Developer Environment:
+Do make sure you have CUDA 12.4 installed and supported. FastVideo currently only support Linux and CUDA GPUs, but we hope to support other platforms in the future.
+
+We recommend using a fresh Python 3.10 Conda environment to develop FastVideo:
+
+Install Miniconda:
+
+```
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
+```
+
+Create and activate a Conda environment for FastVideo:
+
+```
+conda create -n fastvideo python=3.10 -y
+conda activate fastvideo
+```
+
+Clone the FastVideo repository and go to the FastVideo directory:
+
+```
+git clone https://github.com/hao-ai-lab/FastVideo.git && cd FastVideo
+
+```
+
+Now you can install FastVideo and setup git hooks for running linting. By using `pre-commit`, the linters will run and have to pass before you'll be able to make a commit.
+
+```bash
+pip install -e .[dev]
+
+# Can also install flash-attn (optional)
+pip install flash-attn==2.7.4.post1 --no-build-isolation 
+
+# Linting, formatting and static type checking
+pre-commit install --hook-type pre-commit --hook-type commit-msg
+
+# You can manually run pre-commit with
+pre-commit run --all-files
+
+# Unit tests
+pytest tests/
+```
--- a/FastVideo-main/docs/source/design/overview.md
+++ b/FastVideo-main/docs/source/design/overview.md
+# 🔍 FastVideo Overview
+
+This document outlines FastVideo's architecture for developers interested in framework internals or contributions. It serves as an onboarding guide for new contributors by providing an overview of the most important directories and files within the `fastvideo/v1/` codebase.
+
+## Table of Contents - V1 Directory Structure and Files
+
+- [`fastvideo/v1/pipelines/`](#design-pipeline-system) - Core diffusion pipeline components
+- [`fastvideo/v1/models/`](#design-model-components) - Model implementations
+  - [`dits/`](#design-transformer-models) - Transformer-based diffusion models
+  - [`vaes/`](#design-vae-variational-auto-encoder) - Variational autoencoders
+  - [`encoders/`](#design-text-and-image-encoders) - Text and image encoders
+  - [`schedulers/`](#design-schedulers) - Diffusion schedulers
+- [`fastvideo/v1/attention/`](#design-optimized-attention) - Optimized attention implementations
+- [`fastvideo/v1/distributed/`](#design-distributed-processing) - Distributed computing utilities
+- [`fastvideo/v1/layers/`](#design-tensor-parallelism) - Custom neural network layers
+- [`fastvideo/v1/platforms/`](#design-platforms) - Hardware platform abstractions
+- [`fastvideo/v1/worker/`](#design-executor-and-worker-abstractions) - Multi-GPU process management
+- [`fastvideo/v1/fastvideo_args.py`](#design-fastvideo-args) - Argument handling
+- [`fastvideo/v1/forward_context.py`](#design-forwardcontext) - Forward pass context management
+- `fastvideo/v1/utils.py` - Utility functions
+- [`fastvideo/v1/logger.py`](#design-logger) - Logging infrastructure
+
+## Core Architecture
+
+FastVideo separates model components from execution logic with these principles:
+- **Component Isolation**: Models (encoders, VAEs, transformers) are isolated from execution (pipelines, stages, distributed processing)
+- **Modular Design**: Components can be independently replaced
+- **Distributed Execution**: Supports various parallelism strategies (Tensor, Sequence)
+- **Custom Attention Backends**: Components can support and use different Attention implementations
+- **Pipeline Abstraction**: Consistent interface across diffusion models
+
+(design-fastvideo-args)=
+## FastVideoArgs
+
+The `FastVideoArgs` class in `fastvideo/v1/fastvideo_args.py` serves as the central configuration system for FastVideo. It contains all parameters needed to control model loading, inference configuration, performance optimization settings, and more.
+
+Key features include:
+- **Command-line Interface**: Automatic conversion between CLI arguments and dataclass fields
+- **Configuration Groups**: Organized by functional areas (model loading, video params, optimization settings)
+- **Context Management**: Global access to current settings via `get_current_fastvideo_args()`
+- **Parameter Validation**: Ensures valid combinations of settings
+
+Common configuration areas:
+- **Model paths and loading options**: `model_path`, `trust_remote_code`, `revision`
+- **Distributed execution settings**: `num_gpus`, `tp_size`, `sp_size`
+- **Video generation parameters**: `height`, `width`, `num_frames`, `num_inference_steps`
+- **Precision settings**: Control computation precision for different components
+
+Example usage:
+
+```python
+# Load arguments from command line
+fastvideo_args = prepare_fastvideo_args(sys.argv[1:])
+
+# Access parameters
+model = load_model(fastvideo_args.model_path)
+
+# Set as global context
+with set_current_fastvideo_args(fastvideo_args):
+    # Code that requires access to these arguments
+    result = generate_video()
+```
+
+(design-pipeline-system)=
+## Pipeline System
+
+### `ComposedPipelineBase`
+
+This foundational class provides:
+
+- **Model Loading**: Automatically loads components from HuggingFace-Diffusers-compatible model directories
+- **Stage Management**: Creates and orchestrates processing stages
+- **Data Flow Coordination**: Ensures proper state flow between stages
+
+```python
+class MyCustomPipeline(ComposedPipelineBase):
+    _required_config_modules = [
+        "text_encoder", "tokenizer", "vae", "transformer", "scheduler"
+    ]
+    
+    def initialize_pipeline(self, fastvideo_args: FastVideoArgs):
+        # Pipeline-specific initialization
+        pass
+        
+    def create_pipeline_stages(self, fastvideo_args: FastVideoArgs):
+        self.add_stage("input_validation_stage", InputValidationStage())
+        self.add_stage("text_encoding_stage", CLIPTextEncodingStage(
+            text_encoder=self.get_module("text_encoder"),
+            tokenizer=self.get_module("tokenizer")
+        ))
+        # Additional stages...
+```
+
+### Pipeline Stages
+Each stage handles a specific diffusion process component:
+- **Input Validation**: Parameter verification
+- **Text Encoding**: CLIP, LLaMA, or T5-based encoding
+- **Image Encoding**: Image input processing
+- **Timestep & Latent Preparation**: Setup for diffusion
+- **Denoising**: Core diffusion loop
+- **Decoding**: Latent-to-pixel conversion
+
+Each stage implements a standard interface:
+
+```python
+def forward(self, batch: ForwardBatch, fastvideo_args: FastVideoArgs) -> ForwardBatch:
+    # Process batch and update state
+    return batch
+```
+
+(design-forwardbatch)=
+### ForwardBatch
+
+Defined in `fastvideo/v1/pipelines/pipeline_batch_info.py`, `ForwardBatch` encapsulates the data payload passed between pipeline stages. It typically holds:
+
+- **Input Data**: Prompts, images, generation parameters
+- **Intermediate State**: Embeddings, latents, timesteps, accumulated during stage execution
+- **Output Storage**: Generated results and metadata
+- **Configuration**: Sampling parameters, precision settings
+
+This structure facilitates clear state transitions between stages.
+
+(design-model-components)=
+## Model Components
+
+The `fastvideo/v1/models/` directory contains implementations of the core neural network models used in video diffusion:
+
+(design-transformer-models)=
+### Transformer Models
+
+Transformer networks perform the actual denoising during diffusion:
+
+- **Location**: `fastvideo/v1/models/dits/`
+- **Examples**:
+  - `WanTransformer3DModel`
+  - `HunyuanVideoTransformer3DModel`
+
+Features include:
+- Text/image conditioning
+- Standardized interface for model-specific optimizations
+
+```python
+def forward(
+    self, 
+    latents,                    # [B, T, C, H, W]
+    encoder_hidden_states,      # Text embeddings
+    timestep,                   # Current diffusion timestep
+    encoder_hidden_states_image=None,  # Optional image embeddings
+    **kwargs
+):
+    # Perform denoising computation
+    return noise_pred  # Predicted noise residual
+```
+
+(design-vae-variational-auto-encoder)=
+### VAE (Variational Auto-Encoder)
+
+VAEs handle conversion between pixel space and latent space:
+
+- **Location**: `fastvideo/v1/models/vaes/`
+- **Examples**:
+  - `AutoencoderKLWan`
+  - `AutoencoderKLHunyuanVideo`
+
+These models compress image/video data to a more efficient latent representation (typically 4x-8x smaller in each dimension).
+
+FastVideo's VAE implementations include:
+- Efficient video batch processing
+- Memory optimization
+- Optional tiling for large frames
+- Distributed weight support
+
+(design-text-and-image-encoders)=
+### Text and Image Encoders
+
+Encoders process conditioning inputs into embeddings:
+
+- **Location**: `fastvideo/v1/models/encoders/`
+- **Text Encoders**:
+  - `CLIPTextModel`
+  - `LlamaModel`
+  - `UMT5EncoderModel`
+- **Image Encoders**:
+  - `CLIPVisionModel`
+
+FastVideo implements optimizations such as:
+- Vocab parallelism for distributed processing
+- Caching for common prompts
+- Precision-tuned computation
+
+(design-schedulers)=
+### Schedulers
+
+Schedulers manage the diffusion sampling process:
+
+- **Location**: `fastvideo/v1/models/schedulers/`
+- **Examples**:
+  - `UniPCMultistepScheduler`
+  - `FlowMatchEulerDiscreteScheduler`
+
+These components control:
+- Diffusion timestep sequences
+- Noise prediction to latent update conversions
+- Quality/speed trade-offs
+
+```python
+def step(
+    self, 
+    model_output: torch.Tensor,
+    timestep: torch.LongTensor,
+    sample: torch.Tensor,
+    **kwargs
+) -> torch.Tensor:
+    # Process model output and update latents
+    # Return updated latents
+    return prev_sample
+```
+
+(design-optimized-attention)=
+## Optimized Attention
+
+The `fastvideo/v1/attention/` directory contains optimized attention implementations crucial for efficient video diffusion:
+
+### Attention Backends
+Multiple implementations with automatic selection:
+- **FLASH_ATTN**: Optimized for supporting hardware
+- **TORCH_SDPA**: Built-in PyTorch scaled dot-product attention
+- **SLIDING_TILE_ATTN**: For very long sequences
+
+```python
+# Configure available attention backends for this layer
+self.attn = LocalAttention(
+    num_heads=num_heads,
+    head_size=head_dim,
+    causal=False,
+    supported_attention_backends=(_Backend.FLASH_ATTN, _Backend.TORCH_SDPA)
+)
+
+# Override via environment variable
+# export FASTVIDEO_ATTENTION_BACKEND=FLASH_ATTN
+```
+
+### Attention Patterns
+Supports various patterns with memory optimization techniques:
+- **Cross/Self/Temporal/Global-Local Attention**
+- Chunking, progressive computation, optimized masking
+
+(design-distributed-processing)=
+## Distributed Processing
+
+The `fastvideo/v1/distributed/` directory contains implementations for distributed model execution:
+
+(design-tensor-parallelism)=
+### Tensor Parallelism
+
+Tensor parallelism splits model weights across devices:
+
+- **Implementation**: Through `RowParallelLinear` and `ColumnParallelLinear` layers
+- **Use cases**: Will be used by encoder models as their sequence lengths are shorter and enables efficient sharding.
+
+```python
+# Tensor-parallel layers in a transformer block
+from fastvideo.v1.layers.linear import ColumnParallelLinear, RowParallelLinear
+
+# Split along output dimension
+self.qkv_proj = ColumnParallelLinear(
+    input_size=hidden_size,
+    output_size=3 * hidden_size,
+    bias=True,
+    gather_output=False
+)
+
+# Split along input dimension
+self.out_proj = RowParallelLinear(
+    input_size=hidden_size,
+    output_size=hidden_size,
+    bias=True,
+    input_is_parallel=True
+)
+```
+
+### Sequence Parallelism
+
+Sequence parallelism splits sequences across devices:
+
+- **Implementation**: Through `DistributedAttention` and sequence splitting
+- **Use cases**: Long video sequences or high-resolution processing. Used by DiT models.
+
+```python
+# Distributed attention for long sequences
+from fastvideo.v1.attention import DistributedAttention
+
+self.attn = DistributedAttention(
+    num_heads=num_heads,
+    head_size=head_dim,
+    causal=False,
+    supported_attention_backends=(_Backend.SLIDING_TILE_ATTN, _Backend.FLASH_ATTN)
+)
+```
+
+### Communication Primitives
+Efficient distributed operations via AllGather, AllReduce, and synchronization mechanisms.
+
+Efficient communication primitives minimize distributed overhead:
+
+- **Sequence-Parallel AllGather**: Collects sequence chunks
+- **Tensor-Parallel AllReduce**: Combines partial results
+- **Distributed Synchronization**: Coordinates execution
+
+(design-forwardcontext)=
+## Forward Context Management
+
+### ForwardContext
+
+Defined in `fastvideo/v1/forward_context.py`, `ForwardContext` manages execution-specific state *within* a forward pass, particularly for low-level optimizations. It is accessed via `get_forward_context()`.
+
+- **Attention Metadata**: Configuration for optimized attention kernels (`attn_metadata`)
+- **Profiling Data**: Potential hooks for performance metrics collection
+
+This context-based approach enables:
+- Dynamic optimization based on execution state (e.g., attention backend selection)
+- Step-specific customizations within model components
+
+Usage example:
+
+```python
+with set_forward_context(current_timestep, attn_metadata, fastvideo_args):
+    # During this forward pass, components can access context
+    # through get_forward_context()
+    output = model(inputs)
+```
+
+(design-executor-and-worker-abstractions)=
+## Executor and Worker System
+
+The `fastvideo/v1/worker/` directory contains the distributed execution framework:
+
+### Executor Abstraction
+
+FastVideo implements a flexible execution model for distributed processing:
+
+- **Executor Base Class**: An abstract base class defining the interface for all executors
+- **MultiProcExecutor**: Primary implementation that spawns and manages worker processes
+- **GPU Workers**: Handle actual model execution on individual GPUs
+
+The MultiProcExecutor implementation:
+1. Spawns worker processes for each GPU
+2. Establishes communication channels via pipes
+3. Coordinates distributed operations across workers
+4. Handles graceful startup and shutdown of the process group
+
+Each GPU worker:
+1. Initializes the distributed environment
+2. Builds the pipeline for the specified model
+3. Executes requested operations on its assigned GPU
+4. Manages local resources and communicates results back to the executor
+
+This design allows FastVideo to efficiently utilize multiple GPUs while providing a simple, unified interface for model execution.
+
+(design-platforms)=
+## Platforms
+
+The `fastvideo/v1/platforms/` directory provides hardware platform abstractions that enable FastVideo to run efficiently on different hardware configurations:
+
+### Platform Abstraction
+
+FastVideo's platform abstraction layer enables:
+- **Hardware Detection**: Automatic detection of available hardware
+- **Backend Selection**: Appropriate selection of compute kernels
+- **Memory Management**: Efficient utilization of hardware-specific memory features
+
+The primary components include:
+- **Platform Interface**: Defines the common API for all platform implementations
+- **CUDA Platform**: Optimized implementation for NVIDIA GPUs
+- **Backend Enum**: Used throughout the codebase for feature selection
+
+Usage example:
+
+```python
+from fastvideo.v1.platforms import current_platform, _Backend
+
+# Check hardware capabilities
+if current_platform.supports_backend(_Backend.FLASH_ATTN):
+    # Use FlashAttention implementation
+else:
+    # Fall back to standard implementation
+```
+
+The platform system is designed to be extensible for future hardware targets.
+
+(design-logger)=
+## Logger
+See [PR](https://github.com/hao-ai-lab/FastVideo/pull/356)
+
+*TODO*: (help wanted) Add an environment variable that disables process-aware logging.
+
+## Contributing to FastVideo
+
+If you're a new contributor, here are some common areas to explore:
+
+1. **Adding a new model**: Implement new model types in the appropriate subdirectory of `fastvideo/v1/models/`
+2. **Optimizing performance**: Look at attention implementations or memory management
+3. **Adding a new pipeline**: Create a new pipeline subclass in `fastvideo/v1/pipelines/`
+4. **Hardware support**: Extend the `platforms` module for new hardware targets
+
+When adding code, follow these practices:
+- Use type hints for better code readability
+- Add appropriate docstrings
+- Maintain the separation between model components and execution logic
+- Follow existing patterns for distributed processing
--- a/FastVideo-main/docs/source/generate_examples.py
+++ b/FastVideo-main/docs/source/generate_examples.py
+# SPDX-License-Identifier: Apache-2.0
+# adapted from vllm: https://github.com/vllm-project/vllm/blob/v0.7.3/docs/source/generate_examples.py
+
+import itertools
+import re
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional
+
+ROOT_DIR = Path(__file__).parent.parent.parent.resolve()
+ROOT_DIR_RELATIVE = '../../../..'
+EXAMPLE_DIR = ROOT_DIR / "fastvideo/v1/examples"
+EXAMPLE_DOC_DIR = ROOT_DIR / "docs/source/getting_started/examples"
+
+
+def fix_case(text: str) -> str:
+    subs = {
+        "api": "API",
+        "cli": "CLI",
+        "cpu": "CPU",
+        "llm": "LLM",
+        "tpu": "TPU",
+        "aqlm": "AQLM",
+        "gguf": "GGUF",
+        "lora": "LoRA",
+        "rlhf": "RLHF",
+        "vllm": "vLLM",
+        "openai": "OpenAI",
+        "multilora": "MultiLoRA",
+        "mlpspeculator": "MLPSpeculator",
+        r"fp\d+": lambda x: x.group(0).upper(),  # e.g. fp16, fp32
+        r"int\d+": lambda x: x.group(0).upper(),  # e.g. int8, int16
+    }
+    for pattern, repl in subs.items():
+        text = re.sub(rf'\b{pattern}\b', repl, text,
+                      flags=re.IGNORECASE)  # type: ignore[call-overload]
+    return text
+
+
+@dataclass
+class Index:
+    """
+    Index class to generate a structured document index.
+
+    Attributes:
+        path (Path): The path save the index file to.
+        title (str): The title of the index.
+        description (str): A brief description of the index.
+        caption (str): An optional caption for the table of contents.
+        maxdepth (int): The maximum depth of the table of contents. Defaults to 1.
+        documents (list[str]): A list of document paths to include in the index. Defaults to an empty list.
+
+    Methods:
+        generate() -> str:
+            Generates the index content as a string in the specified format.
+    """ # noqa: E501
+    path: Path
+    title: str
+    description: str
+    caption: str
+    maxdepth: int = 1
+    documents: list[str] = field(default_factory=list)
+
+    def generate(self) -> str:
+        content = f"# {self.title}\n\n{self.description}\n\n"
+        content += ":::{toctree}\n"
+        content += f":caption: {self.caption}\n:maxdepth: {self.maxdepth}\n"
+        content += "\n".join(self.documents) + "\n:::\n"
+        return content
+
+
+@dataclass
+class Example:
+    """
+    Example class for generating documentation content from a given path.
+
+    Attributes:
+        path (Path): The path to the main directory or file.
+        category (str): The category of the document.
+        main_file (Path): The main file in the directory.
+        other_files (list[Path]): list of other files in the directory.
+        title (str): The title of the document.
+
+    Methods:
+        __post_init__(): Initializes the main_file, other_files, and title attributes.
+        determine_main_file() -> Path: Determines the main file in the given path.
+        determine_other_files() -> list[Path]: Determines other files in the directory excluding the main file.
+        determine_title() -> str: Determines the title of the document.
+        generate() -> str: Generates the documentation content.
+    """ # noqa: E501
+    path: Path
+    category: Optional[str] = None
+    main_file: Path = field(init=False)
+    other_files: list[Path] = field(init=False)
+    title: str = field(init=False)
+
+    def __post_init__(self):
+        self.main_file = self.determine_main_file()
+        self.other_files = self.determine_other_files()
+        self.title = self.determine_title()
+
+    def determine_main_file(self) -> Path:
+        """
+        Determines the main file in the given path.
+        If the path is a file, it returns the path itself. Otherwise, it searches
+        for Markdown files (*.md) in the directory and returns the first one found.
+        Returns:
+            Path: The main file path, either the original path if it's a file or the first
+            Markdown file found in the directory.
+        Raises:
+            IndexError: If no Markdown files are found in the directory.
+        """ # noqa: E501
+        return self.path if self.path.is_file() else list(
+            self.path.glob("*.md")).pop()
+
+    def determine_other_files(self) -> list[Path]:
+        """
+        Determine other files in the directory excluding the main file.
+
+        This method checks if the given path is a file. If it is, it returns an empty list.
+        Otherwise, it recursively searches through the directory and returns a list of all
+        files that are not the main file.
+
+        Returns:
+            list[Path]: A list of Path objects representing the other files in the directory.
+        """ # noqa: E501
+        if self.path.is_file():
+            return []
+        is_other_file = lambda file: file.is_file() and file != self.main_file
+        return [file for file in self.path.rglob("*")
+                if is_other_file(file)]  # type: ignore[no-untyped-call]
+
+    def determine_title(self) -> str:
+        return fix_case(self.path.stem.replace("_", " ").title())
+
+    def generate(self) -> str:
+        # Convert the path to a relative path from __file__
+        make_relative = lambda path: ROOT_DIR_RELATIVE / path.relative_to(
+            ROOT_DIR)
+
+        content = f"Source <gh-file:{self.path.relative_to(ROOT_DIR)}>.\n\n"
+        include = "include" if self.main_file.suffix == ".md" else \
+            "literalinclude"
+        if include == "literalinclude":
+            content += f"# {self.title}\n\n"
+        content += f":::{{{include}}} {make_relative(self.main_file)}\n"  # type: ignore[no-untyped-call]
+        if include == "literalinclude":
+            content += f":language: {self.main_file.suffix[1:]}\n"
+        content += ":::\n\n"
+
+        if not self.other_files:
+            return content
+
+        content += "## Example materials\n\n"
+        for file in sorted(self.other_files):
+            include = "include" if file.suffix == ".md" else "literalinclude"
+            content += f":::{{admonition}} {file.relative_to(self.path)}\n"
+            content += ":class: dropdown\n\n"
+            content += f":::{{{include}}} {make_relative(file)}\n:::\n"  # type: ignore[no-untyped-call]
+            content += ":::\n\n"
+
+        return content
+
+
+def generate_examples(generate_main_index=False):
+    """
+    Generate example documentation.
+    
+    Args:
+        generate_main_index (bool): Whether to generate the main examples index.
+            If False, only category-specific indices will be generated.
+    """
+    # Create empty indices with dynamic paths
+    main_index_dir = ROOT_DIR / "docs/source/examples"
+    if not main_index_dir.exists():
+        main_index_dir.mkdir(parents=True)
+
+    # Create the main examples index only if requested
+    examples_index = None
+    if generate_main_index:
+        examples_index = Index(
+            path=main_index_dir / "examples_index.md",
+            title="💡 Examples",
+            description=
+            "A collection of examples demonstrating usage of FastVideo.\nAll documented examples are autogenerated using <gh-file:docs/source/generate_examples.py> from examples found in <gh-file:examples>.",  # noqa: E501
+            caption="Examples",
+            maxdepth=2)
+
+    # Category indices with dynamic paths based on category names
+    category_indices = {
+        "inference":
+        Index(
+            path=ROOT_DIR /
+            "docs/source/inference/examples/examples_inference_index.md",
+            title="🚀 Examples",
+            description=
+            "Inference examples demonstrate how to use FastVideo in an offline setting, where the model is queried for predictions in batches. We recommend starting with <project:basic.md>.",  # noqa: E501
+            caption="Examples",
+        ),
+    }
+
+    # Ensure all category doc directories exist
+    for category, index in category_indices.items():
+        category_dir = index.path.parent
+        if not category_dir.exists():
+            category_dir.mkdir(parents=True)
+
+    examples = []
+    glob_patterns = ["*.py", "*.md", "*.sh"]
+    # Find categorised examples
+    for category in category_indices:
+        print(category)
+        category_dir = EXAMPLE_DIR / category
+        globs = [category_dir.glob(pattern) for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
+            examples.append(Example(path, category))
+        # Find examples in subdirectories
+        for path in category_dir.glob("*/*.md"):
+            examples.append(Example(path.parent, category))
+
+    # Find uncategorised examples only if we're generating a main index
+    if generate_main_index:
+        globs = [EXAMPLE_DIR.glob(pattern) for pattern in glob_patterns]
+        for path in itertools.chain(*globs):
+            examples.append(Example(path))
+        # Find examples in subdirectories
+        for path in EXAMPLE_DIR.glob("*/*.md"):
+            # Skip categorised examples
+            if path.parent.name in category_indices:
+                continue
+            examples.append(Example(path.parent))
+
+    # Create document directories for each category based on category name and generate files
+    for example in sorted(examples, key=lambda e: e.path.stem):
+        print(example)
+
+        # Determine which index to use for this example
+        if example.category is not None and example.category in category_indices:
+            index = category_indices[example.category]
+        elif generate_main_index:
+            assert examples_index is not None
+            index = examples_index  # Default to main index if available
+        else:
+            # Skip examples without a category if no main index
+            print(f"Skipping {example.path} (no category and no main index)")
+            continue
+
+        # Place generated example markdown in the same directory as its index
+        doc_path = index.path.parent / f"{example.path.stem}.md"
+        with open(doc_path, "w+") as f:
+            f.write(example.generate())
+        # Add the example to the index
+        index.documents.append(example.path.stem)
+
+    # Generate the index files for categories
+    for category_index in category_indices.values():
+        if category_index.documents:
+            # Add to main index if it exists
+            if generate_main_index:
+                rel_path = category_index.path.relative_to(
+                    main_index_dir.parent)
+                assert examples_index is not None
+                examples_index.documents.insert(
+                    0,
+                    str(rel_path).replace(".md", ""))
+
+            # Write the category index file
+            with open(category_index.path, "w+") as f:
+                f.write(category_index.generate())
+
+    # Write the main index file if requested
+    if generate_main_index and examples_index:
+        with open(examples_index.path, "w+") as f:
+            f.write(examples_index.generate())
--- a/FastVideo-main/docs/source/getting_started/installation.md
+++ b/FastVideo-main/docs/source/getting_started/installation.md
+(fastvideo-installation)=
+
+# 🔧 Installation
+
+FastVideo currently only supports Linux and NVIDIA CUDA GPUs.
+
+FastVideo has been tested on the following GPUs, but it should work on any GPUs that supports CUDA 12.4+, please create an issue if you discover any issues:
+- RTX 4090
+- A40
+- L40S
+- A100
+- H100
+
+## Requirements
+
+- OS: Linux
+- Python: 3.10-3.12
+- CUDA 12.4+
+
+## Installation Options
+
+### Option 1: Quick Install
+
+```bash
+pip install fastvideo
+```
+
+### Option 2: Installation from Source
+
+We recommend using a Python environment such as Conda.
+
+#### 1. [Optional] Install Miniconda (if not already installed)
+
+```bash
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
+```
+
+#### 2. [Optional] Create and activate a Conda environment for FastVideo
+
+```bash
+conda create -n fastvideo python=3.10 -y
+conda activate fastvideo
+```
+
+#### 3. Clone the FastVideo repository
+
+```bash
+git clone https://github.com/hao-ai-lab/FastVideo.git && cd FastVideo
+```
+
+#### 4. Install FastVideo
+
+Basic installation:
+
+```bash
+pip install -e .
+```
+
+## Optional Dependencies
+
+### Flash Attention
+
+```bash
+pip install flash-attn==2.7.4.post1 --no-build-isolation
+```
+
+### Sliding Tile Attention (STA) (Requires CUDA 12.4+ and H100)
+
+To try Sliding Tile Attention (optional), please follow the instructions in [csrc/sliding_tile_attention/README.md](#sta-installation) to install STA.
+
+## Development Environment Setup
+
+If you're planning to contribute to FastVideo please see the following page:
+[Contributor Guide](#developer-overview)
+
+## Hardware Requirements
+
+### For Basic Inference
+- NVIDIA GPU with CUDA support
+- Minimum 20GB VRAM for quantized models (e.g., single RTX 4090)
+
+### For Lora Finetuning
+- 40GB GPU memory each for 2 GPUs with lora
+- 30GB GPU memory each for 2 GPUs with CPU offload and lora
+
+### For Full Finetuning/Distillation
+- Multiple high-memory GPUs recommended (e.g., H100)
+
+## Troubleshooting
+
+If you encounter any issues during installation, please open an issue on our [GitHub repository](https://github.com/hao-ai-lab/FastVideo).
+
+You can also join our [Slack community](https://join.slack.com/t/fastvideo/shared_invite/zt-2zf6ru791-sRwI9lPIUJQq1mIeB_yjJg) for additional support.
--- a/FastVideo-main/docs/source/index.md
+++ b/FastVideo-main/docs/source/index.md
+# Welcome to FastVideo
+
+:::{figure} ../../assets/logo.jpg
+:align: center
+:alt: FastVideo
+:class: no-scaled-link
+:width: 60%
+:::
+
+:::{raw} html
+<p style="text-align:center">
+<strong>FastVideo is a lightweight framework for accelerating large video diffusion models.
+</strong>
+</p>
+
+<p style="text-align:center">
+<script async defer src="https://buttons.github.io/buttons.js"></script>
+<a class="github-button" href="https://github.com/hao-ai-lab/FastVideo/" data-show-count="true" data-size="large" aria-label="Star">Star</a>
+<a class="github-button" href="https://github.com/hao-ai-lab/FastVideo/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
+<a class="github-button" href="https://github.com/hao-ai-lab/FastVideo/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
+</p>
+:::
+
+FastVideo is a lightweight framework for accelerating large video diffusion models developed by the [Hao AI Lab](https://hao-ai-lab.github.io/).
+
+<div style="text-align: center;">
+  <video controls width="800">
+    <source src="https://github.com/user-attachments/assets/79af5fb8-707c-4263-b153-9ab2a01d3ac1" type="video/mp4">
+    Your browser does not support the video tag.
+  </video>
+</div>
+
+FastVideo currently offers: (with more to come)
+
+- [NEW!] V1 inference API available. Full announcement coming soon!
+- [Sliding Tile Attention](https://hao-ai-lab.github.io/blogs/sta/).
+- FastHunyuan and FastMochi: consistency distilled video diffusion models for 8x inference speedup.
+- First open distillation recipes for video DiT, based on [PCM](https://github.com/G-U-N/Phased-Consistency-Model).
+- Support distilling/finetuning/inferencing state-of-the-art open video DiTs: 1. Mochi 2. Hunyuan.
+- Scalable training with FSDP, sequence parallelism, and selective activation checkpointing, with near linear scaling to 64 GPUs.
+- Memory efficient finetuning with LoRA, precomputed latent, and precomputed text embeddings.
+
+Dev in progress and highly experimental.
+
+## Documentation
+
+% How to start using FastVideo?
+
+:::{toctree}
+:caption: Getting Started
+:maxdepth: 1
+
+getting_started/installation
+<!-- getting_started/examples/examples_index -->
+:::
+
+:::{toctree}
+:caption: Inference
+:maxdepth: 1
+
+inference/examples/examples_inference_index
+inference/v0_inference
+:::
+
+:::{toctree}
+:caption: Training
+:maxdepth: 1
+
+training/data_preprocess
+training/distillation
+training/finetune
+:::
+
+% What is STA Kernel?
+
+:::{toctree}
+:caption: Sliding Tile Attention
+:maxdepth: 1
+
+sliding_tile_attention/installation
+sliding_tile_attention/demo
+:::
+
+:::{toctree}
+:caption: Design
+:maxdepth: 1
+design/overview
+:::
+
+:::{toctree}
+:caption: Developer Guide
+:maxdepth: 2
+
+contributing/overview
+contributing/developer_env/index
+contributing/add_pipeline
+:::
+
+## Indices and tables
+
+- {ref}`genindex`
+- {ref}`modindex`
--- a/FastVideo-main/docs/source/inference/fasthunyuan.md
+++ b/FastVideo-main/docs/source/inference/fasthunyuan.md
+(fasthunyuan)=
+
+# FastHunyuan
+## Inference FastHunyuan on single RTX4090
+We now support NF4 and LLM-INT8 quantized inference using BitsAndBytes for FastHunyuan. With NF4 quantization, inference can be performed on a single RTX 4090 GPU, requiring just 20GB of VRAM.
+
+```bash
+# Download the model weight
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastHunyuan-diffusers --local_dir=data/FastHunyuan-diffusers --repo_type=model
+# CLI inference
+bash scripts/inference/inference_hunyuan_hf_quantization.sh
+```
+
+For more information about the VRAM requirements for BitsAndBytes quantization, please refer to the table below (timing measured on an H100 GPU):
+
+| Configuration                  | Memory to Init Transformer | Peak Memory After Init Pipeline (Denoise) | Diffusion Time | End-to-End Time |
+|--------------------------------|----------------------------|--------------------------------------------|----------------|-----------------|
+| BF16 + Pipeline CPU Offload    | 23.883G                   | 33.744G                                    | 81s            | 121.5s          |
+| INT8 + Pipeline CPU Offload    | 13.911G                   | 27.979G                                    | 88s            | 116.7s          |
+| NF4 + Pipeline CPU Offload     | 9.453G                    | 19.26G                                     | 78s            | 114.5s          |
+  
+For improved quality in generated videos, we recommend using a GPU with 80GB of memory to run the BF16 model with the original Hunyuan pipeline. To execute the inference, use the following section:
+
+## FastHunyuan
+
+```bash
+# Download the model weight
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastHunyuan --local_dir=data/FastHunyuan --repo_type=model
+# CLI inference
+bash scripts/inference/inference_hunyuan.sh
+```
+
+You can also inference FastHunyuan in the [official Hunyuan github](https://github.com/Tencent/HunyuanVideo).
--- a/FastVideo-main/docs/source/inference/fastmochi.md
+++ b/FastVideo-main/docs/source/inference/fastmochi.md
+(fastmochi)=
+
+# FastMochi
+
+```bash
+# Download the model weight
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastMochi-diffusers --local_dir=data/FastMochi-diffusers --repo_type=model
+# CLI inference
+bash scripts/inference/inference_mochi_sp.sh
--- a/FastVideo-main/docs/source/inference/hunyuanvideo.md
+++ b/FastVideo-main/docs/source/inference/hunyuanvideo.md
+(hunyuanvideo)=
+
+# HunyuanVideo
+## Inference HunyuanVideo with Sliding Tile Attention
+First, download the model:
+
+```bash
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/hunyuan --local_dir=data/hunyuan --repo_type=model 
+```
+
+We provide two examples in the following script to run inference with STA + [TeaCache](https://github.com/ali-vilab/TeaCache) and STA only.
+
+```bash
+sh scripts/inference/inference_hunyuan_STA.sh
+```
+
+## Video Demos using STA + Teacache
+Visit our [demo website](https://fast-video.github.io/) to explore our complete collection of examples. We shorten a single video generation process from 945s to 317s on H100.
--- a/FastVideo-main/docs/source/inference/stepvideo.md
+++ b/FastVideo-main/docs/source/inference/stepvideo.md
+(stepvideo)=
+
+# StepVideo
+## Inference StepVideo with Sliding Tile Attention
+First, download the model:
+
+```
+python scripts/huggingface/download_hf.py --repo_id=stepfun-ai/stepvideo-t2v --local_dir=data/stepvideo-t2v --repo_type=model 
+```
+
+Use the following scripts to run inference for StepVideo. When using STA for inference, the generated videos will have dimensions of 204×768×768 (currently, this is the only supported shape).
+
+```bash
+sh scripts/inference/inference_stepvideo_STA.sh # Inference stepvideo with STA
+sh scripts/inference/inference_stepvideo.sh # Inference original stepvideo
+```
--- a/FastVideo-main/docs/source/inference/v0_inference.md
+++ b/FastVideo-main/docs/source/inference/v0_inference.md
+(v0-inference)=
+
+# [Deprecated] V0 Inference
+The following commands and APIs are deprecated but still supported until V1's API can completely replace all the features in this page.
+
+## Inference StepVideo with Sliding Tile Attention
+First, download the model:
+
+```
+python scripts/huggingface/download_hf.py --repo_id=stepfun-ai/stepvideo-t2v --local_dir=data/stepvideo-t2v --repo_type=model
+```
+
+Use the following scripts to run inference for StepVideo. When using STA for inference, the generated videos will have dimensions of 204×768×768 (currently, this is the only supported shape).
+
+```bash
+sh scripts/inference/inference_stepvideo_STA.sh # Inference stepvideo with STA
+sh scripts/inference/inference_stepvideo.sh # Inference original stepvideo
+```
+
+## Inference HunyuanVideo with Sliding Tile Attention
+First, download the model:
+
+```bash
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/hunyuan --local_dir=data/hunyuan --repo_type=model
+```
+
+We provide two examples in the following script to run inference with STA + [TeaCache](https://github.com/ali-vilab/TeaCache) and STA only.
+
+```bash
+sh scripts/inference/inference_hunyuan_STA.sh
+```
+
+## Video Demos using STA + Teacache
+Visit our [demo website](https://fast-video.github.io/) to explore our complete collection of examples. We shorten a single video generation process from 945s to 317s on H100.
+
+## Inference FastHunyuan on single RTX4090
+We now support NF4 and LLM-INT8 quantized inference using BitsAndBytes for FastHunyuan. With NF4 quantization, inference can be performed on a single RTX 4090 GPU, requiring just 20GB of VRAM.
+
+```bash
+# Download the model weight
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastHunyuan-diffusers --local_dir=data/FastHunyuan-diffusers --repo_type=model
+# CLI inference
+bash scripts/inference/inference_hunyuan_hf_quantization.sh
+```
+
+For more information about the VRAM requirements for BitsAndBytes quantization, please refer to the table below (timing measured on an H100 GPU):
+
+| Configuration                  | Memory to Init Transformer | Peak Memory After Init Pipeline (Denoise) | Diffusion Time | End-to-End Time |
+|--------------------------------|----------------------------|--------------------------------------------|----------------|-----------------|
+| BF16 + Pipeline CPU Offload    | 23.883G                   | 33.744G                                    | 81s            | 121.5s          |
+| INT8 + Pipeline CPU Offload    | 13.911G                   | 27.979G                                    | 88s            | 116.7s          |
+| NF4 + Pipeline CPU Offload     | 9.453G                    | 19.26G                                     | 78s            | 114.5s          |
+
+For improved quality in generated videos, we recommend using a GPU with 80GB of memory to run the BF16 model with the original Hunyuan pipeline. To execute the inference, use the following section:
+
+## FastHunyuan
+
+```bash
+# Download the model weight
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastHunyuan --local_dir=data/FastHunyuan --repo_type=model
+# CLI inference
+bash scripts/inference/inference_hunyuan.sh
+```
+
+You can also inference FastHunyuan in the [official Hunyuan github](https://github.com/Tencent/HunyuanVideo).
+
+## FastMochi
+
+```bash
+# Download the model weight
+python scripts/huggingface/download_hf.py --repo_id=FastVideo/FastMochi-diffusers --local_dir=data/FastMochi-diffusers --repo_type=model
+# CLI inference
+bash scripts/inference/inference_mochi_sp.sh
+```