chore: release v0.3.0

e2116d7b · Muyang Li · GitHub · 6098c419 · d94c2078 · e2116d7b
Unverified Commit e2116d7b authored Jun 01, 2025 by Muyang Li Committed by GitHub Jun 01, 2025
20 changed files
--- a/.clang-format
+++ b/.clang-format
@@ -3,36 +3,26 @@ IndentWidth:   4              # 4‑space indents everywhere
 TabWidth: 4
 UseTab: Never # never convert to tabs
 ColumnLimit: 120
 AccessModifierOffset: -4
 BreakBeforeBraces: Attach # `void foo() {` — brace on same line
 BraceWrapping:
  AfterNamespace: false # `namespace x {` on same line
  SplitEmptyFunction: false
  SplitEmptyRecord: false
  SplitEmptyNamespace: false
 PointerAlignment: Right # `int *ptr`, `const Foo *bar`
 ReferenceAlignment: Pointer # `int &ref` -> same rule as pointers
 SortIncludes: false # keep the hand‑crafted include order
 IncludeBlocks: Preserve
 SortUsingDeclarations: false
 IndentPPDirectives: None # keep `#pragma` / `#if` at column 0
 AllowShortFunctionsOnASingleLine: Empty
 AllowShortIfStatementsOnASingleLine: false
 AllowShortBlocksOnASingleLine: false
 BinPackParameters: false # one parameter per line (as written)
 BinPackArguments: false
 AlignAfterOpenBracket: Align # preserve the current hanging‑indent style
 AlignConsecutiveAssignments: true
 AlignConsecutiveDeclarations: false
 SpaceAfterTemplateKeyword: false
 BreakTemplateDeclarations: Yes
--- a/.github/ISSUE_TEMPLATE/1-bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/1-bug-report.yml
@@ -3,9 +3,8 @@ name: 🐞 Bug report
 description: Create a report to help us reproduce and fix the bug
 title: "[Bug] "
 labels: ['Bug']
 body:
- type: checkboxes
+  - type: checkboxes
    attributes:
      label: Checklist
      options:
@@ -15,13 +14,13 @@ body:
        - label: 4. If your report is a question rather than a bug, please submit it as a discussion at https://github.com/mit-han-lab/nunchaku/discussions/new/choose. Otherwise, this issue will be closed.
        - label: 5. If this is related to ComfyUI, please report it at https://github.com/mit-han-lab/ComfyUI-nunchaku/issues.
        - label: 6. I will do my best to describe the issue in English.
- type: textarea
+  - type: textarea
    attributes:
      label: Describe the Bug
      description: Provide a clear and concise explanation of the bug you encountered.
    validations:
      required: true
- type: textarea
+  - type: textarea
    attributes:
      label: Environment
      description: |
@@ -29,7 +28,7 @@ body:
      placeholder: "Example: Ubuntu 24.04, Python 3.11, PyTorch 2.6, CUDA 12.4"
    validations:
      required: true
- type: textarea
+  - type: textarea
    attributes:
      label: Reproduction Steps
      description: |

--- a/.github/ISSUE_TEMPLATE/2-feature-request.yml
+++ b/.github/ISSUE_TEMPLATE/2-feature-request.yml
@@ -2,22 +2,21 @@
 name: 🚀 Feature request
 description: Suggest an idea for this project
 title: "[Feature] "
 body:
- type: checkboxes
+  - type: checkboxes
    attributes:
      label: Checklist
      options:
        - label: 1. If the issue you raised is not a feature but a question, please raise a discussion at https://github.com/mit-han-lab/nunchaku/discussions/new/choose. Otherwise, it will be closed.
        - label: 2. I will do my best to describe the issue in English.
- type: textarea
+  - type: textarea
    attributes:
      label: Motivation
      description: |
        A clear and concise description of the motivation of the feature.
    validations:
      required: true
- type: textarea
+  - type: textarea
    attributes:
      label: Related resources
      description: |

--- a/.github/workflows/auto-merge-main-into-dev.yaml
+++ b/.github/workflows/auto-merge-main-into-dev.yaml
 name: Auto-merge main into dev
 on:
  workflow_dispatch:
  push:
    branches:
      - main
 permissions:
  contents: write
 jobs:
  merge-main-into-dev:
    runs-on: ubuntu-latest
    if: github.repository == 'mit-han-lab/nunchaku'
    steps:
      - name: Checkout the repository
        uses: actions/checkout@v4
        with:
          fetch-depth: 0
          token: ${{ secrets.GH_TOKEN }}
      - name: Check if main and dev are already in sync
        id: check_sync
        run: |
@@ -36,7 +31,6 @@ jobs:
            echo "Branches differ. Proceeding with merge."
            echo "skip_merge=false" >> "$GITHUB_OUTPUT"
          fi
      - name: Merge main into dev
        id: last_commit
        if: steps.check_sync.outputs.skip_merge == 'false'

--- a/.github/workflows/clean-nightly-releases.yaml
+++ b/.github/workflows/clean-nightly-releases.yaml
 name: Clean Old Nightly Releases
 on:
  schedule:
    - cron: '* 6 * * *'
  workflow_dispatch:
 permissions:
  contents: write
 jobs:
  cleanup:
    name: Delete old nightly releases and tags
    runs-on: ubuntu-latest
    if: github.repository == 'mit-han-lab/nunchaku'
    steps:
      - name: Checkout repository
        uses: actions/checkout@v4
      - name: List all nightly releases
        id: list
        run: |
@@ -26,14 +21,12 @@ jobs:
          echo "Found $(wc -l < nightly_tags.txt) nightly releases."
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Trim to old tags beyond latest 30
        id: filter
        run: |
          tail -n +31 nightly_tags.txt > to_delete.txt || true
          echo "Tags to delete:"
          cat to_delete.txt || echo "(none)"
      - name: Delete releases and tags
        run: |
          while read tag; do
@@ -43,6 +36,5 @@ jobs:
          done < to_delete.txt
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Done
        run: echo "Nightly cleanup completed."
--- a/.github/workflows/close-inactive-issues.yaml
+++ b/.github/workflows/close-inactive-issues.yaml
 # Borrowed from https://github.com/sgl-project/sglang/blob/main/.github/workflows/close-inactive-issues.yml
 name: Close Inactive Issues
 on:
  schedule:
    - cron: '0 0 * * *'
  workflow_dispatch:
 permissions:
  issues: write
  contents: read
 jobs:
  close-inactive-issues:
    if: github.repository == 'mit-han-lab/nunchaku'

--- a/.github/workflows/lint.yaml
+++ b/.github/workflows/lint.yaml
 name: Lint
 on:
  push:
    branches:
      - main
      - dev
  pull_request:
 jobs:
  lint:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: '3.11'
      - name: Install pre-commit hook
        run: |
          python -m pip install pre-commit
          pre-commit install
      - name: Linting
        run: pre-commit run --all-files
--- a/.github/workflows/nightly-build.yaml
+++ b/.github/workflows/nightly-build.yaml
 name: Nightly Build
 on:
  schedule:
    - cron: '0 8 * * *' # UTC time
  workflow_dispatch:
 permissions:
  contents: write
 jobs:
  tag:
    name: Tag dev branch if dev version
@@ -22,51 +19,38 @@ jobs:
        with:
          fetch-depth: 0
          ref: dev
      - name: Extract version from __version__.py
        id: version
        run: |
          version=$(grep '__version__' nunchaku/__version__.py | sed -E 's/.*"([^"]+)".*/\1/')
          echo "Extracted version: $version"
          echo "version=$version" >> "$GITHUB_OUTPUT"
+      - name: Determine if build is needed
-      - name: Check if version contains 'dev'
        id: check
        run: |
-          if [[ "${{ steps.version.outputs.version }}" == *dev* ]]; then
+          version="${{ steps.version.outputs.version }}"
-            echo "need_build=true" >> "$GITHUB_OUTPUT"
+          need_build=false
-          else
+          if [[ "$version" == *dev* ]]; then
-            echo "need_build=false" >> "$GITHUB_OUTPUT"
+            echo "Version contains 'dev'"
-          fi
+            prefix="v$version"
-      - name: Get latest tag with same version prefix
-        id: last_tag
-        if: steps.check.outputs.need_build == 'true'
-        run: |
-          prefix="v${{ steps.version.outputs.version }}"
            tag=$(git tag --list "${prefix}*" --sort=-creatordate | head -n 1 || echo "")
-          echo "latest_tag=$tag" >> "$GITHUB_OUTPUT"
-      - name: Check if current commit is new
-        id: check_commit_diff
-        if: steps.check.outputs.need_build == 'true'
-        run: |
-          tag=${{ steps.last_tag.outputs.latest_tag }}
            if [ -z "$tag" ]; then
              echo "No previous tag found."
-            echo "need_build=true" >> "$GITHUB_OUTPUT"
+              need_build=true
            else
              base=$(git rev-parse "$tag")
              head=$(git rev-parse HEAD)
-            if [ "$base" = "$head" ]; then
+              if [ "$base" != "$head" ]; then
-              echo "No new commits since $tag"
-              echo "need_build=false" >> "$GITHUB_OUTPUT"
-            else
                echo "New commits found since $tag"
-              echo "need_build=true" >> "$GITHUB_OUTPUT"
+                need_build=true
+              else
+                echo "No new commits since $tag"
              fi
            fi
+          else
+            echo "Version does not contain 'dev'"
+          fi
+          echo "need_build=$need_build" >> "$GITHUB_OUTPUT"
      - name: Set tag name
        id: tag
        if: steps.check.outputs.need_build == 'true'
@@ -75,7 +59,6 @@ jobs:
          tag_name="v${{ steps.version.outputs.version }}$today"
          echo "tag_name=$tag_name"
          echo "tag_name=$tag_name" >> "$GITHUB_OUTPUT"
      - name: Create and push tag
        if: steps.check.outputs.need_build == 'true'
        run: |
@@ -83,11 +66,9 @@ jobs:
          git config user.email "github-actions@users.noreply.github.com"
          git tag ${{ steps.tag.outputs.tag_name }}
          git push origin ${{ steps.tag.outputs.tag_name }}
      - name: Skip tagging (version is not dev or no new commits)
        if: steps.check.outputs.need_build == 'false'
-        run: echo "Version is not a dev version. Skipping tag."
+        run: echo "Version is not a dev version or no new commits. Skipping tag."
  linux-wheels:
    name: Build the linux nightly wheels
    runs-on: [self-hosted, linux-build]
@@ -97,7 +78,6 @@ jobs:
      matrix:
        python: ["3.10", "3.11", "3.12"]
        torch: ["2.5", "2.6", "2.7"]
    steps:
      - name: Checkout to the tag
        uses: actions/checkout@v4
@@ -105,10 +85,8 @@ jobs:
          fetch-depth: 0
          ref: ${{ needs.tag.outputs.tag_name }}
          submodules: true
      - name: Show current commit
        run: git log -1 --oneline
      - name: Build wheels
        run: |
          if [[ "${{ matrix.torch }}" == "2.7" ]]; then
@@ -117,7 +95,6 @@ jobs:
            cuda_version="12.4"
          fi
          bash scripts/build_linux_wheel.sh ${{ matrix.python }} ${{ matrix.torch }} $cuda_version
      - name: Upload wheels to GitHub Release
        uses: softprops/action-gh-release@v2
        with:
@@ -127,21 +104,18 @@ jobs:
          prerelease: true
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Clean up
        if: always() && github.repository == 'mit-han-lab/nunchaku'
        run: bash scripts/linux_cleanup.sh
  windows-wheels:
    name: Build the windows nightly wheels
-    runs-on: [ self-hosted, windows-build ]
+    runs-on: [self-hosted, windows-build]
    needs: tag
    if: needs.tag.outputs.need_build == 'true' && github.repository == 'mit-han-lab/nunchaku'
    strategy:
      matrix:
-        python: [ "3.10", "3.11", "3.12" ]
+        python: ["3.10", "3.11", "3.12"]
-        torch: [ "2.5", "2.6", "2.7" ]
+        torch: ["2.5", "2.6", "2.7"]
    steps:
      - name: Checkout to the tag
        uses: actions/checkout@v4
@@ -149,10 +123,8 @@ jobs:
          fetch-depth: 0
          ref: ${{ needs.tag.outputs.tag_name }}
          submodules: true
      - name: Show current commit
        run: git log -1 --oneline
      - name: Build wheels
        shell: cmd
        run: |
@@ -164,7 +136,6 @@ jobs:
          )
          call C:\Users\muyangl\miniconda3\condabin\activate.bat activate
          call scripts\build_windows_wheel.cmd ${{ matrix.python }} %TORCH_VERSION% %CUDA_VERSION%
      - name: Upload wheels to GitHub Release
        uses: softprops/action-gh-release@v2
        with:

--- a/.github/workflows/release-build.yaml
+++ b/.github/workflows/release-build.yaml
 name: Release Build
 on:
  workflow_dispatch:
 permissions:
  contents: write
 jobs:
  release:
    name: Tag Main Branch and Create Release
@@ -19,14 +16,12 @@ jobs:
        with:
          fetch-depth: 0
          ref: main
      - name: Extract version from __version__.py
        id: version
        run: |
          version=$(grep '__version__' nunchaku/__version__.py | sed -E 's/.*"([^"]+)".*/\1/')
          echo "Extracted version: $version"
          echo "version=$version" >> "$GITHUB_OUTPUT"
      - name: Create and push tag
        id: tag
        run: |
@@ -36,7 +31,6 @@ jobs:
          git tag $tag_name
          git push origin $tag_name
          echo "tag_name=$tag_name" >> "$GITHUB_OUTPUT"
  linux-wheels:
    name: Build the linux release wheels
    runs-on: [self-hosted, linux-build]
@@ -45,7 +39,6 @@ jobs:
      matrix:
        python: ["3.10", "3.11", "3.12"]
        torch: ["2.5", "2.6", "2.7"]
    steps:
      - name: Checkout to the tag
        uses: actions/checkout@v4
@@ -53,10 +46,8 @@ jobs:
          fetch-depth: 0
          ref: ${{ needs.release.outputs.tag_name }}
          submodules: true
      - name: Show current commit
        run: git log -1 --oneline
      - name: Build wheels
        run: |
          if [[ "${{ matrix.torch }}" == "2.7" ]]; then
@@ -65,7 +56,6 @@ jobs:
            cuda_version="12.4"
          fi
          bash scripts/build_linux_wheel.sh ${{ matrix.python }} ${{ matrix.torch }} $cuda_version
      - name: Upload wheels to GitHub Release
        uses: softprops/action-gh-release@v2
        with:
@@ -75,20 +65,17 @@ jobs:
          prerelease: false
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: Clean up
        if: always()
        run: bash scripts/linux_cleanup.sh
  windows-wheels:
    name: Build the windows release wheels
-    runs-on: [ self-hosted, windows-build ]
+    runs-on: [self-hosted, windows-build]
    needs: release
    strategy:
      matrix:
-        python: [ "3.10", "3.11", "3.12" ]
+        python: ["3.10", "3.11", "3.12"]
-        torch: [ "2.5", "2.6", "2.7" ]
+        torch: ["2.5", "2.6", "2.7"]
    steps:
      - name: Checkout to the tag
        uses: actions/checkout@v4
@@ -96,10 +83,8 @@ jobs:
          fetch-depth: 0
          ref: ${{ needs.release.outputs.tag_name }}
          submodules: true
      - name: Show current commit
        run: git log -1 --oneline
      - name: Build wheels
        shell: cmd
        run: |
@@ -111,7 +96,6 @@ jobs:
          )
          call C:\Users\muyangl\miniconda3\condabin\activate.bat activate
          call scripts\build_windows_wheel.cmd ${{ matrix.python }} %TORCH_VERSION% %CUDA_VERSION%
      - name: Upload wheels to GitHub Release
        uses: softprops/action-gh-release@v2
        with:

--- a/.github/workflows/sync-to-private.yml
+++ b/.github/workflows/sync-to-private.yml
 name: Synchronize to Private Repository
 on:
  workflow_dispatch:
  push:
    branches:
      - dev
 permissions:
  contents: write
 jobs:
  cherry-pick-commits:
    runs-on: ubuntu-latest
    if: github.repository == 'mit-han-lab/nunchaku'
    steps:
      - name: Clone private repository
        run: |
          git clone https://x-access-token:${{ secrets.GH_TOKEN }}@github.com/mit-han-lab/nunchaku-dev.git
      - name: Add public remote and fetch
        run: |
          cd nunchaku-dev
          git remote add public https://x-access-token:${{ secrets.GH_TOKEN }}@github.com/mit-han-lab/nunchaku.git
          git fetch public dev
      - name: Cherry-pick latest commit from public/dev
        run: |
          set -e
@@ -94,7 +88,6 @@ jobs:
          done
          git commit --amend --allow-empty -m "$NEW_MSG" --author="$GIT_AUTHOR_NAME <$GIT_AUTHOR_EMAIL>"
      - name: Push to the private main branch
        run: |
          cd nunchaku-dev

--- a/.github/workflows/test-ampere.yaml
+++ b/.github/workflows/test-ampere.yaml
 name: Ampere Tests
 on:
  workflow_dispatch:
    inputs:
@@ -10,11 +9,9 @@ on:
        options:
          - pr
          - branch
      pr_number:
        description: 'Pull Request Number (only if test_target == "pr")'
        required: false
      branch_name:
        description: 'Branch name (only if test_target == "branch")'
        default: 'main'
@@ -39,11 +36,10 @@ on:
 concurrency:
  group: ${{ github.repository }}-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true
 jobs:
  check-comment:
    if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'issue_comment' && github.event.issue.pull_request && !github.event.pull_request.draft) }}
-    runs-on: [ self-hosted, ampere ]
+    runs-on: [self-hosted, ampere]
    outputs:
      should_run: ${{ steps.check.outputs.should_run }}
    steps:
@@ -56,12 +52,10 @@ jobs:
          else
            echo "should_run=false" >> $GITHUB_OUTPUT
          fi
  run-tests:
-    runs-on: [ self-hosted, ampere ]
+    runs-on: [self-hosted, ampere]
-    needs: [ check-comment ]
+    needs: [check-comment]
    if: ${{ github.event_name != 'issue_comment' || needs.check-comment.outputs.should_run == 'true' }}
    steps:
      - name: Determine ref
        id: set-ref
@@ -76,16 +70,13 @@ jobs:
        with:
          ref: ${{ steps.set-ref.outputs.ref }}
          submodules: true
      - name: Show current commit
        run: git log -1 --oneline
      - name: Set up Python
        run: |
          which python
          echo "Setting up Python with Conda"
          conda create -n test_env python=3.11 -y
      - name: Install dependencies
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
@@ -95,7 +86,6 @@ jobs:
          echo "Installing dependencies"
          pip install torch==2.7 torchvision==0.22 torchaudio==2.7 --index-url https://download.pytorch.org/whl/cu128
          pip install ninja wheel diffusers==0.33.1 transformers==4.51 accelerate==1.7 sentencepiece==0.2 protobuf==6.31 huggingface_hub==0.31
      - name: Build
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
@@ -103,7 +93,6 @@ jobs:
          which python
          NUNCHAKU_INSTALL_MODE=ALL python setup.py develop
          pip install -r tests/requirements.txt
      - name: Setup ComfyUI
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
@@ -127,7 +116,6 @@ jobs:
          pip install -r nunchaku_tests/requirements.txt
          HF_TOKEN=${{ secrets.HF_TOKEN }} python custom_nodes/ComfyUI-nunchaku/scripts/download_models.py
          HF_TOKEN=${{ secrets.HF_TOKEN }} python custom_nodes/ComfyUI-nunchaku/scripts/download_test_data.py
      - name: Run ComfyUI tests
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
@@ -136,7 +124,6 @@ jobs:
          cd ../ComfyUI
          python nunchaku_tests/scripts/nunchaku_flux1_dev.py
          pytest -v nunchaku_tests/
      - name: Nunchaku FLUX memory tests
        run: |
          pwd
@@ -144,28 +131,24 @@ jobs:
          conda activate test_env || { echo "Failed to activate conda env"; exit 1; }
          which python
          NUNCHAKU_TEST_CACHE_ROOT=${{ secrets.NUNCHAKU_TEST_CACHE_ROOT_AMPERE }} HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -v tests/flux/test_flux_memory.py
      - name: Nunchaku FLUX example tests
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
          conda activate test_env || { echo "Failed to activate conda env"; exit 1; }
          which python
          NUNCHAKU_TEST_CACHE_ROOT=${{ secrets.NUNCHAKU_TEST_CACHE_ROOT_AMPERE }} HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -v tests/flux/test_flux_examples.py
      - name: Nunchaku FLUX other tests
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
          conda activate test_env || { echo "Failed to activate conda env"; exit 1; }
          which python
          NUNCHAKU_TEST_CACHE_ROOT=${{ secrets.NUNCHAKU_TEST_CACHE_ROOT_AMPERE }} HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -v tests/flux --ignore=tests/flux/test_flux_memory.py --ignore=tests/flux/test_flux_examples.py
      - name: Nunchaku SANA tests
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
          conda activate test_env || { echo "Failed to activate conda env"; exit 1; }
          which python
          NUNCHAKU_TEST_CACHE_ROOT=${{ secrets.NUNCHAKU_TEST_CACHE_ROOT_AMPERE }} HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -v tests/sana
      - name: clean up
        if: always() && (github.event_name != 'issue_comment' || needs.check-comment.outputs.should_run == 'true')
        run: |

--- a/.github/workflows/test-blackwell.yaml
+++ b/.github/workflows/test-blackwell.yaml
 name: Blackwell Tests
 on:
  workflow_dispatch:
    inputs:
@@ -10,11 +9,9 @@ on:
        options:
          - pr
          - branch
      pr_number:
        description: 'Pull Request Number (only if test_target == "pr")'
        required: false
      branch_name:
        description: 'Branch name (only if test_target == "branch")'
        default: 'main'
@@ -39,11 +36,10 @@ on:
 concurrency:
  group: ${{ github.repository }}-${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
  cancel-in-progress: true
 jobs:
  check-comment:
    if: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'issue_comment' && github.event.issue.pull_request && !github.event.pull_request.draft) }}
-    runs-on: [ self-hosted, blackwell ]
+    runs-on: [self-hosted, blackwell]
    outputs:
      should_run: ${{ steps.check.outputs.should_run }}
    steps:
@@ -56,12 +52,10 @@ jobs:
          else
            echo "should_run=false" >> $GITHUB_OUTPUT
          fi
  run-tests:
-    runs-on: [ self-hosted, blackwell ]
+    runs-on: [self-hosted, blackwell]
-    needs: [ check-comment ]
+    needs: [check-comment]
    if: ${{ github.event_name != 'issue_comment' || needs.check-comment.outputs.should_run == 'true' }}
    steps:
      - name: Determine ref
        id: set-ref
@@ -76,16 +70,13 @@ jobs:
        with:
          ref: ${{ steps.set-ref.outputs.ref }}
          submodules: true
      - name: Show current commit
        run: git log -1 --oneline
      - name: Set up Python
        run: |
          which python
          echo "Setting up Python with Conda"
          conda create -n test_env python=3.11 -y
      - name: Install dependencies
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
@@ -95,7 +86,6 @@ jobs:
          echo "Installing dependencies"
          pip install torch==2.7 torchvision==0.22 torchaudio==2.7 --index-url https://download.pytorch.org/whl/cu128
          pip install ninja wheel diffusers==0.33.1 transformers==4.51 accelerate==1.7 sentencepiece==0.2 protobuf==6.31 huggingface_hub==0.31
      - name: Build
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
@@ -103,7 +93,6 @@ jobs:
          which python
          NUNCHAKU_INSTALL_MODE=ALL python setup.py develop
          pip install -r tests/requirements.txt
      - name: Setup ComfyUI
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
@@ -127,7 +116,6 @@ jobs:
          pip install -r nunchaku_tests/requirements.txt
          HF_TOKEN=${{ secrets.HF_TOKEN }} python custom_nodes/ComfyUI-nunchaku/scripts/download_models.py
          HF_TOKEN=${{ secrets.HF_TOKEN }} python custom_nodes/ComfyUI-nunchaku/scripts/download_test_data.py
      - name: Run ComfyUI tests
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
@@ -136,7 +124,6 @@ jobs:
          cd ../ComfyUI
          python nunchaku_tests/scripts/nunchaku_flux1_dev.py
          pytest -v nunchaku_tests/
      - name: Nunchaku FLUX memory tests
        run: |
          pwd
@@ -144,28 +131,24 @@ jobs:
          conda activate test_env || { echo "Failed to activate conda env"; exit 1; }
          which python
          NUNCHAKU_TEST_CACHE_ROOT=${{ secrets.NUNCHAKU_TEST_CACHE_ROOT_BLACKWELL }} HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -v tests/flux/test_flux_memory.py
      - name: Nunchaku FLUX example tests
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
          conda activate test_env || { echo "Failed to activate conda env"; exit 1; }
          which python
          NUNCHAKU_TEST_CACHE_ROOT=${{ secrets.NUNCHAKU_TEST_CACHE_ROOT_BLACKWELL }} HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -v tests/flux/test_flux_examples.py
      - name: Nunchaku FLUX other tests
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
          conda activate test_env || { echo "Failed to activate conda env"; exit 1; }
          which python
          NUNCHAKU_TEST_CACHE_ROOT=${{ secrets.NUNCHAKU_TEST_CACHE_ROOT_BLACKWELL }} HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -v tests/flux --ignore=tests/flux/test_flux_memory.py --ignore=tests/flux/test_flux_examples.py
      - name: Nunchaku SANA tests
        run: |
          source $(conda info --base)/etc/profile.d/conda.sh
          conda activate test_env || { echo "Failed to activate conda env"; exit 1; }
          which python
          NUNCHAKU_TEST_CACHE_ROOT=${{ secrets.NUNCHAKU_TEST_CACHE_ROOT_BLACKWELL }} HF_TOKEN=${{ secrets.HF_TOKEN }} pytest -v tests/sana
      - name: clean up
        if: always() && (github.event_name != 'issue_comment' || needs.check-comment.outputs.should_run == 'true')
        run: |

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
 # Adapted from https://github.com/sgl-project/sglang/blob/main/.pre-commit-config.yaml
-default_stages: [ pre-commit, pre-push, manual ]
+default_stages: [pre-commit, pre-push, manual]
 repos:
  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v5.0.0
@@ -10,7 +9,7 @@ repos:
      - id: trailing-whitespace
      - id: end-of-file-fixer
      - id: check-yaml
-        args: [ --allow-multiple-documents ]
+        args: [--allow-multiple-documents]
      - id: check-toml
      - id: check-ast
      - id: check-added-large-files
@@ -27,7 +26,7 @@ repos:
    rev: v0.11.2
    hooks:
      - id: ruff
-        args: [ --fixable=F401 ]
+        args: [--fixable=F401]
        files: ^(nunchaku/|examples/|tests/|app/)
        exclude: \.ipynb$
  - repo: https://github.com/psf/black
@@ -35,14 +34,14 @@ repos:
    hooks:
      - id: black-jupyter
      - id: black
-        args: [ -l, "120" ]
+        args: [-l, "120"]
        files: ^(nunchaku/|examples/|tests/|app/)
  - repo: https://github.com/pre-commit/mirrors-clang-format
    rev: v20.1.3
    hooks:
      - id: clang-format
-        types_or: [ c++, cuda ]
+        types_or: [c++, cuda]
-        args: [ --style=file, --verbose ]
+        args: [--style=file, --verbose]
  - repo: https://github.com/kynan/nbstripout
    rev: 0.8.1
    hooks:
@@ -50,3 +49,12 @@ repos:
        args:
          - '--keep-output'
          - '--extra-keys=metadata.kernelspec metadata.language_info.version'
+  - repo: https://github.com/google/yamlfmt
+    rev: v0.17.0
+    hooks:
+      - id: yamlfmt
+  - repo: https://github.com/executablebooks/mdformat
+    rev: 0.7.22
+    hooks:
+      - id: mdformat
+        name: (Markdown) Format docs with mdformat
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
  <img src="https://raw.githubusercontent.com/mit-han-lab/nunchaku/477953fa1dd6f082fbec201cea7c7430117a810e/assets/nunchaku.svg" alt="logo" width="220"></img>
 </div>
 <h3 align="center">
-<a href="http://arxiv.org/abs/2411.05007"><b>Paper</b></a> | <a href="https://hanlab.mit.edu/projects/svdquant"><b>Website</b></a> | <a href="https://hanlab.mit.edu/blog/svdquant"><b>Blog</b></a> | <a href="https://svdquant.mit.edu"><b>Demo</b></a> | <a href="https://huggingface.co/collections/mit-han-lab/svdquant-67493c2c2e62a1fc6e93f45c"><b>HuggingFace</b></a> | <a href="https://modelscope.cn/collections/svdquant-468e8f780c2641"><b>ModelScope</b></a> | <a href="https://github.com/mit-han-lab/ComfyUI-nunchaku"><b>ComfyUI</b></a>
+<a href="http://arxiv.org/abs/2411.05007"><b>Paper</b></a> | <a href="https://hanlab.mit.edu/projects/svdquant"><b>Website</b></a> | <a href="https://hanlab.mit.edu/blog/svdquant"><b>Blog</b></a> | <a href="https://svdquant.mit.edu"><b>Demo</b></a> | <a href="https://huggingface.co/collections/mit-han-lab/nunchaku-6837e7498f680552f7bbb5ad"><b>HuggingFace</b></a> | <a href="https://modelscope.cn/collections/Nunchaku-519fed7f9de94e"><b>ModelScope</b></a> | <a href="https://github.com/mit-han-lab/ComfyUI-nunchaku"><b>ComfyUI</b></a>
 </h3>
 <h3 align="center">
@@ -15,23 +15,20 @@ Join our user groups on [**Slack**](https://join.slack.com/t/nunchaku/shared_inv
 ## News
+- **[2025-06-01]** 🚀 **Release v0.3.0!** Now supports [**ControlNet-Union-Pro 2.0**](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Union-Pro-2.0) and initial support for [**PuLID**](https://github.com/ToTheBeginning/PuLID). You can now load Nunchaku FLUX models as a single file, and our upgraded [**4-bit T5 encoder**](https://huggingface.co/mit-han-lab/nunchaku-t5) now matches **FP8 T5** in quality!
 - **[2025-04-16]** 🎥 Released tutorial videos in both [**English**](https://youtu.be/YHAVe-oM7U8?si=cM9zaby_aEHiFXk0) and [**Chinese**](https://www.bilibili.com/video/BV1BTocYjEk5/?share_source=copy_web&vd_source=8926212fef622f25cc95380515ac74ee) to assist installation and usage.
 - **[2025-04-09]** 📢 Published the [April roadmap](https://github.com/mit-han-lab/nunchaku/issues/266) and an [FAQ](https://github.com/mit-han-lab/nunchaku/discussions/262) to help the community get started and stay up to date with Nunchaku’s development.
 - **[2025-04-05]** 🚀 **Nunchaku v0.2.0 released!** This release brings [**multi-LoRA**](examples/flux.1-dev-multiple-lora.py) and [**ControlNet**](examples/flux.1-dev-controlnet-union-pro.py) support with even faster performance powered by [**FP16 attention**](#fp16-attention) and [**First-Block Cache**](#first-block-cache). We've also added compatibility for [**20-series GPUs**](examples/flux.1-dev-turing.py) — Nunchaku is now more accessible than ever!
- **[2025-03-17]** 🚀 Released NVFP4 4-bit [Shuttle-Jaguar](https://huggingface.co/mit-han-lab/svdq-int4-shuttle-jaguar) and FLUX.1-tools and also upgraded the INT4 FLUX.1-tool models. Download and update your models from our [HuggingFace](https://huggingface.co/collections/mit-han-lab/svdquant-67493c2c2e62a1fc6e93f45c) or [ModelScope](https://modelscope.cn/collections/svdquant-468e8f780c2641) collections!
- **[2025-03-13]** 📦 Separate the ComfyUI node into a [standalone repository](https://github.com/mit-han-lab/ComfyUI-nunchaku) for easier installation and release node v0.1.6! Plus, [4-bit Shuttle-Jaguar](https://huggingface.co/mit-han-lab/svdq-int4-shuttle-jaguar) is now fully supported!
 - **[2025-03-07]** 🚀 **Nunchaku v0.1.4 Released!** We've supported [4-bit text encoder and per-layer CPU offloading](#Low-Memory-Inference), reducing FLUX's minimum memory requirement to just **4 GiB** while maintaining a **2–3× speedup**. This update also fixes various issues related to resolution, LoRA, pin memory, and runtime stability. Check out the release notes for full details!
- **[2025-02-20]** 🚀 We release the [pre-built wheels](https://huggingface.co/mit-han-lab/nunchaku) to simplify installation! Check [here](#Installation) for the guidance!
- **[2025-02-20]** 🚀 **Support NVFP4 precision on NVIDIA RTX 5090!** NVFP4 delivers superior image quality compared to INT4, offering **~3× speedup** on the RTX 5090 over BF16. Learn more in our [blog](https://hanlab.mit.edu/blog/svdquant-nvfp4), checkout  [`examples`](./examples) for usage and try [our demo](https://svdquant.mit.edu/flux1-schnell/) online!
- **[2025-02-18]** 🔥 [**Customized LoRA conversion**](#Customized-LoRA) and [**model quantization**](#Customized-Model-Quantization) instructions are now available! **[ComfyUI](./comfyui)** workflows now support **customized LoRA**, along with **FLUX.1-Tools**!
- **[2025-02-11]** 🎉 **[SVDQuant](http://arxiv.org/abs/2411.05007) has been selected as a ICLR 2025 Spotlight! FLUX.1-tools Gradio demos are now available!** Check [here](#gradio-demos) for the usage details! Our new [depth-to-image demo](https://svdquant.mit.edu/flux1-depth-dev/) is also online—try it out!
 <details>
 <summary>More</summary>
+- **[2025-02-20]** 🚀 **Support NVFP4 precision on NVIDIA RTX 5090!** NVFP4 delivers superior image quality compared to INT4, offering **~3× speedup** on the RTX 5090 over BF16. Learn more in our [blog](https://hanlab.mit.edu/blog/svdquant-nvfp4), checkout [`examples`](./examples) for usage and try [our demo](https://svdquant.mit.edu/flux1-schnell/) online!
+- **[2025-02-18]** 🔥 [**Customized LoRA conversion**](#Customized-LoRA) and [**model quantization**](#Customized-Model-Quantization) instructions are now available! **[ComfyUI](./comfyui)** workflows now support **customized LoRA**, along with **FLUX.1-Tools**!
+- **[2025-02-11]** 🎉 **[SVDQuant](http://arxiv.org/abs/2411.05007) has been selected as a ICLR 2025 Spotlight! FLUX.1-tools Gradio demos are now available!** Check [here](#gradio-demos) for the usage details! Our new [depth-to-image demo](https://svdquant.mit.edu/flux1-depth-dev/) is also online—try it out!
 - **[2025-02-04]** **🚀 4-bit [FLUX.1-tools](https://blackforestlabs.ai/flux-1-tools/) is here!** Enjoy a **2-3× speedup** over the original models. Check out the [examples](./examples) for usage. **ComfyUI integration is coming soon!**
- **[2025-01-23]** 🚀 **4-bit [SANA](https://nvlabs.github.io/Sana/) support is here!** Experience a 2-3× speedup compared to the 16-bit model. Check out the [usage example](./examples/sana_1600m_pag.py) and the [deployment guide](app/sana/t2i) for more details. Explore our live demo at [svdquant.mit.edu](https://svdquant.mit.edu)!
+- **[2025-01-23]** 🚀 **4-bit [SANA](https://nvlabs.github.io/Sana/) support is here!** Experience a 2-3× speedup compared to the 16-bit model. Check out the [usage example](examples/sana1.6b_pag.py) and the [deployment guide](app/sana/t2i) for more details. Explore our live demo at [svdquant.mit.edu](https://svdquant.mit.edu)!
 - **[2025-01-22]** 🎉 [**SVDQuant**](http://arxiv.org/abs/2411.05007) has been accepted to **ICLR 2025**!
 - **[2024-12-08]** Support [ComfyUI](https://github.com/comfyanonymous/ComfyUI). Please check [mit-han-lab/ComfyUI-nunchaku](https://github.com/mit-han-lab/ComfyUI-nunchaku) for the usage.
 - **[2024-11-07]** 🔥 Our latest **W4A4** Diffusion model quantization work [**SVDQuant**](https://hanlab.mit.edu/projects/svdquant) is publicly released! Check [**DeepCompressor**](https://github.com/mit-han-lab/deepcompressor) for the quantization library.
@@ -53,23 +50,24 @@ https://github.com/user-attachments/assets/fdd4ab68-6489-4c65-8768-259bd866e8f8
 #### Quantization Method -- SVDQuant
-![intuition](https://huggingface.co/mit-han-lab/nunchaku-artifacts/resolve/main/nunchaku/assets/intuition.gif)Overview of SVDQuant. Stage1: Originally, both the activation $\boldsymbol{X}$ and weights $\boldsymbol{W}$ contain outliers, making 4-bit quantization challenging.  Stage 2: We migrate the outliers from activations to weights, resulting in the updated activation $\hat{\boldsymbol{X}}$ and weights $\hat{\boldsymbol{W}}$. While $\hat{\boldsymbol{X}}$ becomes easier to quantize, $\hat{\boldsymbol{W}}$ now becomes more difficult. Stage 3: SVDQuant further decomposes $\hat{\boldsymbol{W}}$ into a low-rank component $\boldsymbol{L}_1\boldsymbol{L}_2$ and a residual $\hat{\boldsymbol{W}}-\boldsymbol{L}_1\boldsymbol{L}_2$ with SVD. Thus, the quantization difficulty is alleviated by the low-rank branch, which runs at 16-bit precision.
+![intuition](https://huggingface.co/mit-han-lab/nunchaku-artifacts/resolve/main/nunchaku/assets/intuition.gif)Overview of SVDQuant. Stage1: Originally, both the activation $\\boldsymbol{X}$ and weights $\\boldsymbol{W}$ contain outliers, making 4-bit quantization challenging. Stage 2: We migrate the outliers from activations to weights, resulting in the updated activation $\\hat{\\boldsymbol{X}}$ and weights $\\hat{\\boldsymbol{W}}$. While $\\hat{\\boldsymbol{X}}$ becomes easier to quantize, $\\hat{\\boldsymbol{W}}$ now becomes more difficult. Stage 3: SVDQuant further decomposes $\\hat{\\boldsymbol{W}}$ into a low-rank component $\\boldsymbol{L}\_1\\boldsymbol{L}\_2$ and a residual $\\hat{\\boldsymbol{W}}-\\boldsymbol{L}\_1\\boldsymbol{L}\_2$ with SVD. Thus, the quantization difficulty is alleviated by the low-rank branch, which runs at 16-bit precision.
 #### Nunchaku Engine Design
 ![engine](https://huggingface.co/mit-han-lab/nunchaku-artifacts/resolve/main/nunchaku/assets/engine.jpg) (a) Naïvely running low-rank branch with rank 32 will introduce 57% latency overhead due to extra read of 16-bit inputs in *Down Projection* and extra write of 16-bit outputs in *Up Projection*. Nunchaku optimizes this overhead with kernel fusion. (b) *Down Projection* and *Quantize* kernels use the same input, while *Up Projection* and *4-Bit Compute* kernels share the same output. To reduce data movement overhead, we fuse the first two and the latter two kernels together.
 ## Performance
 ![efficiency](https://huggingface.co/mit-han-lab/nunchaku-artifacts/resolve/main/nunchaku/assets/efficiency.jpg)SVDQuant reduces the 12B FLUX.1 model size by 3.6× and cuts the 16-bit model's memory usage by 3.5×. With Nunchaku, our INT4 model runs 3.0× faster than the NF4 W4A16 baseline on both desktop and laptop NVIDIA RTX 4090 GPUs. Notably, on the laptop 4090, it achieves a total 10.1× speedup by eliminating CPU offloading. Our NVFP4 model is also 3.1× faster than both BF16 and NF4 on the RTX 5090 GPU.
 ## Installation
 We provide tutorial videos to help you install and use Nunchaku on Windows, available in both [**English**](https://youtu.be/YHAVe-oM7U8?si=cM9zaby_aEHiFXk0) and [**Chinese**](https://www.bilibili.com/video/BV1BTocYjEk5/?share_source=copy_web&vd_source=8926212fef622f25cc95380515ac74ee). You can also follow the corresponding step-by-step text guide at [`docs/setup_windows.md`](docs/setup_windows.md). If you run into issues, these resources are a good place to start.
 ### Wheels
 #### Prerequisites
 Before installation, ensure you have [PyTorch>=2.5](https://pytorch.org/) installed. For example, you can use the following command to install PyTorch 2.6:
 ```shell
@@ -77,6 +75,7 @@ pip install torch==2.6 torchvision==0.21 torchaudio==2.6
 ```
 #### Install nunchaku
 Once PyTorch is installed, you can directly install `nunchaku` from [Hugging Face](https://huggingface.co/mit-han-lab/nunchaku/tree/main), [ModelScope](https://modelscope.cn/models/Lmxyy1999/nunchaku) or [GitHub release](https://github.com/mit-han-lab/nunchaku/releases). Be sure to select the appropriate wheel for your Python and PyTorch version. For example, for Python 3.11 and PyTorch 2.6:
 ```shell
@@ -111,12 +110,11 @@ If you're using a Blackwell GPU (e.g., 50-series GPUs), install a wheel with PyT
 **Note**:
-*  Make sure your CUDA version is **at least 12.2 on Linux** and **at least 12.6 on Windows**. If you're using a Blackwell GPU (e.g., 50-series GPUs), CUDA **12.8 or higher is required**.
+- Make sure your CUDA version is **at least 12.2 on Linux** and **at least 12.6 on Windows**. If you're using a Blackwell GPU (e.g., 50-series GPUs), CUDA **12.8 or higher is required**.
-*  For Windows users, please refer to [this issue](https://github.com/mit-han-lab/nunchaku/issues/6) for the instruction. Please upgrade your MSVC compiler to the latest version.
-*  We currently support only NVIDIA GPUs with architectures sm_75 (Turing: RTX 2080), sm_86 (Ampere: RTX 3090, A6000), sm_89 (Ada: RTX 4090), and sm_80 (A100). See [this issue](https://github.com/mit-han-lab/nunchaku/issues/1) for more details.
+- For Windows users, please refer to [this issue](https://github.com/mit-han-lab/nunchaku/issues/6) for the instruction. Please upgrade your MSVC compiler to the latest version.
+- We currently support only NVIDIA GPUs with architectures sm_75 (Turing: RTX 2080), sm_86 (Ampere: RTX 3090, A6000), sm_89 (Ada: RTX 4090), and sm_80 (A100). See [this issue](https://github.com/mit-han-lab/nunchaku/issues/1) for more details.
 1. Install dependencies:
@@ -136,7 +134,7 @@ If you're using a Blackwell GPU (e.g., 50-series GPUs), install a wheel with PyT
   pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
   ```
-2. Install `nunchaku` package:
+1. Install `nunchaku` package:
   Make sure you have `gcc/g++>=11`. If you don't, you can install it via Conda on Linux:
   ```shell
@@ -175,7 +173,9 @@ from nunchaku import NunchakuFluxTransformer2dModel
 from nunchaku.utils import get_precision
 precision = get_precision()  # auto-detect your precision is 'int4' or 'fp4' based on your GPU
-transformer = NunchakuFluxTransformer2dModel.from_pretrained(f"mit-han-lab/svdq-{precision}-flux.1-dev")
+transformer = NunchakuFluxTransformer2dModel.from_pretrained(
+    f"mit-han-lab/nunchaku-flux.1-dev/svdq-{precision}_r32-flux.1-dev.safetensors"
+)
 pipeline = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev", transformer=transformer, torch_dtype=torch.bfloat16
 ).to("cuda")
@@ -236,7 +236,9 @@ from nunchaku import NunchakuFluxTransformer2dModel
 from nunchaku.utils import get_precision
 precision = get_precision()  # auto-detect your precision is 'int4' or 'fp4' based on your GPU
-transformer = NunchakuFluxTransformer2dModel.from_pretrained(f"mit-han-lab/svdq-{precision}-flux.1-dev")
+transformer = NunchakuFluxTransformer2dModel.from_pretrained(
+    f"mit-han-lab/nunchaku-flux.1-dev/svdq-{precision}_r32-flux.1-dev.safetensors"
+)
 pipeline = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev", transformer=transformer, torch_dtype=torch.bfloat16
 ).to("cuda")
@@ -285,14 +287,14 @@ Please refer to [mit-han-lab/ComfyUI-nunchaku](https://github.com/mit-han-lab/Co
 ## Gradio Demos
-* FLUX.1 Models
+- FLUX.1 Models
-  * Text-to-image: see [`app/flux.1/t2i`](app/flux.1/t2i).
+  - Text-to-image: see [`app/flux.1/t2i`](app/flux.1/t2i).
-  * Sketch-to-Image ([pix2pix-Turbo](https://github.com/GaParmar/img2img-turbo)): see [`app/flux.1/sketch`](app/flux.1/sketch).
+  - Sketch-to-Image ([pix2pix-Turbo](https://github.com/GaParmar/img2img-turbo)): see [`app/flux.1/sketch`](app/flux.1/sketch).
-  * Depth/Canny-to-Image ([FLUX.1-tools](https://blackforestlabs.ai/flux-1-tools/)): see [`app/flux.1/depth_canny`](app/flux.1/depth_canny).
+  - Depth/Canny-to-Image ([FLUX.1-tools](https://blackforestlabs.ai/flux-1-tools/)): see [`app/flux.1/depth_canny`](app/flux.1/depth_canny).
-  * Inpainting ([FLUX.1-Fill-dev](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev)): see [`app/flux.1/fill`](app/flux.1/fill).
+  - Inpainting ([FLUX.1-Fill-dev](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev)): see [`app/flux.1/fill`](app/flux.1/fill).
-  * Redux ([FLUX.1-Redux-dev](https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev)): see [`app/flux.1/redux`](app/flux.1/redux).
+  - Redux ([FLUX.1-Redux-dev](https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev)): see [`app/flux.1/redux`](app/flux.1/redux).
-* SANA:
+- SANA:
-  * Text-to-image: see [`app/sana/t2i`](app/sana/t2i).
+  - Text-to-image: see [`app/sana/t2i`](app/sana/t2i).
 ## Customized Model Quantization
@@ -307,6 +309,7 @@ Please refer to [app/flux/t2i/README.md](app/flux/t2i/README.md) for instruction
 Please check [here](https://github.com/mit-han-lab/nunchaku/issues/266) for the roadmap for April.
 ## Contribution
 We warmly welcome contributions from the community! To get started, please refer to our [contribution guide](docs/contribution_guide.md) for instructions on how to contribute code to Nunchaku.
 ## Troubleshooting
@@ -319,13 +322,13 @@ For enterprises interested in adopting SVDQuant or Nunchaku, including technical
 ## Related Projects
-* [Efficient Spatially Sparse Inference for Conditional GANs and Diffusion Models](https://arxiv.org/abs/2211.02048), NeurIPS 2022 & T-PAMI 2023
+- [Efficient Spatially Sparse Inference for Conditional GANs and Diffusion Models](https://arxiv.org/abs/2211.02048), NeurIPS 2022 & T-PAMI 2023
-* [SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https://arxiv.org/abs/2211.10438), ICML 2023
+- [SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https://arxiv.org/abs/2211.10438), ICML 2023
-* [Q-Diffusion: Quantizing Diffusion Models](https://arxiv.org/abs/2302.04304), ICCV 2023
+- [Q-Diffusion: Quantizing Diffusion Models](https://arxiv.org/abs/2302.04304), ICCV 2023
-* [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://arxiv.org/abs/2306.00978), MLSys 2024
+- [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://arxiv.org/abs/2306.00978), MLSys 2024
-* [DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models](https://arxiv.org/abs/2402.19481), CVPR 2024
+- [DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models](https://arxiv.org/abs/2402.19481), CVPR 2024
-* [QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving](https://arxiv.org/abs/2405.04532), MLSys 2025
+- [QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving](https://arxiv.org/abs/2405.04532), MLSys 2025
-* [SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformers](https://arxiv.org/abs/2410.10629), ICLR 2025
+- [SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformers](https://arxiv.org/abs/2410.10629), ICLR 2025
 ## Citation

--- a/README_ZH.md
+++ b/README_ZH.md
@@ -2,7 +2,7 @@
  <img src="https://raw.githubusercontent.com/mit-han-lab/nunchaku/477953fa1dd6f082fbec201cea7c7430117a810e/assets/nunchaku.svg" alt="logo" width="220"></img>
 </div>
 <h3 align="center">
-<a href="http://arxiv.org/abs/2411.05007"><b>论文</b></a> | <a href="https://hanlab.mit.edu/projects/svdquant"><b>官网</b></a> | <a href="https://hanlab.mit.edu/blog/svdquant"><b>博客</b></a> | <a href="https://svdquant.mit.edu"><b>演示</b></a> | <a href="https://huggingface.co/collections/mit-han-lab/svdquant-67493c2c2e62a1fc6e93f45c"><b>HuggingFace</b></a> | <a href="https://modelscope.cn/collections/svdquant-468e8f780c2641"><b>ModelScope</b></a> | <a href="https://github.com/mit-han-lab/ComfyUI-nunchaku"><b>ComfyUI</b></a>
+<a href="http://arxiv.org/abs/2411.05007"><b>论文</b></a> | <a href="https://hanlab.mit.edu/projects/svdquant"><b>官网</b></a> | <a href="https://hanlab.mit.edu/blog/svdquant"><b>博客</b></a> | <a href="https://svdquant.mit.edu"><b>演示</b></a> | <a href="https://huggingface.co/collections/mit-han-lab/nunchaku-6837e7498f680552f7bbb5ad"><b>HuggingFace</b></a> | <a href="https://modelscope.cn/collections/Nunchaku-519fed7f9de94e"><b>ModelScope</b></a> | <a href="https://github.com/mit-han-lab/ComfyUI-nunchaku"><b>ComfyUI</b></a>
 </h3>
 <h3 align="center">
@@ -18,19 +18,16 @@
 - **[2025-04-09]** 🎥 发布了[**英文**](https://youtu.be/YHAVe-oM7U8?si=cM9zaby_aEHiFXk0)和[**中文**](https://www.bilibili.com/video/BV1BTocYjEk5/?share_source=copy_web&vd_source=8926212fef622f25cc95380515ac74ee)教程视频，协助安装和使用Nunchaku。
 - **[2025-04-09]** 📢 发布[四月开发路线图](https://github.com/mit-han-lab/nunchaku/issues/266)和[常见问题解答](https://github.com/mit-han-lab/nunchaku/discussions/262)，帮助社区快速上手并了解Nunchaku最新进展。
 - **[2025-04-05]** 🚀 **Nunchaku v0.2.0 发布！** 支持[**多LoRA融合**](examples/flux.1-dev-multiple-lora.py)和[**ControlNet**](examples/flux.1-dev-controlnet-union-pro.py)，通过[**FP16 attention**](#fp16-attention)和[**First-Block Cache**](#first-block-cache)实现更快的推理速度。新增[**20系显卡支持**](examples/flux.1-dev-turing.py)，覆盖更多用户！
- **[2025-03-17]** 🚀 发布NVFP4 4-bit量化版[Shuttle-Jaguar](https://huggingface.co/mit-han-lab/svdq-int4-shuttle-jaguar)和FLUX.1工具集，升级INT4 FLUX.1工具模型。从[HuggingFace](https://huggingface.co/collections/mit-han-lab/svdquant-67493c2c2e62a1fc6e93f45c)或[ModelScope](https://modelscope.cn/collections/svdquant-468e8f780c2641)下载更新！
- **[2025-03-13]** 📦 ComfyUI节点[独立仓库](https://github.com/mit-han-lab/ComfyUI-nunchaku)发布，安装更便捷！节点版本v0.1.6上线，全面支持[4-bit Shuttle-Jaguar](https://huggingface.co/mit-han-lab/svdq-int4-shuttle-jaguar)！
 - **[2025-03-07]** 🚀 **Nunchaku v0.1.4 发布！** 支持4-bit文本编码器和分层CPU offloading，FLUX最低显存需求降至**4 GiB**，同时保持**2–3倍加速**。修复分辨率、LoRA、内存锁定等稳定性问题，详情见更新日志！
- **[2025-02-20]** 🚀 发布[预编译wheel包](https://huggingface.co/mit-han-lab/nunchaku)，简化安装步骤！查看[安装指南](#安装指南)！
 - **[2025-02-20]** 🚀 **NVIDIA RTX 5090支持NVFP4精度！** 相比INT4，NVFP4画质更优，在RTX 5090上比BF16快**约3倍**。[博客详解](https://hanlab.mit.edu/blog/svdquant-nvfp4)，[示例代码](./examples)及[在线演示](https://svdquant.mit.edu/flux1-schnell/)已上线！
- **[2025-02-18]** 🔥 新增[自定义LoRA转换](#自定义lora)和[模型量化](#自定义模型量化)指南！[ComfyUI](./comfyui)工作流支持**自定义LoRA**及**FLUX.1工具集**！
+- **[2025-02-18]** 🔥 新增[自定义LoRA转换](#%E8%87%AA%E5%AE%9A%E4%B9%89lora)和[模型量化](#%E8%87%AA%E5%AE%9A%E4%B9%89%E6%A8%A1%E5%9E%8B%E9%87%8F%E5%8C%96)指南！[ComfyUI](./comfyui)工作流支持**自定义LoRA**及**FLUX.1工具集**！
- **[2025-02-11]** 🎉 **[SVDQuant](http://arxiv.org/abs/2411.05007)入选ICLR 2025 Spotlight！FLUX.1工具集使用演示上线！** [使用演示](#使用演示)已更新！[深度图生成演示](https://svdquant.mit.edu/flux1-depth-dev/)同步开放！
+- **[2025-02-11]** 🎉 **[SVDQuant](http://arxiv.org/abs/2411.05007)入选ICLR 2025 Spotlight！FLUX.1工具集使用演示上线！** [使用演示](#%E4%BD%BF%E7%94%A8%E6%BC%94%E7%A4%BA)已更新！[深度图生成演示](https://svdquant.mit.edu/flux1-depth-dev/)同步开放！
 <details>
 <summary>更多动态</summary>
 - **[2025-02-04]** **🚀 4-bit量化版[FLUX.1工具集](https://blackforestlabs.ai/flux-1-tools/)发布！** 相比原模型提速**2-3倍**。[示例代码](./examples)已更新，**ComfyUI支持即将到来！**
- **[2025-01-23]** 🚀 **支持4-bit量化[SANA](https://nvlabs.github.io/Sana/)！** 相比16位模型提速2-3倍。[使用示例](./examples/sana_1600m_pag.py)和[部署指南](app/sana/t2i)已发布，体验[在线演示](https://svdquant.mit.edu)！
+- **[2025-01-23]** 🚀 **支持4-bit量化[SANA](https://nvlabs.github.io/Sana/)！** 相比16位模型提速2-3倍。[使用示例](examples/sana1.6b_pag.py)和[部署指南](app/sana/t2i)已发布，体验[在线演示](https://svdquant.mit.edu)！
 - **[2025-01-22]** 🎉 [**SVDQuant**](http://arxiv.org/abs/2411.05007) 被 **ICLR 2025** 接收！
 - **[2024-12-08]** 支持 [ComfyUI](https://github.com/comfyanonymous/ComfyUI)，详情见 [mit-han-lab/ComfyUI-nunchaku](https://github.com/mit-han-lab/ComfyUI-nunchaku)。
 - **[2024-11-07]** 🔥 最新 **W4A4** 扩散模型量化工作 [**SVDQuant**](https://hanlab.mit.edu/projects/svdquant) 开源！量化库 [**DeepCompressor**](https://github.com/mit-han-lab/deepcompressor) 同步发布。
@@ -52,7 +49,7 @@ https://github.com/user-attachments/assets/fdd4ab68-6489-4c65-8768-259bd866e8f8
 #### 量化方法 -- SVDQuant
-![intuition](https://huggingface.co/mit-han-lab/nunchaku-artifacts/resolve/main/nunchaku/assets/intuition.gif)SVDQuant三阶段示意图。阶段1：原始激活 $\boldsymbol{X}$ 和权重 $\boldsymbol{W}$ 均含异常值，4-bit量化困难。阶段2：将激活异常值迁移至权重，得到更易量化的激活 $\hat{\boldsymbol{X}}$ 和更难量化的权重 $\hat{\boldsymbol{W}}$ 。阶段3：通过SVD将 $\hat{\boldsymbol{W}}$ 分解为低秩分量 $\boldsymbol{L}_1\boldsymbol{L}_2$ 和残差 $\hat{\boldsymbol{W}}-\boldsymbol{L}_1\boldsymbol{L}_2$ ，低秩分支以16位精度运行缓解量化难度。
+![intuition](https://huggingface.co/mit-han-lab/nunchaku-artifacts/resolve/main/nunchaku/assets/intuition.gif)SVDQuant三阶段示意图。阶段1：原始激活 $\\boldsymbol{X}$ 和权重 $\\boldsymbol{W}$ 均含异常值，4-bit量化困难。阶段2：将激活异常值迁移至权重，得到更易量化的激活 $\\hat{\\boldsymbol{X}}$ 和更难量化的权重 $\\hat{\\boldsymbol{W}}$ 。阶段3：通过SVD将 $\\hat{\\boldsymbol{W}}$ 分解为低秩分量 $\\boldsymbol{L}\_1\\boldsymbol{L}\_2$ 和残差 $\\hat{\\boldsymbol{W}}-\\boldsymbol{L}\_1\\boldsymbol{L}\_2$ ，低秩分支以16位精度运行缓解量化难度。
 #### Nunchaku引擎设计
@@ -69,6 +66,7 @@ https://github.com/user-attachments/assets/fdd4ab68-6489-4c65-8768-259bd866e8f8
 ### Wheel包安装
 #### 前置条件
 确保已安装 [PyTorch>=2.5](https://pytorch.org/)。例如：
 ```shell
@@ -76,6 +74,7 @@ pip install torch==2.6 torchvision==0.21 torchaudio==2.6
 ```
 #### 安装nunchaku
 从[Hugging Face](https://huggingface.co/mit-han-lab/nunchaku/tree/main)、[ModelScope](https://modelscope.cn/models/Lmxyy1999/nunchaku)或[GitHub release](https://github.com/mit-han-lab/nunchaku/releases)选择对应Python和PyTorch版本的wheel。例如Python 3.11和PyTorch 2.6：
 ```shell
@@ -110,9 +109,9 @@ pip install https://huggingface.co/mit-han-lab/nunchaku/resolve/main/nunchaku-0.
 **注意**：
-* Linux需CUDA≥12.2，Windows需CUDA≥12.6。Blackwell显卡需CUDA≥12.8。
+- Linux需CUDA≥12.2，Windows需CUDA≥12.6。Blackwell显卡需CUDA≥12.8。
-* Windows用户请参考[此问题](https://github.com/mit-han-lab/nunchaku/issues/6)升级MSVC编译器。
+- Windows用户请参考[此问题](https://github.com/mit-han-lab/nunchaku/issues/6)升级MSVC编译器。
-* 支持SM_75（Turing：RTX 2080）、SM_86（Ampere：RTX 3090）、SM_89（Ada：RTX 4090）、SM_80（A100）架构显卡，详见[此问题](https://github.com/mit-han-lab/nunchaku/issues/1)。
+- 支持SM_75（Turing：RTX 2080）、SM_86（Ampere：RTX 3090）、SM_89（Ada：RTX 4090）、SM_80（A100）架构显卡，详见[此问题](https://github.com/mit-han-lab/nunchaku/issues/1)。
 1. 安装依赖：
@@ -132,7 +131,7 @@ pip install https://huggingface.co/mit-han-lab/nunchaku/resolve/main/nunchaku-0.
   pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
   ```
-2. 编译安装：
+1. 编译安装：
   确保`gcc/g++≥11`。Linux用户可通过Conda安装：
   ```shell
@@ -179,7 +178,7 @@ image = pipeline("举着'Hello World'标牌的猫咪", num_inference_steps=50, g
 image.save(f"flux.1-dev-{precision}.png")
 ```
-**注意**：**Turing显卡用户（如20系列）**需设置`torch_dtype=torch.float16`并使用`nunchaku-fp16`注意力模块，完整示例见[`examples/flux.1-dev-turing.py`](examples/flux.1-dev-turing.py)。
+**注意**：\*\*Turing显卡用户（如20系列）\*\*需设置`torch_dtype=torch.float16`并使用`nunchaku-fp16`注意力模块，完整示例见[`examples/flux.1-dev-turing.py`](examples/flux.1-dev-turing.py)。
 ### FP16 Attention
@@ -281,14 +280,14 @@ Nunchaku 支持 [FLUX.1-tools](https://blackforestlabs.ai/flux-1-tools/) 和 [FL
 ## 使用演示
-* FLUX.1 模型
+- FLUX.1 模型
-  * 文生图：见 [`app/flux.1/t2i`](app/flux.1/t2i)。
+  - 文生图：见 [`app/flux.1/t2i`](app/flux.1/t2i)。
-  * 草图生成图像 ([pix2pix-Turbo](https://github.com/GaParmar/img2img-turbo))：见 [`app/flux.1/sketch`](app/flux.1/sketch)。
+  - 草图生成图像 ([pix2pix-Turbo](https://github.com/GaParmar/img2img-turbo))：见 [`app/flux.1/sketch`](app/flux.1/sketch)。
-  * 深度/Canny 边缘生成图像 ([FLUX.1-tools](https://blackforestlabs.ai/flux-1-tools/))：见 [`app/flux.1/depth_canny`](app/flux.1/depth_canny)。
+  - 深度/Canny 边缘生成图像 ([FLUX.1-tools](https://blackforestlabs.ai/flux-1-tools/))：见 [`app/flux.1/depth_canny`](app/flux.1/depth_canny)。
-  * 修复 ([FLUX.1-Fill-dev](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev))：见 [`app/flux.1/fill`](app/flux.1/fill)。
+  - 修复 ([FLUX.1-Fill-dev](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev))：见 [`app/flux.1/fill`](app/flux.1/fill)。
-  * Redux ([FLUX.1-Redux-dev](https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev))：见 [`app/flux.1/redux`](app/flux.1/redux)。
+  - Redux ([FLUX.1-Redux-dev](https://huggingface.co/black-forest-labs/FLUX.1-Redux-dev))：见 [`app/flux.1/redux`](app/flux.1/redux)。
-* SANA：
+- SANA：
-  * 文生图：见 [`app/sana/t2i`](app/sana/t2i)。
+  - 文生图：见 [`app/sana/t2i`](app/sana/t2i)。
 ## 自定义模型量化
@@ -303,6 +302,7 @@ Nunchaku 支持 [FLUX.1-tools](https://blackforestlabs.ai/flux-1-tools/) 和 [FL
 请查看 [此处](https://github.com/mit-han-lab/nunchaku/issues/266) 获取四月的路线图。
 ## 贡献
 我们诚挚欢迎社区贡献！请参阅[贡献指南](docs/contribution_guide_ZH.md)了解如何为 Nunchaku 贡献代码。
 ## 问题排查
@@ -315,13 +315,13 @@ Nunchaku 支持 [FLUX.1-tools](https://blackforestlabs.ai/flux-1-tools/) 和 [FL
 ## 相关项目
-* [Efficient Spatially Sparse Inference for Conditional GANs and Diffusion Models](https://arxiv.org/abs/2211.02048), NeurIPS 2022 & T-PAMI 2023
+- [Efficient Spatially Sparse Inference for Conditional GANs and Diffusion Models](https://arxiv.org/abs/2211.02048), NeurIPS 2022 & T-PAMI 2023
-* [SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https://arxiv.org/abs/2211.10438), ICML 2023
+- [SmoothQuant: Accurate and Efficient Post-Training Quantization for Large Language Models](https://arxiv.org/abs/2211.10438), ICML 2023
-* [Q-Diffusion: Quantizing Diffusion Models](https://arxiv.org/abs/2302.04304), ICCV 2023
+- [Q-Diffusion: Quantizing Diffusion Models](https://arxiv.org/abs/2302.04304), ICCV 2023
-* [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://arxiv.org/abs/2306.00978), MLSys 2024
+- [AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration](https://arxiv.org/abs/2306.00978), MLSys 2024
-* [DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models](https://arxiv.org/abs/2402.19481), CVPR 2024
+- [DistriFusion: Distributed Parallel Inference for High-Resolution Diffusion Models](https://arxiv.org/abs/2402.19481), CVPR 2024
-* [QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving](https://arxiv.org/abs/2405.04532), MLSys 2025
+- [QServe: W4A8KV4 Quantization and System Co-design for Efficient LLM Serving](https://arxiv.org/abs/2405.04532), MLSys 2025
-* [SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformers](https://arxiv.org/abs/2410.10629), ICLR 2025
+- [SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformers](https://arxiv.org/abs/2410.10629), ICLR 2025
 ## 引用

--- a/app/flux.1/depth_canny/README.md
+++ b/app/flux.1/depth_canny/README.md
@@ -6,8 +6,8 @@ This interactive Gradio application transforms your uploaded image into a differ
 The base models are:
-* [FLUX.1-Depth-dev](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev) (preserves depth map)
+- [FLUX.1-Depth-dev](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev) (preserves depth map)
-* [FLUX.1-Canny-dev](https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev) (preserves Canny edge)
+- [FLUX.1-Canny-dev](https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev) (preserves Canny edge)
 First you need to install some dependencies:
@@ -22,7 +22,7 @@ Then run:
 python run_gradio.py
 ```
-* By default, the model is `FLUX.1-Depth-dev`. You can add `-m canny` to switch to `FLUX.1-Canny-dev`.
+- By default, the model is `FLUX.1-Depth-dev`. You can add `-m canny` to switch to `FLUX.1-Canny-dev`.
-* The demo loads the Gemma-2B model as a safety checker by default. To disable this feature, use `--no-safety-checker`.
+- The demo loads the Gemma-2B model as a safety checker by default. To disable this feature, use `--no-safety-checker`.
-* To further reduce GPU memory usage, you can enable the W4A16 text encoder by specifying `--use-qencoder`.
+- To further reduce GPU memory usage, you can enable the W4A16 text encoder by specifying `--use-qencoder`.
-* By default, we use our INT4 model. Use  `-p bf16` to switch to the BF16 model.
+- By default, we use our INT4 model. Use `-p bf16` to switch to the BF16 model.
--- a/app/flux.1/fill/README.md
+++ b/app/flux.1/fill/README.md
@@ -8,6 +8,6 @@ This interactive Gradio application allows you to interactively inpaint an uploa
 python run_gradio.py
 ```
-* The demo loads the Gemma-2B model as a safety checker by default. To disable this feature, use `--no-safety-checker`.
+- The demo loads the Gemma-2B model as a safety checker by default. To disable this feature, use `--no-safety-checker`.
-* To further reduce GPU memory usage, you can enable the W4A16 text encoder by specifying `--use-qencoder`.
+- To further reduce GPU memory usage, you can enable the W4A16 text encoder by specifying `--use-qencoder`.
-* By default, we use our INT4 model. Use  `-p bf16` to switch to the BF16 model.
+- By default, we use our INT4 model. Use `-p bf16` to switch to the BF16 model.
--- a/app/flux.1/redux/README.md
+++ b/app/flux.1/redux/README.md
@@ -8,4 +8,4 @@ This interactive Gradio application allows you to interactively generate image v
 python run_gradio.py
 ```
-* By default, we use our INT4 model. Use  `-p bf16` to switch to the BF16 model.
+- By default, we use our INT4 model. Use `-p bf16` to switch to the BF16 model.
--- a/app/flux.1/sketch/README.md
+++ b/app/flux.1/sketch/README.md
@@ -10,6 +10,6 @@ To launch the application, simply run:
 python run_gradio.py
 ```
-* The demo loads the Gemma-2B model as a safety checker by default. To disable this feature, use `--no-safety-checker`.
+- The demo loads the Gemma-2B model as a safety checker by default. To disable this feature, use `--no-safety-checker`.
-* To further reduce GPU memory usage, you can enable the W4A16 text encoder by specifying `--use-qencoder`.
+- To further reduce GPU memory usage, you can enable the W4A16 text encoder by specifying `--use-qencoder`.
-* By default, we use our INT4 model. Use  `-p bf16` to switch to the BF16 model.
+- By default, we use our INT4 model. Use `-p bf16` to switch to the BF16 model.
--- a/app/flux.1/t2i/README.md
+++ b/app/flux.1/t2i/README.md
@@ -12,10 +12,10 @@ To launch the application, simply run:
 python run_gradio.py
 ```
-* The demo also defaults to the FLUX.1-schnell model. To switch to the FLUX.1-dev model, use `-m dev`.
+- The demo also defaults to the FLUX.1-schnell model. To switch to the FLUX.1-dev model, use `-m dev`.
-* By default, the Gemma-2B model is loaded as a safety checker. To disable this feature and save GPU memory, use `--no-safety-checker`.
+- By default, the Gemma-2B model is loaded as a safety checker. To disable this feature and save GPU memory, use `--no-safety-checker`.
-* To further reduce GPU memory usage, you can enable the W4A16 text encoder by specifying `--use-qencoder`.
+- To further reduce GPU memory usage, you can enable the W4A16 text encoder by specifying `--use-qencoder`.
-* By default, only the INT4 DiT is loaded. Use `-p int4 bf16` to add a BF16 DiT for side-by-side comparison, or `-p bf16` to load only the BF16 model.
+- By default, only the INT4 DiT is loaded. Use `-p int4 bf16` to add a BF16 DiT for side-by-side comparison, or `-p bf16` to load only the BF16 model.
 ## Command Line Inference
@@ -25,13 +25,17 @@ We provide a script, [generate.py](generate.py), that generates an image from a
 python generate.py --prompt "You Text Prompt"
 ```
-* The generated image will be saved as `output.png` by default. You can specify a different path using the `-o` or `--output-path` options.
+- The generated image will be saved as `output.png` by default. You can specify a different path using the `-o` or `--output-path` options.
-* The script defaults to using the FLUX.1-schnell model. To switch to the FLUX.1-dev model, use `-m dev`.
-* By default, the script uses our INT4 model. To use the BF16 model instead, specify `-p bf16`.
-* You can specify `--use-qencoder` to use our W4A16 text encoder.
-* You can adjust the number of inference steps and guidance scale with `-t` and `-g`, respectively. For the FLUX.1-schnell model, the defaults are 4 steps and a guidance scale of 0; for the FLUX.1-dev model, the defaults are 50 steps and a guidance scale of 3.5.
-* When using the FLUX.1-dev model, you also have the option to load a LoRA adapter with `--lora-name`. Available choices are `None`, [`Anime`](https://huggingface.co/alvdansen/sonny-anime-fixed), [`GHIBSKY Illustration`](https://huggingface.co/aleksa-codes/flux-ghibsky-illustration), [`Realism`](https://huggingface.co/XLabs-AI/flux-RealismLora), [`Children Sketch`](https://huggingface.co/Shakker-Labs/FLUX.1-dev-LoRA-Children-Simple-Sketch), and [`Yarn Art`](https://huggingface.co/linoyts/yarn_art_Flux_LoRA), with the default set to `None`. You can also specify the LoRA weight with `--lora-weight`, which defaults to 1.
+- The script defaults to using the FLUX.1-schnell model. To switch to the FLUX.1-dev model, use `-m dev`.
+- By default, the script uses our INT4 model. To use the BF16 model instead, specify `-p bf16`.
+- You can specify `--use-qencoder` to use our W4A16 text encoder.
+- You can adjust the number of inference steps and guidance scale with `-t` and `-g`, respectively. For the FLUX.1-schnell model, the defaults are 4 steps and a guidance scale of 0; for the FLUX.1-dev model, the defaults are 50 steps and a guidance scale of 3.5.
+- When using the FLUX.1-dev model, you also have the option to load a LoRA adapter with `--lora-name`. Available choices are `None`, [`Anime`](https://huggingface.co/alvdansen/sonny-anime-fixed), [`GHIBSKY Illustration`](https://huggingface.co/aleksa-codes/flux-ghibsky-illustration), [`Realism`](https://huggingface.co/XLabs-AI/flux-RealismLora), [`Children Sketch`](https://huggingface.co/Shakker-Labs/FLUX.1-dev-LoRA-Children-Simple-Sketch), and [`Yarn Art`](https://huggingface.co/linoyts/yarn_art_Flux_LoRA), with the default set to `None`. You can also specify the LoRA weight with `--lora-weight`, which defaults to 1.
 ## Latency Benchmark
@@ -41,12 +45,12 @@ To measure the latency of our INT4 models, use the following command:
 python latency.py
 ```
-* The script defaults to the INT4 FLUX.1-schnell model. To switch to FLUX.1-dev, use the `-m dev` option. For BF16 precision, add `-p bf16`.
+- The script defaults to the INT4 FLUX.1-schnell model. To switch to FLUX.1-dev, use the `-m dev` option. For BF16 precision, add `-p bf16`.
-* Adjust the number of inference steps and the guidance scale using `-t` and `-g`, respectively.
+- Adjust the number of inference steps and the guidance scale using `-t` and `-g`, respectively.
  - For FLUX.1-schnell, the defaults are 4 steps and a guidance scale of 0.
  - For FLUX.1-dev, the defaults are 50 steps and a guidance scale of 3.5.
-* By default, the script measures the end-to-end latency for generating a single image. To measure the latency of a single DiT forward step instead, use the `--mode step` flag.
+- By default, the script measures the end-to-end latency for generating a single image. To measure the latency of a single DiT forward step instead, use the `--mode step` flag.
-* Specify the number of warmup and test runs using `--warmup-times` and `--test-times`. The defaults are 2 warmup runs and 10 test runs.
+- Specify the number of warmup and test runs using `--warmup-times` and `--test-times`. The defaults are 2 warmup runs and 10 test runs.
 ## Quality Results
@@ -63,12 +67,12 @@ python evaluate.py -p int4
 python evaluate.py -p bf16
 ```
-* The commands above will generate images from FLUX.1-schnell on both datasets. Use `-m dev` to switch to FLUX.1-dev, or specify a single dataset with `-d MJHQ` or `-d DCI`.
+- The commands above will generate images from FLUX.1-schnell on both datasets. Use `-m dev` to switch to FLUX.1-dev, or specify a single dataset with `-d MJHQ` or `-d DCI`.
-* By default, generated images are saved to `results/$MODEL/$PRECISION`. Customize the output path using the `-o` option if desired.
+- By default, generated images are saved to `results/$MODEL/$PRECISION`. Customize the output path using the `-o` option if desired.
-* You can also adjust the number of inference steps and the guidance scale using `-t` and `-g`, respectively.
+- You can also adjust the number of inference steps and the guidance scale using `-t` and `-g`, respectively.
  - For FLUX.1-schnell, the defaults are 4 steps and a guidance scale of 0.
  - For FLUX.1-dev, the defaults are 50 steps and a guidance scale of 3.5.
-* To accelerate the generation process, you can distribute the workload across multiple GPUs. For instance, if you have $N$ GPUs, on GPU $i (0 \le i < N)$ , you can add the options `--chunk-start $i --chunk-step $N`. This setup ensures each GPU handles a distinct portion of the workload, enhancing overall efficiency.
+- To accelerate the generation process, you can distribute the workload across multiple GPUs. For instance, if you have $N$ GPUs, on GPU $i (0 \\le i < N)$ , you can add the options `--chunk-start $i --chunk-step $N`. This setup ensures each GPU handles a distinct portion of the workload, enhancing overall efficiency.
 Finally you can compute the metrics for the images with