Merge branch 'main' of github.com:ModelTC/lightx2v into dev-debug-distill

ae089db4 · GoatWu · 8b213df0 · 4796fc6e · 8b213df0 · 8b213df0
Commit ae089db4 authored Jul 11, 2025 by GoatWu
10 changed files
--- a/scripts/cache/run_wan_i2v_ada.sh
+++ b/scripts/cache/run_wan_i2v_ada.sh
-#!/bin/bash
-
-# set path and first
-lightx2v_path=
-model_path=
-
-# check section
-if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
-    cuda_devices=0
-    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
-    export CUDA_VISIBLE_DEVICES=${cuda_devices}
-fi
-
-if [ -z "${lightx2v_path}" ]; then
-    echo "Error: lightx2v_path is not set. Please set this variable first."
-    exit 1
-fi
-
-if [ -z "${model_path}" ]; then
-    echo "Error: model_path is not set. Please set this variable first."
-    exit 1
-fi
-
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
-export DTYPE=BF16
-export ENABLE_PROFILING_DEBUG=true
-export ENABLE_GRAPH_MODE=false
-
-python -m lightx2v.infer \
--model_cls wan2.1 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/caching/adacache/wan_i2v_ada.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
--image_path ${lightx2v_path}/assets/inputs/imgs/img_0.jpg \
--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_ada.mp4
--- a/scripts/cache/run_wan_i2v_custom.sh
+++ b/scripts/cache/run_wan_i2v_custom.sh
-#!/bin/bash
-
-# set path and first
-lightx2v_path=
-model_path=
-
-# check section
-if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
-    cuda_devices=0
-    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
-    export CUDA_VISIBLE_DEVICES=${cuda_devices}
-fi
-
-if [ -z "${lightx2v_path}" ]; then
-    echo "Error: lightx2v_path is not set. Please set this variable first."
-    exit 1
-fi
-
-if [ -z "${model_path}" ]; then
-    echo "Error: model_path is not set. Please set this variable first."
-    exit 1
-fi
-
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
-export DTYPE=BF16
-export ENABLE_PROFILING_DEBUG=true
-export ENABLE_GRAPH_MODE=false
-
-python -m lightx2v.infer \
--model_cls wan2.1 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/caching/custom/wan_i2v_custom_480p.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
--image_path ${lightx2v_path}/assets/inputs/imgs/img_0.jpg \
--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_custom.mp4
--- a/scripts/cache/run_wan_i2v_taylor.sh
+++ b/scripts/cache/run_wan_i2v_taylor.sh
-#!/bin/bash
-
-# set path and first
-lightx2v_path=
-model_path=
-
-# check section
-if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
-    cuda_devices=0
-    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
-    export CUDA_VISIBLE_DEVICES=${cuda_devices}
-fi
-
-if [ -z "${lightx2v_path}" ]; then
-    echo "Error: lightx2v_path is not set. Please set this variable first."
-    exit 1
-fi
-
-if [ -z "${model_path}" ]; then
-    echo "Error: model_path is not set. Please set this variable first."
-    exit 1
-fi
-
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
-export DTYPE=BF16
-export ENABLE_PROFILING_DEBUG=true
-export ENABLE_GRAPH_MODE=false
-
-python -m lightx2v.infer \
--model_cls wan2.1 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/caching/taylorseer/wan_i2v_tea_480p.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
--image_path ${lightx2v_path}/assets/inputs/imgs/img_0.jpg \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_taylor.mp4
--- a/scripts/cache/run_wan_t2v_ada.sh
+++ b/scripts/cache/run_wan_t2v_ada.sh
-#!/bin/bash
-
-# set path and first
-lightx2v_path=
-model_path=
-
-# check section
-if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
-    cuda_devices=0
-    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
-    export CUDA_VISIBLE_DEVICES=${cuda_devices}
-fi
-
-if [ -z "${lightx2v_path}" ]; then
-    echo "Error: lightx2v_path is not set. Please set this variable first."
-    exit 1
-fi
-
-if [ -z "${model_path}" ]; then
-    echo "Error: model_path is not set. Please set this variable first."
-    exit 1
-fi
-
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
-export DTYPE=BF16
-export ENABLE_PROFILING_DEBUG=true
-export ENABLE_GRAPH_MODE=false
-
-python -m lightx2v.infer \
--model_cls wan2.1 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/caching/adacache/wan_t2v_ada.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_t2v_ada.mp4
--- a/scripts/cache/run_wan_t2v_custom.sh
+++ b/scripts/cache/run_wan_t2v_custom.sh
-#!/bin/bash
-
-# set path and first
-lightx2v_path=
-model_path=
-
-# check section
-if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
-    cuda_devices=0
-    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
-    export CUDA_VISIBLE_DEVICES=${cuda_devices}
-fi
-
-if [ -z "${lightx2v_path}" ]; then
-    echo "Error: lightx2v_path is not set. Please set this variable first."
-    exit 1
-fi
-
-if [ -z "${model_path}" ]; then
-    echo "Error: model_path is not set. Please set this variable first."
-    exit 1
-fi
-
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
-export DTYPE=BF16
-export ENABLE_PROFILING_DEBUG=true
-export ENABLE_GRAPH_MODE=false
-
-python -m lightx2v.infer \
--model_cls wan2.1 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/caching/custom/wan_t2v_custom_1_3b.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_t2v_custom.mp4
--- a/scripts/cache/run_wan_t2v_taylor.sh
+++ b/scripts/cache/run_wan_t2v_taylor.sh
-#!/bin/bash
-
-# set path and first
-lightx2v_path=
-model_path=
-
-# check section
-if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
-    cuda_devices=0
-    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using default value: ${cuda_devices}, change at shell script or set env variable."
-    export CUDA_VISIBLE_DEVICES=${cuda_devices}
-fi
-
-if [ -z "${lightx2v_path}" ]; then
-    echo "Error: lightx2v_path is not set. Please set this variable first."
-    exit 1
-fi
-
-if [ -z "${model_path}" ]; then
-    echo "Error: model_path is not set. Please set this variable first."
-    exit 1
-fi
-
-export TOKENIZERS_PARALLELISM=false
-
-export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
-export DTYPE=BF16
-export ENABLE_PROFILING_DEBUG=true
-export ENABLE_GRAPH_MODE=false
-
-python -m lightx2v.infer \
--model_cls wan2.1 \
--task t2v \
--model_path $model_path \
--config_json ${lightx2v_path}/configs/caching/taylorseer/wan_t2v_taylorseer.json \
--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
--negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_t2v_taylor.mp4
--- a/scripts/cache/run_wan_t2v_tea.sh
+++ b/scripts/cache/run_wan_t2v_tea.sh
@@ -32,7 +32,7 @@ python -m lightx2v.infer \
 --model_cls wan2.1 \
 --task t2v \
 --model_path $model_path \
--config_json ${lightx2v_path}/configs/caching/teacache/wan_t2v_1_3b.json \
+--config_json ${lightx2v_path}/configs/caching/teacache/wan_t2v_1_3b_tea_480p.json \
 --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
 --negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
 --save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_t2v_tea.mp4
--- a/tools/convert/converter.py
+++ b/tools/convert/converter.py
@@ -10,6 +10,7 @@ from safetensors import safe_open, torch as st
 from loguru import logger
 from tqdm import tqdm
 from collections import defaultdict
+from qtorch.quant import float_quantize


 def get_key_mapping_rules(direction, model_type):
@@ -314,7 +315,8 @@ def quantize_tensor(w, w_bit=8, dtype=torch.int8):
    max_val = w.abs().amax(dim=1, keepdim=True).clamp(min=1e-5)

    if dtype == torch.float8_e4m3fn:
-        qmin, qmax = -448, 448
+        finfo = torch.finfo(dtype)
+        qmin, qmax = finfo.min, finfo.max
    elif dtype == torch.int8:
        qmin, qmax = -128, 127

@@ -322,7 +324,9 @@ def quantize_tensor(w, w_bit=8, dtype=torch.int8):
    scales = max_val / qmax

    if dtype == torch.float8_e4m3fn:
-        w_q = torch.clamp(w / scales, qmin, qmax).to(dtype)
+        scaled_tensor = w / scales
+        scaled_tensor = torch.clip(scaled_tensor, qmin, qmax)
+        w_q = float_quantize(scaled_tensor.float(), 4, 3, rounding="nearest").to(dtype)
    else:
        w_q = torch.clamp(torch.round(w / scales), qmin, qmax).to(dtype)

@@ -341,7 +345,8 @@ def quantize_model(
    target_keys=["attn", "ffn"],
    key_idx=2,
    ignore_key=None,
-    dtype=torch.int8,
+    linear_dtype=torch.int8,
+    non_linear_dtype=torch.float,
 ):
    """
    Quantize model weights in-place
@@ -370,16 +375,20 @@ def quantize_model(

            # Skip non-tensors, small tensors, and non-2D tensors
            if not isinstance(tensor, torch.Tensor) or tensor.dim() != 2:
+                if tensor.dtype != non_linear_dtype:
+                    weights[key] = tensor.to(non_linear_dtype)
                continue

            # Check if key matches target modules
            parts = key.split(".")
            if len(parts) < key_idx + 1 or parts[key_idx] not in target_keys:
+                if tensor.dtype != non_linear_dtype:
+                    weights[key] = tensor.to(non_linear_dtype)
                continue

            try:
                # Quantize tensor and store results
-                w_q, scales = quantize_tensor(tensor, w_bit, dtype)
+                w_q, scales = quantize_tensor(tensor, w_bit, linear_dtype)

                # Replace original tensor and store scales
                weights[key] = w_q
@@ -500,7 +509,8 @@ def convert_weights(args):
            target_keys=args.target_keys,
            key_idx=args.key_idx,
            ignore_key=args.ignore_key,
-            dtype=args.dtype,
+            linear_dtype=args.linear_dtype,
+            non_linear_dtype=args.non_linear_dtype,
        )

    os.makedirs(args.output, exist_ok=True)
@@ -637,10 +647,17 @@ def main():
        help="Device to use for quantization (cpu/cuda)",
    )
    parser.add_argument(
-        "--dtype",
+        "--linear_dtype",
        type=str,
        choices=["torch.int8", "torch.float8_e4m3fn"],
-        help="Data type for quantization",
+        help="Data type for linear",
+    )
+    parser.add_argument(
+        "--non_linear_dtype",
+        type=str,
+        default="torch.float32",
+        choices=["torch.bfloat16", "torch.float16"],
+        help="Data type for non-linear",
    )
    parser.add_argument("--lora_path", type=str, nargs="*", help="Path(s) to LoRA file(s). Can specify multiple paths separated by spaces.")
    parser.add_argument(
@@ -654,12 +671,8 @@ def main():
    args = parser.parse_args()

    if args.quantized:
-        if args.dtype == "torch.int8":
-            args.dtype = torch.int8
-        elif args.dtype == "torch.float8_e4m3fn":
-            args.dtype = torch.float8_e4m3fn
-        else:
-            raise ValueError(f"Not support dtype :{args.dtype}")
+        args.linear_dtype = eval(args.linear_dtype)
+        args.non_linear_dtype = eval(args.non_linear_dtype)

        model_type_keys_map = {
            "wan_dit": {

--- a/tools/convert/readme.md
+++ b/tools/convert/readme.md
@@ -36,7 +36,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type wan_dit \
    --quantized \
    --save_by_block
@@ -48,7 +48,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
    --model_type wan_dit \
    --quantized \
    --save_by_block
@@ -62,7 +62,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type wan_dit \
    --lora_path /Path/To/LoRA1/ /Path/To/LoRA2/ \
    --lora_alpha 1.0 1.0 \
@@ -78,7 +78,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext ..safetensors \
    --output_name hunyuan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type hunyuan_dit \
    --quantized
 ```
@@ -89,7 +89,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name hunyuan_fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
    --model_type hunyuan_dit \
    --quantized
 ```
@@ -103,7 +103,8 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .pth\
    --output_name models_t5_umt5-xxl-enc-int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
+    --non_linear_dtype torch.bfloat16 \
    --model_type wan_t5 \
    --quantized
 ```
@@ -111,10 +112,11 @@ python converter.py \
 ```bash
 python converter.py \
    --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth \
-    --output /Path/To/output \
+    --output /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/fp8 \
    --output_ext .pth\
    --output_name models_t5_umt5-xxl-enc-fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
+    --non_linear_dtype torch.bfloat16 \
    --model_type wan_t5 \
    --quantized
 ```
@@ -128,7 +130,8 @@ python converter.py \
  --output /Path/To/output \
  --output_ext .pth \
  --output_name clip-int8 \
-  --dtype torch.int8 \
+  --linear_dtype torch.int8 \
+  --non_linear_dtype torch.float16 \
  --model_type wan_clip \
  --quantized

@@ -136,10 +139,11 @@ python converter.py \
 ```bash
 python converter.py \
  --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth \
-  --output /Path/To/output \
+  --output ./output \
  --output_ext .pth \
  --output_name clip-fp8 \
-  --dtype torch.float8_e4m3fn \
+  --linear_dtype torch.float8_e4m3fn \
+  --non_linear_dtype torch.float16 \
  --model_type wan_clip \
  --quantized
 ```
--- a/tools/convert/readme_zh.md
+++ b/tools/convert/readme_zh.md
@@ -36,7 +36,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type wan_dit \
    --quantized \
    --save_by_block
@@ -48,7 +48,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
    --model_type wan_dit \
    --quantized \
    --save_by_block
@@ -62,7 +62,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type wan_dit \
    --lora_path /Path/To/LoRA1/ /Path/To/LoRA2/ \
    --lora_alpha 1.0 1.0 \
@@ -78,7 +78,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext ..safetensors \
    --output_name hunyuan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type hunyuan_dit \
    --quantized
 ```
@@ -89,7 +89,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name hunyuan_fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
    --model_type hunyuan_dit \
    --quantized
 ```
@@ -103,7 +103,8 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .pth\
    --output_name models_t5_umt5-xxl-enc-int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
+    --non_linear_dtype torch.bfloat16 \
    --model_type wan_t5 \
    --quantized
 ```
@@ -111,10 +112,11 @@ python converter.py \
 ```bash
 python converter.py \
    --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth \
-    --output /Path/To/output \
+    --output /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/fp8 \
    --output_ext .pth\
    --output_name models_t5_umt5-xxl-enc-fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
+    --non_linear_dtype torch.bfloat16 \
    --model_type wan_t5 \
    --quantized
 ```
@@ -128,7 +130,8 @@ python converter.py \
  --output /Path/To/output \
  --output_ext .pth \
  --output_name clip-int8 \
-  --dtype torch.int8 \
+  --linear_dtype torch.int8 \
+  --non_linear_dtype torch.float16 \
  --model_type wan_clip \
  --quantized

@@ -136,10 +139,11 @@ python converter.py \
 ```bash
 python converter.py \
  --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth \
-  --output /Path/To/output \
+  --output ./output \
  --output_ext .pth \
  --output_name clip-fp8 \
-  --dtype torch.float8_e4m3fn \
+  --linear_dtype torch.float8_e4m3fn \
+  --non_linear_dtype torch.float16 \
  --model_type wan_clip \
  --quantized
 ```