Merge branch 'main' into audio_r2v

e08c4f90 · sandy · GitHub · 12bfd120 · 6d07a72e · e08c4f90
Commit e08c4f90 authored Jul 17, 2025 by sandy Committed by GitHub Jul 17, 2025
11 changed files
--- a/scripts/post_multi_servers.py
+++ b/scripts/post_multi_servers.py
@@ -46,12 +46,26 @@ def generate_task_id():
 def post_all_tasks(urls, messages):
    msg_num = len(messages)
    msg_index = 0
+    available_urls = []
+    for url in urls:
+        try:
+            _ = requests.get(f"{url}/v1/service/status").json()
+        except Exception as e:
+            continue
+        available_urls.append(url)
+
+    if not available_urls:
+        logger.error("No available urls.")
+        return
+
+    logger.info(f"available_urls: {available_urls}")
+
    while True:
-        for url in urls:
-            response = requests.get(f"{url}/v1/local/video/generate/service_status").json()
+        for url in available_urls:
+            response = requests.get(f"{url}/v1/service/status").json()
            if response["service_status"] == "idle":
                logger.info(f"{url} service is idle, start task...")
-                response = requests.post(f"{url}/v1/local/video/generate", json=messages[msg_index])
+                response = requests.post(f"{url}/v1/tasks/", json=messages[msg_index])
                logger.info(f"response: {response.json()}")
                msg_index += 1
                if msg_index == msg_num:

--- a/scripts/server/post_vbench_i2v.py
+++ b/scripts/server/post_vbench_i2v.py
+from tqdm import tqdm
+import argparse
+import glob
+import os
+import requests
+import time
+
+
+def post_i2v(image_path, output_path):
+    url = "http://localhost:8000"
+
+    file_name = os.path.basename(image_path)
+    prompt = os.path.splitext(file_name)[0]
+    save_video_path = os.path.join(output_path, f"{prompt}.mp4")
+
+    message = {
+        "prompt": prompt,
+        "negative_prompt": "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走",
+        "image_path": image_path,
+        "save_video_path": save_video_path,
+    }
+
+    while True:
+        response = requests.get(f"{url}/v1/service/status").json()
+        if response["service_status"] == "idle":
+            response = requests.post(f"{url}/v1/tasks/", json=message)
+            return
+        time.sleep(3)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_path", type=str, required=True, help="path to img files.")
+    parser.add_argument("--output_path", type=str, default="./vbench_i2v", help="output video path.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.data_path):
+        img_files = glob.glob(os.path.join(args.data_path, "*.jpg"))
+        print(f"Found {len(img_files)} image files.")
+
+        with tqdm(total=len(img_files)) as progress_bar:
+            for idx, img_path in enumerate(img_files):
+                post_i2v(img_path, args.output_path)
+                progress_bar.update()
--- a/scripts/server/start_server.sh
+++ b/scripts/server/start_server.sh
@@ -42,7 +42,6 @@ python -m lightx2v.api_server \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/wan/wan_i2v_dist.json \
 --port 8000 \
--start_inference \
 --nproc_per_node 1

 echo "Service stopped"
--- a/scripts/stop_running_task.py
+++ b/scripts/stop_running_task.py
--- a/scripts/wan/run_wan_i2v_audio.sh
+++ b/scripts/wan/run_wan_i2v_audio.sh
 #!/bin/bash

 # set path and first
-lightx2v_path="/mnt/Text2Video/wangshankun/lightx2v"
-model_path="/mnt/Text2Video/wangshankun/HF_Cache/Wan2.1-R2V-Audio-14B-720P/"
-#lora_path="/mnt/Text2Video/wuzhuguanyu/Wan21_I2V_14B_lightx2v_cfg_step_distill_lora_rank64.safetensors"
-#lora_path="/mnt/aigc/qiuzesong/Distill/DMD2/0716lightx2v/LightX2V/tools/extract/wan_r2v_V2_14B_lora_ran32.safetensors"
+
+lightx2v_path=
+model_path=
+lora_path=
+
 # check section
 if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
    cuda_devices=0

--- a/scripts/wan/run_wan_i2v_causvid.sh
+++ b/scripts/wan/run_wan_i2v_causvid.sh
 #!/bin/bash

 # set path and first
-lightx2v_path="/mnt/Text2Video/wangshankun/lightx2v/"
-model_path="/mnt/Text2Video/wangshankun/HF_Cache/Wan2.1-I2V-14B-CausVid/"
+lightx2v_path=
+model_path=
 # check section
 if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
    cuda_devices=0

--- a/scripts/wan/run_wan_skyreels_v2_df.sh
+++ b/scripts/wan/run_wan_skyreels_v2_df.sh
 #!/bin/bash
 # set path and first
-lightx2v_path="/mnt/Text2Video/wangshankun/tmp_code/lightx2v/"
-model_path="/mnt/Text2Video/wangshankun/HF_Cache/hub/models--Skywork--SkyReels-V2-DF-14B-540P/snapshots/7ff972ba7b6a33d2f6e6c976dd3cf2d36984eee4/"
-
+lightx2v_path=
+model_path=

 # check section
 if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then

--- a/scripts/cache/run_wan_t2v_ada.sh
+++ b/scripts/cache/run_wan_t2v_ada.sh
@@ -32,7 +32,7 @@ python -m lightx2v.infer \
 --model_cls wan2.1 \
 --task t2v \
 --model_path $model_path \
--config_json ${lightx2v_path}/configs/caching/adacache/wan_t2v_ada.json \
+--config_json ${lightx2v_path}/configs/changing_resolution/wan_t2v.json \
 --prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
 --negative_prompt "镜头晃动，色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走" \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_t2v_ada.mp4
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_t2v_changing_resolution.mp4
--- a/tools/convert/converter.py
+++ b/tools/convert/converter.py
@@ -10,6 +10,7 @@ from safetensors import safe_open, torch as st
 from loguru import logger
 from tqdm import tqdm
 from collections import defaultdict
+from qtorch.quant import float_quantize


 def get_key_mapping_rules(direction, model_type):
@@ -314,7 +315,8 @@ def quantize_tensor(w, w_bit=8, dtype=torch.int8):
    max_val = w.abs().amax(dim=1, keepdim=True).clamp(min=1e-5)

    if dtype == torch.float8_e4m3fn:
-        qmin, qmax = -448, 448
+        finfo = torch.finfo(dtype)
+        qmin, qmax = finfo.min, finfo.max
    elif dtype == torch.int8:
        qmin, qmax = -128, 127

@@ -322,7 +324,9 @@ def quantize_tensor(w, w_bit=8, dtype=torch.int8):
    scales = max_val / qmax

    if dtype == torch.float8_e4m3fn:
-        w_q = torch.clamp(w / scales, qmin, qmax).to(dtype)
+        scaled_tensor = w / scales
+        scaled_tensor = torch.clip(scaled_tensor, qmin, qmax)
+        w_q = float_quantize(scaled_tensor.float(), 4, 3, rounding="nearest").to(dtype)
    else:
        w_q = torch.clamp(torch.round(w / scales), qmin, qmax).to(dtype)

@@ -341,7 +345,8 @@ def quantize_model(
    target_keys=["attn", "ffn"],
    key_idx=2,
    ignore_key=None,
-    dtype=torch.int8,
+    linear_dtype=torch.int8,
+    non_linear_dtype=torch.float,
 ):
    """
    Quantize model weights in-place
@@ -370,16 +375,20 @@ def quantize_model(

            # Skip non-tensors, small tensors, and non-2D tensors
            if not isinstance(tensor, torch.Tensor) or tensor.dim() != 2:
+                if tensor.dtype != non_linear_dtype:
+                    weights[key] = tensor.to(non_linear_dtype)
                continue

            # Check if key matches target modules
            parts = key.split(".")
            if len(parts) < key_idx + 1 or parts[key_idx] not in target_keys:
+                if tensor.dtype != non_linear_dtype:
+                    weights[key] = tensor.to(non_linear_dtype)
                continue

            try:
                # Quantize tensor and store results
-                w_q, scales = quantize_tensor(tensor, w_bit, dtype)
+                w_q, scales = quantize_tensor(tensor, w_bit, linear_dtype)

                # Replace original tensor and store scales
                weights[key] = w_q
@@ -440,9 +449,11 @@ def load_loras(lora_path, weight_dict, alpha):
        elif name in lora_diffs:
            name_diff = lora_diffs[name]
            lora_diff = lora_weights[name_diff].to(param.device, param.dtype)
-            param += lora_diff * alpha
-            applied_count += 1
-
+            try:
+                param += lora_diff * alpha
+                applied_count += 1
+            except Exception as e:
+                continue
    logger.info(f"Applied {applied_count} LoRA weight adjustments")


@@ -500,7 +511,8 @@ def convert_weights(args):
            target_keys=args.target_keys,
            key_idx=args.key_idx,
            ignore_key=args.ignore_key,
-            dtype=args.dtype,
+            linear_dtype=args.linear_dtype,
+            non_linear_dtype=args.non_linear_dtype,
        )

    os.makedirs(args.output, exist_ok=True)
@@ -637,10 +649,17 @@ def main():
        help="Device to use for quantization (cpu/cuda)",
    )
    parser.add_argument(
-        "--dtype",
+        "--linear_dtype",
        type=str,
        choices=["torch.int8", "torch.float8_e4m3fn"],
-        help="Data type for quantization",
+        help="Data type for linear",
+    )
+    parser.add_argument(
+        "--non_linear_dtype",
+        type=str,
+        default="torch.float32",
+        choices=["torch.bfloat16", "torch.float16"],
+        help="Data type for non-linear",
    )
    parser.add_argument("--lora_path", type=str, nargs="*", help="Path(s) to LoRA file(s). Can specify multiple paths separated by spaces.")
    parser.add_argument(
@@ -654,12 +673,8 @@ def main():
    args = parser.parse_args()

    if args.quantized:
-        if args.dtype == "torch.int8":
-            args.dtype = torch.int8
-        elif args.dtype == "torch.float8_e4m3fn":
-            args.dtype = torch.float8_e4m3fn
-        else:
-            raise ValueError(f"Not support dtype :{args.dtype}")
+        args.linear_dtype = eval(args.linear_dtype)
+        args.non_linear_dtype = eval(args.non_linear_dtype)

        model_type_keys_map = {
            "wan_dit": {

--- a/tools/convert/readme.md
+++ b/tools/convert/readme.md
-# 模型转换工具
+# Model Conversion Tool

-A powerful utility for converting model weights between different formats and performing quantization tasks.
+This converter tool can convert model weights between different formats.

-## Diffusers
-Facilitates mutual conversion between diffusers architecture and lightx2v architecture
-
-### Lightx2v->Diffusers
-```bash
-python converter.py \
-       --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P \
-       --output /Path/To/Wan2.1-I2V-14B-480P-Diffusers \
-       --direction forward \
-       --save_by_block
-```
-
-### Diffusers->Lightx2v
-```bash
-python converter.py \
-       --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers \
-       --output /Path/To/Wan2.1-I2V-14B-480P \
-       --direction backward \
-       --save_by_block
-```
-
-
-## Quantization
-This tool supports converting fp32/fp16/bf16 model weights to INT8、FP8 type.
+## Feature 1: Convert Quantized Models

+This tool supports converting **FP32/FP16/BF16** model weights to **INT8, FP8** types.

 ### Wan DIT

@@ -36,7 +14,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type wan_dit \
    --quantized \
    --save_by_block
@@ -48,7 +26,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
    --model_type wan_dit \
    --quantized \
    --save_by_block
@@ -62,7 +40,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type wan_dit \
    --lora_path /Path/To/LoRA1/ /Path/To/LoRA2/ \
    --lora_alpha 1.0 1.0 \
@@ -78,7 +56,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext ..safetensors \
    --output_name hunyuan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type hunyuan_dit \
    --quantized
 ```
@@ -89,7 +67,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name hunyuan_fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
    --model_type hunyuan_dit \
    --quantized
 ```
@@ -103,7 +81,8 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .pth\
    --output_name models_t5_umt5-xxl-enc-int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
+    --non_linear_dtype torch.bfloat16 \
    --model_type wan_t5 \
    --quantized
 ```
@@ -111,10 +90,11 @@ python converter.py \
 ```bash
 python converter.py \
    --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth \
-    --output /Path/To/output \
+    --output /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/fp8 \
    --output_ext .pth\
    --output_name models_t5_umt5-xxl-enc-fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
+    --non_linear_dtype torch.bfloat16 \
    --model_type wan_t5 \
    --quantized
 ```
@@ -128,7 +108,8 @@ python converter.py \
  --output /Path/To/output \
  --output_ext .pth \
  --output_name clip-int8 \
-  --dtype torch.int8 \
+  --linear_dtype torch.int8 \
+  --non_linear_dtype torch.float16 \
  --model_type wan_clip \
  --quantized

@@ -136,10 +117,33 @@ python converter.py \
 ```bash
 python converter.py \
  --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth \
-  --output /Path/To/output \
+  --output ./output \
  --output_ext .pth \
  --output_name clip-fp8 \
-  --dtype torch.float8_e4m3fn \
+  --linear_dtype torch.float8_e4m3fn \
+  --non_linear_dtype torch.float16 \
  --model_type wan_clip \
  --quantized
 ```
+
+
+## Feature 2: Format Conversion Between Diffusers and Lightx2v
+Supports mutual conversion between Diffusers architecture and LightX2V architecture
+
+### Lightx2v->Diffusers
+```bash
+python converter.py \
+       --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P \
+       --output /Path/To/Wan2.1-I2V-14B-480P-Diffusers \
+       --direction forward \
+       --save_by_block
+```
+
+### Diffusers->Lightx2v
+```bash
+python converter.py \
+       --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers \
+       --output /Path/To/Wan2.1-I2V-14B-480P \
+       --direction backward \
+       --save_by_block
+```
--- a/tools/convert/readme_zh.md
+++ b/tools/convert/readme_zh.md
 # 模型转换工具

-一款功能强大的实用工具，可在不同格式之间转换模型权重并执行量化任务。
-
-## Diffusers
-支持 Diffusers 架构与 LightX2V 架构之间的相互转换
-
-### Lightx2v->Diffusers
-```bash
-python converter.py \
-       --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P \
-       --output /Path/To/Wan2.1-I2V-14B-480P-Diffusers \
-       --direction forward \
-       --save_by_block
-```
-
-### Diffusers->Lightx2v
-```bash
-python converter.py \
-       --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers \
-       --output /Path/To/Wan2.1-I2V-14B-480P \
-       --direction backward \
-       --save_by_block
-```
+该converter工具可在不同格式之间转换模型权重。

-
-## 量化
+## 功能1：转换量化模型

 该工具支持将 **FP32/FP16/BF16** 模型权重转换为 **INT8、FP8** 类型。

@@ -36,7 +14,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type wan_dit \
    --quantized \
    --save_by_block
@@ -48,7 +26,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
    --model_type wan_dit \
    --quantized \
    --save_by_block
@@ -62,7 +40,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name wan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type wan_dit \
    --lora_path /Path/To/LoRA1/ /Path/To/LoRA2/ \
    --lora_alpha 1.0 1.0 \
@@ -78,7 +56,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext ..safetensors \
    --output_name hunyuan_int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
    --model_type hunyuan_dit \
    --quantized
 ```
@@ -89,7 +67,7 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .safetensors \
    --output_name hunyuan_fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
    --model_type hunyuan_dit \
    --quantized
 ```
@@ -103,7 +81,8 @@ python converter.py \
    --output /Path/To/output \
    --output_ext .pth\
    --output_name models_t5_umt5-xxl-enc-int8 \
-    --dtype torch.int8 \
+    --linear_dtype torch.int8 \
+    --non_linear_dtype torch.bfloat16 \
    --model_type wan_t5 \
    --quantized
 ```
@@ -111,10 +90,11 @@ python converter.py \
 ```bash
 python converter.py \
    --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/models_t5_umt5-xxl-enc-bf16.pth \
-    --output /Path/To/output \
+    --output /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/fp8 \
    --output_ext .pth\
    --output_name models_t5_umt5-xxl-enc-fp8 \
-    --dtype torch.float8_e4m3fn \
+    --linear_dtype torch.float8_e4m3fn \
+    --non_linear_dtype torch.bfloat16 \
    --model_type wan_t5 \
    --quantized
 ```
@@ -128,7 +108,8 @@ python converter.py \
  --output /Path/To/output \
  --output_ext .pth \
  --output_name clip-int8 \
-  --dtype torch.int8 \
+  --linear_dtype torch.int8 \
+  --non_linear_dtype torch.float16 \
  --model_type wan_clip \
  --quantized

@@ -136,10 +117,33 @@ python converter.py \
 ```bash
 python converter.py \
  --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P/models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth \
-  --output /Path/To/output \
+  --output ./output \
  --output_ext .pth \
  --output_name clip-fp8 \
-  --dtype torch.float8_e4m3fn \
+  --linear_dtype torch.float8_e4m3fn \
+  --non_linear_dtype torch.float16 \
  --model_type wan_clip \
  --quantized
 ```
+
+
+## 功能2：Diffusers和Lightx2v之间的格式转换
+支持 Diffusers 架构与 LightX2V 架构之间的相互转换
+
+### Lightx2v->Diffusers
+```bash
+python converter.py \
+       --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P \
+       --output /Path/To/Wan2.1-I2V-14B-480P-Diffusers \
+       --direction forward \
+       --save_by_block
+```
+
+### Diffusers->Lightx2v
+```bash
+python converter.py \
+       --source /Path/To/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers \
+       --output /Path/To/Wan2.1-I2V-14B-480P \
+       --direction backward \
+       --save_by_block
+```