function feature caching (#38)

* function hunyuan_t2v_tea, hunyuan_t2v_taylorseer, modify the fresh_threshold of taylorseer * hunyuan i2v,t2v + tea,tay; wan i2v,t2v + tea function, add log files * 删除了TeaCace Scheduler的多余属性 * 删除了多余目录 * 修复了TeaCaching部分的bug,目前t2v, i2v feature caching均可跑通 * Update attn_weight.py --------- Co-authored-by: Yang Yong(雍洋) <yongyang1030@163.com>

function feature caching (#38)
* function hunyuan_t2v_tea, hunyuan_t2v_taylorseer, modify the fresh_threshold of taylorseer * hunyuan i2v,t2v + tea,tay; wan i2v,t2v + tea function, add log files * 删除了TeaCace Scheduler的多余属性 * 删除了多余目录 * 修复了TeaCaching部分的bug,目前t2v, i2v feature caching均可跑通 * Update attn_weight.py --------- Co-authored-by: Yang Yong(雍洋) <yongyang1030@163.com>
cfd0423f · TorynCurtis · GitHub · 6333bca1 · cfd0423f · cfd0423f
Commit cfd0423f authored May 13, 2025 by TorynCurtis Committed by GitHub May 13, 2025
15 changed files
--- a/configs/caching/hunyuan_i2v_TaylorSeer.json
+++ b/configs/caching/hunyuan_i2v_TaylorSeer.json
+{
+    "infer_steps": 20,
+    "target_video_length": 33,
+    "i2v_resolution": "720p",
+    "attention_type": "flash_attn3",
+    "seed": 0,
+    "mm_config": {
+        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",
+        "weight_auto_quant": true
+    },
+    "feature_caching": "TaylorSeer"
+}
--- a/configs/caching/hunyuan_i2v_Tea.json
+++ b/configs/caching/hunyuan_i2v_Tea.json
+{
+    "infer_steps": 20,
+    "target_video_length": 33,
+    "i2v_resolution": "720p",
+    "attention_type": "flash_attn3",
+    "seed": 0,
+    "mm_config": {
+        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",
+        "weight_auto_quant": true
+    },
+    "feature_caching": "Tea"
+}
--- a/configs/caching/hunyuan_t2v_Tea.json
+++ b/configs/caching/hunyuan_t2v_Tea.json
+{
+    "infer_steps": 20,
+    "target_video_length": 33,
+    "target_height": 720,
+    "target_width": 1280,
+    "attention_type": "flash_attn3",
+    "seed": 42,
+    "mm_config": {
+        "mm_type": "W-int8-channel-sym-A-int8-channel-sym-dynamic-Vllm",
+        "weight_auto_quant": true
+    },
+    "feature_caching": "Tea"
+}
--- a/configs/caching/wan_i2v_Tea.json
+++ b/configs/caching/wan_i2v_Tea.json
+{
+    "infer_steps": 40,
+    "target_video_length": 81,
+    "target_height": 480,
+    "target_width": 832,
+    "attention_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 5,
+    "sample_shift": 5,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl",
+        "weight_auto_quant": true
+    },
+    "feature_caching": "Tea"
+}
--- a/configs/caching/wan_t2v_Tea.json
+++ b/configs/caching/wan_t2v_Tea.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "attention_type": "flash_attn3",
+    "seed": 42,
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "mm_config": {
+        "mm_type": "W-fp8-channel-sym-A-fp8-channel-sym-dynamic-Sgl",
+        "weight_auto_quant": true
+    },
+    "feature_caching": "Tea"
+}
--- a/lightx2v/common/ops/attn/attn_weight.py
+++ b/lightx2v/common/ops/attn/attn_weight.py
@@ -53,10 +53,10 @@ class AttnWeightTemplate(metaclass=ABCMeta):
            self.config = config

    def to_cpu(self, non_blocking=False):
-        self.weight = self.weight.to("cpu", non_blocking=non_blocking)
+        pass

    def to_cuda(self, non_blocking=False):
-        self.weight = self.weight.cuda(non_blocking=non_blocking)
+        pass

    def state_dict(self, destination=None):
        if destination is None:

--- a/lightx2v/models/networks/hunyuan/infer/feature_caching/transformer_infer.py
+++ b/lightx2v/models/networks/hunyuan/infer/feature_caching/transformer_infer.py
@@ -25,9 +25,7 @@ class HunyuanTransformerInferTeaCaching(HunyuanTransformerInfer):
        inp = img.clone()
        vec_ = vec.clone()

-        weights.double_blocks_weights[0].to_cuda()
-        img_mod1_shift, img_mod1_scale, _, _, _, _ = weights.double_blocks_weights[0].img_mod.apply(vec_).chunk(6, dim=-1)
-        weights.double_blocks_weights[0].to_cpu_sync()
+        img_mod1_shift, img_mod1_scale, _, _, _, _ = weights.double_blocks[0].img_mod.apply(vec_).chunk(6, dim=-1)

        normed_inp = torch.nn.functional.layer_norm(inp, (inp.shape[1],), None, None, 1e-6)
        modulated_inp = normed_inp * (1 + img_mod1_scale) + img_mod1_shift
@@ -73,14 +71,14 @@ class HunyuanTransformerInferTaylorCaching(HunyuanTransformerInfer):
        self.scheduler.current["stream"] = "double_stream"
        for i in range(self.double_blocks_num):
            self.scheduler.current["layer"] = i
-            img, txt = self.infer_double_block(weights.double_blocks_weights[i], img, txt, vec, cu_seqlens_qkv, max_seqlen_qkv, freqs_cis, token_replace_vec, frist_frame_token_num)
+            img, txt = self.infer_double_block(weights.double_blocks[i], img, txt, vec, cu_seqlens_qkv, max_seqlen_qkv, freqs_cis)

        x = torch.cat((img, txt), 0)

        self.scheduler.current["stream"] = "single_stream"
        for i in range(self.single_blocks_num):
            self.scheduler.current["layer"] = i
-            x = self.infer_single_block(weights.single_blocks_weights[i], x, vec, txt_seq_len, cu_seqlens_qkv, max_seqlen_qkv, freqs_cis, token_replace_vec, frist_frame_token_num)
+            x = self.infer_single_block(weights.single_blocks[i], x, vec, txt_seq_len, cu_seqlens_qkv, max_seqlen_qkv, freqs_cis)

        img = x[:img_seq_len, ...]
        return img, vec
@@ -109,7 +107,7 @@ class HunyuanTransformerInferTaylorCaching(HunyuanTransformerInfer):
        ) = txt_mod_out.chunk(6, dim=-1)

        if self.scheduler.current["type"] == "full":
-            img_q, img_k, img_v = self.infer_double_block_img_pre_atten(weights, img, img_mod1_scale, img_mod1_shift, freqs_cis)
+            img_q, img_k, img_v = self.infer_double_block_img_pre_atten(weights, img, img_mod1_scale, img_mod1_shift, None, None, None, freqs_cis)
            txt_q, txt_k, txt_v = self.infer_double_block_txt_pre_atten(weights, txt, txt_mod1_scale, txt_mod1_shift)

            q = torch.cat((img_q, txt_q), dim=0)

--- a/lightx2v/models/schedulers/hunyuan/feature_caching/scheduler.py
+++ b/lightx2v/models/schedulers/hunyuan/feature_caching/scheduler.py
@@ -7,8 +7,8 @@ class HunyuanSchedulerTeaCaching(HunyuanScheduler):
    def __init__(self, config):
        super().__init__(config)
        self.cnt = 0
-        self.num_steps = self.args.infer_steps
-        self.teacache_thresh = self.args.teacache_thresh
+        self.num_steps = self.config.infer_steps
+        self.teacache_thresh = self.config.teacache_thresh
        self.accumulated_rel_l1_distance = 0
        self.previous_modulated_input = None
        self.previous_residual = None

--- a/lightx2v/models/schedulers/hunyuan/feature_caching/utils.py
+++ b/lightx2v/models/schedulers/hunyuan/feature_caching/utils.py
@@ -109,7 +109,7 @@ def cache_init(num_steps, model_kwargs=None):
        cache_dic["cache"] = cache
        cache_dic["fresh_ratio_schedule"] = "ToCa"
        cache_dic["fresh_ratio"] = 0.0
-        cache_dic["fresh_threshold"] = 5
+        cache_dic["fresh_threshold"] = 2
        cache_dic["max_order"] = 1
        cache_dic["force_fresh"] = "global"
        cache_dic["soft_fresh_weight"] = 0.0

--- a/scripts/run_hunyuan_i2v_taylorseer.sh
+++ b/scripts/run_hunyuan_i2v_taylorseer.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=
+model_path=
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using defalt value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+
+python -m lightx2v.infer \
+--model_cls hunyuan \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/caching/hunyuan_i2v_TaylorSeer.json \
+--prompt "An Asian man with short hair in black tactical uniform and white clothes waves a firework stick." \
+--image_path ${lightx2v_path}/assets/inputs/imgs/img_1.jpg \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_hy_i2v_taylor.mp4
--- a/scripts/run_hunyuan_i2v_tea.sh
+++ b/scripts/run_hunyuan_i2v_tea.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=
+model_path=
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using defalt value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+
+python -m lightx2v.infer \
+--model_cls hunyuan \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/caching/hunyuan_i2v_Tea.json \
+--prompt "An Asian man with short hair in black tactical uniform and white clothes waves a firework stick." \
+--image_path ${lightx2v_path}/assets/inputs/imgs/img_1.jpg \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_hy_i2v_tea.mp4
--- a/scripts/run_hunyuan_t2v_taylorseer.sh
+++ b/scripts/run_hunyuan_t2v_taylorseer.sh
@@ -33,4 +33,4 @@ python -m lightx2v.infer \
 --model_path $model_path \
 --config_json ${lightx2v_path}/configs/caching/hunyuan_t2v_TaylorSeer.json \
 --prompt "A cat walks on the grass, realistic style." \
--save_video_path ${lightx2v_path}/save_results/output_lightx2v_hy_t2v.mp4
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_hy_t2v_taylor.mp4
--- a/scripts/run_hunyuan_t2v_tea.sh
+++ b/scripts/run_hunyuan_t2v_tea.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=
+model_path=
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using defalt value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+
+export ENABLE_PROFILING_DEBUG=true
+
+python -m lightx2v.infer \
+--model_cls hunyuan \
+--task t2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/caching/hunyuan_t2v_Tea.json \
+--prompt "A cat walks on the grass, realistic style." \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_hy_t2v_tea.mp4
--- a/scripts/run_wan_i2v_tea.sh
+++ b/scripts/run_wan_i2v_tea.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=
+model_path=
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using defalt value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+
+python -m lightx2v.infer \
+--model_cls wan2.1 \
+--task i2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/caching/wan_i2v_Tea.json \
+--prompt "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--image_path ${lightx2v_path}/assets/inputs/imgs/img_0.jpg \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_i2v_tea.mp4
--- a/scripts/run_wan_t2v_tea.sh
+++ b/scripts/run_wan_t2v_tea.sh
+#!/bin/bash
+
+# set path and first
+lightx2v_path=
+model_path=
+
+# check section
+if [ -z "${CUDA_VISIBLE_DEVICES}" ]; then
+    cuda_devices=0
+    echo "Warn: CUDA_VISIBLE_DEVICES is not set, using defalt value: ${cuda_devices}, change at shell script or set env variable."
+    export CUDA_VISIBLE_DEVICES=${cuda_devices}
+fi
+
+if [ -z "${lightx2v_path}" ]; then
+    echo "Error: lightx2v_path is not set. Please set this variable first."
+    exit 1
+fi
+
+if [ -z "${model_path}" ]; then
+    echo "Error: model_path is not set. Please set this variable first."
+    exit 1
+fi
+
+export TOKENIZERS_PARALLELISM=false
+
+export PYTHONPATH=${lightx2v_path}:$PYTHONPATH
+
+export ENABLE_PROFILING_DEBUG=true
+export ENABLE_GRAPH_MODE=false
+
+python -m lightx2v.infer \
+--model_cls wan2.1 \
+--task t2v \
+--model_path $model_path \
+--config_json ${lightx2v_path}/configs/caching/wan_t2v_Tea.json \
+--prompt "Two anthropomorphic cats in comfy boxing gear and bright gloves fight intensely on a spotlighted stage." \
+--negative_prompt 色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走 \
+--save_video_path ${lightx2v_path}/save_results/output_lightx2v_wan_t2v_tea.mp4