Initial commit

e2778d0d · litzh · e2778d0d · e2778d0d · e2778d0d · e2778d0d
Commit e2778d0d authored Feb 05, 2026 by litzh
20 changed files
--- a/configs/offload/block/wan_t2v_block.json
+++ b/configs/offload/block/wan_t2v_block.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "flash_attn3",
+    "cross_attn_1_type": "flash_attn3",
+    "cross_attn_2_type": "flash_attn3",
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "dit_quantized": true,
+    "dit_quant_scheme": "fp8-q8f",
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8-q8f",
+    "clip_quantized": true,
+    "clip_quant_scheme": "fp8-q8f",
+    "cpu_offload": true,
+    "offload_granularity": "block",
+    "t5_cpu_offload": false,
+    "vae_cpu_offload": false,
+    "clip_cpu_offload": false
+}
--- a/configs/offload/disk/wan_i2v_phase_lazy_load_480p.json
+++ b/configs/offload/disk/wan_i2v_phase_lazy_load_480p.json
+{
+    "infer_steps": 40,
+    "target_video_length": 81,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 5,
+    "sample_shift": 5,
+    "enable_cfg": true,
+    "cpu_offload": true,
+    "offload_granularity": "phase",
+    "dit_quantized_ckpt": "/path/to/dit_quant_model",
+    "dit_quantized": true,
+    "dit_quant_scheme": "fp8-vllm",
+    "t5_cpu_offload": true,
+    "t5_quantized": true,
+    "t5_quantized_ckpt": "/path/to/models_t5_umt5-xxl-enc-fp8.pth",
+    "t5_quant_scheme": "fp8",
+    "clip_quantized": true,
+    "clip_quantized_ckpt": "/path/to/clip-fp8.pth",
+    "clip_quant_scheme": "fp8",
+    "use_tiling_vae": true,
+    "use_tae": true,
+    "tae_path": "/path/to/taew2_1.pth",
+    "lazy_load": true
+}
--- a/configs/offload/disk/wan_i2v_phase_lazy_load_720p.json
+++ b/configs/offload/disk/wan_i2v_phase_lazy_load_720p.json
+{
+    "infer_steps": 40,
+    "target_video_length": 81,
+    "target_height": 1280,
+    "target_width": 720,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 5,
+    "sample_shift": 5,
+    "enable_cfg": false,
+    "cpu_offload": true,
+    "offload_granularity": "phase",
+    "dit_quantized_ckpt": "/path/to/dit_quant_model",
+    "dit_quantized": true,
+    "dit_quant_scheme": "fp8-vllm",
+    "t5_cpu_offload": true,
+    "t5_quantized": true,
+    "t5_quantized_ckpt": "/path/to/models_t5_umt5-xxl-enc-fp8.pth",
+    "t5_quant_scheme": "fp8",
+    "clip_quantized": true,
+    "clip_quantized_ckpt": "/path/to/clip-fp8.pth",
+    "clip_quant_scheme": "fp8",
+    "use_tiling_vae": true,
+    "use_tae": true,
+    "tae_path": "/path/to/taew2_1.pth",
+    "lazy_load": true,
+    "rotary_chunk": true,
+    "clean_cuda_cache": true
+}
--- a/configs/offload/phase/wan_i2v_phase.json
+++ b/configs/offload/phase/wan_i2v_phase.json
+{
+    "infer_steps": 40,
+    "target_video_length": 81,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 5,
+    "sample_shift": 5,
+    "enable_cfg": true,
+    "cpu_offload": true,
+    "offload_granularity": "phase",
+    "t5_cpu_offload": false,
+    "clip_cpu_offload": false,
+    "vae_cpu_offload": false,
+    "use_tiling_vae": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "fp8-q8f",
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8-q8f",
+    "clip_quantized": true,
+    "clip_quant_scheme": "fp8-q8f"
+}
--- a/configs/offload/phase/wan_t2v_phase.json
+++ b/configs/offload/phase/wan_t2v_phase.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "sage_attn2",
+    "cross_attn_1_type": "sage_attn2",
+    "cross_attn_2_type": "sage_attn2",
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "cpu_offload": true,
+    "offload_granularity": "phase",
+    "t5_cpu_offload": false,
+    "clip_cpu_offload": false,
+    "vae_cpu_offload": false,
+    "use_tiling_vae": false,
+    "dit_quantized": true,
+    "dit_quant_scheme": "fp8-q8f",
+    "t5_quantized": true,
+    "t5_quant_scheme": "fp8-q8f",
+    "clip_quantized": true,
+    "clip_quant_scheme": "fp8-q8f"
+}
--- a/configs/platforms/amd_rocm/qwen_image_i2i_2511.json
+++ b/configs/platforms/amd_rocm/qwen_image_i2i_2511.json
+{
+    "infer_steps": 40,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "resize_mode": "adaptive",
+    "attn_type": "aiter_attn",
+    "enable_cfg": true,
+    "sample_guide_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true
+}
--- a/configs/platforms/amd_rocm/wan_t2v.json
+++ b/configs/platforms/amd_rocm/wan_t2v.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "aiter_attn",
+    "cross_attn_1_type": "aiter_attn",
+    "cross_attn_2_type": "aiter_attn",
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "cpu_offload": false
+}
--- a/configs/platforms/ascend_npu/qwen_image_i2i_2511.json
+++ b/configs/platforms/ascend_npu/qwen_image_i2i_2511.json
+{
+    "infer_steps": 40,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "resize_mode": "adaptive",
+    "attn_type": "npu_flash_attn",
+    "enable_cfg": true,
+    "sample_guide_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true,
+    "cpu_offload": true,
+    "offload_granularity": "model",
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "torch"
+}
--- a/configs/platforms/ascend_npu/wan_t2v.json
+++ b/configs/platforms/ascend_npu/wan_t2v.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "npu_flash_attn",
+    "cross_attn_1_type": "npu_flash_attn",
+    "cross_attn_2_type": "npu_flash_attn",
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "cpu_offload": true,
+    "offload_granularity": "model",
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "torch"
+}
--- a/configs/platforms/enflame_gcu/qwen_image_i2i_2511.json
+++ b/configs/platforms/enflame_gcu/qwen_image_i2i_2511.json
+{
+    "vae_scale_factor": 8,
+    "infer_steps": 40,
+    "transformer_in_channels": 64,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "resize_mode": "adaptive",
+    "num_layers": 60,
+    "attention_out_dim": 3072,
+    "attention_dim_head": 128,
+    "attn_type": "flash_attn_enflame_gcu",
+    "enable_cfg": true,
+    "sample_guide_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true,
+    "zero_cond_t": true,
+    "modulate_type": "torch",
+    "rope_type": "gcu_wan_rope",
+    "layer_norm_type": "gcu_layer_norm",
+    "rms_norm_type": "torch"
+}
--- a/configs/platforms/enflame_gcu/wan_i2v.json
+++ b/configs/platforms/enflame_gcu/wan_i2v.json
+{
+    "comment": "Wan2.1 I2V 480P - Enflame GCU 8 GPUs with Ulysses + CFG parallelism",
+    "infer_steps": 40,
+    "target_video_length": 81,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "flash_attn_enflame_gcu",
+    "cross_attn_1_type": "flash_attn_enflame_gcu",
+    "cross_attn_2_type": "flash_attn_enflame_gcu",
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": true,
+    "offload_granularity": "model",
+    "modulate_type": "torch",
+    "rope_type": "gcu_wan_rope",
+    "layer_norm_type": "gcu_layer_norm",
+    "rms_norm_type": "torch",
+    "parallel": {
+        "seq_p_size": 4,
+        "seq_p_attn_type": "ulysses",
+        "cfg_p_size": 2
+    },
+    "comment_parallel": "Hybrid parallelism: 4-way Ulysses + 2-way CFG = 8 GPUs total"
+}
--- a/configs/platforms/enflame_gcu/wan_t2v.json
+++ b/configs/platforms/enflame_gcu/wan_t2v.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "flash_attn_enflame_gcu",
+    "cross_attn_1_type": "flash_attn_enflame_gcu",
+    "cross_attn_2_type": "flash_attn_enflame_gcu",
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "torch"
+}
--- a/configs/platforms/hygon_dcu/qwen_image_i2i_2511.json
+++ b/configs/platforms/hygon_dcu/qwen_image_i2i_2511.json
+{
+    "infer_steps": 40,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "resize_mode": "adaptive",
+    "attn_type": "flash_attn_hygon_dcu",
+    "enable_cfg": true,
+    "sample_guide_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true,
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "torch"
+}
--- a/configs/platforms/hygon_dcu/wan_i2v.json
+++ b/configs/platforms/hygon_dcu/wan_i2v.json
+{
+    "infer_steps": 40,
+    "target_video_length": 81,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "flash_attn_hygon_dcu",
+    "cross_attn_1_type": "flash_attn_hygon_dcu",
+    "cross_attn_2_type": "flash_attn_hygon_dcu",
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "torch"
+}
--- a/configs/platforms/hygon_dcu/wan_i2v_int8.json
+++ b/configs/platforms/hygon_dcu/wan_i2v_int8.json
+{
+    "infer_steps": 40,
+    "target_video_length": 81,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "flash_attn_hygon_dcu",
+    "cross_attn_1_type": "flash_attn_hygon_dcu",
+    "cross_attn_2_type": "flash_attn_hygon_dcu",
+    "sample_guide_scale": 5.0,
+    "sample_shift": 5.0,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "torch",
+    "dit_quantized": true,
+    "dit_quant_scheme": "int8-vllm-hygon-dcu",
+    "dit_quantized_ckpt": ""
+}
--- a/configs/platforms/hygon_dcu/wan_t2v.json
+++ b/configs/platforms/hygon_dcu/wan_t2v.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "flash_attn_hygon_dcu",
+    "cross_attn_1_type": "flash_attn_hygon_dcu",
+    "cross_attn_2_type": "flash_attn_hygon_dcu",
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "torch"
+}
--- a/configs/platforms/metax/qwen_image_i2i_2511.json
+++ b/configs/platforms/metax/qwen_image_i2i_2511.json
+{
+    "infer_steps": 40,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "resize_mode": "adaptive",
+    "attn_type": "flash_attn2",
+    "enable_cfg": true,
+    "sample_guide_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true,
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "torch"
+}
--- a/configs/platforms/metax/wan_t2v.json
+++ b/configs/platforms/metax/wan_t2v.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "flash_attn2",
+    "cross_attn_1_type": "flash_attn2",
+    "cross_attn_2_type": "flash_attn2",
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "torch"
+}
--- a/configs/platforms/mlu/qwen_image_i2i_2511.json
+++ b/configs/platforms/mlu/qwen_image_i2i_2511.json
+{
+    "infer_steps": 40,
+    "prompt_template_encode": "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
+    "prompt_template_encode_start_idx": 64,
+    "resize_mode": "adaptive",
+    "attn_type": "mlu_flash_attn",
+    "enable_cfg": true,
+    "sample_guide_scale": 4.0,
+    "CONDITION_IMAGE_SIZE": 147456,
+    "USE_IMAGE_ID_IN_PROMPT": true,
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "mlu_rms_norm"
+}
--- a/configs/platforms/mlu/wan_t2v.json
+++ b/configs/platforms/mlu/wan_t2v.json
+{
+    "infer_steps": 50,
+    "target_video_length": 81,
+    "text_len": 512,
+    "target_height": 480,
+    "target_width": 832,
+    "self_attn_1_type": "mlu_sage_attn",
+    "cross_attn_1_type": "mlu_sage_attn",
+    "cross_attn_2_type": "mlu_sage_attn",
+    "sample_guide_scale": 6,
+    "sample_shift": 8,
+    "enable_cfg": true,
+    "cpu_offload": false,
+    "modulate_type": "torch",
+    "rope_type": "torch",
+    "layer_norm_type": "torch",
+    "rms_norm_type": "mlu_rms_norm"
+}