wan2.2

1336a33d · zzg_666 · 1336a33d · 1336a33d · 1336a33d · 1336a33d
Commit 1336a33d authored Nov 15, 2025 by zzg_666
20 changed files
--- a/wan/__pycache__/speech2video.cpython-310.pyc
+++ b/wan/__pycache__/speech2video.cpython-310.pyc
--- a/wan/__pycache__/text2video.cpython-310.pyc
+++ b/wan/__pycache__/text2video.cpython-310.pyc
--- a/wan/__pycache__/textimage2video.cpython-310.pyc
+++ b/wan/__pycache__/textimage2video.cpython-310.pyc
--- a/wan/animate.py
+++ b/wan/animate.py
--- a/wan/configs/__init__.py
+++ b/wan/configs/__init__.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import copy
+import os
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+from .wan_i2v_A14B import i2v_A14B
+from .wan_s2v_14B import s2v_14B
+from .wan_t2v_A14B import t2v_A14B
+from .wan_ti2v_5B import ti2v_5B
+from .wan_animate_14B import animate_14B
+
+WAN_CONFIGS = {
+    't2v-A14B': t2v_A14B,
+    'i2v-A14B': i2v_A14B,
+    'ti2v-5B': ti2v_5B,
+    'animate-14B': animate_14B,
+    's2v-14B': s2v_14B,
+}
+
+SIZE_CONFIGS = {
+    '720*1280': (720, 1280),
+    '1280*720': (1280, 720),
+    '480*832': (480, 832),
+    '832*480': (832, 480),
+    '704*1280': (704, 1280),
+    '1280*704': (1280, 704),
+    '1024*704': (1024, 704),
+    '704*1024': (704, 1024),
+}
+
+MAX_AREA_CONFIGS = {
+    '720*1280': 720 * 1280,
+    '1280*720': 1280 * 720,
+    '480*832': 480 * 832,
+    '832*480': 832 * 480,
+    '704*1280': 704 * 1280,
+    '1280*704': 1280 * 704,
+    '1024*704': 1024 * 704,
+    '704*1024': 704 * 1024,
+}
+
+SUPPORTED_SIZES = {
+    't2v-A14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    'i2v-A14B': ('720*1280', '1280*720', '480*832', '832*480'),
+    'ti2v-5B': ('704*1280', '1280*704'),
+    's2v-14B': ('720*1280', '1280*720', '480*832', '832*480', '1024*704',
+                '704*1024', '704*1280', '1280*704'),
+    'animate-14B': ('720*1280', '1280*720')
+}
--- a/wan/configs/__pycache__/__init__.cpython-310.pyc
+++ b/wan/configs/__pycache__/__init__.cpython-310.pyc
--- a/wan/configs/__pycache__/shared_config.cpython-310.pyc
+++ b/wan/configs/__pycache__/shared_config.cpython-310.pyc
--- a/wan/configs/__pycache__/wan_animate_14B.cpython-310.pyc
+++ b/wan/configs/__pycache__/wan_animate_14B.cpython-310.pyc
--- a/wan/configs/__pycache__/wan_i2v_A14B.cpython-310.pyc
+++ b/wan/configs/__pycache__/wan_i2v_A14B.cpython-310.pyc
--- a/wan/configs/__pycache__/wan_s2v_14B.cpython-310.pyc
+++ b/wan/configs/__pycache__/wan_s2v_14B.cpython-310.pyc
--- a/wan/configs/__pycache__/wan_t2v_A14B.cpython-310.pyc
+++ b/wan/configs/__pycache__/wan_t2v_A14B.cpython-310.pyc
--- a/wan/configs/__pycache__/wan_ti2v_5B.cpython-310.pyc
+++ b/wan/configs/__pycache__/wan_ti2v_5B.cpython-310.pyc
--- a/wan/configs/shared_config.py
+++ b/wan/configs/shared_config.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+
+#------------------------ Wan shared config ------------------------#
+wan_shared_cfg = EasyDict()
+
+# t5
+wan_shared_cfg.t5_model = 'umt5_xxl'
+wan_shared_cfg.t5_dtype = torch.bfloat16
+wan_shared_cfg.text_len = 512
+
+# transformer
+wan_shared_cfg.param_dtype = torch.bfloat16
+
+# inference
+wan_shared_cfg.num_train_timesteps = 1000
+wan_shared_cfg.sample_fps = 16
+wan_shared_cfg.sample_neg_prompt = '色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走'
+wan_shared_cfg.frame_num = 81
--- a/wan/configs/wan_animate_14B.py
+++ b/wan/configs/wan_animate_14B.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan animate 14B ------------------------#
+animate_14B = EasyDict(__name__='Config: Wan animate 14B')
+animate_14B.update(wan_shared_cfg)
+
+animate_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+animate_14B.t5_tokenizer = 'google/umt5-xxl'
+
+animate_14B.clip_checkpoint = 'models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth'
+animate_14B.clip_tokenizer = 'xlm-roberta-large'
+animate_14B.lora_checkpoint = 'relighting_lora.ckpt'
+# vae
+animate_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+animate_14B.vae_stride = (4, 8, 8)
+
+# transformer
+animate_14B.patch_size = (1, 2, 2)
+animate_14B.dim = 5120
+animate_14B.ffn_dim = 13824
+animate_14B.freq_dim = 256
+animate_14B.num_heads = 40
+animate_14B.num_layers = 40
+animate_14B.window_size = (-1, -1)
+animate_14B.qk_norm = True
+animate_14B.cross_attn_norm = True
+animate_14B.eps = 1e-6
+animate_14B.use_face_encoder = True
+animate_14B.motion_encoder_dim = 512
+
+# inference
+animate_14B.sample_shift = 5.0
+animate_14B.sample_steps = 20
+animate_14B.sample_guide_scale = 1.0
+animate_14B.frame_num = 77
+animate_14B.sample_fps = 30
+animate_14B.prompt = '视频中的人在做动作'
--- a/wan/configs/wan_i2v_A14B.py
+++ b/wan/configs/wan_i2v_A14B.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+import torch
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan I2V A14B ------------------------#
+
+i2v_A14B = EasyDict(__name__='Config: Wan I2V A14B')
+i2v_A14B.update(wan_shared_cfg)
+
+i2v_A14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+i2v_A14B.t5_tokenizer = 'google/umt5-xxl'
+
+# vae
+i2v_A14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+i2v_A14B.vae_stride = (4, 8, 8)
+
+# transformer
+i2v_A14B.patch_size = (1, 2, 2)
+i2v_A14B.dim = 5120
+i2v_A14B.ffn_dim = 13824
+i2v_A14B.freq_dim = 256
+i2v_A14B.num_heads = 40
+i2v_A14B.num_layers = 40
+i2v_A14B.window_size = (-1, -1)
+i2v_A14B.qk_norm = True
+i2v_A14B.cross_attn_norm = True
+i2v_A14B.eps = 1e-6
+i2v_A14B.low_noise_checkpoint = 'low_noise_model'
+i2v_A14B.high_noise_checkpoint = 'high_noise_model'
+
+# inference
+i2v_A14B.sample_shift = 5.0
+i2v_A14B.sample_steps = 40
+i2v_A14B.boundary = 0.900
+i2v_A14B.sample_guide_scale = (3.5, 3.5)  # low noise, high noise
--- a/wan/configs/wan_s2v_14B.py
+++ b/wan/configs/wan_s2v_14B.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan S2V 14B ------------------------#
+
+s2v_14B = EasyDict(__name__='Config: Wan S2V 14B')
+s2v_14B.update(wan_shared_cfg)
+
+# t5
+s2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+s2v_14B.t5_tokenizer = 'google/umt5-xxl'
+
+# vae
+s2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+s2v_14B.vae_stride = (4, 8, 8)
+
+# wav2vec
+s2v_14B.wav2vec = "wav2vec2-large-xlsr-53-english"
+
+s2v_14B.num_heads = 40
+# transformer
+s2v_14B.transformer = EasyDict(
+    __name__="Config: Transformer config for WanModel_S2V")
+s2v_14B.transformer.patch_size = (1, 2, 2)
+s2v_14B.transformer.dim = 5120
+s2v_14B.transformer.ffn_dim = 13824
+s2v_14B.transformer.freq_dim = 256
+s2v_14B.transformer.num_heads = 40
+s2v_14B.transformer.num_layers = 40
+s2v_14B.transformer.window_size = (-1, -1)
+s2v_14B.transformer.qk_norm = True
+s2v_14B.transformer.cross_attn_norm = True
+s2v_14B.transformer.eps = 1e-6
+s2v_14B.transformer.enable_adain = True
+s2v_14B.transformer.adain_mode = "attn_norm"
+s2v_14B.transformer.audio_inject_layers = [
+    0, 4, 8, 12, 16, 20, 24, 27, 30, 33, 36, 39
+]
+s2v_14B.transformer.zero_init = True
+s2v_14B.transformer.zero_timestep = True
+s2v_14B.transformer.enable_motioner = False
+s2v_14B.transformer.add_last_motion = True
+s2v_14B.transformer.trainable_token = False
+s2v_14B.transformer.enable_tsm = False
+s2v_14B.transformer.enable_framepack = True
+s2v_14B.transformer.framepack_drop_mode = 'padd'
+s2v_14B.transformer.audio_dim = 1024
+
+s2v_14B.transformer.motion_frames = 73
+s2v_14B.transformer.cond_dim = 16
+
+# inference
+s2v_14B.sample_neg_prompt = "画面模糊，最差质量，画面模糊，细节模糊不清，情绪激动剧烈，手快速抖动，字幕，丑陋的，残缺的，多余的手指，画得不好的手部，画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，杂乱的背景，三条腿，背景人很多，倒着走"
+s2v_14B.drop_first_motion = True
+s2v_14B.sample_shift = 3
+s2v_14B.sample_steps = 40
+s2v_14B.sample_guide_scale = 4.5
--- a/wan/configs/wan_t2v_A14B.py
+++ b/wan/configs/wan_t2v_A14B.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan T2V A14B ------------------------#
+
+t2v_A14B = EasyDict(__name__='Config: Wan T2V A14B')
+t2v_A14B.update(wan_shared_cfg)
+
+# t5
+t2v_A14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+t2v_A14B.t5_tokenizer = 'google/umt5-xxl'
+
+# vae
+t2v_A14B.vae_checkpoint = 'Wan2.1_VAE.pth'
+t2v_A14B.vae_stride = (4, 8, 8)
+
+# transformer
+t2v_A14B.patch_size = (1, 2, 2)
+t2v_A14B.dim = 5120
+t2v_A14B.ffn_dim = 13824
+t2v_A14B.freq_dim = 256
+t2v_A14B.num_heads = 40
+t2v_A14B.num_layers = 40
+t2v_A14B.window_size = (-1, -1)
+t2v_A14B.qk_norm = True
+t2v_A14B.cross_attn_norm = True
+t2v_A14B.eps = 1e-6
+t2v_A14B.low_noise_checkpoint = 'low_noise_model'
+t2v_A14B.high_noise_checkpoint = 'high_noise_model'
+
+# inference
+t2v_A14B.sample_shift = 12.0
+t2v_A14B.sample_steps = 40
+t2v_A14B.boundary = 0.875
+t2v_A14B.sample_guide_scale = (3.0, 4.0)  # low noise, high noise
--- a/wan/configs/wan_ti2v_5B.py
+++ b/wan/configs/wan_ti2v_5B.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
+from easydict import EasyDict
+
+from .shared_config import wan_shared_cfg
+
+#------------------------ Wan TI2V 5B ------------------------#
+
+ti2v_5B = EasyDict(__name__='Config: Wan TI2V 5B')
+ti2v_5B.update(wan_shared_cfg)
+
+# t5
+ti2v_5B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
+ti2v_5B.t5_tokenizer = 'google/umt5-xxl'
+
+# vae
+ti2v_5B.vae_checkpoint = 'Wan2.2_VAE.pth'
+ti2v_5B.vae_stride = (4, 16, 16)
+
+# transformer
+ti2v_5B.patch_size = (1, 2, 2)
+ti2v_5B.dim = 3072
+ti2v_5B.ffn_dim = 14336
+ti2v_5B.freq_dim = 256
+ti2v_5B.num_heads = 24
+ti2v_5B.num_layers = 30
+ti2v_5B.window_size = (-1, -1)
+ti2v_5B.qk_norm = True
+ti2v_5B.cross_attn_norm = True
+ti2v_5B.eps = 1e-6
+
+# inference
+ti2v_5B.sample_fps = 24
+ti2v_5B.sample_shift = 5.0
+ti2v_5B.sample_steps = 50
+ti2v_5B.sample_guide_scale = 5.0
+ti2v_5B.frame_num = 121
--- a/wan/distributed/__init__.py
+++ b/wan/distributed/__init__.py
+# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
--- a/wan/distributed/__pycache__/__init__.cpython-310.pyc
+++ b/wan/distributed/__pycache__/__init__.cpython-310.pyc