wan_s2v_14B.py 2.19 KB
Newer Older
zzg_666's avatar
wan2.2  
zzg_666 committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# Copyright 2024-2025 The Alibaba Wan Team Authors. All rights reserved.
from easydict import EasyDict

from .shared_config import wan_shared_cfg

#------------------------ Wan S2V 14B ------------------------#

s2v_14B = EasyDict(__name__='Config: Wan S2V 14B')
s2v_14B.update(wan_shared_cfg)

# t5
s2v_14B.t5_checkpoint = 'models_t5_umt5-xxl-enc-bf16.pth'
s2v_14B.t5_tokenizer = 'google/umt5-xxl'

# vae
s2v_14B.vae_checkpoint = 'Wan2.1_VAE.pth'
s2v_14B.vae_stride = (4, 8, 8)

# wav2vec
s2v_14B.wav2vec = "wav2vec2-large-xlsr-53-english"

s2v_14B.num_heads = 40
# transformer
s2v_14B.transformer = EasyDict(
    __name__="Config: Transformer config for WanModel_S2V")
s2v_14B.transformer.patch_size = (1, 2, 2)
s2v_14B.transformer.dim = 5120
s2v_14B.transformer.ffn_dim = 13824
s2v_14B.transformer.freq_dim = 256
s2v_14B.transformer.num_heads = 40
s2v_14B.transformer.num_layers = 40
s2v_14B.transformer.window_size = (-1, -1)
s2v_14B.transformer.qk_norm = True
s2v_14B.transformer.cross_attn_norm = True
s2v_14B.transformer.eps = 1e-6
s2v_14B.transformer.enable_adain = True
s2v_14B.transformer.adain_mode = "attn_norm"
s2v_14B.transformer.audio_inject_layers = [
    0, 4, 8, 12, 16, 20, 24, 27, 30, 33, 36, 39
]
s2v_14B.transformer.zero_init = True
s2v_14B.transformer.zero_timestep = True
s2v_14B.transformer.enable_motioner = False
s2v_14B.transformer.add_last_motion = True
s2v_14B.transformer.trainable_token = False
s2v_14B.transformer.enable_tsm = False
s2v_14B.transformer.enable_framepack = True
s2v_14B.transformer.framepack_drop_mode = 'padd'
s2v_14B.transformer.audio_dim = 1024

s2v_14B.transformer.motion_frames = 73
s2v_14B.transformer.cond_dim = 16

# inference
s2v_14B.sample_neg_prompt = "画面模糊,最差质量,画面模糊,细节模糊不清,情绪激动剧烈,手快速抖动,字幕,丑陋的,残缺的,多余的手指,画得不好的手部,画得不好的脸部,畸形的,毁容的,形态畸形的肢体,手指融合,静止不动的画面,杂乱的背景,三条腿,背景人很多,倒着走"
s2v_14B.drop_first_motion = True
s2v_14B.sample_shift = 3
s2v_14B.sample_steps = 40
s2v_14B.sample_guide_scale = 4.5