============================================================ model ===================================================== HYVideoDiffusionTransformer( (img_in): PatchEmbed( (proj): Conv3d(16, 3072, kernel_size=(1, 2, 2), stride=(1, 2, 2)) (norm): Identity() ) (txt_in): SingleTokenRefiner( (input_embedder): Linear(in_features=4096, out_features=3072, bias=True) (t_embedder): TimestepEmbedder( (mlp): Sequential( (0): Linear(in_features=256, out_features=3072, bias=True) (1): SiLU() (2): Linear(in_features=3072, out_features=3072, bias=True) ) ) (c_embedder): TextProjection( (linear_1): Linear(in_features=4096, out_features=3072, bias=True) (act_1): SiLU() (linear_2): Linear(in_features=3072, out_features=3072, bias=True) ) (individual_token_refiner): IndividualTokenRefiner( (blocks): ModuleList( (0-1): 2 x IndividualTokenRefinerBlock( (norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=True) (self_attn_qkv): Linear(in_features=3072, out_features=9216, bias=True) (self_attn_q_norm): Identity() (self_attn_k_norm): Identity() (self_attn_proj): Linear(in_features=3072, out_features=3072, bias=True) (norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=True) (mlp): MLP( (fc1): Linear(in_features=3072, out_features=12288, bias=True) (act): SiLU() (drop1): Dropout(p=0.0, inplace=False) (norm): Identity() (fc2): Linear(in_features=12288, out_features=3072, bias=True) (drop2): Dropout(p=0.0, inplace=False) ) (adaLN_modulation): Sequential( (0): SiLU() (1): Linear(in_features=3072, out_features=6144, bias=True) ) ) ) ) ) (time_in): TimestepEmbedder( (mlp): Sequential( (0): Linear(in_features=256, out_features=3072, bias=True) (1): SiLU() (2): Linear(in_features=3072, out_features=3072, bias=True) ) ) (vector_in): MLPEmbedder( (in_layer): Linear(in_features=768, out_features=3072, bias=True) (silu): SiLU() (out_layer): Linear(in_features=3072, out_features=3072, bias=True) ) (guidance_in): TimestepEmbedder( (mlp): Sequential( (0): Linear(in_features=256, out_features=3072, bias=True) (1): SiLU() (2): Linear(in_features=3072, out_features=3072, bias=True) ) ) (double_blocks): ModuleList( (0-19): 20 x MMDoubleStreamBlock( (img_mod): ModulateDiT( (act): SiLU() (linear): Linear(in_features=3072, out_features=18432, bias=True) ) (img_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (img_attn_qkv): Linear(in_features=3072, out_features=9216, bias=True) (img_attn_q_norm): RMSNorm(eps=0.00000) (img_attn_k_norm): RMSNorm(eps=0.00000) (img_attn_proj): Linear(in_features=3072, out_features=3072, bias=True) (img_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (img_mlp): MLP( (fc1): Linear(in_features=3072, out_features=12288, bias=True) (act): GELU(approximate='tanh') (drop1): Dropout(p=0.0, inplace=False) (norm): Identity() (fc2): Linear(in_features=12288, out_features=3072, bias=True) (drop2): Dropout(p=0.0, inplace=False) ) (txt_mod): ModulateDiT( (act): SiLU() (linear): Linear(in_features=3072, out_features=18432, bias=True) ) (txt_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (txt_attn_qkv): Linear(in_features=3072, out_features=9216, bias=True) (txt_attn_q_norm): RMSNorm(eps=0.00000) (txt_attn_k_norm): RMSNorm(eps=0.00000) (txt_attn_proj): Linear(in_features=3072, out_features=3072, bias=True) (txt_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (txt_mlp): MLP( (fc1): Linear(in_features=3072, out_features=12288, bias=True) (act): GELU(approximate='tanh') (drop1): Dropout(p=0.0, inplace=False) (norm): Identity() (fc2): Linear(in_features=12288, out_features=3072, bias=True) (drop2): Dropout(p=0.0, inplace=False) ) ) ) (single_blocks): ModuleList( (0-39): 40 x MMSingleStreamBlock( (linear1): Linear(in_features=3072, out_features=21504, bias=True) (linear2): Linear(in_features=15360, out_features=3072, bias=True) (q_norm): RMSNorm(eps=0.00000) (k_norm): RMSNorm(eps=0.00000) (pre_norm): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (mlp_act): GELU(approximate='tanh') (modulation): ModulateDiT( (act): SiLU() (linear): Linear(in_features=3072, out_features=9216, bias=True) ) ) ) (final_layer): FinalLayer( (norm_final): LayerNorm((3072,), eps=1e-06, elementwise_affine=False) (linear): Linear(in_features=3072, out_features=64, bias=True) (adaLN_modulation): Sequential( (0): SiLU() (1): Linear(in_features=3072, out_features=6144, bias=True) ) ) ) ============================================================ text_encoder ===================================================== LlamaModel( (embed_tokens): Embedding(128320, 4096) (layers): ModuleList( (0-31): 32 x LlamaDecoderLayer( (self_attn): LlamaSdpaAttention( (q_proj): Linear(in_features=4096, out_features=4096, bias=False) (k_proj): Linear(in_features=4096, out_features=1024, bias=False) (v_proj): Linear(in_features=4096, out_features=1024, bias=False) (o_proj): Linear(in_features=4096, out_features=4096, bias=False) (rotary_emb): LlamaRotaryEmbedding() ) (mlp): LlamaMLP( (gate_proj): Linear(in_features=4096, out_features=14336, bias=False) (up_proj): Linear(in_features=4096, out_features=14336, bias=False) (down_proj): Linear(in_features=14336, out_features=4096, bias=False) (act_fn): SiLU() ) (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05) (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05) ) ) (norm): LlamaRMSNorm((4096,), eps=1e-05) (rotary_emb): LlamaRotaryEmbedding() (final_layer_norm): LlamaRMSNorm((4096,), eps=1e-05) ) ============================================================ text_encoder_2 ===================================================== CLIPTextModel( (text_model): CLIPTextTransformer( (embeddings): CLIPTextEmbeddings( (token_embedding): Embedding(49408, 768) (position_embedding): Embedding(77, 768) ) (encoder): CLIPEncoder( (layers): ModuleList( (0-11): 12 x CLIPEncoderLayer( (self_attn): CLIPSdpaAttention( (k_proj): Linear(in_features=768, out_features=768, bias=True) (v_proj): Linear(in_features=768, out_features=768, bias=True) (q_proj): Linear(in_features=768, out_features=768, bias=True) (out_proj): Linear(in_features=768, out_features=768, bias=True) ) (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True) (mlp): CLIPMLP( (activation_fn): QuickGELUActivation() (fc1): Linear(in_features=768, out_features=3072, bias=True) (fc2): Linear(in_features=3072, out_features=768, bias=True) ) (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True) ) ============================================================ vae ===================================================== AutoencoderKLCausal3D( (encoder): EncoderCausal3D( (conv_in): CausalConv3d( (conv): Conv3d(3, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (down_blocks): ModuleList( (0): DownEncoderBlockCausal3D( (resnets): ModuleList( (0-1): 2 x ResnetBlockCausal3D( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() ) ) (downsamplers): ModuleList( (0): DownsampleCausal3D( (conv): CausalConv3d( (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 2, 2)) ) ) ) ) (1): DownEncoderBlockCausal3D( (resnets): ModuleList( (0): ResnetBlockCausal3D( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() (conv_shortcut): CausalConv3d( (conv): Conv3d(128, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1)) ) ) (1): ResnetBlockCausal3D( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() ) ) (downsamplers): ModuleList( (0): DownsampleCausal3D( (conv): CausalConv3d( (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(2, 2, 2)) ) ) ) ) (2): DownEncoderBlockCausal3D( (resnets): ModuleList( (0): ResnetBlockCausal3D( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() (conv_shortcut): CausalConv3d( (conv): Conv3d(256, 512, kernel_size=(1, 1, 1), stride=(1, 1, 1)) ) ) (1): ResnetBlockCausal3D( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() ) ) (downsamplers): ModuleList( (0): DownsampleCausal3D( (conv): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(2, 2, 2)) ) ) ) ) (3): DownEncoderBlockCausal3D( (resnets): ModuleList( (0-1): 2 x ResnetBlockCausal3D( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() ) ) ) ) (mid_block): UNetMidBlockCausal3D( (attentions): ModuleList( (0): Attention( (group_norm): GroupNorm(32, 512, eps=1e-06, affine=True) (to_q): Linear(in_features=512, out_features=512, bias=True) (to_k): Linear(in_features=512, out_features=512, bias=True) (to_v): Linear(in_features=512, out_features=512, bias=True) (to_out): ModuleList( (0): Linear(in_features=512, out_features=512, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) ) (resnets): ModuleList( (0-1): 2 x ResnetBlockCausal3D( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() ) ) ) (conv_norm_out): GroupNorm(32, 512, eps=1e-06, affine=True) (conv_act): SiLU() (conv_out): CausalConv3d( (conv): Conv3d(512, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) ) (decoder): DecoderCausal3D( (conv_in): CausalConv3d( (conv): Conv3d(16, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (up_blocks): ModuleList( (0-1): 2 x UpDecoderBlockCausal3D( (resnets): ModuleList( (0-2): 3 x ResnetBlockCausal3D( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() ) ) (upsamplers): ModuleList( (0): UpsampleCausal3D( (conv): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) ) ) ) (2): UpDecoderBlockCausal3D( (resnets): ModuleList( (0): ResnetBlockCausal3D( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(512, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() (conv_shortcut): CausalConv3d( (conv): Conv3d(512, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1)) ) ) (1-2): 2 x ResnetBlockCausal3D( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 256, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() ) ) (upsamplers): ModuleList( (0): UpsampleCausal3D( (conv): CausalConv3d( (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) ) ) ) (3): UpDecoderBlockCausal3D( (resnets): ModuleList( (0): ResnetBlockCausal3D( (norm1): GroupNorm(32, 256, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(256, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() (conv_shortcut): CausalConv3d( (conv): Conv3d(256, 128, kernel_size=(1, 1, 1), stride=(1, 1, 1)) ) ) (1-2): 2 x ResnetBlockCausal3D( (norm1): GroupNorm(32, 128, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 128, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() ) ) ) ) (mid_block): UNetMidBlockCausal3D( (attentions): ModuleList( (0): Attention( (group_norm): GroupNorm(32, 512, eps=1e-06, affine=True) (to_q): Linear(in_features=512, out_features=512, bias=True) (to_k): Linear(in_features=512, out_features=512, bias=True) (to_v): Linear(in_features=512, out_features=512, bias=True) (to_out): ModuleList( (0): Linear(in_features=512, out_features=512, bias=True) (1): Dropout(p=0.0, inplace=False) ) ) ) (resnets): ModuleList( (0-1): 2 x ResnetBlockCausal3D( (norm1): GroupNorm(32, 512, eps=1e-06, affine=True) (conv1): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (norm2): GroupNorm(32, 512, eps=1e-06, affine=True) (dropout): Dropout(p=0.0, inplace=False) (conv2): CausalConv3d( (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) (nonlinearity): SiLU() ) ) ) (conv_norm_out): GroupNorm(32, 128, eps=1e-06, affine=True) (conv_act): SiLU() (conv_out): CausalConv3d( (conv): Conv3d(128, 3, kernel_size=(3, 3, 3), stride=(1, 1, 1)) ) ) (quant_conv): Conv3d(32, 32, kernel_size=(1, 1, 1), stride=(1, 1, 1)) (post_quant_conv): Conv3d(16, 16, kernel_size=(1, 1, 1), stride=(1, 1, 1)) )