video-model.log 18.5 KB
Newer Older
jerrrrry's avatar
jerrrrry committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
============================================================  model  =====================================================
HYVideoDiffusionTransformer(
  (img_in): PatchEmbed(
    (proj): Conv3d(16, 3072, kernel_size=(1, 2, 2), stride=(1, 2, 2))
    (norm): Identity()
  )
  (txt_in): SingleTokenRefiner(
    (input_embedder): Linear(in_features=4096, out_features=3072, bias=True)
    (t_embedder): TimestepEmbedder(
      (mlp): Sequential(
        (0): Linear(in_features=256, out_features=3072, bias=True)
        (1): SiLU()
        (2): Linear(in_features=3072, out_features=3072, bias=True)
      )
    )
    (c_embedder): TextProjection(
      (linear_1): Linear(in_features=4096, out_features=3072, bias=True)
      (act_1): SiLU()
      (linear_2): Linear(in_features=3072, out_features=3072, bias=True)
    )
    (individual_token_refiner): IndividualTokenRefiner(
      (blocks): ModuleList(
        (0-1): 2 x IndividualTokenRefinerBlock(
          (norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=True)
          (self_attn_qkv): Linear(in_features=3072, out_features=9216, bias=True)
          (self_attn_q_norm): Identity()
          (self_attn_k_norm): Identity()
          (self_attn_proj): Linear(in_features=3072, out_features=3072, bias=True)
          (norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=True)
          (mlp): MLP(
            (fc1): Linear(in_features=3072, out_features=12288, bias=True)
            (act): SiLU()
            (drop1): Dropout(p=0.0, inplace=False)
            (norm): Identity()
            (fc2): Linear(in_features=12288, out_features=3072, bias=True)
            (drop2): Dropout(p=0.0, inplace=False)
          )
          (adaLN_modulation): Sequential(
            (0): SiLU()
            (1): Linear(in_features=3072, out_features=6144, bias=True)
          )
        )
      )
    )
  )
  (time_in): TimestepEmbedder(
    (mlp): Sequential(
      (0): Linear(in_features=256, out_features=3072, bias=True)
      (1): SiLU()
      (2): Linear(in_features=3072, out_features=3072, bias=True)
    )
  )
  (vector_in): MLPEmbedder(
    (in_layer): Linear(in_features=768, out_features=3072, bias=True)
    (silu): SiLU()
    (out_layer): Linear(in_features=3072, out_features=3072, bias=True)
  )
  (guidance_in): TimestepEmbedder(
    (mlp): Sequential(
      (0): Linear(in_features=256, out_features=3072, bias=True)
      (1): SiLU()
      (2): Linear(in_features=3072, out_features=3072, bias=True)
    )
  )
  (double_blocks): ModuleList(
    (0-19): 20 x MMDoubleStreamBlock(
      (img_mod): ModulateDiT(
        (act): SiLU()
        (linear): Linear(in_features=3072, out_features=18432, bias=True)
      )
      (img_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
      (img_attn_qkv): Linear(in_features=3072, out_features=9216, bias=True)
      (img_attn_q_norm): RMSNorm(eps=0.00000)
      (img_attn_k_norm): RMSNorm(eps=0.00000)
      (img_attn_proj): Linear(in_features=3072, out_features=3072, bias=True)
      (img_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
      (img_mlp): MLP(
        (fc1): Linear(in_features=3072, out_features=12288, bias=True)
        (act): GELU(approximate='tanh')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=12288, out_features=3072, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
      (txt_mod): ModulateDiT(
        (act): SiLU()
        (linear): Linear(in_features=3072, out_features=18432, bias=True)
      )
      (txt_norm1): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
      (txt_attn_qkv): Linear(in_features=3072, out_features=9216, bias=True)
      (txt_attn_q_norm): RMSNorm(eps=0.00000)
      (txt_attn_k_norm): RMSNorm(eps=0.00000)
      (txt_attn_proj): Linear(in_features=3072, out_features=3072, bias=True)
      (txt_norm2): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
      (txt_mlp): MLP(
        (fc1): Linear(in_features=3072, out_features=12288, bias=True)
        (act): GELU(approximate='tanh')
        (drop1): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (fc2): Linear(in_features=12288, out_features=3072, bias=True)
        (drop2): Dropout(p=0.0, inplace=False)
      )
    )
  )
  (single_blocks): ModuleList(
    (0-39): 40 x MMSingleStreamBlock(
      (linear1): Linear(in_features=3072, out_features=21504, bias=True)
      (linear2): Linear(in_features=15360, out_features=3072, bias=True)
      (q_norm): RMSNorm(eps=0.00000)
      (k_norm): RMSNorm(eps=0.00000)
      (pre_norm): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
      (mlp_act): GELU(approximate='tanh')
      (modulation): ModulateDiT(
        (act): SiLU()
        (linear): Linear(in_features=3072, out_features=9216, bias=True)
      )
    )
  )
  (final_layer): FinalLayer(
    (norm_final): LayerNorm((3072,), eps=1e-06, elementwise_affine=False)
    (linear): Linear(in_features=3072, out_features=64, bias=True)
    (adaLN_modulation): Sequential(
      (0): SiLU()
      (1): Linear(in_features=3072, out_features=6144, bias=True)
    )
  )
)

============================================================  text_encoder  =====================================================
LlamaModel(
  (embed_tokens): Embedding(128320, 4096)
  (layers): ModuleList(
    (0-31): 32 x LlamaDecoderLayer(
      (self_attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
        (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
    )
  )
  (norm): LlamaRMSNorm((4096,), eps=1e-05)
  (rotary_emb): LlamaRotaryEmbedding()
  (final_layer_norm): LlamaRMSNorm((4096,), eps=1e-05)
)


============================================================  text_encoder_2  =====================================================
CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        )
      )
    )
    (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
)



============================================================  vae  =====================================================
AutoencoderKLCausal3D(
  (encoder): EncoderCausal3D(
    (conv_in): CausalConv3d(
      (conv): Conv3d(3, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1))
    )
    (down_blocks): ModuleList(
      (0): DownEncoderBlockCausal3D(
        (resnets): ModuleList(
          (0-1): 2 x ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
          )
        )
        (downsamplers): ModuleList(
          (0): DownsampleCausal3D(
            (conv): CausalConv3d(
              (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 2, 2))
            )
          )
        )
      )
      (1): DownEncoderBlockCausal3D(
        (resnets): ModuleList(
          (0): ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(128, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
            (conv_shortcut): CausalConv3d(
              (conv): Conv3d(128, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1))
            )
          )
          (1): ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
          )
        )
        (downsamplers): ModuleList(
          (0): DownsampleCausal3D(
            (conv): CausalConv3d(
              (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(2, 2, 2))
            )
          )
        )
      )
      (2): DownEncoderBlockCausal3D(
        (resnets): ModuleList(
          (0): ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(256, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
            (conv_shortcut): CausalConv3d(
              (conv): Conv3d(256, 512, kernel_size=(1, 1, 1), stride=(1, 1, 1))
            )
          )
          (1): ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
          )
        )
        (downsamplers): ModuleList(
          (0): DownsampleCausal3D(
            (conv): CausalConv3d(
              (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(2, 2, 2))
            )
          )
        )
      )
      (3): DownEncoderBlockCausal3D(
        (resnets): ModuleList(
          (0-1): 2 x ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
          )
        )
      )
    )
    (mid_block): UNetMidBlockCausal3D(
      (attentions): ModuleList(
        (0): Attention(
          (group_norm): GroupNorm(32, 512, eps=1e-06, affine=True)
          (to_q): Linear(in_features=512, out_features=512, bias=True)
          (to_k): Linear(in_features=512, out_features=512, bias=True)
          (to_v): Linear(in_features=512, out_features=512, bias=True)
          (to_out): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Dropout(p=0.0, inplace=False)
          )
        )
      )
      (resnets): ModuleList(
        (0-1): 2 x ResnetBlockCausal3D(
          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
          (conv1): CausalConv3d(
            (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
          )
          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (conv2): CausalConv3d(
            (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
          )
          (nonlinearity): SiLU()
        )
      )
    )
    (conv_norm_out): GroupNorm(32, 512, eps=1e-06, affine=True)
    (conv_act): SiLU()
    (conv_out): CausalConv3d(
      (conv): Conv3d(512, 32, kernel_size=(3, 3, 3), stride=(1, 1, 1))
    )
  )
  (decoder): DecoderCausal3D(
    (conv_in): CausalConv3d(
      (conv): Conv3d(16, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
    )
    (up_blocks): ModuleList(
      (0-1): 2 x UpDecoderBlockCausal3D(
        (resnets): ModuleList(
          (0-2): 3 x ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
          )
        )
        (upsamplers): ModuleList(
          (0): UpsampleCausal3D(
            (conv): CausalConv3d(
              (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
          )
        )
      )
      (2): UpDecoderBlockCausal3D(
        (resnets): ModuleList(
          (0): ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(512, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
            (conv_shortcut): CausalConv3d(
              (conv): Conv3d(512, 256, kernel_size=(1, 1, 1), stride=(1, 1, 1))
            )
          )
          (1-2): 2 x ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 256, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
          )
        )
        (upsamplers): ModuleList(
          (0): UpsampleCausal3D(
            (conv): CausalConv3d(
              (conv): Conv3d(256, 256, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
          )
        )
      )
      (3): UpDecoderBlockCausal3D(
        (resnets): ModuleList(
          (0): ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 256, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(256, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
            (conv_shortcut): CausalConv3d(
              (conv): Conv3d(256, 128, kernel_size=(1, 1, 1), stride=(1, 1, 1))
            )
          )
          (1-2): 2 x ResnetBlockCausal3D(
            (norm1): GroupNorm(32, 128, eps=1e-06, affine=True)
            (conv1): CausalConv3d(
              (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (norm2): GroupNorm(32, 128, eps=1e-06, affine=True)
            (dropout): Dropout(p=0.0, inplace=False)
            (conv2): CausalConv3d(
              (conv): Conv3d(128, 128, kernel_size=(3, 3, 3), stride=(1, 1, 1))
            )
            (nonlinearity): SiLU()
          )
        )
      )
    )
    (mid_block): UNetMidBlockCausal3D(
      (attentions): ModuleList(
        (0): Attention(
          (group_norm): GroupNorm(32, 512, eps=1e-06, affine=True)
          (to_q): Linear(in_features=512, out_features=512, bias=True)
          (to_k): Linear(in_features=512, out_features=512, bias=True)
          (to_v): Linear(in_features=512, out_features=512, bias=True)
          (to_out): ModuleList(
            (0): Linear(in_features=512, out_features=512, bias=True)
            (1): Dropout(p=0.0, inplace=False)
          )
        )
      )
      (resnets): ModuleList(
        (0-1): 2 x ResnetBlockCausal3D(
          (norm1): GroupNorm(32, 512, eps=1e-06, affine=True)
          (conv1): CausalConv3d(
            (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
          )
          (norm2): GroupNorm(32, 512, eps=1e-06, affine=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (conv2): CausalConv3d(
            (conv): Conv3d(512, 512, kernel_size=(3, 3, 3), stride=(1, 1, 1))
          )
          (nonlinearity): SiLU()
        )
      )
    )
    (conv_norm_out): GroupNorm(32, 128, eps=1e-06, affine=True)
    (conv_act): SiLU()
    (conv_out): CausalConv3d(
      (conv): Conv3d(128, 3, kernel_size=(3, 3, 3), stride=(1, 1, 1))
    )
  )
  (quant_conv): Conv3d(32, 32, kernel_size=(1, 1, 1), stride=(1, 1, 1))
  (post_quant_conv): Conv3d(16, 16, kernel_size=(1, 1, 1), stride=(1, 1, 1))
)