wan_causvid_runner.py 6.12 KB
Newer Older
Zhuguanyu Wu's avatar
Zhuguanyu Wu committed
1
import gc
PengGao's avatar
PengGao committed
2
3
import os

4
5
import numpy as np
import torch
PengGao's avatar
PengGao committed
6
import torch.distributed as dist
7
8
import torchvision.transforms.functional as TF
from PIL import Image
PengGao's avatar
PengGao committed
9
10
from loguru import logger

11
12
from lightx2v.models.input_encoders.hf.t5.model import T5EncoderModel
from lightx2v.models.input_encoders.hf.xlm_roberta.model import CLIPModel
Zhuguanyu Wu's avatar
Zhuguanyu Wu committed
13
from lightx2v.models.networks.wan.causvid_model import WanCausVidModel
14
from lightx2v.models.networks.wan.lora_adapter import WanLoraWrapper
PengGao's avatar
PengGao committed
15
16
17
18
19
from lightx2v.models.networks.wan.model import WanModel
from lightx2v.models.runners.default_runner import DefaultRunner
from lightx2v.models.runners.wan.wan_runner import WanRunner
from lightx2v.models.schedulers.wan.scheduler import WanScheduler
from lightx2v.models.schedulers.wan.step_distill.scheduler import WanStepDistillScheduler
20
from lightx2v.models.video_encoders.hf.wan.vae import WanVAE
PengGao's avatar
PengGao committed
21
22
from lightx2v.utils.profiler import ProfilingContext, ProfilingContext4Debug
from lightx2v.utils.registry_factory import RUNNER_REGISTER
23
24


Zhuguanyu Wu's avatar
Zhuguanyu Wu committed
25
26
@RUNNER_REGISTER("wan2.1_causvid")
class WanCausVidRunner(WanRunner):
27
28
    def __init__(self, config):
        super().__init__(config)
GoatWu's avatar
GoatWu committed
29
30
31
32
33
        self.num_frame_per_block = self.config.num_frame_per_block
        self.num_frames = self.config.num_frames
        self.frame_seq_length = self.config.frame_seq_length
        self.infer_blocks = self.config.num_blocks
        self.num_fragments = self.config.num_fragments
34

Zhuguanyu Wu's avatar
Zhuguanyu Wu committed
35
    def load_transformer(self):
36
        if self.config.get("lora_configs") and self.config.lora_configs:
GoatWu's avatar
GoatWu committed
37
38
39
40
41
42
            model = WanModel(
                self.config.model_path,
                self.config,
                self.init_device,
            )
            lora_wrapper = WanLoraWrapper(model)
43
44
45
            for lora_config in self.config.lora_configs:
                lora_path = lora_config["path"]
                strength = lora_config.get("strength", 1.0)
GoatWu's avatar
GoatWu committed
46
                lora_name = lora_wrapper.load_lora(lora_path)
47
48
                lora_wrapper.apply_lora(lora_name, strength)
                logger.info(f"Loaded LoRA: {lora_name} with strength: {strength}")
GoatWu's avatar
GoatWu committed
49
50
51
        else:
            model = WanCausVidModel(self.config.model_path, self.config, self.init_device)
        return model
52

Zhuguanyu Wu's avatar
Zhuguanyu Wu committed
53
54
55
56
57
    def set_inputs(self, inputs):
        super().set_inputs(inputs)
        self.config["num_fragments"] = inputs.get("num_fragments", 1)
        self.num_fragments = self.config["num_fragments"]

58
    def init_scheduler(self):
59
        scheduler = WanStepDistillScheduler(self.config)
60
61
62
63
        self.model.set_scheduler(scheduler)

    def set_target_shape(self):
        if self.config.task == "i2v":
wangshankun's avatar
wangshankun committed
64
65
66
67
68
            self.config.target_shape = (16, self.config.num_frame_per_block, self.config.lat_h, self.config.lat_w)
            # i2v需根据input shape重置frame_seq_length
            frame_seq_length = (self.config.lat_h // 2) * (self.config.lat_w // 2)
            self.model.transformer_infer.frame_seq_length = frame_seq_length
            self.frame_seq_length = frame_seq_length
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
        elif self.config.task == "t2v":
            self.config.target_shape = (
                16,
                self.config.num_frame_per_block,
                int(self.config.target_height) // self.config.vae_stride[1],
                int(self.config.target_width) // self.config.vae_stride[2],
            )

    def run(self):
        self.model.transformer_infer._init_kv_cache(dtype=torch.bfloat16, device="cuda")
        self.model.transformer_infer._init_crossattn_cache(dtype=torch.bfloat16, device="cuda")

        output_latents = torch.zeros(
            (self.model.config.target_shape[0], self.num_frames + (self.num_fragments - 1) * (self.num_frames - self.num_frame_per_block), *self.model.config.target_shape[2:]),
            device="cuda",
            dtype=torch.bfloat16,
        )

        start_block_idx = 0

        for fragment_idx in range(self.num_fragments):
Zhuguanyu Wu's avatar
Zhuguanyu Wu committed
90
            logger.info(f"========> fragment_idx: {fragment_idx + 1} / {self.num_fragments}")
91
92
93
94
95

            kv_start = 0
            kv_end = kv_start + self.num_frame_per_block * self.frame_seq_length

            if fragment_idx > 0:
root's avatar
root committed
96
                logger.info("recompute the kv_cache ...")
97
98
99
100
101
102
103
104
105
106
107
108
109
                with ProfilingContext4Debug("step_pre"):
                    self.model.scheduler.latents = self.model.scheduler.last_sample
                    self.model.scheduler.step_pre(step_index=self.model.scheduler.infer_steps - 1)

                with ProfilingContext4Debug("infer"):
                    self.model.infer(self.inputs, kv_start, kv_end)

                kv_start += self.num_frame_per_block * self.frame_seq_length
                kv_end += self.num_frame_per_block * self.frame_seq_length

            infer_blocks = self.infer_blocks - (fragment_idx > 0)

            for block_idx in range(infer_blocks):
Zhuguanyu Wu's avatar
Zhuguanyu Wu committed
110
111
                logger.info(f"=====> block_idx: {block_idx + 1} / {infer_blocks}")
                logger.info(f"=====> kv_start: {kv_start}, kv_end: {kv_end}")
112
113
114
                self.model.scheduler.reset()

                for step_index in range(self.model.scheduler.infer_steps):
root's avatar
root committed
115
                    logger.info(f"==> step_index: {step_index + 1} / {self.model.scheduler.infer_steps}")
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132

                    with ProfilingContext4Debug("step_pre"):
                        self.model.scheduler.step_pre(step_index=step_index)

                    with ProfilingContext4Debug("infer"):
                        self.model.infer(self.inputs, kv_start, kv_end)

                    with ProfilingContext4Debug("step_post"):
                        self.model.scheduler.step_post()

                kv_start += self.num_frame_per_block * self.frame_seq_length
                kv_end += self.num_frame_per_block * self.frame_seq_length

                output_latents[:, start_block_idx * self.num_frame_per_block : (start_block_idx + 1) * self.num_frame_per_block] = self.model.scheduler.latents
                start_block_idx += 1

        return output_latents, self.model.scheduler.generator
Zhuguanyu Wu's avatar
Zhuguanyu Wu committed
133
134
135
136
137
138

    def end_run(self):
        self.model.scheduler.clear()
        del self.inputs, self.model.scheduler, self.model.transformer_infer.kv_cache, self.model.transformer_infer.crossattn_cache
        gc.collect()
        torch.cuda.empty_cache()