wan_runner.py 7.45 KB
Newer Older
helloyongyang's avatar
helloyongyang committed
1
2
3
4
5
6
7
8
import os
import numpy as np
import torch
import torchvision.transforms.functional as TF
from PIL import Image
from lightx2v.utils.registry_factory import RUNNER_REGISTER
from lightx2v.models.runners.default_runner import DefaultRunner
from lightx2v.models.schedulers.wan.scheduler import WanScheduler
9
10
11
from lightx2v.models.schedulers.wan.feature_caching.scheduler import (
    WanSchedulerTeaCaching,
)
helloyongyang's avatar
helloyongyang committed
12
13
14
15
16
17
from lightx2v.utils.profiler import ProfilingContext
from lightx2v.models.input_encoders.hf.t5.model import T5EncoderModel
from lightx2v.models.input_encoders.hf.xlm_roberta.model import CLIPModel
from lightx2v.models.networks.wan.model import WanModel
from lightx2v.models.networks.wan.lora_adapter import WanLoraWrapper
from lightx2v.models.video_encoders.hf.wan.vae import WanVAE
18
from lightx2v.models.video_encoders.hf.wan.vae_tiny import WanVAE_tiny
19
from lightx2v.utils.utils import cache_video
root's avatar
root committed
20
from loguru import logger
helloyongyang's avatar
helloyongyang committed
21
22
23
24
25
26
27


@RUNNER_REGISTER("wan2.1")
class WanRunner(DefaultRunner):
    def __init__(self, config):
        super().__init__(config)

28
    def load_transformer(self, init_device):
29
30
31
32
33
34
35
36
        model = WanModel(self.config.model_path, self.config, init_device)
        if self.config.lora_path:
            lora_wrapper = WanLoraWrapper(model)
            lora_name = lora_wrapper.load_lora(self.config.lora_path)
            lora_wrapper.apply_lora(lora_name, self.config.strength_model)
            logger.info(f"Loaded LoRA: {lora_name}")
        return model

37
    def load_image_encoder(self, init_device):
helloyongyang's avatar
helloyongyang committed
38
        image_encoder = None
39
40
41
42
43
44
45
46
47
48
49
        if self.config.task == "i2v":
            image_encoder = CLIPModel(
                dtype=torch.float16,
                device=init_device,
                checkpoint_path=os.path.join(
                    self.config.model_path,
                    "models_clip_open-clip-xlm-roberta-large-vit-huge-14.pth",
                ),
                tokenizer_path=os.path.join(self.config.model_path, "xlm-roberta-large"),
            )
        return image_encoder
helloyongyang's avatar
helloyongyang committed
50

51
    def load_text_encoder(self, init_device):
helloyongyang's avatar
helloyongyang committed
52
53
54
55
56
57
58
        text_encoder = T5EncoderModel(
            text_len=self.config["text_len"],
            dtype=torch.bfloat16,
            device=init_device,
            checkpoint_path=os.path.join(self.config.model_path, "models_t5_umt5-xxl-enc-bf16.pth"),
            tokenizer_path=os.path.join(self.config.model_path, "google/umt5-xxl"),
            shard_fn=None,
59
60
            cpu_offload=self.config.cpu_offload,
            offload_granularity=self.config.get("text_encoder_offload_granularity", "model"),
helloyongyang's avatar
helloyongyang committed
61
62
        )
        text_encoders = [text_encoder]
63
        return text_encoders
helloyongyang's avatar
helloyongyang committed
64

65
66
67
68
69
70
71
72
73
74
75
    def load_vae(self, init_device):
        vae_config = {
            "vae_pth": os.path.join(self.config.model_path, "Wan2.1_VAE.pth"),
            "device": init_device,
            "parallel": self.config.parallel_vae,
            "use_tiling": self.config.get("use_tiling_vae", False),
        }
        use_tiny_decoder = self.config.get("tiny_vae", False)
        is_i2v = self.config.task == "i2v"
        if use_tiny_decoder:
            vae_decoder = WanVAE_tiny(
76
77
                vae_pth=self.config.tiny_vae_path,
                device=init_device,
78
79
            ).to("cuda")
            vae_encoder = WanVAE(**vae_config) if is_i2v else None
80
        else:
81
82
            vae_decoder = WanVAE(**vae_config)
            vae_encoder = vae_decoder if is_i2v else None
helloyongyang's avatar
helloyongyang committed
83

84
        return vae_encoder, vae_decoder
helloyongyang's avatar
helloyongyang committed
85
86
87
88
89
90
91
92
93
94

    def init_scheduler(self):
        if self.config.feature_caching == "NoCaching":
            scheduler = WanScheduler(self.config)
        elif self.config.feature_caching == "Tea":
            scheduler = WanSchedulerTeaCaching(self.config)
        else:
            raise NotImplementedError(f"Unsupported feature_caching type: {self.config.feature_caching}")
        self.model.set_scheduler(scheduler)

95
    def run_text_encoder(self, text, img):
helloyongyang's avatar
helloyongyang committed
96
        text_encoder_output = {}
97
98
99
        n_prompt = self.config.get("negative_prompt", "")
        context = self.text_encoders[0].infer([text])
        context_null = self.text_encoders[0].infer([n_prompt if n_prompt else ""])
helloyongyang's avatar
helloyongyang committed
100
101
102
103
        text_encoder_output["context"] = context
        text_encoder_output["context_null"] = context_null
        return text_encoder_output

104
105
106
107
108
109
110
    def run_image_encoder(self, img):
        img = TF.to_tensor(img).sub_(0.5).div_(0.5).cuda()
        clip_encoder_out = self.image_encoder.visual([img[:, None, :, :]], self.config).squeeze(0).to(torch.bfloat16)
        return clip_encoder_out

    def run_vae_encoder(self, img):
        kwargs = {}
helloyongyang's avatar
helloyongyang committed
111
112
113
        img = TF.to_tensor(img).sub_(0.5).div_(0.5).cuda()
        h, w = img.shape[1:]
        aspect_ratio = h / w
114
115
116
117
118
        max_area = self.config.target_height * self.config.target_width
        lat_h = round(np.sqrt(max_area * aspect_ratio) // self.config.vae_stride[1] // self.config.patch_size[1] * self.config.patch_size[1])
        lat_w = round(np.sqrt(max_area / aspect_ratio) // self.config.vae_stride[2] // self.config.patch_size[2] * self.config.patch_size[2])
        h = lat_h * self.config.vae_stride[1]
        w = lat_w * self.config.vae_stride[2]
helloyongyang's avatar
helloyongyang committed
119

120
121
        self.config.lat_h, kwargs["lat_h"] = lat_h, lat_h
        self.config.lat_w, kwargs["lat_w"] = lat_w, lat_w
helloyongyang's avatar
helloyongyang committed
122

123
        msk = torch.ones(1, self.config.target_video_length, lat_h, lat_w, device=torch.device("cuda"))
helloyongyang's avatar
helloyongyang committed
124
125
126
127
        msk[:, 1:] = 0
        msk = torch.concat([torch.repeat_interleave(msk[:, 0:1], repeats=4, dim=1), msk[:, 1:]], dim=1)
        msk = msk.view(1, msk.shape[1] // 4, 4, lat_h, lat_w)
        msk = msk.transpose(1, 2)[0]
128
        vae_encode_out = self.vae_encoder.encode(
129
130
131
132
            [
                torch.concat(
                    [
                        torch.nn.functional.interpolate(img[None].cpu(), size=(h, w), mode="bicubic").transpose(0, 1),
133
                        torch.zeros(3, self.config.target_video_length - 1, h, w),
134
135
136
137
                    ],
                    dim=1,
                ).cuda()
            ],
138
            self.config,
helloyongyang's avatar
helloyongyang committed
139
140
        )[0]
        vae_encode_out = torch.concat([msk, vae_encode_out]).to(torch.bfloat16)
141
142
143
144
145
        return vae_encode_out, kwargs

    def get_encoder_output_i2v(self, clip_encoder_out, vae_encode_out, text_encoder_output, img):
        image_encoder_output = {"clip_encoder_out": clip_encoder_out, "vae_encode_out": vae_encode_out}
        return {"text_encoder_output": text_encoder_output, "image_encoder_output": image_encoder_output}
helloyongyang's avatar
helloyongyang committed
146
147

    def set_target_shape(self):
148
        ret = {}
149
        num_channels_latents = self.config.get("num_channels_latents", 16)
helloyongyang's avatar
helloyongyang committed
150
        if self.config.task == "i2v":
151
152
            self.config.target_shape = (
                num_channels_latents,
153
                (self.config.target_video_length - 1) // self.config.vae_stride[0] + 1,
154
155
156
                self.config.lat_h,
                self.config.lat_w,
            )
157
158
            ret["lat_h"] = self.config.lat_h
            ret["lat_w"] = self.config.lat_w
helloyongyang's avatar
helloyongyang committed
159
160
        elif self.config.task == "t2v":
            self.config.target_shape = (
161
                num_channels_latents,
162
                (self.config.target_video_length - 1) // self.config.vae_stride[0] + 1,
helloyongyang's avatar
helloyongyang committed
163
164
165
                int(self.config.target_height) // self.config.vae_stride[1],
                int(self.config.target_width) // self.config.vae_stride[2],
            )
166
167
168
169
170
        ret["target_shape"] = self.config.target_shape
        return ret

    def save_video_func(self, images):
        cache_video(tensor=images, save_file=self.config.save_video_path, fps=self.config.get("fps", 16), nrow=1, normalize=True, value_range=(-1, 1))