model_base.py 7.45 KB
Newer Older
comfyanonymous's avatar
comfyanonymous committed
1
2
3
4
import torch
from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel
from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation
from comfy.ldm.modules.diffusionmodules.util import make_beta_schedule
5
from comfy.ldm.modules.diffusionmodules.openaimodel import Timestep
comfyanonymous's avatar
comfyanonymous committed
6
7
8
import numpy as np

class BaseModel(torch.nn.Module):
9
    def __init__(self, model_config, v_prediction=False):
comfyanonymous's avatar
comfyanonymous committed
10
11
        super().__init__()

12
13
        unet_config = model_config.unet_config
        self.latent_format = model_config.latent_format
comfyanonymous's avatar
comfyanonymous committed
14
15
16
17
18
19
20
        self.register_schedule(given_betas=None, beta_schedule="linear", timesteps=1000, linear_start=0.00085, linear_end=0.012, cosine_s=8e-3)
        self.diffusion_model = UNetModel(**unet_config)
        self.v_prediction = v_prediction
        if self.v_prediction:
            self.parameterization = "v"
        else:
            self.parameterization = "eps"
21
22
23

        self.adm_channels = unet_config.get("adm_in_channels", None)
        if self.adm_channels is None:
comfyanonymous's avatar
comfyanonymous committed
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
            self.adm_channels = 0
        print("v_prediction", v_prediction)
        print("adm", self.adm_channels)

    def register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
        if given_betas is not None:
            betas = given_betas
        else:
            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
        alphas = 1. - betas
        alphas_cumprod = np.cumprod(alphas, axis=0)
        alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1])

        timesteps, = betas.shape
        self.num_timesteps = int(timesteps)
        self.linear_start = linear_start
        self.linear_end = linear_end

        self.register_buffer('betas', torch.tensor(betas, dtype=torch.float32))
        self.register_buffer('alphas_cumprod', torch.tensor(alphas_cumprod, dtype=torch.float32))
        self.register_buffer('alphas_cumprod_prev', torch.tensor(alphas_cumprod_prev, dtype=torch.float32))

    def apply_model(self, x, t, c_concat=None, c_crossattn=None, c_adm=None, control=None, transformer_options={}):
        if c_concat is not None:
            xc = torch.cat([x] + c_concat, dim=1)
        else:
            xc = x
        context = torch.cat(c_crossattn, 1)
        return self.diffusion_model(xc, t, context=context, y=c_adm, control=control, transformer_options=transformer_options)

    def get_dtype(self):
        return self.diffusion_model.dtype

    def is_adm(self):
        return self.adm_channels > 0

61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
    def encode_adm(self, **kwargs):
        return None

    def load_model_weights(self, sd, unet_prefix=""):
        to_load = {}
        keys = list(sd.keys())
        for k in keys:
            if k.startswith(unet_prefix):
                to_load[k[len(unet_prefix):]] = sd.pop(k)

        m, u = self.diffusion_model.load_state_dict(to_load, strict=False)
        if len(m) > 0:
            print("unet missing:", m)

        if len(u) > 0:
            print("unet unexpected:", u)
        del to_load
        return self

80
81
82
83
84
85
86
    def process_latent_in(self, latent):
        return self.latent_format.process_in(latent)

    def process_latent_out(self, latent):
        return self.latent_format.process_out(latent)


comfyanonymous's avatar
comfyanonymous committed
87
class SD21UNCLIP(BaseModel):
88
89
    def __init__(self, model_config, noise_aug_config, v_prediction=True):
        super().__init__(model_config, v_prediction)
comfyanonymous's avatar
comfyanonymous committed
90
91
        self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**noise_aug_config)

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
    def encode_adm(self, **kwargs):
        unclip_conditioning = kwargs.get("unclip_conditioning", None)
        device = kwargs["device"]

        if unclip_conditioning is not None:
            adm_inputs = []
            weights = []
            noise_aug = []
            for unclip_cond in unclip_conditioning:
                adm_cond = unclip_cond["clip_vision_output"].image_embeds
                weight = unclip_cond["strength"]
                noise_augment = unclip_cond["noise_augmentation"]
                noise_level = round((self.noise_augmentor.max_noise_level - 1) * noise_augment)
                c_adm, noise_level_emb = self.noise_augmentor(adm_cond.to(device), noise_level=torch.tensor([noise_level], device=device))
                adm_out = torch.cat((c_adm, noise_level_emb), 1) * weight
                weights.append(weight)
                noise_aug.append(noise_augment)
                adm_inputs.append(adm_out)

            if len(noise_aug) > 1:
                adm_out = torch.stack(adm_inputs).sum(0)
                #TODO: add a way to control this
                noise_augment = 0.05
                noise_level = round((self.noise_augmentor.max_noise_level - 1) * noise_augment)
                c_adm, noise_level_emb = self.noise_augmentor(adm_out[:, :self.noise_augmentor.time_embed.dim], noise_level=torch.tensor([noise_level], device=device))
                adm_out = torch.cat((c_adm, noise_level_emb), 1)
        else:
            adm_out = torch.zeros((1, self.adm_channels))

        return adm_out

comfyanonymous's avatar
comfyanonymous committed
123
class SDInpaint(BaseModel):
124
125
    def __init__(self, model_config, v_prediction=False):
        super().__init__(model_config, v_prediction)
comfyanonymous's avatar
comfyanonymous committed
126
        self.concat_keys = ("mask", "masked_image")
127
128

class SDXLRefiner(BaseModel):
129
130
    def __init__(self, model_config, v_prediction=False):
        super().__init__(model_config, v_prediction)
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
        self.embedder = Timestep(256)

    def encode_adm(self, **kwargs):
        clip_pooled = kwargs["pooled_output"]
        width = kwargs.get("width", 768)
        height = kwargs.get("height", 768)
        crop_w = kwargs.get("crop_w", 0)
        crop_h = kwargs.get("crop_h", 0)

        if kwargs.get("prompt_type", "") == "negative":
            aesthetic_score = kwargs.get("aesthetic_score", 2.5)
        else:
            aesthetic_score = kwargs.get("aesthetic_score", 6)

        print(clip_pooled.shape, width, height, crop_w, crop_h, aesthetic_score)
        out = []
        out.append(self.embedder(torch.Tensor([width])))
        out.append(self.embedder(torch.Tensor([height])))
        out.append(self.embedder(torch.Tensor([crop_w])))
        out.append(self.embedder(torch.Tensor([crop_h])))
        out.append(self.embedder(torch.Tensor([aesthetic_score])))
        flat = torch.flatten(torch.cat(out))[None, ]
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)

class SDXL(BaseModel):
156
157
    def __init__(self, model_config, v_prediction=False):
        super().__init__(model_config, v_prediction)
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
        self.embedder = Timestep(256)

    def encode_adm(self, **kwargs):
        clip_pooled = kwargs["pooled_output"]
        width = kwargs.get("width", 768)
        height = kwargs.get("height", 768)
        crop_w = kwargs.get("crop_w", 0)
        crop_h = kwargs.get("crop_h", 0)
        target_width = kwargs.get("target_width", width)
        target_height = kwargs.get("target_height", height)

        print(clip_pooled.shape, width, height, crop_w, crop_h, target_width, target_height)
        out = []
        out.append(self.embedder(torch.Tensor([width])))
        out.append(self.embedder(torch.Tensor([height])))
        out.append(self.embedder(torch.Tensor([crop_w])))
        out.append(self.embedder(torch.Tensor([crop_h])))
        out.append(self.embedder(torch.Tensor([target_width])))
        out.append(self.embedder(torch.Tensor([target_height])))
        flat = torch.flatten(torch.cat(out))[None, ]
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)