"benchmark/vscode:/vscode.git/clone" did not exist on "a0592c059f14e9aaa73432497fa8b9f01399cfc7"
model_base.py 27.3 KB
Newer Older
comfyanonymous's avatar
comfyanonymous committed
1
import torch
2
import logging
3
from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep
comfyanonymous's avatar
comfyanonymous committed
4
from comfy.ldm.cascade.stage_c import StageC
comfyanonymous's avatar
comfyanonymous committed
5
from comfy.ldm.cascade.stage_b import StageB
comfyanonymous's avatar
comfyanonymous committed
6
from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation
7
from comfy.ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
comfyanonymous's avatar
comfyanonymous committed
8
from comfy.ldm.modules.diffusionmodules.mmdit import OpenAISignatureMMDITWrapper
9
10
import comfy.ldm.audio.dit
import comfy.ldm.audio.embedders
comfyanonymous's avatar
comfyanonymous committed
11
import comfy.model_management
12
import comfy.conds
13
import comfy.ops
14
from enum import Enum
15
from . import utils
comfyanonymous's avatar
comfyanonymous committed
16
import comfy.latent_formats
17
import math
comfyanonymous's avatar
comfyanonymous committed
18

19
20
21
class ModelType(Enum):
    EPS = 1
    V_PREDICTION = 2
comfyanonymous's avatar
comfyanonymous committed
22
    V_PREDICTION_EDM = 3
comfyanonymous's avatar
comfyanonymous committed
23
    STABLE_CASCADE = 4
24
    EDM = 5
comfyanonymous's avatar
comfyanonymous committed
25
    FLOW = 6
26
    V_PREDICTION_CONTINUOUS = 7
27

comfyanonymous's avatar
comfyanonymous committed
28

29
from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling, ModelSamplingContinuousV
comfyanonymous's avatar
comfyanonymous committed
30

31

comfyanonymous's avatar
comfyanonymous committed
32
def model_sampling(model_config, model_type):
comfyanonymous's avatar
comfyanonymous committed
33
34
    s = ModelSamplingDiscrete

comfyanonymous's avatar
comfyanonymous committed
35
36
37
38
    if model_type == ModelType.EPS:
        c = EPS
    elif model_type == ModelType.V_PREDICTION:
        c = V_PREDICTION
comfyanonymous's avatar
comfyanonymous committed
39
40
41
    elif model_type == ModelType.V_PREDICTION_EDM:
        c = V_PREDICTION
        s = ModelSamplingContinuousEDM
comfyanonymous's avatar
comfyanonymous committed
42
43
44
    elif model_type == ModelType.FLOW:
        c = comfy.model_sampling.CONST
        s = comfy.model_sampling.ModelSamplingDiscreteFlow
comfyanonymous's avatar
comfyanonymous committed
45
46
47
    elif model_type == ModelType.STABLE_CASCADE:
        c = EPS
        s = StableCascadeSampling
48
49
50
    elif model_type == ModelType.EDM:
        c = EDM
        s = ModelSamplingContinuousEDM
51
52
53
    elif model_type == ModelType.V_PREDICTION_CONTINUOUS:
        c = V_PREDICTION
        s = ModelSamplingContinuousV
comfyanonymous's avatar
comfyanonymous committed
54
55
56
57
58
59
60

    class ModelSampling(s, c):
        pass

    return ModelSampling(model_config)


comfyanonymous's avatar
comfyanonymous committed
61
class BaseModel(torch.nn.Module):
comfyanonymous's avatar
comfyanonymous committed
62
    def __init__(self, model_config, model_type=ModelType.EPS, device=None, unet_model=UNetModel):
comfyanonymous's avatar
comfyanonymous committed
63
64
        super().__init__()

65
66
        unet_config = model_config.unet_config
        self.latent_format = model_config.latent_format
67
        self.model_config = model_config
68
        self.manual_cast_dtype = model_config.manual_cast_dtype
comfyanonymous's avatar
comfyanonymous committed
69

70
        if not unet_config.get("disable_unet_model_creation", False):
71
72
73
            if self.manual_cast_dtype is not None:
                operations = comfy.ops.manual_cast
            else:
comfyanonymous's avatar
comfyanonymous committed
74
                operations = comfy.ops.disable_weight_init
comfyanonymous's avatar
comfyanonymous committed
75
            self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
76
77
78
            if comfy.model_management.force_channels_last():
                self.diffusion_model.to(memory_format=torch.channels_last)
                logging.debug("using channels last mode for diffusion model")
79
        self.model_type = model_type
comfyanonymous's avatar
comfyanonymous committed
80
81
        self.model_sampling = model_sampling(model_config, model_type)

82
83
        self.adm_channels = unet_config.get("adm_in_channels", None)
        if self.adm_channels is None:
comfyanonymous's avatar
comfyanonymous committed
84
            self.adm_channels = 0
85
86

        self.concat_keys = ()
comfyanonymous's avatar
comfyanonymous committed
87
88
        logging.info("model_type {}".format(model_type.name))
        logging.debug("adm {}".format(self.adm_channels))
comfyanonymous's avatar
comfyanonymous committed
89

90
    def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
comfyanonymous's avatar
comfyanonymous committed
91
92
        sigma = t
        xc = self.model_sampling.calculate_input(sigma, x)
comfyanonymous's avatar
comfyanonymous committed
93
        if c_concat is not None:
comfyanonymous's avatar
comfyanonymous committed
94
95
            xc = torch.cat([xc] + [c_concat], dim=1)

96
        context = c_crossattn
97
        dtype = self.get_dtype()
98

99
100
        if self.manual_cast_dtype is not None:
            dtype = self.manual_cast_dtype
101

102
        xc = xc.to(dtype)
103
        t = self.model_sampling.timestep(t).float()
104
        context = context.to(dtype)
105
106
        extra_conds = {}
        for o in kwargs:
107
            extra = kwargs[o]
108
109
110
            if hasattr(extra, "dtype"):
                if extra.dtype != torch.int and extra.dtype != torch.long:
                    extra = extra.to(dtype)
111
            extra_conds[o] = extra
112

113
        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
comfyanonymous's avatar
comfyanonymous committed
114
        return self.model_sampling.calculate_denoised(sigma, model_output, x)
comfyanonymous's avatar
comfyanonymous committed
115
116
117
118
119
120
121

    def get_dtype(self):
        return self.diffusion_model.dtype

    def is_adm(self):
        return self.adm_channels > 0

122
123
124
    def encode_adm(self, **kwargs):
        return None

125
126
    def extra_conds(self, **kwargs):
        out = {}
127
        if len(self.concat_keys) > 0:
128
            cond_concat = []
129
130
131
132
133
134
135
            denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
            concat_latent_image = kwargs.get("concat_latent_image", None)
            if concat_latent_image is None:
                concat_latent_image = kwargs.get("latent_image", None)
            else:
                concat_latent_image = self.process_latent_in(concat_latent_image)

136
            noise = kwargs.get("noise", None)
137
            device = kwargs["device"]
138

139
140
141
142
143
            if concat_latent_image.shape[1:] != noise.shape[1:]:
                concat_latent_image = utils.common_upscale(concat_latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")

            concat_latent_image = utils.resize_to_batch_size(concat_latent_image, noise.shape[0])

144
145
146
            if denoise_mask is not None:
                if len(denoise_mask.shape) == len(noise.shape):
                    denoise_mask = denoise_mask[:,:1]
147

148
149
150
151
                denoise_mask = denoise_mask.reshape((-1, 1, denoise_mask.shape[-2], denoise_mask.shape[-1]))
                if denoise_mask.shape[-2:] != noise.shape[-2:]:
                    denoise_mask = utils.common_upscale(denoise_mask, noise.shape[-1], noise.shape[-2], "bilinear", "center")
                denoise_mask = utils.resize_to_batch_size(denoise_mask.round(), noise.shape[0])
152

153
            for ck in self.concat_keys:
154
155
                if denoise_mask is not None:
                    if ck == "mask":
156
                        cond_concat.append(denoise_mask.to(device))
157
                    elif ck == "masked_image":
158
                        cond_concat.append(concat_latent_image.to(device)) #NOTE: the latent_image should be masked by the mask in pixel space
159
160
161
162
                else:
                    if ck == "mask":
                        cond_concat.append(torch.ones_like(noise)[:,:1])
                    elif ck == "masked_image":
163
                        cond_concat.append(self.blank_inpaint_image_like(noise))
164
165
            data = torch.cat(cond_concat, dim=1)
            out['c_concat'] = comfy.conds.CONDNoiseShape(data)
166

167
168
        adm = self.encode_adm(**kwargs)
        if adm is not None:
169
            out['y'] = comfy.conds.CONDRegular(adm)
170
171
172
173
174

        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)

175
176
177
178
        cross_attn_cnet = kwargs.get("cross_attn_controlnet", None)
        if cross_attn_cnet is not None:
            out['crossattn_controlnet'] = comfy.conds.CONDCrossAttn(cross_attn_cnet)

179
180
        c_concat = kwargs.get("noise_concat", None)
        if c_concat is not None:
comfyanonymous's avatar
comfyanonymous committed
181
            out['c_concat'] = comfy.conds.CONDNoiseShape(c_concat)
182

183
        return out
184

185
186
187
188
189
190
191
    def load_model_weights(self, sd, unet_prefix=""):
        to_load = {}
        keys = list(sd.keys())
        for k in keys:
            if k.startswith(unet_prefix):
                to_load[k[len(unet_prefix):]] = sd.pop(k)

192
        to_load = self.model_config.process_unet_state_dict(to_load)
193
194
        m, u = self.diffusion_model.load_state_dict(to_load, strict=False)
        if len(m) > 0:
195
            logging.warning("unet missing: {}".format(m))
196
197

        if len(u) > 0:
198
            logging.warning("unet unexpected: {}".format(u))
199
200
201
        del to_load
        return self

202
203
204
205
206
207
    def process_latent_in(self, latent):
        return self.latent_format.process_in(latent)

    def process_latent_out(self, latent):
        return self.latent_format.process_out(latent)

208
209
210
211
212
213
214
215
216
    def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
        extra_sds = []
        if clip_state_dict is not None:
            extra_sds.append(self.model_config.process_clip_state_dict_for_saving(clip_state_dict))
        if vae_state_dict is not None:
            extra_sds.append(self.model_config.process_vae_state_dict_for_saving(vae_state_dict))
        if clip_vision_state_dict is not None:
            extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))

217
        unet_state_dict = self.diffusion_model.state_dict()
218
        unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
219

220
221
222
        if self.model_type == ModelType.V_PREDICTION:
            unet_state_dict["v_pred"] = torch.tensor([])

223
224
225
226
        for sd in extra_sds:
            unet_state_dict.update(sd)

        return unet_state_dict
227

comfyanonymous's avatar
comfyanonymous committed
228
    def set_inpaint(self):
229
230
231
232
233
234
235
236
237
238
        self.concat_keys = ("mask", "masked_image")
        def blank_inpaint_image_like(latent_image):
            blank_image = torch.ones_like(latent_image)
            # these are the values for "zero" in pixel space translated to latent space
            blank_image[:,0] *= 0.8223
            blank_image[:,1] *= -0.6876
            blank_image[:,2] *= 0.6364
            blank_image[:,3] *= 0.1380
            return blank_image
        self.blank_inpaint_image_like = blank_inpaint_image_like
comfyanonymous's avatar
comfyanonymous committed
239

240
241
    def memory_required(self, input_shape):
        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
242
243
244
            dtype = self.get_dtype()
            if self.manual_cast_dtype is not None:
                dtype = self.manual_cast_dtype
245
            #TODO: this needs to be tweaked
246
            area = input_shape[0] * math.prod(input_shape[2:])
247
            return (area * comfy.model_management.dtype_size(dtype) / 50) * (1024 * 1024)
248
249
        else:
            #TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
250
            area = input_shape[0] * math.prod(input_shape[2:])
251
252
253
            return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)


254
def unclip_adm(unclip_conditioning, device, noise_augmentor, noise_augment_merge=0.0, seed=None):
comfyanonymous's avatar
comfyanonymous committed
255
256
257
258
259
260
261
262
    adm_inputs = []
    weights = []
    noise_aug = []
    for unclip_cond in unclip_conditioning:
        for adm_cond in unclip_cond["clip_vision_output"].image_embeds:
            weight = unclip_cond["strength"]
            noise_augment = unclip_cond["noise_augmentation"]
            noise_level = round((noise_augmentor.max_noise_level - 1) * noise_augment)
263
            c_adm, noise_level_emb = noise_augmentor(adm_cond.to(device), noise_level=torch.tensor([noise_level], device=device), seed=seed)
comfyanonymous's avatar
comfyanonymous committed
264
265
266
267
268
269
270
271
272
273
274
275
276
            adm_out = torch.cat((c_adm, noise_level_emb), 1) * weight
            weights.append(weight)
            noise_aug.append(noise_augment)
            adm_inputs.append(adm_out)

    if len(noise_aug) > 1:
        adm_out = torch.stack(adm_inputs).sum(0)
        noise_augment = noise_augment_merge
        noise_level = round((noise_augmentor.max_noise_level - 1) * noise_augment)
        c_adm, noise_level_emb = noise_augmentor(adm_out[:, :noise_augmentor.time_embed.dim], noise_level=torch.tensor([noise_level], device=device))
        adm_out = torch.cat((c_adm, noise_level_emb), 1)

    return adm_out
277

comfyanonymous's avatar
comfyanonymous committed
278
class SD21UNCLIP(BaseModel):
279
280
    def __init__(self, model_config, noise_aug_config, model_type=ModelType.V_PREDICTION, device=None):
        super().__init__(model_config, model_type, device=device)
comfyanonymous's avatar
comfyanonymous committed
281
282
        self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**noise_aug_config)

283
284
285
    def encode_adm(self, **kwargs):
        unclip_conditioning = kwargs.get("unclip_conditioning", None)
        device = kwargs["device"]
comfyanonymous's avatar
comfyanonymous committed
286
287
        if unclip_conditioning is None:
            return torch.zeros((1, self.adm_channels))
288
        else:
289
            return unclip_adm(unclip_conditioning, device, self.noise_augmentor, kwargs.get("unclip_noise_augment_merge", 0.05), kwargs.get("seed", 0) - 10)
290

291
292
def sdxl_pooled(args, noise_augmentor):
    if "unclip_conditioning" in args:
293
        return unclip_adm(args.get("unclip_conditioning", None), args["device"], noise_augmentor, seed=args.get("seed", 0) - 10)[:,:1280]
294
295
296
    else:
        return args["pooled_output"]

297
class SDXLRefiner(BaseModel):
298
299
    def __init__(self, model_config, model_type=ModelType.EPS, device=None):
        super().__init__(model_config, model_type, device=device)
300
        self.embedder = Timestep(256)
301
        self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**{"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1280})
302
303

    def encode_adm(self, **kwargs):
304
        clip_pooled = sdxl_pooled(kwargs, self.noise_augmentor)
305
306
307
308
309
310
311
312
313
314
315
316
        width = kwargs.get("width", 768)
        height = kwargs.get("height", 768)
        crop_w = kwargs.get("crop_w", 0)
        crop_h = kwargs.get("crop_h", 0)

        if kwargs.get("prompt_type", "") == "negative":
            aesthetic_score = kwargs.get("aesthetic_score", 2.5)
        else:
            aesthetic_score = kwargs.get("aesthetic_score", 6)

        out = []
        out.append(self.embedder(torch.Tensor([height])))
comfyanonymous's avatar
comfyanonymous committed
317
        out.append(self.embedder(torch.Tensor([width])))
318
        out.append(self.embedder(torch.Tensor([crop_h])))
comfyanonymous's avatar
comfyanonymous committed
319
        out.append(self.embedder(torch.Tensor([crop_w])))
320
        out.append(self.embedder(torch.Tensor([aesthetic_score])))
321
        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
322
323
324
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)

class SDXL(BaseModel):
325
326
    def __init__(self, model_config, model_type=ModelType.EPS, device=None):
        super().__init__(model_config, model_type, device=device)
327
        self.embedder = Timestep(256)
328
        self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**{"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1280})
329
330

    def encode_adm(self, **kwargs):
331
        clip_pooled = sdxl_pooled(kwargs, self.noise_augmentor)
332
333
334
335
336
337
338
339
340
        width = kwargs.get("width", 768)
        height = kwargs.get("height", 768)
        crop_w = kwargs.get("crop_w", 0)
        crop_h = kwargs.get("crop_h", 0)
        target_width = kwargs.get("target_width", width)
        target_height = kwargs.get("target_height", height)

        out = []
        out.append(self.embedder(torch.Tensor([height])))
comfyanonymous's avatar
comfyanonymous committed
341
        out.append(self.embedder(torch.Tensor([width])))
342
        out.append(self.embedder(torch.Tensor([crop_h])))
comfyanonymous's avatar
comfyanonymous committed
343
        out.append(self.embedder(torch.Tensor([crop_w])))
344
        out.append(self.embedder(torch.Tensor([target_height])))
comfyanonymous's avatar
comfyanonymous committed
345
        out.append(self.embedder(torch.Tensor([target_width])))
346
        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
347
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
comfyanonymous's avatar
comfyanonymous committed
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382

class SVD_img2vid(BaseModel):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
        super().__init__(model_config, model_type, device=device)
        self.embedder = Timestep(256)

    def encode_adm(self, **kwargs):
        fps_id = kwargs.get("fps", 6) - 1
        motion_bucket_id = kwargs.get("motion_bucket_id", 127)
        augmentation = kwargs.get("augmentation_level", 0)

        out = []
        out.append(self.embedder(torch.Tensor([fps_id])))
        out.append(self.embedder(torch.Tensor([motion_bucket_id])))
        out.append(self.embedder(torch.Tensor([augmentation])))

        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0)
        return flat

    def extra_conds(self, **kwargs):
        out = {}
        adm = self.encode_adm(**kwargs)
        if adm is not None:
            out['y'] = comfy.conds.CONDRegular(adm)

        latent_image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)
        device = kwargs["device"]

        if latent_image is None:
            latent_image = torch.zeros_like(noise)

        if latent_image.shape[1:] != noise.shape[1:]:
            latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")

383
        latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])
comfyanonymous's avatar
comfyanonymous committed
384
385
386

        out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)

387
388
389
390
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)

comfyanonymous's avatar
comfyanonymous committed
391
392
393
394
395
        if "time_conditioning" in kwargs:
            out["time_context"] = comfy.conds.CONDCrossAttn(kwargs["time_conditioning"])

        out['num_video_frames'] = comfy.conds.CONDConstant(noise.shape[0])
        return out
396

comfyanonymous's avatar
comfyanonymous committed
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
class SV3D_u(SVD_img2vid):
    def encode_adm(self, **kwargs):
        augmentation = kwargs.get("augmentation_level", 0)

        out = []
        out.append(self.embedder(torch.flatten(torch.Tensor([augmentation]))))

        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0)
        return flat

class SV3D_p(SVD_img2vid):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
        super().__init__(model_config, model_type, device=device)
        self.embedder_512 = Timestep(512)

    def encode_adm(self, **kwargs):
        augmentation = kwargs.get("augmentation_level", 0)
        elevation = kwargs.get("elevation", 0) #elevation and azimuth are in degrees here
        azimuth = kwargs.get("azimuth", 0)
        noise = kwargs.get("noise", None)

        out = []
        out.append(self.embedder(torch.flatten(torch.Tensor([augmentation]))))
        out.append(self.embedder_512(torch.deg2rad(torch.fmod(torch.flatten(90 - torch.Tensor([elevation])), 360.0))))
        out.append(self.embedder_512(torch.deg2rad(torch.fmod(torch.flatten(torch.Tensor([azimuth])), 360.0))))

        out = list(map(lambda a: utils.resize_to_batch_size(a, noise.shape[0]), out))
        return torch.cat(out, dim=1)


427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
class Stable_Zero123(BaseModel):
    def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None):
        super().__init__(model_config, model_type, device=device)
        self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device)
        self.cc_projection.weight.copy_(cc_projection_weight)
        self.cc_projection.bias.copy_(cc_projection_bias)

    def extra_conds(self, **kwargs):
        out = {}

        latent_image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)

        if latent_image is None:
            latent_image = torch.zeros_like(noise)

        if latent_image.shape[1:] != noise.shape[1:]:
            latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")

        latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])

        out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)

        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            if cross_attn.shape[-1] != 768:
                cross_attn = self.cc_projection(cross_attn)
            out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
        return out
456
457
458
459

class SD_X4Upscaler(BaseModel):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
        super().__init__(model_config, model_type, device=device)
460
        self.noise_augmentor = ImageConcatWithNoiseAugmentation(noise_schedule_config={"linear_start": 0.0001, "linear_end": 0.02}, max_noise_level=350)
461
462
463
464
465
466

    def extra_conds(self, **kwargs):
        out = {}

        image = kwargs.get("concat_image", None)
        noise = kwargs.get("noise", None)
467
468
469
470
471
        noise_augment = kwargs.get("noise_augmentation", 0.0)
        device = kwargs["device"]
        seed = kwargs["seed"] - 10

        noise_level = round((self.noise_augmentor.max_noise_level) * noise_augment)
472
473
474
475
476

        if image is None:
            image = torch.zeros_like(noise)[:,:3]

        if image.shape[1:] != noise.shape[1:]:
477
478
479
480
481
            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")

        noise_level = torch.tensor([noise_level], device=device)
        if noise_augment > 0:
            image, noise_level = self.noise_augmentor(image.to(device), noise_level=noise_level, seed=seed)
482
483
484
485

        image = utils.resize_to_batch_size(image, noise.shape[0])

        out['c_concat'] = comfy.conds.CONDNoiseShape(image)
486
        out['y'] = comfy.conds.CONDRegular(noise_level)
487
        return out
comfyanonymous's avatar
comfyanonymous committed
488

comfyanonymous's avatar
comfyanonymous committed
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
class IP2P:
    def extra_conds(self, **kwargs):
        out = {}

        image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)
        device = kwargs["device"]

        if image is None:
            image = torch.zeros_like(noise)

        if image.shape[1:] != noise.shape[1:]:
            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")

        image = utils.resize_to_batch_size(image, noise.shape[0])

        out['c_concat'] = comfy.conds.CONDNoiseShape(self.process_ip2p_image_in(image))
        adm = self.encode_adm(**kwargs)
        if adm is not None:
            out['y'] = comfy.conds.CONDRegular(adm)
        return out

class SD15_instructpix2pix(IP2P, BaseModel):
    def __init__(self, model_config, model_type=ModelType.EPS, device=None):
        super().__init__(model_config, model_type, device=device)
        self.process_ip2p_image_in = lambda image: image

class SDXL_instructpix2pix(IP2P, SDXL):
    def __init__(self, model_config, model_type=ModelType.EPS, device=None):
        super().__init__(model_config, model_type, device=device)
comfyanonymous's avatar
comfyanonymous committed
519
520
521
522
        if model_type == ModelType.V_PREDICTION_EDM:
            self.process_ip2p_image_in = lambda image: comfy.latent_formats.SDXL().process_in(image) #cosxl ip2p
        else:
            self.process_ip2p_image_in = lambda image: image #diffusers ip2p
comfyanonymous's avatar
comfyanonymous committed
523
524


comfyanonymous's avatar
comfyanonymous committed
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
class StableCascade_C(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=StageC)
        self.diffusion_model.eval().requires_grad_(False)

    def extra_conds(self, **kwargs):
        out = {}
        clip_text_pooled = kwargs["pooled_output"]
        if clip_text_pooled is not None:
            out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)

        if "unclip_conditioning" in kwargs:
            embeds = []
            for unclip_cond in kwargs["unclip_conditioning"]:
                weight = unclip_cond["strength"]
                embeds.append(unclip_cond["clip_vision_output"].image_embeds.unsqueeze(0) * weight)
            clip_img = torch.cat(embeds, dim=1)
        else:
            clip_img = torch.zeros((1, 1, 768))
        out["clip_img"] = comfy.conds.CONDRegular(clip_img)
        out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,)))
        out["crp"] = comfy.conds.CONDRegular(torch.zeros((1,)))

        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['clip_text'] = comfy.conds.CONDCrossAttn(cross_attn)
        return out

comfyanonymous's avatar
comfyanonymous committed
553
554
555
556
557
558
559
560
561
562
563
564

class StableCascade_B(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=StageB)
        self.diffusion_model.eval().requires_grad_(False)

    def extra_conds(self, **kwargs):
        out = {}
        noise = kwargs.get("noise", None)

        clip_text_pooled = kwargs["pooled_output"]
        if clip_text_pooled is not None:
comfyanonymous's avatar
comfyanonymous committed
565
            out['clip'] = comfy.conds.CONDRegular(clip_text_pooled)
comfyanonymous's avatar
comfyanonymous committed
566
567
568
569
570
571
572

        #size of prior doesn't really matter if zeros because it gets resized but I still want it to get batched
        prior = kwargs.get("stable_cascade_prior", torch.zeros((1, 16, (noise.shape[2] * 4) // 42, (noise.shape[3] * 4) // 42), dtype=noise.dtype, layout=noise.layout, device=noise.device))

        out["effnet"] = comfy.conds.CONDRegular(prior)
        out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,)))
        return out
comfyanonymous's avatar
comfyanonymous committed
573
574
575
576
577
578
579
580
581
582


class SD3(BaseModel):
    def __init__(self, model_config, model_type=ModelType.FLOW, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=OpenAISignatureMMDITWrapper)

    def encode_adm(self, **kwargs):
        return kwargs["pooled_output"]

    def extra_conds(self, **kwargs):
comfyanonymous's avatar
comfyanonymous committed
583
        out = super().extra_conds(**kwargs)
comfyanonymous's avatar
comfyanonymous committed
584
585
586
587
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        return out
588
589
590
591
592
593
594
595
596
597
598
599

    def memory_required(self, input_shape):
        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
            dtype = self.get_dtype()
            if self.manual_cast_dtype is not None:
                dtype = self.manual_cast_dtype
            #TODO: this probably needs to be tweaked
            area = input_shape[0] * input_shape[2] * input_shape[3]
            return (area * comfy.model_management.dtype_size(dtype) * 0.012) * (1024 * 1024)
        else:
            area = input_shape[0] * input_shape[2] * input_shape[3]
            return (area * 0.3) * (1024 * 1024)
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629


class StableAudio1(BaseModel):
    def __init__(self, model_config, seconds_start_embedder_weights, seconds_total_embedder_weights, model_type=ModelType.V_PREDICTION_CONTINUOUS, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=comfy.ldm.audio.dit.AudioDiffusionTransformer)
        self.seconds_start_embedder = comfy.ldm.audio.embedders.NumberConditioner(768, min_val=0, max_val=512)
        self.seconds_total_embedder = comfy.ldm.audio.embedders.NumberConditioner(768, min_val=0, max_val=512)
        self.seconds_start_embedder.load_state_dict(seconds_start_embedder_weights)
        self.seconds_total_embedder.load_state_dict(seconds_total_embedder_weights)

    def extra_conds(self, **kwargs):
        out = {}

        noise = kwargs.get("noise", None)
        device = kwargs["device"]

        seconds_start = kwargs.get("seconds_start", 0)
        seconds_total = kwargs.get("seconds_total", int(noise.shape[-1] / 21.53))

        seconds_start_embed = self.seconds_start_embedder([seconds_start])[0].to(device)
        seconds_total_embed = self.seconds_total_embedder([seconds_total])[0].to(device)

        global_embed = torch.cat([seconds_start_embed, seconds_total_embed], dim=-1).reshape((1, -1))
        out['global_embed'] = comfy.conds.CONDRegular(global_embed)

        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            cross_attn = torch.cat([cross_attn.to(device), seconds_start_embed.repeat((cross_attn.shape[0], 1, 1)), seconds_total_embed.repeat((cross_attn.shape[0], 1, 1))], dim=1)
            out['c_crossattn'] = comfy.conds.CONDRegular(cross_attn)
        return out
630
631
632
633
634
635
636
637
638

    def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
        sd = super().state_dict_for_saving(clip_state_dict=clip_state_dict, vae_state_dict=vae_state_dict, clip_vision_state_dict=clip_vision_state_dict)
        d = {"conditioner.conditioners.seconds_start.": self.seconds_start_embedder.state_dict(), "conditioner.conditioners.seconds_total.": self.seconds_total_embedder.state_dict()}
        for k in d:
            s = d[k]
            for l in s:
                sd["{}{}".format(k, l)] = s[l]
        return sd