model_base.py 22 KB
Newer Older
comfyanonymous's avatar
comfyanonymous committed
1
import torch
2
import logging
3
from comfy.ldm.modules.diffusionmodules.openaimodel import UNetModel, Timestep
comfyanonymous's avatar
comfyanonymous committed
4
from comfy.ldm.cascade.stage_c import StageC
comfyanonymous's avatar
comfyanonymous committed
5
from comfy.ldm.cascade.stage_b import StageB
comfyanonymous's avatar
comfyanonymous committed
6
from comfy.ldm.modules.encoders.noise_aug_modules import CLIPEmbeddingNoiseAugmentation
7
from comfy.ldm.modules.diffusionmodules.upscaling import ImageConcatWithNoiseAugmentation
comfyanonymous's avatar
comfyanonymous committed
8
import comfy.model_management
9
import comfy.conds
10
import comfy.ops
11
from enum import Enum
12
from . import utils
comfyanonymous's avatar
comfyanonymous committed
13

14
15
16
class ModelType(Enum):
    EPS = 1
    V_PREDICTION = 2
comfyanonymous's avatar
comfyanonymous committed
17
    V_PREDICTION_EDM = 3
comfyanonymous's avatar
comfyanonymous committed
18
    STABLE_CASCADE = 4
19
    EDM = 5
20

comfyanonymous's avatar
comfyanonymous committed
21

22
from comfy.model_sampling import EPS, V_PREDICTION, EDM, ModelSamplingDiscrete, ModelSamplingContinuousEDM, StableCascadeSampling
comfyanonymous's avatar
comfyanonymous committed
23

24

comfyanonymous's avatar
comfyanonymous committed
25
def model_sampling(model_config, model_type):
comfyanonymous's avatar
comfyanonymous committed
26
27
    s = ModelSamplingDiscrete

comfyanonymous's avatar
comfyanonymous committed
28
29
30
31
    if model_type == ModelType.EPS:
        c = EPS
    elif model_type == ModelType.V_PREDICTION:
        c = V_PREDICTION
comfyanonymous's avatar
comfyanonymous committed
32
33
34
    elif model_type == ModelType.V_PREDICTION_EDM:
        c = V_PREDICTION
        s = ModelSamplingContinuousEDM
comfyanonymous's avatar
comfyanonymous committed
35
36
37
    elif model_type == ModelType.STABLE_CASCADE:
        c = EPS
        s = StableCascadeSampling
38
39
40
    elif model_type == ModelType.EDM:
        c = EDM
        s = ModelSamplingContinuousEDM
comfyanonymous's avatar
comfyanonymous committed
41
42
43
44
45
46
47

    class ModelSampling(s, c):
        pass

    return ModelSampling(model_config)


comfyanonymous's avatar
comfyanonymous committed
48
class BaseModel(torch.nn.Module):
comfyanonymous's avatar
comfyanonymous committed
49
    def __init__(self, model_config, model_type=ModelType.EPS, device=None, unet_model=UNetModel):
comfyanonymous's avatar
comfyanonymous committed
50
51
        super().__init__()

52
53
        unet_config = model_config.unet_config
        self.latent_format = model_config.latent_format
54
        self.model_config = model_config
55
        self.manual_cast_dtype = model_config.manual_cast_dtype
comfyanonymous's avatar
comfyanonymous committed
56

57
        if not unet_config.get("disable_unet_model_creation", False):
58
59
60
            if self.manual_cast_dtype is not None:
                operations = comfy.ops.manual_cast
            else:
comfyanonymous's avatar
comfyanonymous committed
61
                operations = comfy.ops.disable_weight_init
comfyanonymous's avatar
comfyanonymous committed
62
            self.diffusion_model = unet_model(**unet_config, device=device, operations=operations)
63
        self.model_type = model_type
comfyanonymous's avatar
comfyanonymous committed
64
65
        self.model_sampling = model_sampling(model_config, model_type)

66
67
        self.adm_channels = unet_config.get("adm_in_channels", None)
        if self.adm_channels is None:
comfyanonymous's avatar
comfyanonymous committed
68
            self.adm_channels = 0
69
70

        self.concat_keys = ()
comfyanonymous's avatar
comfyanonymous committed
71
72
        logging.info("model_type {}".format(model_type.name))
        logging.debug("adm {}".format(self.adm_channels))
comfyanonymous's avatar
comfyanonymous committed
73

74
    def apply_model(self, x, t, c_concat=None, c_crossattn=None, control=None, transformer_options={}, **kwargs):
comfyanonymous's avatar
comfyanonymous committed
75
76
        sigma = t
        xc = self.model_sampling.calculate_input(sigma, x)
comfyanonymous's avatar
comfyanonymous committed
77
        if c_concat is not None:
comfyanonymous's avatar
comfyanonymous committed
78
79
            xc = torch.cat([xc] + [c_concat], dim=1)

80
        context = c_crossattn
81
        dtype = self.get_dtype()
82

83
84
        if self.manual_cast_dtype is not None:
            dtype = self.manual_cast_dtype
85

86
        xc = xc.to(dtype)
87
        t = self.model_sampling.timestep(t).float()
88
        context = context.to(dtype)
89
90
        extra_conds = {}
        for o in kwargs:
91
            extra = kwargs[o]
92
93
94
            if hasattr(extra, "dtype"):
                if extra.dtype != torch.int and extra.dtype != torch.long:
                    extra = extra.to(dtype)
95
            extra_conds[o] = extra
96

97
        model_output = self.diffusion_model(xc, t, context=context, control=control, transformer_options=transformer_options, **extra_conds).float()
comfyanonymous's avatar
comfyanonymous committed
98
        return self.model_sampling.calculate_denoised(sigma, model_output, x)
comfyanonymous's avatar
comfyanonymous committed
99
100
101
102
103
104
105

    def get_dtype(self):
        return self.diffusion_model.dtype

    def is_adm(self):
        return self.adm_channels > 0

106
107
108
    def encode_adm(self, **kwargs):
        return None

109
110
    def extra_conds(self, **kwargs):
        out = {}
111
        if len(self.concat_keys) > 0:
112
            cond_concat = []
113
114
115
116
117
118
119
            denoise_mask = kwargs.get("concat_mask", kwargs.get("denoise_mask", None))
            concat_latent_image = kwargs.get("concat_latent_image", None)
            if concat_latent_image is None:
                concat_latent_image = kwargs.get("latent_image", None)
            else:
                concat_latent_image = self.process_latent_in(concat_latent_image)

120
            noise = kwargs.get("noise", None)
121
            device = kwargs["device"]
122

123
124
125
126
127
            if concat_latent_image.shape[1:] != noise.shape[1:]:
                concat_latent_image = utils.common_upscale(concat_latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")

            concat_latent_image = utils.resize_to_batch_size(concat_latent_image, noise.shape[0])

128
129
130
            if denoise_mask is not None:
                if len(denoise_mask.shape) == len(noise.shape):
                    denoise_mask = denoise_mask[:,:1]
131

132
133
134
135
                denoise_mask = denoise_mask.reshape((-1, 1, denoise_mask.shape[-2], denoise_mask.shape[-1]))
                if denoise_mask.shape[-2:] != noise.shape[-2:]:
                    denoise_mask = utils.common_upscale(denoise_mask, noise.shape[-1], noise.shape[-2], "bilinear", "center")
                denoise_mask = utils.resize_to_batch_size(denoise_mask.round(), noise.shape[0])
136

137
            for ck in self.concat_keys:
138
139
                if denoise_mask is not None:
                    if ck == "mask":
140
                        cond_concat.append(denoise_mask.to(device))
141
                    elif ck == "masked_image":
142
                        cond_concat.append(concat_latent_image.to(device)) #NOTE: the latent_image should be masked by the mask in pixel space
143
144
145
146
                else:
                    if ck == "mask":
                        cond_concat.append(torch.ones_like(noise)[:,:1])
                    elif ck == "masked_image":
147
                        cond_concat.append(self.blank_inpaint_image_like(noise))
148
149
            data = torch.cat(cond_concat, dim=1)
            out['c_concat'] = comfy.conds.CONDNoiseShape(data)
150

151
152
        adm = self.encode_adm(**kwargs)
        if adm is not None:
153
            out['y'] = comfy.conds.CONDRegular(adm)
154
155
156
157
158

        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)

159
160
161
162
        cross_attn_cnet = kwargs.get("cross_attn_controlnet", None)
        if cross_attn_cnet is not None:
            out['crossattn_controlnet'] = comfy.conds.CONDCrossAttn(cross_attn_cnet)

163
164
165
166
        c_concat = kwargs.get("noise_concat", None)
        if c_concat is not None:
            out['c_concat'] = comfy.conds.CONDNoiseShape(data)

167
        return out
168

169
170
171
172
173
174
175
    def load_model_weights(self, sd, unet_prefix=""):
        to_load = {}
        keys = list(sd.keys())
        for k in keys:
            if k.startswith(unet_prefix):
                to_load[k[len(unet_prefix):]] = sd.pop(k)

176
        to_load = self.model_config.process_unet_state_dict(to_load)
177
178
        m, u = self.diffusion_model.load_state_dict(to_load, strict=False)
        if len(m) > 0:
179
            logging.warning("unet missing: {}".format(m))
180
181

        if len(u) > 0:
182
            logging.warning("unet unexpected: {}".format(u))
183
184
185
        del to_load
        return self

186
187
188
189
190
191
    def process_latent_in(self, latent):
        return self.latent_format.process_in(latent)

    def process_latent_out(self, latent):
        return self.latent_format.process_out(latent)

192
193
194
195
196
197
198
199
200
    def state_dict_for_saving(self, clip_state_dict=None, vae_state_dict=None, clip_vision_state_dict=None):
        extra_sds = []
        if clip_state_dict is not None:
            extra_sds.append(self.model_config.process_clip_state_dict_for_saving(clip_state_dict))
        if vae_state_dict is not None:
            extra_sds.append(self.model_config.process_vae_state_dict_for_saving(vae_state_dict))
        if clip_vision_state_dict is not None:
            extra_sds.append(self.model_config.process_clip_vision_state_dict_for_saving(clip_vision_state_dict))

201
        unet_state_dict = self.diffusion_model.state_dict()
202
        unet_state_dict = self.model_config.process_unet_state_dict_for_saving(unet_state_dict)
203

204
        if self.get_dtype() == torch.float16:
205
            extra_sds = map(lambda sd: utils.convert_sd_to(sd, torch.float16), extra_sds)
206
207
208
209

        if self.model_type == ModelType.V_PREDICTION:
            unet_state_dict["v_pred"] = torch.tensor([])

210
211
212
213
        for sd in extra_sds:
            unet_state_dict.update(sd)

        return unet_state_dict
214

comfyanonymous's avatar
comfyanonymous committed
215
    def set_inpaint(self):
216
217
218
219
220
221
222
223
224
225
        self.concat_keys = ("mask", "masked_image")
        def blank_inpaint_image_like(latent_image):
            blank_image = torch.ones_like(latent_image)
            # these are the values for "zero" in pixel space translated to latent space
            blank_image[:,0] *= 0.8223
            blank_image[:,1] *= -0.6876
            blank_image[:,2] *= 0.6364
            blank_image[:,3] *= 0.1380
            return blank_image
        self.blank_inpaint_image_like = blank_inpaint_image_like
comfyanonymous's avatar
comfyanonymous committed
226

227
228
    def memory_required(self, input_shape):
        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
229
230
231
            dtype = self.get_dtype()
            if self.manual_cast_dtype is not None:
                dtype = self.manual_cast_dtype
232
            #TODO: this needs to be tweaked
233
            area = input_shape[0] * input_shape[2] * input_shape[3]
234
            return (area * comfy.model_management.dtype_size(dtype) / 50) * (1024 * 1024)
235
236
        else:
            #TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
237
            area = input_shape[0] * input_shape[2] * input_shape[3]
238
239
240
            return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)


241
def unclip_adm(unclip_conditioning, device, noise_augmentor, noise_augment_merge=0.0, seed=None):
comfyanonymous's avatar
comfyanonymous committed
242
243
244
245
246
247
248
249
    adm_inputs = []
    weights = []
    noise_aug = []
    for unclip_cond in unclip_conditioning:
        for adm_cond in unclip_cond["clip_vision_output"].image_embeds:
            weight = unclip_cond["strength"]
            noise_augment = unclip_cond["noise_augmentation"]
            noise_level = round((noise_augmentor.max_noise_level - 1) * noise_augment)
250
            c_adm, noise_level_emb = noise_augmentor(adm_cond.to(device), noise_level=torch.tensor([noise_level], device=device), seed=seed)
comfyanonymous's avatar
comfyanonymous committed
251
252
253
254
255
256
257
258
259
260
261
262
263
            adm_out = torch.cat((c_adm, noise_level_emb), 1) * weight
            weights.append(weight)
            noise_aug.append(noise_augment)
            adm_inputs.append(adm_out)

    if len(noise_aug) > 1:
        adm_out = torch.stack(adm_inputs).sum(0)
        noise_augment = noise_augment_merge
        noise_level = round((noise_augmentor.max_noise_level - 1) * noise_augment)
        c_adm, noise_level_emb = noise_augmentor(adm_out[:, :noise_augmentor.time_embed.dim], noise_level=torch.tensor([noise_level], device=device))
        adm_out = torch.cat((c_adm, noise_level_emb), 1)

    return adm_out
264

comfyanonymous's avatar
comfyanonymous committed
265
class SD21UNCLIP(BaseModel):
266
267
    def __init__(self, model_config, noise_aug_config, model_type=ModelType.V_PREDICTION, device=None):
        super().__init__(model_config, model_type, device=device)
comfyanonymous's avatar
comfyanonymous committed
268
269
        self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**noise_aug_config)

270
271
272
    def encode_adm(self, **kwargs):
        unclip_conditioning = kwargs.get("unclip_conditioning", None)
        device = kwargs["device"]
comfyanonymous's avatar
comfyanonymous committed
273
274
        if unclip_conditioning is None:
            return torch.zeros((1, self.adm_channels))
275
        else:
276
            return unclip_adm(unclip_conditioning, device, self.noise_augmentor, kwargs.get("unclip_noise_augment_merge", 0.05), kwargs.get("seed", 0) - 10)
277

278
279
def sdxl_pooled(args, noise_augmentor):
    if "unclip_conditioning" in args:
280
        return unclip_adm(args.get("unclip_conditioning", None), args["device"], noise_augmentor, seed=args.get("seed", 0) - 10)[:,:1280]
281
282
283
    else:
        return args["pooled_output"]

284
class SDXLRefiner(BaseModel):
285
286
    def __init__(self, model_config, model_type=ModelType.EPS, device=None):
        super().__init__(model_config, model_type, device=device)
287
        self.embedder = Timestep(256)
288
        self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**{"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1280})
289
290

    def encode_adm(self, **kwargs):
291
        clip_pooled = sdxl_pooled(kwargs, self.noise_augmentor)
292
293
294
295
296
297
298
299
300
301
302
303
        width = kwargs.get("width", 768)
        height = kwargs.get("height", 768)
        crop_w = kwargs.get("crop_w", 0)
        crop_h = kwargs.get("crop_h", 0)

        if kwargs.get("prompt_type", "") == "negative":
            aesthetic_score = kwargs.get("aesthetic_score", 2.5)
        else:
            aesthetic_score = kwargs.get("aesthetic_score", 6)

        out = []
        out.append(self.embedder(torch.Tensor([height])))
comfyanonymous's avatar
comfyanonymous committed
304
        out.append(self.embedder(torch.Tensor([width])))
305
        out.append(self.embedder(torch.Tensor([crop_h])))
comfyanonymous's avatar
comfyanonymous committed
306
        out.append(self.embedder(torch.Tensor([crop_w])))
307
        out.append(self.embedder(torch.Tensor([aesthetic_score])))
308
        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
309
310
311
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)

class SDXL(BaseModel):
312
313
    def __init__(self, model_config, model_type=ModelType.EPS, device=None):
        super().__init__(model_config, model_type, device=device)
314
        self.embedder = Timestep(256)
315
        self.noise_augmentor = CLIPEmbeddingNoiseAugmentation(**{"noise_schedule_config": {"timesteps": 1000, "beta_schedule": "squaredcos_cap_v2"}, "timestep_dim": 1280})
316
317

    def encode_adm(self, **kwargs):
318
        clip_pooled = sdxl_pooled(kwargs, self.noise_augmentor)
319
320
321
322
323
324
325
326
327
        width = kwargs.get("width", 768)
        height = kwargs.get("height", 768)
        crop_w = kwargs.get("crop_w", 0)
        crop_h = kwargs.get("crop_h", 0)
        target_width = kwargs.get("target_width", width)
        target_height = kwargs.get("target_height", height)

        out = []
        out.append(self.embedder(torch.Tensor([height])))
comfyanonymous's avatar
comfyanonymous committed
328
        out.append(self.embedder(torch.Tensor([width])))
329
        out.append(self.embedder(torch.Tensor([crop_h])))
comfyanonymous's avatar
comfyanonymous committed
330
        out.append(self.embedder(torch.Tensor([crop_w])))
331
        out.append(self.embedder(torch.Tensor([target_height])))
comfyanonymous's avatar
comfyanonymous committed
332
        out.append(self.embedder(torch.Tensor([target_width])))
333
        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
334
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
comfyanonymous's avatar
comfyanonymous committed
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369

class SVD_img2vid(BaseModel):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
        super().__init__(model_config, model_type, device=device)
        self.embedder = Timestep(256)

    def encode_adm(self, **kwargs):
        fps_id = kwargs.get("fps", 6) - 1
        motion_bucket_id = kwargs.get("motion_bucket_id", 127)
        augmentation = kwargs.get("augmentation_level", 0)

        out = []
        out.append(self.embedder(torch.Tensor([fps_id])))
        out.append(self.embedder(torch.Tensor([motion_bucket_id])))
        out.append(self.embedder(torch.Tensor([augmentation])))

        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0)
        return flat

    def extra_conds(self, **kwargs):
        out = {}
        adm = self.encode_adm(**kwargs)
        if adm is not None:
            out['y'] = comfy.conds.CONDRegular(adm)

        latent_image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)
        device = kwargs["device"]

        if latent_image is None:
            latent_image = torch.zeros_like(noise)

        if latent_image.shape[1:] != noise.shape[1:]:
            latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")

370
        latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])
comfyanonymous's avatar
comfyanonymous committed
371
372
373

        out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)

374
375
376
377
        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)

comfyanonymous's avatar
comfyanonymous committed
378
379
380
381
382
        if "time_conditioning" in kwargs:
            out["time_context"] = comfy.conds.CONDCrossAttn(kwargs["time_conditioning"])

        out['num_video_frames'] = comfy.conds.CONDConstant(noise.shape[0])
        return out
383

comfyanonymous's avatar
comfyanonymous committed
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
class SV3D_u(SVD_img2vid):
    def encode_adm(self, **kwargs):
        augmentation = kwargs.get("augmentation_level", 0)

        out = []
        out.append(self.embedder(torch.flatten(torch.Tensor([augmentation]))))

        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0)
        return flat

class SV3D_p(SVD_img2vid):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
        super().__init__(model_config, model_type, device=device)
        self.embedder_512 = Timestep(512)

    def encode_adm(self, **kwargs):
        augmentation = kwargs.get("augmentation_level", 0)
        elevation = kwargs.get("elevation", 0) #elevation and azimuth are in degrees here
        azimuth = kwargs.get("azimuth", 0)
        noise = kwargs.get("noise", None)

        out = []
        out.append(self.embedder(torch.flatten(torch.Tensor([augmentation]))))
        out.append(self.embedder_512(torch.deg2rad(torch.fmod(torch.flatten(90 - torch.Tensor([elevation])), 360.0))))
        out.append(self.embedder_512(torch.deg2rad(torch.fmod(torch.flatten(torch.Tensor([azimuth])), 360.0))))

        out = list(map(lambda a: utils.resize_to_batch_size(a, noise.shape[0]), out))
        return torch.cat(out, dim=1)


414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
class Stable_Zero123(BaseModel):
    def __init__(self, model_config, model_type=ModelType.EPS, device=None, cc_projection_weight=None, cc_projection_bias=None):
        super().__init__(model_config, model_type, device=device)
        self.cc_projection = comfy.ops.manual_cast.Linear(cc_projection_weight.shape[1], cc_projection_weight.shape[0], dtype=self.get_dtype(), device=device)
        self.cc_projection.weight.copy_(cc_projection_weight)
        self.cc_projection.bias.copy_(cc_projection_bias)

    def extra_conds(self, **kwargs):
        out = {}

        latent_image = kwargs.get("concat_latent_image", None)
        noise = kwargs.get("noise", None)

        if latent_image is None:
            latent_image = torch.zeros_like(noise)

        if latent_image.shape[1:] != noise.shape[1:]:
            latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")

        latent_image = utils.resize_to_batch_size(latent_image, noise.shape[0])

        out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)

        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            if cross_attn.shape[-1] != 768:
                cross_attn = self.cc_projection(cross_attn)
            out['c_crossattn'] = comfy.conds.CONDCrossAttn(cross_attn)
        return out
443
444
445
446

class SD_X4Upscaler(BaseModel):
    def __init__(self, model_config, model_type=ModelType.V_PREDICTION, device=None):
        super().__init__(model_config, model_type, device=device)
447
        self.noise_augmentor = ImageConcatWithNoiseAugmentation(noise_schedule_config={"linear_start": 0.0001, "linear_end": 0.02}, max_noise_level=350)
448
449
450
451
452
453

    def extra_conds(self, **kwargs):
        out = {}

        image = kwargs.get("concat_image", None)
        noise = kwargs.get("noise", None)
454
455
456
457
458
        noise_augment = kwargs.get("noise_augmentation", 0.0)
        device = kwargs["device"]
        seed = kwargs["seed"] - 10

        noise_level = round((self.noise_augmentor.max_noise_level) * noise_augment)
459
460
461
462
463

        if image is None:
            image = torch.zeros_like(noise)[:,:3]

        if image.shape[1:] != noise.shape[1:]:
464
465
466
467
468
            image = utils.common_upscale(image.to(device), noise.shape[-1], noise.shape[-2], "bilinear", "center")

        noise_level = torch.tensor([noise_level], device=device)
        if noise_augment > 0:
            image, noise_level = self.noise_augmentor(image.to(device), noise_level=noise_level, seed=seed)
469
470
471
472

        image = utils.resize_to_batch_size(image, noise.shape[0])

        out['c_concat'] = comfy.conds.CONDNoiseShape(image)
473
        out['y'] = comfy.conds.CONDRegular(noise_level)
474
        return out
comfyanonymous's avatar
comfyanonymous committed
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503

class StableCascade_C(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=StageC)
        self.diffusion_model.eval().requires_grad_(False)

    def extra_conds(self, **kwargs):
        out = {}
        clip_text_pooled = kwargs["pooled_output"]
        if clip_text_pooled is not None:
            out['clip_text_pooled'] = comfy.conds.CONDRegular(clip_text_pooled)

        if "unclip_conditioning" in kwargs:
            embeds = []
            for unclip_cond in kwargs["unclip_conditioning"]:
                weight = unclip_cond["strength"]
                embeds.append(unclip_cond["clip_vision_output"].image_embeds.unsqueeze(0) * weight)
            clip_img = torch.cat(embeds, dim=1)
        else:
            clip_img = torch.zeros((1, 1, 768))
        out["clip_img"] = comfy.conds.CONDRegular(clip_img)
        out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,)))
        out["crp"] = comfy.conds.CONDRegular(torch.zeros((1,)))

        cross_attn = kwargs.get("cross_attn", None)
        if cross_attn is not None:
            out['clip_text'] = comfy.conds.CONDCrossAttn(cross_attn)
        return out

comfyanonymous's avatar
comfyanonymous committed
504
505
506
507
508
509
510
511
512
513
514
515

class StableCascade_B(BaseModel):
    def __init__(self, model_config, model_type=ModelType.STABLE_CASCADE, device=None):
        super().__init__(model_config, model_type, device=device, unet_model=StageB)
        self.diffusion_model.eval().requires_grad_(False)

    def extra_conds(self, **kwargs):
        out = {}
        noise = kwargs.get("noise", None)

        clip_text_pooled = kwargs["pooled_output"]
        if clip_text_pooled is not None:
comfyanonymous's avatar
comfyanonymous committed
516
            out['clip'] = comfy.conds.CONDRegular(clip_text_pooled)
comfyanonymous's avatar
comfyanonymous committed
517
518
519
520
521
522
523

        #size of prior doesn't really matter if zeros because it gets resized but I still want it to get batched
        prior = kwargs.get("stable_cascade_prior", torch.zeros((1, 16, (noise.shape[2] * 4) // 42, (noise.shape[3] * 4) // 42), dtype=noise.dtype, layout=noise.layout, device=noise.device))

        out["effnet"] = comfy.conds.CONDRegular(prior)
        out["sca"] = comfy.conds.CONDRegular(torch.zeros((1,)))
        return out