Merge branch 'master' into image-cache

c92f3dca · Jairo Correa · GitHub · 006b24cc · 2995a247 · c92f3dca
Unverified Commit c92f3dca authored Dec 02, 2023 by Jairo Correa Committed by GitHub Dec 02, 2023
20 changed files
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
+{
+	"path-intellisense.mappings": {
+		"../": "${workspaceFolder}/web/extensions/core"
+	},
+	"[python]": {
+		"editor.defaultFormatter": "ms-python.autopep8"
+	},
+	"python.formatting.provider": "none"
+}
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 ## Features
 - Nodes/graph/flowchart interface to experiment and create complex Stable Diffusion workflows without needing to code anything.
- Fully supports SD1.x, SD2.x and SDXL
+- Fully supports SD1.x, SD2.x, [SDXL](https://comfyanonymous.github.io/ComfyUI_examples/sdxl/) and [Stable Video Diffusion](https://comfyanonymous.github.io/ComfyUI_examples/video/)
 - Asynchronous Queue system
 - Many optimizations: Only re-executes the parts of the workflow that changes between executions.
 - Command line option: ```--lowvram``` to make it work on GPUs with less than 3GB vram (enabled automatically on GPUs with low vram)
@@ -30,6 +30,8 @@ This ui will let you design and execute advanced stable diffusion pipelines usin
 - [unCLIP Models](https://comfyanonymous.github.io/ComfyUI_examples/unclip/)
 - [GLIGEN](https://comfyanonymous.github.io/ComfyUI_examples/gligen/)
 - [Model Merging](https://comfyanonymous.github.io/ComfyUI_examples/model_merging/)
+- [LCM models and Loras](https://comfyanonymous.github.io/ComfyUI_examples/lcm/)
+- [SDXL Turbo](https://comfyanonymous.github.io/ComfyUI_examples/sdturbo/)
 - Latent previews with [TAESD](#how-to-show-high-quality-previews)
 - Starts up very fast.
 - Works fully offline: will never download anything.
@@ -43,6 +45,7 @@ Workflow examples can be found on the [Examples page](https://comfyanonymous.git
 |---------------------------|--------------------------------------------------------------------------------------------------------------------|
 | Ctrl + Enter              | Queue up current graph for generation                                                                              |
 | Ctrl + Shift + Enter      | Queue up current graph as first for generation                                                                     |
+| Ctrl + Z/Ctrl + Y         | Undo/Redo                                                                                                          |
 | Ctrl + S                  | Save workflow                                                                                                      |
 | Ctrl + O                  | Load workflow                                                                                                      |
 | Ctrl + A                  | Select all nodes                                                                                                   |
@@ -98,6 +101,7 @@ AMD users can install rocm and pytorch with pip if you don't have it already ins
 ```pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.6```
 This is the command to install the nightly with ROCm 5.7 that might have some performance improvements:
 ```pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm5.7```
 ### NVIDIA
@@ -190,7 +194,7 @@ To use a textual inversion concepts/embeddings in a text prompt put them in the
 Make sure you use the regular loaders/Load Checkpoint node to load checkpoints. It will auto pick the right settings depending on your GPU.
-You can set this command line setting to disable the upcasting to fp32 in some cross attention operations which will increase your speed. Note that this will very likely give you black images on SD2.x models. If you use xformers this option does not do anything.
+You can set this command line setting to disable the upcasting to fp32 in some cross attention operations which will increase your speed. Note that this will very likely give you black images on SD2.x models. If you use xformers or pytorch attention this option does not do anything.
 ```--dont-upcast-attention```

--- a/comfy/cldm/cldm.py
+++ b/comfy/cldm/cldm.py
@@ -54,6 +54,7 @@ class ControlNet(nn.Module):
        transformer_depth_output=None,
        device=None,
        operations=comfy.ops,
+        **kwargs,
    ):
        super().__init__()
        assert use_spatial_transformer == True, "use_spatial_transformer has to be true"

--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -62,6 +62,13 @@ fpvae_group.add_argument("--fp16-vae", action="store_true", help="Run the VAE in
 fpvae_group.add_argument("--fp32-vae", action="store_true", help="Run the VAE in full precision fp32.")
 fpvae_group.add_argument("--bf16-vae", action="store_true", help="Run the VAE in bf16.")
+fpte_group = parser.add_mutually_exclusive_group()
+fpte_group.add_argument("--fp8_e4m3fn-text-enc", action="store_true", help="Store text encoder weights in fp8 (e4m3fn variant).")
+fpte_group.add_argument("--fp8_e5m2-text-enc", action="store_true", help="Store text encoder weights in fp8 (e5m2 variant).")
+fpte_group.add_argument("--fp16-text-enc", action="store_true", help="Store text encoder weights in fp16.")
+fpte_group.add_argument("--fp32-text-enc", action="store_true", help="Store text encoder weights in fp32.")
 parser.add_argument("--directml", type=int, nargs="?", metavar="DIRECTML_DEVICE", const=-1, help="Use torch-directml.")
 parser.add_argument("--disable-ipex-optimize", action="store_true", help="Disables ipex.optimize when loading models with Intel GPUs.")

--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
@@ -33,7 +33,7 @@ class ControlBase:
        self.cond_hint_original = None
        self.cond_hint = None
        self.strength = 1.0
-        self.timestep_percent_range = (1.0, 0.0)
+        self.timestep_percent_range = (0.0, 1.0)
        self.timestep_range = None
        if device is None:
@@ -42,7 +42,7 @@ class ControlBase:
        self.previous_controlnet = None
        self.global_average_pooling = False
-    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(1.0, 0.0)):
+    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0)):
        self.cond_hint_original = cond_hint
        self.strength = strength
        self.timestep_percent_range = timestep_percent_range

--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
@@ -858,7 +858,7 @@ def predict_eps_sigma(model, input, sigma_in, **kwargs):
    return  (input - model(input, sigma_in, **kwargs)) / sigma
-def sample_unipc(model, noise, image, sigmas, sampling_function, max_denoise, extra_args=None, callback=None, disable=False, noise_mask=None, variant='bh1'):
+def sample_unipc(model, noise, image, sigmas, max_denoise, extra_args=None, callback=None, disable=False, noise_mask=None, variant='bh1'):
        timesteps = sigmas.clone()
        if sigmas[-1] == 0:
            timesteps = sigmas[:]

--- a/comfy/k_diffusion/sampling.py
+++ b/comfy/k_diffusion/sampling.py
@@ -750,3 +750,61 @@ def sample_lcm(model, x, sigmas, extra_args=None, callback=None, disable=None, n
        if sigmas[i + 1] > 0:
            x += sigmas[i + 1] * noise_sampler(sigmas[i], sigmas[i + 1])
    return x
+@torch.no_grad()
+def sample_heunpp2(model, x, sigmas, extra_args=None, callback=None, disable=None, s_churn=0., s_tmin=0., s_tmax=float('inf'), s_noise=1.):
+    # From MIT licensed: https://github.com/Carzit/sd-webui-samplers-scheduler/
+    extra_args = {} if extra_args is None else extra_args
+    s_in = x.new_ones([x.shape[0]])
+    s_end = sigmas[-1]
+    for i in trange(len(sigmas) - 1, disable=disable):
+        gamma = min(s_churn / (len(sigmas) - 1), 2 ** 0.5 - 1) if s_tmin <= sigmas[i] <= s_tmax else 0.
+        eps = torch.randn_like(x) * s_noise
+        sigma_hat = sigmas[i] * (gamma + 1)
+        if gamma > 0:
+            x = x + eps * (sigma_hat ** 2 - sigmas[i] ** 2) ** 0.5
+        denoised = model(x, sigma_hat * s_in, **extra_args)
+        d = to_d(x, sigma_hat, denoised)
+        if callback is not None:
+            callback({'x': x, 'i': i, 'sigma': sigmas[i], 'sigma_hat': sigma_hat, 'denoised': denoised})
+        dt = sigmas[i + 1] - sigma_hat
+        if sigmas[i + 1] == s_end:
+            # Euler method
+            x = x + d * dt
+        elif sigmas[i + 2] == s_end:
+            # Heun's method
+            x_2 = x + d * dt
+            denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args)
+            d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
+            w = 2 * sigmas[0]
+            w2 = sigmas[i+1]/w
+            w1 = 1 - w2
+            d_prime = d * w1 + d_2 * w2
+            x = x + d_prime * dt
+        else:
+            # Heun++
+            x_2 = x + d * dt
+            denoised_2 = model(x_2, sigmas[i + 1] * s_in, **extra_args)
+            d_2 = to_d(x_2, sigmas[i + 1], denoised_2)
+            dt_2 = sigmas[i + 2] - sigmas[i + 1]
+            x_3 = x_2 + d_2 * dt_2
+            denoised_3 = model(x_3, sigmas[i + 2] * s_in, **extra_args)
+            d_3 = to_d(x_3, sigmas[i + 2], denoised_3)
+            w = 3 * sigmas[0]
+            w2 = sigmas[i + 1] / w
+            w3 = sigmas[i + 2] / w
+            w1 = 1 - w2 - w3
+            d_prime = w1 * d + w2 * d_2 + w3 * d_3
+            x = x + d_prime * dt
+    return x
--- a/comfy/ldm/modules/attention.py
+++ b/comfy/ldm/modules/attention.py
@@ -5,8 +5,10 @@ import torch.nn.functional as F
 from torch import nn, einsum
 from einops import rearrange, repeat
 from typing import Optional, Any
+from functools import partial
-from .diffusionmodules.util import checkpoint
+from .diffusionmodules.util import checkpoint, AlphaBlender, timestep_embedding
 from .sub_quadratic_attention import efficient_dot_product_attention
 from comfy import model_management
@@ -276,9 +278,20 @@ def attention_split(q, k, v, heads, mask=None):
    )
    return r1
+BROKEN_XFORMERS = False
+try:
+    x_vers = xformers.__version__
+    #I think 0.0.23 is also broken (q with bs bigger than 65535 gives CUDA error)
+    BROKEN_XFORMERS = x_vers.startswith("0.0.21") or x_vers.startswith("0.0.22") or x_vers.startswith("0.0.23")
+except:
+    pass
 def attention_xformers(q, k, v, heads, mask=None):
    b, _, dim_head = q.shape
    dim_head //= heads
+    if BROKEN_XFORMERS:
+        if b * heads > 65535:
+            return attention_pytorch(q, k, v, heads, mask)
    q, k, v = map(
        lambda t: t.unsqueeze(3)
@@ -370,53 +383,72 @@ class CrossAttention(nn.Module):
 class BasicTransformerBlock(nn.Module):
-    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True,
+    def __init__(self, dim, n_heads, d_head, dropout=0., context_dim=None, gated_ff=True, checkpoint=True, ff_in=False, inner_dim=None,
-                 disable_self_attn=False, dtype=None, device=None, operations=comfy.ops):
+                 disable_self_attn=False, disable_temporal_crossattention=False, switch_temporal_ca_to_sa=False, dtype=None, device=None, operations=comfy.ops):
        super().__init__()
+        self.ff_in = ff_in or inner_dim is not None
+        if inner_dim is None:
+            inner_dim = dim
+        self.is_res = inner_dim == dim
+        if self.ff_in:
+            self.norm_in = nn.LayerNorm(dim, dtype=dtype, device=device)
+            self.ff_in = FeedForward(dim, dim_out=inner_dim, dropout=dropout, glu=gated_ff, dtype=dtype, device=device, operations=operations)
        self.disable_self_attn = disable_self_attn
-        self.attn1 = CrossAttention(query_dim=dim, heads=n_heads, dim_head=d_head, dropout=dropout,
+        self.attn1 = CrossAttention(query_dim=inner_dim, heads=n_heads, dim_head=d_head, dropout=dropout,
                              context_dim=context_dim if self.disable_self_attn else None, dtype=dtype, device=device, operations=operations)  # is a self-attention if not self.disable_self_attn
-        self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff, dtype=dtype, device=device, operations=operations)
+        self.ff = FeedForward(inner_dim, dim_out=dim, dropout=dropout, glu=gated_ff, dtype=dtype, device=device, operations=operations)
-        self.attn2 = CrossAttention(query_dim=dim, context_dim=context_dim,
-                              heads=n_heads, dim_head=d_head, dropout=dropout, dtype=dtype, device=device, operations=operations)  # is self-attn if context is none
+        if disable_temporal_crossattention:
-        self.norm1 = nn.LayerNorm(dim, dtype=dtype, device=device)
+            if switch_temporal_ca_to_sa:
-        self.norm2 = nn.LayerNorm(dim, dtype=dtype, device=device)
+                raise ValueError
-        self.norm3 = nn.LayerNorm(dim, dtype=dtype, device=device)
+            else:
+                self.attn2 = None
+        else:
+            context_dim_attn2 = None
+            if not switch_temporal_ca_to_sa:
+                context_dim_attn2 = context_dim
+            self.attn2 = CrossAttention(query_dim=inner_dim, context_dim=context_dim_attn2,
+                                heads=n_heads, dim_head=d_head, dropout=dropout, dtype=dtype, device=device, operations=operations)  # is self-attn if context is none
+            self.norm2 = nn.LayerNorm(inner_dim, dtype=dtype, device=device)
+        self.norm1 = nn.LayerNorm(inner_dim, dtype=dtype, device=device)
+        self.norm3 = nn.LayerNorm(inner_dim, dtype=dtype, device=device)
        self.checkpoint = checkpoint
        self.n_heads = n_heads
        self.d_head = d_head
+        self.switch_temporal_ca_to_sa = switch_temporal_ca_to_sa
    def forward(self, x, context=None, transformer_options={}):
        return checkpoint(self._forward, (x, context, transformer_options), self.parameters(), self.checkpoint)
    def _forward(self, x, context=None, transformer_options={}):
        extra_options = {}
-        block = None
+        block = transformer_options.get("block", None)
-        block_index = 0
+        block_index = transformer_options.get("block_index", 0)
-        if "current_index" in transformer_options:
+        transformer_patches = {}
-            extra_options["transformer_index"] = transformer_options["current_index"]
+        transformer_patches_replace = {}
-        if "block_index" in transformer_options:
-            block_index = transformer_options["block_index"]
+        for k in transformer_options:
-            extra_options["block_index"] = block_index
+            if k == "patches":
-        if "original_shape" in transformer_options:
+                transformer_patches = transformer_options[k]
-            extra_options["original_shape"] = transformer_options["original_shape"]
+            elif k == "patches_replace":
-        if "block" in transformer_options:
+                transformer_patches_replace = transformer_options[k]
-            block = transformer_options["block"]
+            else:
-            extra_options["block"] = block
+                extra_options[k] = transformer_options[k]
-        if "cond_or_uncond" in transformer_options:
-            extra_options["cond_or_uncond"] = transformer_options["cond_or_uncond"]
-        if "patches" in transformer_options:
-            transformer_patches = transformer_options["patches"]
-        else:
-            transformer_patches = {}
        extra_options["n_heads"] = self.n_heads
        extra_options["dim_head"] = self.d_head
-        if "patches_replace" in transformer_options:
+        if self.ff_in:
-            transformer_patches_replace = transformer_options["patches_replace"]
+            x_skip = x
-        else:
+            x = self.ff_in(self.norm_in(x))
-            transformer_patches_replace = {}
+            if self.is_res:
+                x += x_skip
        n = self.norm1(x)
        if self.disable_self_attn:
@@ -465,31 +497,34 @@ class BasicTransformerBlock(nn.Module):
            for p in patch:
                x = p(x, extra_options)
-        n = self.norm2(x)
+        if self.attn2 is not None:
+            n = self.norm2(x)
-        context_attn2 = context
+            if self.switch_temporal_ca_to_sa:
-        value_attn2 = None
+                context_attn2 = n
-        if "attn2_patch" in transformer_patches:
+            else:
-            patch = transformer_patches["attn2_patch"]
+                context_attn2 = context
-            value_attn2 = context_attn2
+            value_attn2 = None
-            for p in patch:
+            if "attn2_patch" in transformer_patches:
-                n, context_attn2, value_attn2 = p(n, context_attn2, value_attn2, extra_options)
+                patch = transformer_patches["attn2_patch"]
-        attn2_replace_patch = transformer_patches_replace.get("attn2", {})
-        block_attn2 = transformer_block
-        if block_attn2 not in attn2_replace_patch:
-            block_attn2 = block
-        if block_attn2 in attn2_replace_patch:
-            if value_attn2 is None:
                value_attn2 = context_attn2
-            n = self.attn2.to_q(n)
+                for p in patch:
-            context_attn2 = self.attn2.to_k(context_attn2)
+                    n, context_attn2, value_attn2 = p(n, context_attn2, value_attn2, extra_options)
-            value_attn2 = self.attn2.to_v(value_attn2)
-            n = attn2_replace_patch[block_attn2](n, context_attn2, value_attn2, extra_options)
+            attn2_replace_patch = transformer_patches_replace.get("attn2", {})
-            n = self.attn2.to_out(n)
+            block_attn2 = transformer_block
-        else:
+            if block_attn2 not in attn2_replace_patch:
-            n = self.attn2(n, context=context_attn2, value=value_attn2)
+                block_attn2 = block
+            if block_attn2 in attn2_replace_patch:
+                if value_attn2 is None:
+                    value_attn2 = context_attn2
+                n = self.attn2.to_q(n)
+                context_attn2 = self.attn2.to_k(context_attn2)
+                value_attn2 = self.attn2.to_v(value_attn2)
+                n = attn2_replace_patch[block_attn2](n, context_attn2, value_attn2, extra_options)
+                n = self.attn2.to_out(n)
+            else:
+                n = self.attn2(n, context=context_attn2, value=value_attn2)
        if "attn2_output_patch" in transformer_patches:
            patch = transformer_patches["attn2_output_patch"]
@@ -497,7 +532,12 @@ class BasicTransformerBlock(nn.Module):
                n = p(n, extra_options)
        x += n
-        x = self.ff(self.norm3(x)) + x
+        if self.is_res:
+            x_skip = x
+        x = self.ff(self.norm3(x))
+        if self.is_res:
+            x += x_skip
        return x
@@ -565,3 +605,164 @@ class SpatialTransformer(nn.Module):
            x = self.proj_out(x)
        return x + x_in
+class SpatialVideoTransformer(SpatialTransformer):
+    def __init__(
+        self,
+        in_channels,
+        n_heads,
+        d_head,
+        depth=1,
+        dropout=0.0,
+        use_linear=False,
+        context_dim=None,
+        use_spatial_context=False,
+        timesteps=None,
+        merge_strategy: str = "fixed",
+        merge_factor: float = 0.5,
+        time_context_dim=None,
+        ff_in=False,
+        checkpoint=False,
+        time_depth=1,
+        disable_self_attn=False,
+        disable_temporal_crossattention=False,
+        max_time_embed_period: int = 10000,
+        dtype=None, device=None, operations=comfy.ops
+    ):
+        super().__init__(
+            in_channels,
+            n_heads,
+            d_head,
+            depth=depth,
+            dropout=dropout,
+            use_checkpoint=checkpoint,
+            context_dim=context_dim,
+            use_linear=use_linear,
+            disable_self_attn=disable_self_attn,
+            dtype=dtype, device=device, operations=operations
+        )
+        self.time_depth = time_depth
+        self.depth = depth
+        self.max_time_embed_period = max_time_embed_period
+        time_mix_d_head = d_head
+        n_time_mix_heads = n_heads
+        time_mix_inner_dim = int(time_mix_d_head * n_time_mix_heads)
+        inner_dim = n_heads * d_head
+        if use_spatial_context:
+            time_context_dim = context_dim
+        self.time_stack = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    n_time_mix_heads,
+                    time_mix_d_head,
+                    dropout=dropout,
+                    context_dim=time_context_dim,
+                    # timesteps=timesteps,
+                    checkpoint=checkpoint,
+                    ff_in=ff_in,
+                    inner_dim=time_mix_inner_dim,
+                    disable_self_attn=disable_self_attn,
+                    disable_temporal_crossattention=disable_temporal_crossattention,
+                    dtype=dtype, device=device, operations=operations
+                )
+                for _ in range(self.depth)
+            ]
+        )
+        assert len(self.time_stack) == len(self.transformer_blocks)
+        self.use_spatial_context = use_spatial_context
+        self.in_channels = in_channels
+        time_embed_dim = self.in_channels * 4
+        self.time_pos_embed = nn.Sequential(
+            operations.Linear(self.in_channels, time_embed_dim, dtype=dtype, device=device),
+            nn.SiLU(),
+            operations.Linear(time_embed_dim, self.in_channels, dtype=dtype, device=device),
+        )
+        self.time_mixer = AlphaBlender(
+            alpha=merge_factor, merge_strategy=merge_strategy
+        )
+    def forward(
+        self,
+        x: torch.Tensor,
+        context: Optional[torch.Tensor] = None,
+        time_context: Optional[torch.Tensor] = None,
+        timesteps: Optional[int] = None,
+        image_only_indicator: Optional[torch.Tensor] = None,
+        transformer_options={}
+    ) -> torch.Tensor:
+        _, _, h, w = x.shape
+        x_in = x
+        spatial_context = None
+        if exists(context):
+            spatial_context = context
+        if self.use_spatial_context:
+            assert (
+                context.ndim == 3
+            ), f"n dims of spatial context should be 3 but are {context.ndim}"
+            if time_context is None:
+                time_context = context
+            time_context_first_timestep = time_context[::timesteps]
+            time_context = repeat(
+                time_context_first_timestep, "b ... -> (b n) ...", n=h * w
+            )
+        elif time_context is not None and not self.use_spatial_context:
+            time_context = repeat(time_context, "b ... -> (b n) ...", n=h * w)
+            if time_context.ndim == 2:
+                time_context = rearrange(time_context, "b c -> b 1 c")
+        x = self.norm(x)
+        if not self.use_linear:
+            x = self.proj_in(x)
+        x = rearrange(x, "b c h w -> b (h w) c")
+        if self.use_linear:
+            x = self.proj_in(x)
+        num_frames = torch.arange(timesteps, device=x.device)
+        num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
+        num_frames = rearrange(num_frames, "b t -> (b t)")
+        t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False, max_period=self.max_time_embed_period).to(x.dtype)
+        emb = self.time_pos_embed(t_emb)
+        emb = emb[:, None, :]
+        for it_, (block, mix_block) in enumerate(
+            zip(self.transformer_blocks, self.time_stack)
+        ):
+            transformer_options["block_index"] = it_
+            x = block(
+                x,
+                context=spatial_context,
+                transformer_options=transformer_options,
+            )
+            x_mix = x
+            x_mix = x_mix + emb
+            B, S, C = x_mix.shape
+            x_mix = rearrange(x_mix, "(b t) s c -> (b s) t c", t=timesteps)
+            x_mix = mix_block(x_mix, context=time_context) #TODO: transformer_options
+            x_mix = rearrange(
+                x_mix, "(b s) t c -> (b t) s c", s=S, b=B // timesteps, c=C, t=timesteps
+            )
+            x = self.time_mixer(x_spatial=x, x_temporal=x_mix, image_only_indicator=image_only_indicator)
+        if self.use_linear:
+            x = self.proj_out(x)
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        if not self.use_linear:
+            x = self.proj_out(x)
+        out = x + x_in
+        return out
--- a/comfy/ldm/modules/diffusionmodules/openaimodel.py
+++ b/comfy/ldm/modules/diffusionmodules/openaimodel.py
--- a/comfy/ldm/modules/diffusionmodules/util.py
+++ b/comfy/ldm/modules/diffusionmodules/util.py
@@ -13,11 +13,78 @@ import math
 import torch
 import torch.nn as nn
 import numpy as np
-from einops import repeat
+from einops import repeat, rearrange
 from comfy.ldm.util import instantiate_from_config
 import comfy.ops
+class AlphaBlender(nn.Module):
+    strategies = ["learned", "fixed", "learned_with_images"]
+    def __init__(
+        self,
+        alpha: float,
+        merge_strategy: str = "learned_with_images",
+        rearrange_pattern: str = "b t -> (b t) 1 1",
+    ):
+        super().__init__()
+        self.merge_strategy = merge_strategy
+        self.rearrange_pattern = rearrange_pattern
+        assert (
+            merge_strategy in self.strategies
+        ), f"merge_strategy needs to be in {self.strategies}"
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif (
+            self.merge_strategy == "learned"
+            or self.merge_strategy == "learned_with_images"
+        ):
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+    def get_alpha(self, image_only_indicator: torch.Tensor) -> torch.Tensor:
+        # skip_time_mix = rearrange(repeat(skip_time_mix, 'b -> (b t) () () ()', t=t), '(b t) 1 ... -> b 1 t ...', t=t)
+        if self.merge_strategy == "fixed":
+            # make shape compatible
+            # alpha = repeat(self.mix_factor, '1 -> b () t  () ()', t=t, b=bs)
+            alpha = self.mix_factor
+        elif self.merge_strategy == "learned":
+            alpha = torch.sigmoid(self.mix_factor)
+            # make shape compatible
+            # alpha = repeat(alpha, '1 -> s () ()', s = t * bs)
+        elif self.merge_strategy == "learned_with_images":
+            assert image_only_indicator is not None, "need image_only_indicator ..."
+            alpha = torch.where(
+                image_only_indicator.bool(),
+                torch.ones(1, 1, device=image_only_indicator.device),
+                rearrange(torch.sigmoid(self.mix_factor), "... -> ... 1"),
+            )
+            alpha = rearrange(alpha, self.rearrange_pattern)
+            # make shape compatible
+            # alpha = repeat(alpha, '1 -> s () ()', s = t * bs)
+        else:
+            raise NotImplementedError()
+        return alpha
+    def forward(
+        self,
+        x_spatial,
+        x_temporal,
+        image_only_indicator=None,
+    ) -> torch.Tensor:
+        alpha = self.get_alpha(image_only_indicator)
+        x = (
+            alpha.to(x_spatial.dtype) * x_spatial
+            + (1.0 - alpha).to(x_spatial.dtype) * x_temporal
+        )
+        return x
 def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
    if schedule == "linear":
        betas = (

--- a/comfy/ldm/modules/temporal_ae.py
+++ b/comfy/ldm/modules/temporal_ae.py
+import functools
+from typing import Callable, Iterable, Union
+import torch
+from einops import rearrange, repeat
+import comfy.ops
+from .diffusionmodules.model import (
+    AttnBlock,
+    Decoder,
+    ResnetBlock,
+)
+from .diffusionmodules.openaimodel import ResBlock, timestep_embedding
+from .attention import BasicTransformerBlock
+def partialclass(cls, *args, **kwargs):
+    class NewCls(cls):
+        __init__ = functools.partialmethod(cls.__init__, *args, **kwargs)
+    return NewCls
+class VideoResBlock(ResnetBlock):
+    def __init__(
+        self,
+        out_channels,
+        *args,
+        dropout=0.0,
+        video_kernel_size=3,
+        alpha=0.0,
+        merge_strategy="learned",
+        **kwargs,
+    ):
+        super().__init__(out_channels=out_channels, dropout=dropout, *args, **kwargs)
+        if video_kernel_size is None:
+            video_kernel_size = [3, 1, 1]
+        self.time_stack = ResBlock(
+            channels=out_channels,
+            emb_channels=0,
+            dropout=dropout,
+            dims=3,
+            use_scale_shift_norm=False,
+            use_conv=False,
+            up=False,
+            down=False,
+            kernel_size=video_kernel_size,
+            use_checkpoint=False,
+            skip_t_emb=True,
+        )
+        self.merge_strategy = merge_strategy
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned":
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+    def get_alpha(self, bs):
+        if self.merge_strategy == "fixed":
+            return self.mix_factor
+        elif self.merge_strategy == "learned":
+            return torch.sigmoid(self.mix_factor)
+        else:
+            raise NotImplementedError()
+    def forward(self, x, temb, skip_video=False, timesteps=None):
+        b, c, h, w = x.shape
+        if timesteps is None:
+            timesteps = b
+        x = super().forward(x, temb)
+        if not skip_video:
+            x_mix = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+            x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+            x = self.time_stack(x, temb)
+            alpha = self.get_alpha(bs=b // timesteps)
+            x = alpha * x + (1.0 - alpha) * x_mix
+            x = rearrange(x, "b c t h w -> (b t) c h w")
+        return x
+class AE3DConv(torch.nn.Conv2d):
+    def __init__(self, in_channels, out_channels, video_kernel_size=3, *args, **kwargs):
+        super().__init__(in_channels, out_channels, *args, **kwargs)
+        if isinstance(video_kernel_size, Iterable):
+            padding = [int(k // 2) for k in video_kernel_size]
+        else:
+            padding = int(video_kernel_size // 2)
+        self.time_mix_conv = torch.nn.Conv3d(
+            in_channels=out_channels,
+            out_channels=out_channels,
+            kernel_size=video_kernel_size,
+            padding=padding,
+        )
+    def forward(self, input, timesteps=None, skip_video=False):
+        if timesteps is None:
+            timesteps = input.shape[0]
+        x = super().forward(input)
+        if skip_video:
+            return x
+        x = rearrange(x, "(b t) c h w -> b c t h w", t=timesteps)
+        x = self.time_mix_conv(x)
+        return rearrange(x, "b c t h w -> (b t) c h w")
+class AttnVideoBlock(AttnBlock):
+    def __init__(
+        self, in_channels: int, alpha: float = 0, merge_strategy: str = "learned"
+    ):
+        super().__init__(in_channels)
+        # no context, single headed, as in base class
+        self.time_mix_block = BasicTransformerBlock(
+            dim=in_channels,
+            n_heads=1,
+            d_head=in_channels,
+            checkpoint=False,
+            ff_in=True,
+        )
+        time_embed_dim = self.in_channels * 4
+        self.video_time_embed = torch.nn.Sequential(
+            comfy.ops.Linear(self.in_channels, time_embed_dim),
+            torch.nn.SiLU(),
+            comfy.ops.Linear(time_embed_dim, self.in_channels),
+        )
+        self.merge_strategy = merge_strategy
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif self.merge_strategy == "learned":
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+    def forward(self, x, timesteps=None, skip_time_block=False):
+        if skip_time_block:
+            return super().forward(x)
+        if timesteps is None:
+            timesteps = x.shape[0]
+        x_in = x
+        x = self.attention(x)
+        h, w = x.shape[2:]
+        x = rearrange(x, "b c h w -> b (h w) c")
+        x_mix = x
+        num_frames = torch.arange(timesteps, device=x.device)
+        num_frames = repeat(num_frames, "t -> b t", b=x.shape[0] // timesteps)
+        num_frames = rearrange(num_frames, "b t -> (b t)")
+        t_emb = timestep_embedding(num_frames, self.in_channels, repeat_only=False)
+        emb = self.video_time_embed(t_emb)  # b, n_channels
+        emb = emb[:, None, :]
+        x_mix = x_mix + emb
+        alpha = self.get_alpha()
+        x_mix = self.time_mix_block(x_mix, timesteps=timesteps)
+        x = alpha * x + (1.0 - alpha) * x_mix  # alpha merge
+        x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w)
+        x = self.proj_out(x)
+        return x_in + x
+    def get_alpha(
+        self,
+    ):
+        if self.merge_strategy == "fixed":
+            return self.mix_factor
+        elif self.merge_strategy == "learned":
+            return torch.sigmoid(self.mix_factor)
+        else:
+            raise NotImplementedError(f"unknown merge strategy {self.merge_strategy}")
+def make_time_attn(
+    in_channels,
+    attn_type="vanilla",
+    attn_kwargs=None,
+    alpha: float = 0,
+    merge_strategy: str = "learned",
+):
+    return partialclass(
+        AttnVideoBlock, in_channels, alpha=alpha, merge_strategy=merge_strategy
+    )
+class Conv2DWrapper(torch.nn.Conv2d):
+    def forward(self, input: torch.Tensor, **kwargs) -> torch.Tensor:
+        return super().forward(input)
+class VideoDecoder(Decoder):
+    available_time_modes = ["all", "conv-only", "attn-only"]
+    def __init__(
+        self,
+        *args,
+        video_kernel_size: Union[int, list] = 3,
+        alpha: float = 0.0,
+        merge_strategy: str = "learned",
+        time_mode: str = "conv-only",
+        **kwargs,
+    ):
+        self.video_kernel_size = video_kernel_size
+        self.alpha = alpha
+        self.merge_strategy = merge_strategy
+        self.time_mode = time_mode
+        assert (
+            self.time_mode in self.available_time_modes
+        ), f"time_mode parameter has to be in {self.available_time_modes}"
+        if self.time_mode != "attn-only":
+            kwargs["conv_out_op"] = partialclass(AE3DConv, video_kernel_size=self.video_kernel_size)
+        if self.time_mode not in ["conv-only", "only-last-conv"]:
+            kwargs["attn_op"] = partialclass(make_time_attn, alpha=self.alpha, merge_strategy=self.merge_strategy)
+        if self.time_mode not in ["attn-only", "only-last-conv"]:
+            kwargs["resnet_op"] = partialclass(VideoResBlock, video_kernel_size=self.video_kernel_size, alpha=self.alpha, merge_strategy=self.merge_strategy)
+        super().__init__(*args, **kwargs)
+    def get_last_layer(self, skip_time_mix=False, **kwargs):
+        if self.time_mode == "attn-only":
+            raise NotImplementedError("TODO")
+        else:
+            return (
+                self.conv_out.time_mix_conv.weight
+                if not skip_time_mix
+                else self.conv_out.weight
+            )
--- a/comfy/model_base.py
+++ b/comfy/model_base.py
@@ -10,17 +10,22 @@ from . import utils
 class ModelType(Enum):
    EPS = 1
    V_PREDICTION = 2
+    V_PREDICTION_EDM = 3
-from comfy.model_sampling import EPS, V_PREDICTION, ModelSamplingDiscrete
+from comfy.model_sampling import EPS, V_PREDICTION, ModelSamplingDiscrete, ModelSamplingContinuousEDM
 def model_sampling(model_config, model_type):
+    s = ModelSamplingDiscrete
    if model_type == ModelType.EPS:
        c = EPS
    elif model_type == ModelType.V_PREDICTION:
        c = V_PREDICTION
+    elif model_type == ModelType.V_PREDICTION_EDM:
-    s = ModelSamplingDiscrete
+        c = V_PREDICTION
+        s = ModelSamplingContinuousEDM
    class ModelSampling(s, c):
        pass
@@ -121,6 +126,7 @@ class BaseModel(torch.nn.Module):
            if k.startswith(unet_prefix):
                to_load[k[len(unet_prefix):]] = sd.pop(k)
+        to_load = self.model_config.process_unet_state_dict(to_load)
        m, u = self.diffusion_model.load_state_dict(to_load, strict=False)
        if len(m) > 0:
            print("unet missing:", m)
@@ -157,6 +163,17 @@ class BaseModel(torch.nn.Module):
    def set_inpaint(self):
        self.inpaint_model = True
+    def memory_required(self, input_shape):
+        if comfy.model_management.xformers_enabled() or comfy.model_management.pytorch_attention_flash_attention():
+            #TODO: this needs to be tweaked
+            area = input_shape[0] * input_shape[2] * input_shape[3]
+            return (area * comfy.model_management.dtype_size(self.get_dtype()) / 50) * (1024 * 1024)
+        else:
+            #TODO: this formula might be too aggressive since I tweaked the sub-quad and split algorithms to use less memory.
+            area = input_shape[0] * input_shape[2] * input_shape[3]
+            return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
 def unclip_adm(unclip_conditioning, device, noise_augmentor, noise_augment_merge=0.0):
    adm_inputs = []
    weights = []
@@ -251,3 +268,48 @@ class SDXL(BaseModel):
        out.append(self.embedder(torch.Tensor([target_width])))
        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0).repeat(clip_pooled.shape[0], 1)
        return torch.cat((clip_pooled.to(flat.device), flat), dim=1)
+class SVD_img2vid(BaseModel):
+    def __init__(self, model_config, model_type=ModelType.V_PREDICTION_EDM, device=None):
+        super().__init__(model_config, model_type, device=device)
+        self.embedder = Timestep(256)
+    def encode_adm(self, **kwargs):
+        fps_id = kwargs.get("fps", 6) - 1
+        motion_bucket_id = kwargs.get("motion_bucket_id", 127)
+        augmentation = kwargs.get("augmentation_level", 0)
+        out = []
+        out.append(self.embedder(torch.Tensor([fps_id])))
+        out.append(self.embedder(torch.Tensor([motion_bucket_id])))
+        out.append(self.embedder(torch.Tensor([augmentation])))
+        flat = torch.flatten(torch.cat(out)).unsqueeze(dim=0)
+        return flat
+    def extra_conds(self, **kwargs):
+        out = {}
+        adm = self.encode_adm(**kwargs)
+        if adm is not None:
+            out['y'] = comfy.conds.CONDRegular(adm)
+        latent_image = kwargs.get("concat_latent_image", None)
+        noise = kwargs.get("noise", None)
+        device = kwargs["device"]
+        if latent_image is None:
+            latent_image = torch.zeros_like(noise)
+        if latent_image.shape[1:] != noise.shape[1:]:
+            latent_image = utils.common_upscale(latent_image, noise.shape[-1], noise.shape[-2], "bilinear", "center")
+        latent_image = utils.repeat_to_batch_size(latent_image, noise.shape[0])
+        out['c_concat'] = comfy.conds.CONDNoiseShape(latent_image)
+        if "time_conditioning" in kwargs:
+            out["time_context"] = comfy.conds.CONDCrossAttn(kwargs["time_conditioning"])
+        out['image_only_indicator'] = comfy.conds.CONDConstant(torch.zeros((1,), device=device))
+        out['num_video_frames'] = comfy.conds.CONDConstant(noise.shape[0])
+        return out
--- a/comfy/model_detection.py
+++ b/comfy/model_detection.py
@@ -24,7 +24,8 @@ def calculate_transformer_depth(prefix, state_dict_keys, state_dict):
        last_transformer_depth = count_blocks(state_dict_keys, transformer_prefix + '{}')
        context_dim = state_dict['{}0.attn2.to_k.weight'.format(transformer_prefix)].shape[1]
        use_linear_in_transformer = len(state_dict['{}1.proj_in.weight'.format(prefix)].shape) == 2
-        return last_transformer_depth, context_dim, use_linear_in_transformer
+        time_stack = '{}1.time_stack.0.attn1.to_q.weight'.format(prefix) in state_dict or '{}1.time_mix_blocks.0.attn1.to_q.weight'.format(prefix) in state_dict
+        return last_transformer_depth, context_dim, use_linear_in_transformer, time_stack
    return None
 def detect_unet_config(state_dict, key_prefix, dtype):
@@ -57,6 +58,7 @@ def detect_unet_config(state_dict, key_prefix, dtype):
    context_dim = None
    use_linear_in_transformer = False
+    video_model = False
    current_res = 1
    count = 0
@@ -99,6 +101,7 @@ def detect_unet_config(state_dict, key_prefix, dtype):
                    if context_dim is None:
                        context_dim = out[1]
                        use_linear_in_transformer = out[2]
+                        video_model = out[3]
                else:
                    transformer_depth.append(0)
@@ -127,6 +130,19 @@ def detect_unet_config(state_dict, key_prefix, dtype):
    unet_config["transformer_depth_middle"] = transformer_depth_middle
    unet_config['use_linear_in_transformer'] = use_linear_in_transformer
    unet_config["context_dim"] = context_dim
+    if video_model:
+        unet_config["extra_ff_mix_layer"] = True
+        unet_config["use_spatial_context"] = True
+        unet_config["merge_strategy"] = "learned_with_images"
+        unet_config["merge_factor"] = 0.0
+        unet_config["video_kernel_size"] = [3, 1, 1]
+        unet_config["use_temporal_resblock"] = True
+        unet_config["use_temporal_attention"] = True
+    else:
+        unet_config["use_temporal_resblock"] = False
+        unet_config["use_temporal_attention"] = False
    return unet_config
 def model_config_from_unet_config(unet_config):
@@ -186,17 +202,24 @@ def convert_config(unet_config):
 def unet_config_from_diffusers_unet(state_dict, dtype):
    match = {}
-    attention_resolutions = []
+    transformer_depth = []
    attn_res = 1
-    for i in range(5):
+    down_blocks = count_blocks(state_dict, "down_blocks.{}")
-        k = "down_blocks.{}.attentions.1.transformer_blocks.0.attn2.to_k.weight".format(i)
+    for i in range(down_blocks):
-        if k in state_dict:
+        attn_blocks = count_blocks(state_dict, "down_blocks.{}.attentions.".format(i) + '{}')
-            match["context_dim"] = state_dict[k].shape[1]
+        for ab in range(attn_blocks):
-            attention_resolutions.append(attn_res)
+            transformer_count = count_blocks(state_dict, "down_blocks.{}.attentions.{}.transformer_blocks.".format(i, ab) + '{}')
+            transformer_depth.append(transformer_count)
+            if transformer_count > 0:
+                match["context_dim"] = state_dict["down_blocks.{}.attentions.{}.transformer_blocks.0.attn2.to_k.weight".format(i, ab)].shape[1]
        attn_res *= 2
+        if attn_blocks == 0:
+            transformer_depth.append(0)
+            transformer_depth.append(0)
-    match["attention_resolutions"] = attention_resolutions
+    match["transformer_depth"] = transformer_depth
    match["model_channels"] = state_dict["conv_in.weight"].shape[0]
    match["in_channels"] = state_dict["conv_in.weight"].shape[1]
@@ -208,50 +231,65 @@ def unet_config_from_diffusers_unet(state_dict, dtype):
    SDXL = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
            'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-            'num_res_blocks': 2, 'attention_resolutions': [2, 4], 'transformer_depth': [0, 2, 10], 'channel_mult': [1, 2, 4],
+            'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 2, 2, 10, 10], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 10,
-            'transformer_depth_middle': 10, 'use_linear_in_transformer': True, 'context_dim': 2048, "num_head_channels": 64}
+            'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 2, 2, 2, 10, 10, 10],
+            'use_temporal_attention': False, 'use_temporal_resblock': False}
    SDXL_refiner = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
                    'num_classes': 'sequential', 'adm_in_channels': 2560, 'dtype': dtype, 'in_channels': 4, 'model_channels': 384,
-                    'num_res_blocks': 2, 'attention_resolutions': [2, 4], 'transformer_depth': [0, 4, 4, 0], 'channel_mult': [1, 2, 4, 4],
+                    'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [0, 0, 4, 4, 4, 4, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 4,
-                    'transformer_depth_middle': 4, 'use_linear_in_transformer': True, 'context_dim': 1280, "num_head_channels": 64}
+                    'use_linear_in_transformer': True, 'context_dim': 1280, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 4, 4, 4, 4, 4, 4, 0, 0, 0],
+                    'use_temporal_attention': False, 'use_temporal_resblock': False}
    SD21 = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
-            'adm_in_channels': None, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': 2,
+            'adm_in_channels': None, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2],
-            'attention_resolutions': [1, 2, 4], 'transformer_depth': [1, 1, 1, 0], 'channel_mult': [1, 2, 4, 4],
+            'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': True,
-            'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, "num_head_channels": 64}
+            'context_dim': 1024, 'num_head_channels': 64, 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
+            'use_temporal_attention': False, 'use_temporal_resblock': False}
    SD21_uncliph = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
                    'num_classes': 'sequential', 'adm_in_channels': 2048, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-                    'num_res_blocks': 2, 'attention_resolutions': [1, 2, 4], 'transformer_depth': [1, 1, 1, 0], 'channel_mult': [1, 2, 4, 4],
+                    'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1,
-                    'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024, "num_head_channels": 64}
+                    'use_linear_in_transformer': True, 'context_dim': 1024, 'num_head_channels': 64, 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
+                    'use_temporal_attention': False, 'use_temporal_resblock': False}
    SD21_unclipl = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
                    'num_classes': 'sequential', 'adm_in_channels': 1536, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-                    'num_res_blocks': 2, 'attention_resolutions': [1, 2, 4], 'transformer_depth': [1, 1, 1, 0], 'channel_mult': [1, 2, 4, 4],
+                    'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0], 'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1,
-                    'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 1024}
+                    'use_linear_in_transformer': True, 'context_dim': 1024, 'num_head_channels': 64, 'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
+                    'use_temporal_attention': False, 'use_temporal_resblock': False}
-    SD15 = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
+    SD15 = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False, 'adm_in_channels': None,
-            'adm_in_channels': None, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': 2,
+            'dtype': dtype, 'in_channels': 4, 'model_channels': 320, 'num_res_blocks': [2, 2, 2, 2], 'transformer_depth': [1, 1, 1, 1, 1, 1, 0, 0],
-            'attention_resolutions': [1, 2, 4], 'transformer_depth': [1, 1, 1, 0], 'channel_mult': [1, 2, 4, 4],
+            'channel_mult': [1, 2, 4, 4], 'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, 'num_heads': 8,
-            'transformer_depth_middle': 1, 'use_linear_in_transformer': False, 'context_dim': 768, "num_heads": 8}
+            'transformer_depth_output': [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0],
+            'use_temporal_attention': False, 'use_temporal_resblock': False}
    SDXL_mid_cnet = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
-            'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
+                     'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-            'num_res_blocks': 2, 'attention_resolutions': [4], 'transformer_depth': [0, 0, 1], 'channel_mult': [1, 2, 4],
+                     'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 0, 0, 1, 1], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 1,
-            'transformer_depth_middle': 1, 'use_linear_in_transformer': True, 'context_dim': 2048, "num_head_channels": 64}
+                     'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 0, 0, 0, 1, 1, 1],
+                     'use_temporal_attention': False, 'use_temporal_resblock': False}
    SDXL_small_cnet = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
-            'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
+                       'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
-            'num_res_blocks': 2, 'attention_resolutions': [], 'transformer_depth': [0, 0, 0], 'channel_mult': [1, 2, 4],
+                       'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 0, 0, 0, 0], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 0,
-            'transformer_depth_middle': 0, 'use_linear_in_transformer': True, "num_head_channels": 64, 'context_dim': 1}
+                       'use_linear_in_transformer': True, 'num_head_channels': 64, 'context_dim': 1, 'transformer_depth_output': [0, 0, 0, 0, 0, 0, 0, 0, 0],
+                       'use_temporal_attention': False, 'use_temporal_resblock': False}
    SDXL_diffusers_inpaint = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
-            'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 9, 'model_channels': 320,
+                              'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 9, 'model_channels': 320,
-            'num_res_blocks': 2, 'attention_resolutions': [2, 4], 'transformer_depth': [0, 2, 10], 'channel_mult': [1, 2, 4],
+                              'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 2, 2, 10, 10], 'channel_mult': [1, 2, 4], 'transformer_depth_middle': 10,
-            'transformer_depth_middle': 10, 'use_linear_in_transformer': True, 'context_dim': 2048, "num_head_channels": 64}
+                              'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64, 'transformer_depth_output': [0, 0, 0, 2, 2, 2, 10, 10, 10],
+                              'use_temporal_attention': False, 'use_temporal_resblock': False}
-    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint]
+    SSD_1B = {'use_checkpoint': False, 'image_size': 32, 'out_channels': 4, 'use_spatial_transformer': True, 'legacy': False,
+              'num_classes': 'sequential', 'adm_in_channels': 2816, 'dtype': dtype, 'in_channels': 4, 'model_channels': 320,
+              'num_res_blocks': [2, 2, 2], 'transformer_depth': [0, 0, 2, 2, 4, 4], 'transformer_depth_output': [0, 0, 0, 1, 1, 2, 10, 4, 4],
+              'channel_mult': [1, 2, 4], 'transformer_depth_middle': -1, 'use_linear_in_transformer': True, 'context_dim': 2048, 'num_head_channels': 64,
+              'use_temporal_attention': False, 'use_temporal_resblock': False}
+    supported_models = [SDXL, SDXL_refiner, SD21, SD15, SD21_uncliph, SD21_unclipl, SDXL_mid_cnet, SDXL_small_cnet, SDXL_diffusers_inpaint, SSD_1B]
    for unet_config in supported_models:
        matches = True

--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -133,6 +133,10 @@ else:
        import xformers
        import xformers.ops
        XFORMERS_IS_AVAILABLE = True
+        try:
+            XFORMERS_IS_AVAILABLE = xformers._has_cpp_library
+        except:
+            pass
        try:
            XFORMERS_VERSION = xformers.version.__version__
            print("xformers version:", XFORMERS_VERSION)
@@ -478,6 +482,21 @@ def text_encoder_device():
    else:
        return torch.device("cpu")
+def text_encoder_dtype(device=None):
+    if args.fp8_e4m3fn_text_enc:
+        return torch.float8_e4m3fn
+    elif args.fp8_e5m2_text_enc:
+        return torch.float8_e5m2
+    elif args.fp16_text_enc:
+        return torch.float16
+    elif args.fp32_text_enc:
+        return torch.float32
+    if should_use_fp16(device, prioritize_performance=False):
+        return torch.float16
+    else:
+        return torch.float32
 def vae_device():
    return get_torch_device()
@@ -579,27 +598,6 @@ def get_free_memory(dev=None, torch_free_too=False):
    else:
        return mem_free_total
-def batch_area_memory(area):
-    if xformers_enabled() or pytorch_attention_flash_attention():
-        #TODO: these formulas are copied from maximum_batch_area below
-        return (area / 20) * (1024 * 1024)
-    else:
-        return (((area * 0.6) / 0.9) + 1024) * (1024 * 1024)
-def maximum_batch_area():
-    global vram_state
-    if vram_state == VRAMState.NO_VRAM:
-        return 0
-    memory_free = get_free_memory() / (1024 * 1024)
-    if xformers_enabled() or pytorch_attention_flash_attention():
-        #TODO: this needs to be tweaked
-        area = 20 * memory_free
-    else:
-        #TODO: this formula is because AMD sucks and has memory management issues which might be fixed in the future
-        area = ((memory_free - 1024) * 0.9) / (0.6)
-    return int(max(area, 0))
 def cpu_mode():
    global cpu_state
    return cpu_state == CPUState.CPU

--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -37,7 +37,7 @@ class ModelPatcher:
        return size
    def clone(self):
-        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device)
+        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device, weight_inplace_update=self.weight_inplace_update)
        n.patches = {}
        for k in self.patches:
            n.patches[k] = self.patches[k][:]
@@ -52,6 +52,9 @@ class ModelPatcher:
            return True
        return False
+    def memory_required(self, input_shape):
+        return self.model.memory_required(input_shape=input_shape)
    def set_model_sampler_cfg_function(self, sampler_cfg_function):
        if len(inspect.signature(sampler_cfg_function).parameters) == 3:
            self.model_options["sampler_cfg_function"] = lambda args: sampler_cfg_function(args["cond"], args["uncond"], args["cond_scale"]) #Old way
@@ -93,6 +96,12 @@ class ModelPatcher:
    def set_model_attn2_output_patch(self, patch):
        self.set_model_patch(patch, "attn2_output_patch")
+    def set_model_input_block_patch(self, patch):
+        self.set_model_patch(patch, "input_block_patch")
+    def set_model_input_block_patch_after_skip(self, patch):
+        self.set_model_patch(patch, "input_block_patch_after_skip")
    def set_model_output_block_patch(self, patch):
        self.set_model_patch(patch, "output_block_patch")

--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
 import torch
 import numpy as np
 from comfy.ldm.modules.diffusionmodules.util import make_beta_schedule
+import math
 class EPS:
    def calculate_input(self, sigma, noise):
@@ -24,7 +24,7 @@ class ModelSamplingDiscrete(torch.nn.Module):
        super().__init__()
        beta_schedule = "linear"
        if model_config is not None:
-            beta_schedule = model_config.beta_schedule
+            beta_schedule = model_config.sampling_settings.get("beta_schedule", beta_schedule)
        self._register_schedule(given_betas=None, beta_schedule=beta_schedule, timesteps=1000, linear_start=0.00085, linear_end=0.012, cosine_s=8e-3)
        self.sigma_data = 1.0
@@ -65,16 +65,65 @@ class ModelSamplingDiscrete(torch.nn.Module):
    def timestep(self, sigma):
        log_sigma = sigma.log()
        dists = log_sigma.to(self.log_sigmas.device) - self.log_sigmas[:, None]
-        return dists.abs().argmin(dim=0).view(sigma.shape)
+        return dists.abs().argmin(dim=0).view(sigma.shape).to(sigma.device)
    def sigma(self, timestep):
-        t = torch.clamp(timestep.float(), min=0, max=(len(self.sigmas) - 1))
+        t = torch.clamp(timestep.float().to(self.log_sigmas.device), min=0, max=(len(self.sigmas) - 1))
        low_idx = t.floor().long()
        high_idx = t.ceil().long()
        w = t.frac()
        log_sigma = (1 - w) * self.log_sigmas[low_idx] + w * self.log_sigmas[high_idx]
-        return log_sigma.exp()
+        return log_sigma.exp().to(timestep.device)
    def percent_to_sigma(self, percent):
-        return self.sigma(torch.tensor(percent * 999.0))
+        if percent <= 0.0:
+            return 999999999.9
+        if percent >= 1.0:
+            return 0.0
+        percent = 1.0 - percent
+        return self.sigma(torch.tensor(percent * 999.0)).item()
+class ModelSamplingContinuousEDM(torch.nn.Module):
+    def __init__(self, model_config=None):
+        super().__init__()
+        self.sigma_data = 1.0
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+        sigma_min = sampling_settings.get("sigma_min", 0.002)
+        sigma_max = sampling_settings.get("sigma_max", 120.0)
+        self.set_sigma_range(sigma_min, sigma_max)
+    def set_sigma_range(self, sigma_min, sigma_max):
+        sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), 1000).exp()
+        self.register_buffer('sigmas', sigmas) #for compatibility with some schedulers
+        self.register_buffer('log_sigmas', sigmas.log())
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+    def timestep(self, sigma):
+        return 0.25 * sigma.log()
+    def sigma(self, timestep):
+        return (timestep / 0.25).exp()
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 999999999.9
+        if percent >= 1.0:
+            return 0.0
+        percent = 1.0 - percent
+        log_sigma_min = math.log(self.sigma_min)
+        return math.exp((math.log(self.sigma_max) - log_sigma_min) * percent + log_sigma_min)
--- a/comfy/sample.py
+++ b/comfy/sample.py
@@ -83,7 +83,7 @@ def prepare_sampling(model, noise_shape, positive, negative, noise_mask):
    real_model = None
    models, inference_memory = get_additional_models(positive, negative, model.model_dtype())
-    comfy.model_management.load_models_gpu([model] + models, comfy.model_management.batch_area_memory(noise_shape[0] * noise_shape[2] * noise_shape[3]) + inference_memory)
+    comfy.model_management.load_models_gpu([model] + models, model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:])) + inference_memory)
    real_model = model.model
    return real_model, positive, negative, noise_mask, models

--- a/comfy/samplers.py
+++ b/comfy/samplers.py
@@ -11,7 +11,7 @@ import comfy.conds
 #The main sampling function shared by all the samplers
 #Returns denoised
-def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, model_options={}, seed=None):
+def sampling_function(model, x, timestep, uncond, cond, cond_scale, model_options={}, seed=None):
        def get_area_and_mult(conds, x_in, timestep_in):
            area = (x_in.shape[2], x_in.shape[3], 0, 0)
            strength = 1.0
@@ -134,7 +134,7 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, mod
            return out
-        def calc_cond_uncond_batch(model_function, cond, uncond, x_in, timestep, max_total_area, model_options):
+        def calc_cond_uncond_batch(model, cond, uncond, x_in, timestep, model_options):
            out_cond = torch.zeros_like(x_in)
            out_count = torch.ones_like(x_in) * 1e-37
@@ -170,9 +170,11 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, mod
                to_batch_temp.reverse()
                to_batch = to_batch_temp[:1]
+                free_memory = model_management.get_free_memory(x_in.device)
                for i in range(1, len(to_batch_temp) + 1):
                    batch_amount = to_batch_temp[:len(to_batch_temp)//i]
-                    if (len(batch_amount) * first_shape[0] * first_shape[2] * first_shape[3] < max_total_area):
+                    input_shape = [len(batch_amount) * first_shape[0]] + list(first_shape)[1:]
+                    if model.memory_required(input_shape) < free_memory:
                        to_batch = batch_amount
                        break
@@ -218,12 +220,14 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, mod
                        transformer_options["patches"] = patches
                transformer_options["cond_or_uncond"] = cond_or_uncond[:]
+                transformer_options["sigmas"] = timestep
                c['transformer_options'] = transformer_options
                if 'model_function_wrapper' in model_options:
-                    output = model_options['model_function_wrapper'](model_function, {"input": input_x, "timestep": timestep_, "c": c, "cond_or_uncond": cond_or_uncond}).chunk(batch_chunks)
+                    output = model_options['model_function_wrapper'](model.apply_model, {"input": input_x, "timestep": timestep_, "c": c, "cond_or_uncond": cond_or_uncond}).chunk(batch_chunks)
                else:
-                    output = model_function(input_x, timestep_, **c).chunk(batch_chunks)
+                    output = model.apply_model(input_x, timestep_, **c).chunk(batch_chunks)
                del input_x
                for o in range(batch_chunks):
@@ -242,11 +246,10 @@ def sampling_function(model_function, x, timestep, uncond, cond, cond_scale, mod
            return out_cond, out_uncond
-        max_total_area = model_management.maximum_batch_area()
        if math.isclose(cond_scale, 1.0):
            uncond = None
-        cond, uncond = calc_cond_uncond_batch(model_function, cond, uncond, x, timestep, max_total_area, model_options)
+        cond, uncond = calc_cond_uncond_batch(model, cond, uncond, x, timestep, model_options)
        if "sampler_cfg_function" in model_options:
            args = {"cond": x - cond, "uncond": x - uncond, "cond_scale": cond_scale, "timestep": timestep, "input": x, "sigma": timestep}
            return x - model_options["sampler_cfg_function"](args)
@@ -258,7 +261,7 @@ class CFGNoisePredictor(torch.nn.Module):
        super().__init__()
        self.inner_model = model
    def apply_model(self, x, timestep, cond, uncond, cond_scale, model_options={}, seed=None):
-        out = sampling_function(self.inner_model.apply_model, x, timestep, uncond, cond, cond_scale, model_options=model_options, seed=seed)
+        out = sampling_function(self.inner_model, x, timestep, uncond, cond, cond_scale, model_options=model_options, seed=seed)
        return out
    def forward(self, *args, **kwargs):
        return self.apply_model(*args, **kwargs)
@@ -511,52 +514,69 @@ class Sampler:
 class UNIPC(Sampler):
    def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
-        return uni_pc.sample_unipc(model_wrap, noise, latent_image, sigmas, sampling_function=sampling_function, max_denoise=self.max_denoise(model_wrap, sigmas), extra_args=extra_args, noise_mask=denoise_mask, callback=callback, disable=disable_pbar)
+        return uni_pc.sample_unipc(model_wrap, noise, latent_image, sigmas, max_denoise=self.max_denoise(model_wrap, sigmas), extra_args=extra_args, noise_mask=denoise_mask, callback=callback, disable=disable_pbar)
 class UNIPCBH2(Sampler):
    def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
-        return uni_pc.sample_unipc(model_wrap, noise, latent_image, sigmas, sampling_function=sampling_function, max_denoise=self.max_denoise(model_wrap, sigmas), extra_args=extra_args, noise_mask=denoise_mask, callback=callback, variant='bh2', disable=disable_pbar)
+        return uni_pc.sample_unipc(model_wrap, noise, latent_image, sigmas, max_denoise=self.max_denoise(model_wrap, sigmas), extra_args=extra_args, noise_mask=denoise_mask, callback=callback, variant='bh2', disable=disable_pbar)
-KSAMPLER_NAMES = ["euler", "euler_ancestral", "heun", "dpm_2", "dpm_2_ancestral",
+KSAMPLER_NAMES = ["euler", "euler_ancestral", "heun", "heunpp2","dpm_2", "dpm_2_ancestral",
                  "lms", "dpm_fast", "dpm_adaptive", "dpmpp_2s_ancestral", "dpmpp_sde", "dpmpp_sde_gpu",
                  "dpmpp_2m", "dpmpp_2m_sde", "dpmpp_2m_sde_gpu", "dpmpp_3m_sde", "dpmpp_3m_sde_gpu", "ddpm", "lcm"]
-def ksampler(sampler_name, extra_options={}, inpaint_options={}):
+class KSAMPLER(Sampler):
-    class KSAMPLER(Sampler):
+    def __init__(self, sampler_function, extra_options={}, inpaint_options={}):
-        def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
+        self.sampler_function = sampler_function
-            extra_args["denoise_mask"] = denoise_mask
+        self.extra_options = extra_options
-            model_k = KSamplerX0Inpaint(model_wrap)
+        self.inpaint_options = inpaint_options
-            model_k.latent_image = latent_image
-            if inpaint_options.get("random", False): #TODO: Should this be the default?
-                generator = torch.manual_seed(extra_args.get("seed", 41) + 1)
-                model_k.noise = torch.randn(noise.shape, generator=generator, device="cpu").to(noise.dtype).to(noise.device)
-            else:
-                model_k.noise = noise
-            if self.max_denoise(model_wrap, sigmas):
+    def sample(self, model_wrap, sigmas, extra_args, callback, noise, latent_image=None, denoise_mask=None, disable_pbar=False):
-                noise = noise * torch.sqrt(1.0 + sigmas[0] ** 2.0)
+        extra_args["denoise_mask"] = denoise_mask
-            else:
+        model_k = KSamplerX0Inpaint(model_wrap)
-                noise = noise * sigmas[0]
+        model_k.latent_image = latent_image
+        if self.inpaint_options.get("random", False): #TODO: Should this be the default?
+            generator = torch.manual_seed(extra_args.get("seed", 41) + 1)
+            model_k.noise = torch.randn(noise.shape, generator=generator, device="cpu").to(noise.dtype).to(noise.device)
+        else:
+            model_k.noise = noise
-            k_callback = None
+        if self.max_denoise(model_wrap, sigmas):
-            total_steps = len(sigmas) - 1
+            noise = noise * torch.sqrt(1.0 + sigmas[0] ** 2.0)
-            if callback is not None:
+        else:
-                k_callback = lambda x: callback(x["i"], x["denoised"], x["x"], total_steps)
+            noise = noise * sigmas[0]
+        k_callback = None
+        total_steps = len(sigmas) - 1
+        if callback is not None:
+            k_callback = lambda x: callback(x["i"], x["denoised"], x["x"], total_steps)
+        if latent_image is not None:
+            noise += latent_image
+        samples = self.sampler_function(model_k, noise, sigmas, extra_args=extra_args, callback=k_callback, disable=disable_pbar, **self.extra_options)
+        return samples
+def ksampler(sampler_name, extra_options={}, inpaint_options={}):
+    if sampler_name == "dpm_fast":
+        def dpm_fast_function(model, noise, sigmas, extra_args, callback, disable):
            sigma_min = sigmas[-1]
            if sigma_min == 0:
                sigma_min = sigmas[-2]
+            total_steps = len(sigmas) - 1
+            return k_diffusion_sampling.sample_dpm_fast(model, noise, sigma_min, sigmas[0], total_steps, extra_args=extra_args, callback=callback, disable=disable)
+        sampler_function = dpm_fast_function
+    elif sampler_name == "dpm_adaptive":
+        def dpm_adaptive_function(model, noise, sigmas, extra_args, callback, disable):
+            sigma_min = sigmas[-1]
+            if sigma_min == 0:
+                sigma_min = sigmas[-2]
+            return k_diffusion_sampling.sample_dpm_adaptive(model, noise, sigma_min, sigmas[0], extra_args=extra_args, callback=callback, disable=disable)
+        sampler_function = dpm_adaptive_function
+    else:
+        sampler_function = getattr(k_diffusion_sampling, "sample_{}".format(sampler_name))
-            if latent_image is not None:
+    return KSAMPLER(sampler_function, extra_options, inpaint_options)
-                noise += latent_image
-            if sampler_name == "dpm_fast":
-                samples = k_diffusion_sampling.sample_dpm_fast(model_k, noise, sigma_min, sigmas[0], total_steps, extra_args=extra_args, callback=k_callback, disable=disable_pbar)
-            elif sampler_name == "dpm_adaptive":
-                samples = k_diffusion_sampling.sample_dpm_adaptive(model_k, noise, sigma_min, sigmas[0], extra_args=extra_args, callback=k_callback, disable=disable_pbar)
-            else:
-                samples = getattr(k_diffusion_sampling, "sample_{}".format(sampler_name))(model_k, noise, sigmas, extra_args=extra_args, callback=k_callback, disable=disable_pbar, **extra_options)
-            return samples
-    return KSAMPLER
 def wrap_model(model):
    model_denoise = CFGNoisePredictor(model)
@@ -617,11 +637,11 @@ def calculate_sigmas_scheduler(model, scheduler_name, steps):
        print("error invalid scheduler", self.scheduler)
    return sigmas
-def sampler_class(name):
+def sampler_object(name):
    if name == "uni_pc":
-        sampler = UNIPC
+        sampler = UNIPC()
    elif name == "uni_pc_bh2":
-        sampler = UNIPCBH2
+        sampler = UNIPCBH2()
    elif name == "ddim":
        sampler = ksampler("euler", inpaint_options={"random": True})
    else:
@@ -686,6 +706,6 @@ class KSampler:
                else:
                    return torch.zeros_like(noise)
-        sampler = sampler_class(self.sampler)
+        sampler = sampler_object(self.sampler)
-        return sample(self.model, noise, positive, negative, cfg, self.device, sampler(), sigmas, self.model_options, latent_image=latent_image, denoise_mask=denoise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
+        return sample(self.model, noise, positive, negative, cfg, self.device, sampler, sigmas, self.model_options, latent_image=latent_image, denoise_mask=denoise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
--- a/comfy/sd.py
+++ b/comfy/sd.py
@@ -23,6 +23,7 @@ import comfy.model_patcher
 import comfy.lora
 import comfy.t2i_adapter.adapter
 import comfy.supported_models_base
+import comfy.taesd.taesd
 def load_model_weights(model, sd):
    m, u = model.load_state_dict(sd, strict=False)
@@ -95,10 +96,7 @@ class CLIP:
        load_device = model_management.text_encoder_device()
        offload_device = model_management.text_encoder_offload_device()
        params['device'] = offload_device
-        if model_management.should_use_fp16(load_device, prioritize_performance=False):
+        params['dtype'] = model_management.text_encoder_dtype(load_device)
-            params['dtype'] = torch.float16
-        else:
-            params['dtype'] = torch.float32
        self.cond_stage_model = clip(**(params))
@@ -157,10 +155,24 @@ class VAE:
        if 'decoder.up_blocks.0.resnets.0.norm1.weight' in sd.keys(): #diffusers format
            sd = diffusers_convert.convert_vae_state_dict(sd)
+        self.memory_used_encode = lambda shape, dtype: (1767 * shape[2] * shape[3]) * model_management.dtype_size(dtype) #These are for AutoencoderKL and need tweaking (should be lower)
+        self.memory_used_decode = lambda shape, dtype: (2178 * shape[2] * shape[3] * 64) * model_management.dtype_size(dtype)
        if config is None:
-            #default SD1.x/SD2.x VAE parameters
+            if "decoder.mid.block_1.mix_factor" in sd:
-            ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
+                encoder_config = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
-            self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=4)
+                decoder_config = encoder_config.copy()
+                decoder_config["video_kernel_size"] = [3, 1, 1]
+                decoder_config["alpha"] = 0.0
+                self.first_stage_model = AutoencodingEngine(regularizer_config={'target': "comfy.ldm.models.autoencoder.DiagonalGaussianRegularizer"},
+                                                            encoder_config={'target': "comfy.ldm.modules.diffusionmodules.model.Encoder", 'params': encoder_config},
+                                                            decoder_config={'target': "comfy.ldm.modules.temporal_ae.VideoDecoder", 'params': decoder_config})
+            elif "taesd_decoder.1.weight" in sd:
+                self.first_stage_model = comfy.taesd.taesd.TAESD()
+            else:
+                #default SD1.x/SD2.x VAE parameters
+                ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
+                self.first_stage_model = AutoencoderKL(ddconfig=ddconfig, embed_dim=4)
        else:
            self.first_stage_model = AutoencoderKL(**(config['params']))
        self.first_stage_model = self.first_stage_model.eval()
@@ -175,10 +187,12 @@ class VAE:
        if device is None:
            device = model_management.vae_device()
        self.device = device
-        self.offload_device = model_management.vae_offload_device()
+        offload_device = model_management.vae_offload_device()
        self.vae_dtype = model_management.vae_dtype()
        self.first_stage_model.to(self.vae_dtype)
+        self.patcher = comfy.model_patcher.ModelPatcher(self.first_stage_model, load_device=self.device, offload_device=offload_device)
    def decode_tiled_(self, samples, tile_x=64, tile_y=64, overlap = 16):
        steps = samples.shape[0] * comfy.utils.get_tiled_scale_steps(samples.shape[3], samples.shape[2], tile_x, tile_y, overlap)
        steps += samples.shape[0] * comfy.utils.get_tiled_scale_steps(samples.shape[3], samples.shape[2], tile_x // 2, tile_y * 2, overlap)
@@ -207,10 +221,9 @@ class VAE:
        return samples
    def decode(self, samples_in):
-        self.first_stage_model = self.first_stage_model.to(self.device)
        try:
-            memory_used = (2562 * samples_in.shape[2] * samples_in.shape[3] * 64) * 1.7
+            memory_used = self.memory_used_decode(samples_in.shape, self.vae_dtype)
-            model_management.free_memory(memory_used, self.device)
+            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
            free_memory = model_management.get_free_memory(self.device)
            batch_number = int(free_memory / memory_used)
            batch_number = max(1, batch_number)
@@ -223,22 +236,19 @@ class VAE:
            print("Warning: Ran out of memory when regular VAE decoding, retrying with tiled VAE decoding.")
            pixel_samples = self.decode_tiled_(samples_in)
-        self.first_stage_model = self.first_stage_model.to(self.offload_device)
        pixel_samples = pixel_samples.cpu().movedim(1,-1)
        return pixel_samples
    def decode_tiled(self, samples, tile_x=64, tile_y=64, overlap = 16):
-        self.first_stage_model = self.first_stage_model.to(self.device)
+        model_management.load_model_gpu(self.patcher)
        output = self.decode_tiled_(samples, tile_x, tile_y, overlap)
-        self.first_stage_model = self.first_stage_model.to(self.offload_device)
        return output.movedim(1,-1)
    def encode(self, pixel_samples):
-        self.first_stage_model = self.first_stage_model.to(self.device)
        pixel_samples = pixel_samples.movedim(-1,1)
        try:
-            memory_used = (2078 * pixel_samples.shape[2] * pixel_samples.shape[3]) * 1.7 #NOTE: this constant along with the one in the decode above are estimated from the mem usage for the VAE and could change.
+            memory_used = self.memory_used_encode(pixel_samples.shape, self.vae_dtype)
-            model_management.free_memory(memory_used, self.device)
+            model_management.load_models_gpu([self.patcher], memory_required=memory_used)
            free_memory = model_management.get_free_memory(self.device)
            batch_number = int(free_memory / memory_used)
            batch_number = max(1, batch_number)
@@ -251,14 +261,12 @@ class VAE:
            print("Warning: Ran out of memory when regular VAE encoding, retrying with tiled VAE encoding.")
            samples = self.encode_tiled_(pixel_samples)
-        self.first_stage_model = self.first_stage_model.to(self.offload_device)
        return samples
    def encode_tiled(self, pixel_samples, tile_x=512, tile_y=512, overlap = 64):
-        self.first_stage_model = self.first_stage_model.to(self.device)
+        model_management.load_model_gpu(self.patcher)
        pixel_samples = pixel_samples.movedim(-1,1)
        samples = self.encode_tiled_(pixel_samples, tile_x=tile_x, tile_y=tile_y, overlap=overlap)
-        self.first_stage_model = self.first_stage_model.to(self.offload_device)
        return samples
    def get_sd(self):
@@ -444,6 +452,7 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
    if output_vae:
        vae_sd = comfy.utils.state_dict_prefix_replace(sd, {"first_stage_model.": ""}, filter_keys=True)
+        vae_sd = model_config.process_vae_state_dict(vae_sd)
        vae = VAE(sd=vae_sd)
    if output_clip:
@@ -468,20 +477,18 @@ def load_checkpoint_guess_config(ckpt_path, output_vae=True, output_clip=True, o
    return (model_patcher, clip, vae, clipvision)
-def load_unet(unet_path): #load unet in diffusers format
+def load_unet_state_dict(sd): #load unet in diffusers format
-    sd = comfy.utils.load_torch_file(unet_path)
    parameters = comfy.utils.calculate_parameters(sd)
    unet_dtype = model_management.unet_dtype(model_params=parameters)
    if "input_blocks.0.0.weight" in sd: #ldm
        model_config = model_detection.model_config_from_unet(sd, "", unet_dtype)
        if model_config is None:
-            raise RuntimeError("ERROR: Could not detect model type of: {}".format(unet_path))
+            return None
        new_sd = sd
    else: #diffusers
        model_config = model_detection.model_config_from_diffusers_unet(sd, unet_dtype)
        if model_config is None:
-            print("ERROR UNSUPPORTED UNET", unet_path)
            return None
        diffusers_keys = comfy.utils.unet_to_diffusers(model_config.unet_config)
@@ -501,6 +508,14 @@ def load_unet(unet_path): #load unet in diffusers format
        print("left over keys in unet:", left_over)
    return comfy.model_patcher.ModelPatcher(model, load_device=model_management.get_torch_device(), offload_device=offload_device)
+def load_unet(unet_path):
+    sd = comfy.utils.load_torch_file(unet_path)
+    model = load_unet_state_dict(sd)
+    if model is None:
+        print("ERROR UNSUPPORTED UNET", unet_path)
+        raise RuntimeError("ERROR: Could not detect model type of: {}".format(unet_path))
+    return model
 def save_checkpoint(output_path, model, clip, vae, metadata=None):
    model_management.load_models_gpu([model, clip.load_model()])
    sd = model.model.state_dict_for_saving(clip.get_sd(), vae.get_sd())

--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
@@ -173,9 +173,9 @@ class SDClipModel(torch.nn.Module, ClipTokenWeightEncoder):
        if getattr(self.transformer, self.inner_name).final_layer_norm.weight.dtype != torch.float32:
            precision_scope = torch.autocast
        else:
-            precision_scope = lambda a, b: contextlib.nullcontext(a)
+            precision_scope = lambda a, dtype: contextlib.nullcontext(a)
-        with precision_scope(model_management.get_autocast_device(device), torch.float32):
+        with precision_scope(model_management.get_autocast_device(device), dtype=torch.float32):
            attention_mask = None
            if self.enable_attention_masks:
                attention_mask = torch.zeros_like(tokens)