Initial commit

873911fa · fengyf1 · 873911fa · 873911fa · 873911fa · 873911fa
Commit 873911fa authored Feb 04, 2026 by fengyf1
20 changed files
--- a/comfy/comfy_types/examples/input_options.png
+++ b/comfy/comfy_types/examples/input_options.png
--- a/comfy/comfy_types/examples/input_types.png
+++ b/comfy/comfy_types/examples/input_types.png
--- a/comfy/comfy_types/examples/required_hint.png
+++ b/comfy/comfy_types/examples/required_hint.png
--- a/comfy/comfy_types/node_typing.py
+++ b/comfy/comfy_types/node_typing.py
+"""Comfy-specific type hinting"""
+
+from __future__ import annotations
+from typing import Literal, TypedDict, Optional
+from typing_extensions import NotRequired
+from abc import ABC, abstractmethod
+from enum import Enum
+
+
+class StrEnum(str, Enum):
+    """Base class for string enums. Python's StrEnum is not available until 3.11."""
+
+    def __str__(self) -> str:
+        return self.value
+
+
+class IO(StrEnum):
+    """Node input/output data types.
+
+    Includes functionality for ``"*"`` (`ANY`) and ``"MULTI,TYPES"``.
+    """
+
+    STRING = "STRING"
+    IMAGE = "IMAGE"
+    MASK = "MASK"
+    LATENT = "LATENT"
+    BOOLEAN = "BOOLEAN"
+    INT = "INT"
+    FLOAT = "FLOAT"
+    COMBO = "COMBO"
+    CONDITIONING = "CONDITIONING"
+    SAMPLER = "SAMPLER"
+    SIGMAS = "SIGMAS"
+    GUIDER = "GUIDER"
+    NOISE = "NOISE"
+    CLIP = "CLIP"
+    CONTROL_NET = "CONTROL_NET"
+    VAE = "VAE"
+    MODEL = "MODEL"
+    LORA_MODEL = "LORA_MODEL"
+    LOSS_MAP = "LOSS_MAP"
+    CLIP_VISION = "CLIP_VISION"
+    CLIP_VISION_OUTPUT = "CLIP_VISION_OUTPUT"
+    STYLE_MODEL = "STYLE_MODEL"
+    GLIGEN = "GLIGEN"
+    UPSCALE_MODEL = "UPSCALE_MODEL"
+    AUDIO = "AUDIO"
+    WEBCAM = "WEBCAM"
+    POINT = "POINT"
+    FACE_ANALYSIS = "FACE_ANALYSIS"
+    BBOX = "BBOX"
+    SEGS = "SEGS"
+    VIDEO = "VIDEO"
+
+    ANY = "*"
+    """Always matches any type, but at a price.
+
+    Causes some functionality issues (e.g. reroutes, link types), and should be avoided whenever possible.
+    """
+    NUMBER = "FLOAT,INT"
+    """A float or an int - could be either"""
+    PRIMITIVE = "STRING,FLOAT,INT,BOOLEAN"
+    """Could be any of: string, float, int, or bool"""
+
+    def __ne__(self, value: object) -> bool:
+        if self == "*" or value == "*":
+            return False
+        if not isinstance(value, str):
+            return True
+        a = frozenset(self.split(","))
+        b = frozenset(value.split(","))
+        return not (b.issubset(a) or a.issubset(b))
+
+
+class RemoteInputOptions(TypedDict):
+    route: str
+    """The route to the remote source."""
+    refresh_button: bool
+    """Specifies whether to show a refresh button in the UI below the widget."""
+    control_after_refresh: Literal["first", "last"]
+    """Specifies the control after the refresh button is clicked. If "first", the first item will be automatically selected, and so on."""
+    timeout: int
+    """The maximum amount of time to wait for a response from the remote source in milliseconds."""
+    max_retries: int
+    """The maximum number of retries before aborting the request."""
+    refresh: int
+    """The TTL of the remote input's value in milliseconds. Specifies the interval at which the remote input's value is refreshed."""
+
+
+class MultiSelectOptions(TypedDict):
+    placeholder: NotRequired[str]
+    """The placeholder text to display in the multi-select widget when no items are selected."""
+    chip: NotRequired[bool]
+    """Specifies whether to use chips instead of comma separated values for the multi-select widget."""
+
+
+class InputTypeOptions(TypedDict):
+    """Provides type hinting for the return type of the INPUT_TYPES node function.
+
+    Due to IDE limitations with unions, for now all options are available for all types (e.g. `label_on` is hinted even when the type is not `IO.BOOLEAN`).
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/datatypes
+    """
+
+    default: NotRequired[bool | str | float | int | list | tuple]
+    """The default value of the widget"""
+    defaultInput: NotRequired[bool]
+    """@deprecated in v1.16 frontend. v1.16 frontend allows input socket and widget to co-exist.
+    - defaultInput on required inputs should be dropped.
+    - defaultInput on optional inputs should be replaced with forceInput.
+    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3364
+    """
+    forceInput: NotRequired[bool]
+    """Forces the input to be an input slot rather than a widget even a widget is available for the input type."""
+    lazy: NotRequired[bool]
+    """Declares that this input uses lazy evaluation"""
+    rawLink: NotRequired[bool]
+    """When a link exists, rather than receiving the evaluated value, you will receive the link (i.e. `["nodeId", <outputIndex>]`). Designed for node expansion."""
+    tooltip: NotRequired[str]
+    """Tooltip for the input (or widget), shown on pointer hover"""
+    socketless: NotRequired[bool]
+    """All inputs (including widgets) have an input socket to connect links. When ``true``, if there is a widget for this input, no socket will be created.
+    Available from frontend v1.17.5
+    Ref: https://github.com/Comfy-Org/ComfyUI_frontend/pull/3548
+    """
+    widgetType: NotRequired[str]
+    """Specifies a type to be used for widget initialization if different from the input type.
+    Available from frontend v1.18.0
+    https://github.com/Comfy-Org/ComfyUI_frontend/pull/3550"""
+    # class InputTypeNumber(InputTypeOptions):
+    # default: float | int
+    min: NotRequired[float]
+    """The minimum value of a number (``FLOAT`` | ``INT``)"""
+    max: NotRequired[float]
+    """The maximum value of a number (``FLOAT`` | ``INT``)"""
+    step: NotRequired[float]
+    """The amount to increment or decrement a widget by when stepping up/down (``FLOAT`` | ``INT``)"""
+    round: NotRequired[float]
+    """Floats are rounded by this value (``FLOAT``)"""
+    # class InputTypeBoolean(InputTypeOptions):
+    # default: bool
+    label_on: NotRequired[str]
+    """The label to use in the UI when the bool is True (``BOOLEAN``)"""
+    label_off: NotRequired[str]
+    """The label to use in the UI when the bool is False (``BOOLEAN``)"""
+    # class InputTypeString(InputTypeOptions):
+    # default: str
+    multiline: NotRequired[bool]
+    """Use a multiline text box (``STRING``)"""
+    placeholder: NotRequired[str]
+    """Placeholder text to display in the UI when empty (``STRING``)"""
+    # Deprecated:
+    # defaultVal: str
+    dynamicPrompts: NotRequired[bool]
+    """Causes the front-end to evaluate dynamic prompts (``STRING``)"""
+    # class InputTypeCombo(InputTypeOptions):
+    image_upload: NotRequired[bool]
+    """Specifies whether the input should have an image upload button and image preview attached to it. Requires that the input's name is `image`."""
+    image_folder: NotRequired[Literal["input", "output", "temp"]]
+    """Specifies which folder to get preview images from if the input has the ``image_upload`` flag.
+    """
+    remote: NotRequired[RemoteInputOptions]
+    """Specifies the configuration for a remote input.
+    Available after ComfyUI frontend v1.9.7
+    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2422"""
+    control_after_generate: NotRequired[bool]
+    """Specifies whether a control widget should be added to the input, adding options to automatically change the value after each prompt is queued. Currently only used for INT and COMBO types."""
+    options: NotRequired[list[str | int | float]]
+    """COMBO type only. Specifies the selectable options for the combo widget.
+    Prefer:
+    ["COMBO", {"options": ["Option 1", "Option 2", "Option 3"]}]
+    Over:
+    [["Option 1", "Option 2", "Option 3"]]
+    """
+    multi_select: NotRequired[MultiSelectOptions]
+    """COMBO type only. Specifies the configuration for a multi-select widget.
+    Available after ComfyUI frontend v1.13.4
+    https://github.com/Comfy-Org/ComfyUI_frontend/pull/2987"""
+
+
+class HiddenInputTypeDict(TypedDict):
+    """Provides type hinting for the hidden entry of node INPUT_TYPES."""
+
+    node_id: NotRequired[Literal["UNIQUE_ID"]]
+    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
+    unique_id: NotRequired[Literal["UNIQUE_ID"]]
+    """UNIQUE_ID is the unique identifier of the node, and matches the id property of the node on the client side. It is commonly used in client-server communications (see messages)."""
+    prompt: NotRequired[Literal["PROMPT"]]
+    """PROMPT is the complete prompt sent by the client to the server. See the prompt object for a full description."""
+    extra_pnginfo: NotRequired[Literal["EXTRA_PNGINFO"]]
+    """EXTRA_PNGINFO is a dictionary that will be copied into the metadata of any .png files saved. Custom nodes can store additional information in this dictionary for saving (or as a way to communicate with a downstream node)."""
+    dynprompt: NotRequired[Literal["DYNPROMPT"]]
+    """DYNPROMPT is an instance of comfy_execution.graph.DynamicPrompt. It differs from PROMPT in that it may mutate during the course of execution in response to Node Expansion."""
+
+
+class InputTypeDict(TypedDict):
+    """Provides type hinting for node INPUT_TYPES.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs
+    """
+
+    required: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    """Describes all inputs that must be connected for the node to execute."""
+    optional: NotRequired[dict[str, tuple[IO, InputTypeOptions]]]
+    """Describes inputs which do not need to be connected."""
+    hidden: NotRequired[HiddenInputTypeDict]
+    """Offers advanced functionality and server-client communication.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
+    """
+
+
+class ComfyNodeABC(ABC):
+    """Abstract base class for Comfy nodes.  Includes the names and expected types of attributes.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview
+    """
+
+    DESCRIPTION: str
+    """Node description, shown as a tooltip when hovering over the node.
+
+    Usage::
+
+        # Explicitly define the description
+        DESCRIPTION = "Example description here."
+
+        # Use the docstring of the node class.
+        DESCRIPTION = cleandoc(__doc__)
+    """
+    CATEGORY: str
+    """The category of the node, as per the "Add Node" menu.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#category
+    """
+    EXPERIMENTAL: bool
+    """Flags a node as experimental, informing users that it may change or not work as expected."""
+    DEPRECATED: bool
+    """Flags a node as deprecated, indicating to users that they should find alternatives to this node."""
+    API_NODE: Optional[bool]
+    """Flags a node as an API node. See: https://docs.comfy.org/tutorials/api-nodes/overview."""
+
+    @classmethod
+    @abstractmethod
+    def INPUT_TYPES(s) -> InputTypeDict:
+        """Defines node inputs.
+
+        * Must include the ``required`` key, which describes all inputs that must be connected for the node to execute.
+        * The ``optional`` key can be added to describe inputs which do not need to be connected.
+        * The ``hidden`` key offers some advanced functionality.  More info at: https://docs.comfy.org/custom-nodes/backend/more_on_inputs#hidden-inputs
+
+        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#input-types
+        """
+        return {"required": {}}
+
+    OUTPUT_NODE: bool
+    """Flags this node as an output node, causing any inputs it requires to be executed.
+
+    If a node is not connected to any output nodes, that node will not be executed.  Usage::
+
+        OUTPUT_NODE = True
+
+    From the docs:
+
+    By default, a node is not considered an output. Set ``OUTPUT_NODE = True`` to specify that it is.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#output-node
+    """
+    INPUT_IS_LIST: bool
+    """A flag indicating if this node implements the additional code necessary to deal with OUTPUT_IS_LIST nodes.
+
+    All inputs of ``type`` will become ``list[type]``, regardless of how many items are passed in.  This also affects ``check_lazy_status``.
+
+    From the docs:
+
+    A node can also override the default input behaviour and receive the whole list in a single call. This is done by setting a class attribute `INPUT_IS_LIST` to ``True``.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
+    """
+    OUTPUT_IS_LIST: tuple[bool, ...]
+    """A tuple indicating which node outputs are lists, but will be connected to nodes that expect individual items.
+
+    Connected nodes that do not implement `INPUT_IS_LIST` will be executed once for every item in the list.
+
+    A ``tuple[bool]``, where the items match those in `RETURN_TYPES`::
+
+        RETURN_TYPES = (IO.INT, IO.INT, IO.STRING)
+        OUTPUT_IS_LIST = (True, True, False) # The string output will be handled normally
+
+    From the docs:
+
+    In order to tell Comfy that the list being returned should not be wrapped, but treated as a series of data for sequential processing,
+    the node should provide a class attribute `OUTPUT_IS_LIST`, which is a ``tuple[bool]``, of the same length as `RETURN_TYPES`,
+    specifying which outputs which should be so treated.
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lists#list-processing
+    """
+
+    RETURN_TYPES: tuple[IO, ...]
+    """A tuple representing the outputs of this node.
+
+    Usage::
+
+        RETURN_TYPES = (IO.INT, "INT", "CUSTOM_TYPE")
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-types
+    """
+    RETURN_NAMES: tuple[str, ...]
+    """The output slot names for each item in `RETURN_TYPES`, e.g. ``RETURN_NAMES = ("count", "filter_string")``
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#return-names
+    """
+    OUTPUT_TOOLTIPS: tuple[str, ...]
+    """A tuple of strings to use as tooltips for node outputs, one for each item in `RETURN_TYPES`."""
+    FUNCTION: str
+    """The name of the function to execute as a literal string, e.g. `FUNCTION = "execute"`
+
+    Comfy Docs: https://docs.comfy.org/custom-nodes/backend/server_overview#function
+    """
+
+
+class CheckLazyMixin:
+    """Provides a basic check_lazy_status implementation and type hinting for nodes that use lazy inputs."""
+
+    def check_lazy_status(self, **kwargs) -> list[str]:
+        """Returns a list of input names that should be evaluated.
+
+        This basic mixin impl. requires all inputs.
+
+        :kwargs: All node inputs will be included here.  If the input is ``None``, it should be assumed that it has not yet been evaluated.  \
+            When using ``INPUT_IS_LIST = True``, unevaluated will instead be ``(None,)``.
+
+        Params should match the nodes execution ``FUNCTION`` (self, and all inputs by name).
+        Will be executed repeatedly until it returns an empty list, or all requested items were already evaluated (and sent as params).
+
+        Comfy Docs: https://docs.comfy.org/custom-nodes/backend/lazy_evaluation#defining-check-lazy-status
+        """
+
+        need = [name for name in kwargs if kwargs[name] is None]
+        return need
+
+
+class FileLocator(TypedDict):
+    """Provides type hinting for the file location"""
+
+    filename: str
+    """The filename of the file."""
+    subfolder: str
+    """The subfolder of the file."""
+    type: Literal["input", "output", "temp"]
+    """The root folder of the file."""
--- a/comfy/conds.py
+++ b/comfy/conds.py
+import torch
+import math
+import comfy.utils
+import logging
+
+
+class CONDRegular:
+    def __init__(self, cond):
+        self.cond = cond
+
+    def _copy_with(self, cond):
+        return self.__class__(cond)
+
+    def process_cond(self, batch_size, **kwargs):
+        return self._copy_with(comfy.utils.repeat_to_batch_size(self.cond, batch_size))
+
+    def can_concat(self, other):
+        if self.cond.shape != other.cond.shape:
+            return False
+        if self.cond.device != other.cond.device:
+            logging.warning("WARNING: conds not on same device, skipping concat.")
+            return False
+        return True
+
+    def concat(self, others):
+        conds = [self.cond]
+        for x in others:
+            conds.append(x.cond)
+        return torch.cat(conds)
+
+    def size(self):
+        return list(self.cond.size())
+
+
+class CONDNoiseShape(CONDRegular):
+    def process_cond(self, batch_size, area, **kwargs):
+        data = self.cond
+        if area is not None:
+            dims = len(area) // 2
+            for i in range(dims):
+                data = data.narrow(i + 2, area[i + dims], area[i])
+
+        return self._copy_with(comfy.utils.repeat_to_batch_size(data, batch_size))
+
+
+class CONDCrossAttn(CONDRegular):
+    def can_concat(self, other):
+        s1 = self.cond.shape
+        s2 = other.cond.shape
+        if s1 != s2:
+            if s1[0] != s2[0] or s1[2] != s2[2]: #these 2 cases should not happen
+                return False
+
+            mult_min = math.lcm(s1[1], s2[1])
+            diff = mult_min // min(s1[1], s2[1])
+            if diff > 4: #arbitrary limit on the padding because it's probably going to impact performance negatively if it's too much
+                return False
+        if self.cond.device != other.cond.device:
+            logging.warning("WARNING: conds not on same device: skipping concat.")
+            return False
+        return True
+
+    def concat(self, others):
+        conds = [self.cond]
+        crossattn_max_len = self.cond.shape[1]
+        for x in others:
+            c = x.cond
+            crossattn_max_len = math.lcm(crossattn_max_len, c.shape[1])
+            conds.append(c)
+
+        out = []
+        for c in conds:
+            if c.shape[1] < crossattn_max_len:
+                c = c.repeat(1, crossattn_max_len // c.shape[1], 1) #padding with repeat doesn't change result
+            out.append(c)
+        return torch.cat(out)
+
+
+class CONDConstant(CONDRegular):
+    def __init__(self, cond):
+        self.cond = cond
+
+    def process_cond(self, batch_size, **kwargs):
+        return self._copy_with(self.cond)
+
+    def can_concat(self, other):
+        if self.cond != other.cond:
+            return False
+        return True
+
+    def concat(self, others):
+        return self.cond
+
+    def size(self):
+        return [1]
+
+
+class CONDList(CONDRegular):
+    def __init__(self, cond):
+        self.cond = cond
+
+    def process_cond(self, batch_size, **kwargs):
+        out = []
+        for c in self.cond:
+            out.append(comfy.utils.repeat_to_batch_size(c, batch_size))
+
+        return self._copy_with(out)
+
+    def can_concat(self, other):
+        if len(self.cond) != len(other.cond):
+            return False
+        for i in range(len(self.cond)):
+            if self.cond[i].shape != other.cond[i].shape:
+                return False
+
+        return True
+
+    def concat(self, others):
+        out = []
+        for i in range(len(self.cond)):
+            o = [self.cond[i]]
+            for x in others:
+                o.append(x.cond[i])
+            out.append(torch.cat(o))
+
+        return out
+
+    def size(self):  # hackish implementation to make the mem estimation work
+        o = 0
+        c = 1
+        for c in self.cond:
+            size = c.size()
+            o += math.prod(size)
+            if len(size) > 1:
+                c = size[1]
+
+        return [1, c, o // c]
--- a/comfy/context_windows.py
+++ b/comfy/context_windows.py
+from __future__ import annotations
+from typing import TYPE_CHECKING, Callable
+import torch
+import numpy as np
+import collections
+from dataclasses import dataclass
+from abc import ABC, abstractmethod
+import logging
+import comfy.model_management
+import comfy.patcher_extension
+if TYPE_CHECKING:
+    from comfy.model_base import BaseModel
+    from comfy.model_patcher import ModelPatcher
+    from comfy.controlnet import ControlBase
+
+
+class ContextWindowABC(ABC):
+    def __init__(self):
+        ...
+
+    @abstractmethod
+    def get_tensor(self, full: torch.Tensor) -> torch.Tensor:
+        """
+        Get torch.Tensor applicable to current window.
+        """
+        raise NotImplementedError("Not implemented.")
+
+    @abstractmethod
+    def add_window(self, full: torch.Tensor, to_add: torch.Tensor) -> torch.Tensor:
+        """
+        Apply torch.Tensor of window to the full tensor, in place. Returns reference to updated full tensor, not a copy.
+        """
+        raise NotImplementedError("Not implemented.")
+
+class ContextHandlerABC(ABC):
+    def __init__(self):
+        ...
+
+    @abstractmethod
+    def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool:
+        raise NotImplementedError("Not implemented.")
+
+    @abstractmethod
+    def get_resized_cond(self, cond_in: list[dict], x_in: torch.Tensor, window: ContextWindowABC, device=None) -> list:
+        raise NotImplementedError("Not implemented.")
+
+    @abstractmethod
+    def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
+        raise NotImplementedError("Not implemented.")
+
+
+
+class IndexListContextWindow(ContextWindowABC):
+    def __init__(self, index_list: list[int], dim: int=0):
+        self.index_list = index_list
+        self.context_length = len(index_list)
+        self.dim = dim
+
+    def get_tensor(self, full: torch.Tensor, device=None, dim=None) -> torch.Tensor:
+        if dim is None:
+            dim = self.dim
+        if dim == 0 and full.shape[dim] == 1:
+            return full
+        idx = [slice(None)] * dim + [self.index_list]
+        return full[idx].to(device)
+
+    def add_window(self, full: torch.Tensor, to_add: torch.Tensor, dim=None) -> torch.Tensor:
+        if dim is None:
+            dim = self.dim
+        idx = [slice(None)] * dim + [self.index_list]
+        full[idx] += to_add
+        return full
+
+
+class IndexListCallbacks:
+    EVALUATE_CONTEXT_WINDOWS = "evaluate_context_windows"
+    COMBINE_CONTEXT_WINDOW_RESULTS = "combine_context_window_results"
+    EXECUTE_START = "execute_start"
+    EXECUTE_CLEANUP = "execute_cleanup"
+
+    def init_callbacks(self):
+        return {}
+
+
+@dataclass
+class ContextSchedule:
+    name: str
+    func: Callable
+
+@dataclass
+class ContextFuseMethod:
+    name: str
+    func: Callable
+
+ContextResults = collections.namedtuple("ContextResults", ['window_idx', 'sub_conds_out', 'sub_conds', 'window'])
+class IndexListContextHandler(ContextHandlerABC):
+    def __init__(self, context_schedule: ContextSchedule, fuse_method: ContextFuseMethod, context_length: int=1, context_overlap: int=0, context_stride: int=1, closed_loop=False, dim=0):
+        self.context_schedule = context_schedule
+        self.fuse_method = fuse_method
+        self.context_length = context_length
+        self.context_overlap = context_overlap
+        self.context_stride = context_stride
+        self.closed_loop = closed_loop
+        self.dim = dim
+        self._step = 0
+
+        self.callbacks = {}
+
+    def should_use_context(self, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]) -> bool:
+        # for now, assume first dim is batch - should have stored on BaseModel in actual implementation
+        if x_in.size(self.dim) > self.context_length:
+            logging.info(f"Using context windows {self.context_length} for {x_in.size(self.dim)} frames.")
+            return True
+        return False
+
+    def prepare_control_objects(self, control: ControlBase, device=None) -> ControlBase:
+        if control.previous_controlnet is not None:
+            self.prepare_control_objects(control.previous_controlnet, device)
+        return control
+
+    def get_resized_cond(self, cond_in: list[dict], x_in: torch.Tensor, window: IndexListContextWindow, device=None) -> list:
+        if cond_in is None:
+            return None
+        # reuse or resize cond items to match context requirements
+        resized_cond = []
+        # cond object is a list containing a dict - outer list is irrelevant, so just loop through it
+        for actual_cond in cond_in:
+            resized_actual_cond = actual_cond.copy()
+            # now we are in the inner dict - "pooled_output" is a tensor, "control" is a ControlBase object, "model_conds" is dictionary
+            for key in actual_cond:
+                try:
+                    cond_item = actual_cond[key]
+                    if isinstance(cond_item, torch.Tensor):
+                        # check that tensor is the expected length - x.size(0)
+                        if self.dim < cond_item.ndim and cond_item.size(self.dim) == x_in.size(self.dim):
+                            # if so, it's subsetting time - tell controls the expected indeces so they can handle them
+                            actual_cond_item = window.get_tensor(cond_item)
+                            resized_actual_cond[key] = actual_cond_item.to(device)
+                        else:
+                            resized_actual_cond[key] = cond_item.to(device)
+                    # look for control
+                    elif key == "control":
+                        resized_actual_cond[key] = self.prepare_control_objects(cond_item, device)
+                    elif isinstance(cond_item, dict):
+                        new_cond_item = cond_item.copy()
+                        # when in dictionary, look for tensors and CONDCrossAttn [comfy/conds.py] (has cond attr that is a tensor)
+                        for cond_key, cond_value in new_cond_item.items():
+                            if isinstance(cond_value, torch.Tensor):
+                                if cond_value.ndim < self.dim and cond_value.size(0) == x_in.size(self.dim):
+                                    new_cond_item[cond_key] = window.get_tensor(cond_value, device)
+                            # if has cond that is a Tensor, check if needs to be subset
+                            elif hasattr(cond_value, "cond") and isinstance(cond_value.cond, torch.Tensor):
+                                if cond_value.cond.ndim < self.dim and cond_value.cond.size(0) == x_in.size(self.dim):
+                                    new_cond_item[cond_key] = cond_value._copy_with(window.get_tensor(cond_value.cond, device))
+                            elif cond_key == "num_video_frames": # for SVD
+                                new_cond_item[cond_key] = cond_value._copy_with(cond_value.cond)
+                                new_cond_item[cond_key].cond = window.context_length
+                        resized_actual_cond[key] = new_cond_item
+                    else:
+                        resized_actual_cond[key] = cond_item
+                finally:
+                    del cond_item  # just in case to prevent VRAM issues
+            resized_cond.append(resized_actual_cond)
+        return resized_cond
+
+    def set_step(self, timestep: torch.Tensor, model_options: dict[str]):
+        mask = torch.isclose(model_options["transformer_options"]["sample_sigmas"], timestep, rtol=0.0001)
+        matches = torch.nonzero(mask)
+        if torch.numel(matches) == 0:
+            raise Exception("No sample_sigmas matched current timestep; something went wrong.")
+        self._step = int(matches[0].item())
+
+    def get_context_windows(self, model: BaseModel, x_in: torch.Tensor, model_options: dict[str]) -> list[IndexListContextWindow]:
+        full_length = x_in.size(self.dim) # TODO: choose dim based on model
+        context_windows = self.context_schedule.func(full_length, self, model_options)
+        context_windows = [IndexListContextWindow(window, dim=self.dim) for window in context_windows]
+        return context_windows
+
+    def execute(self, calc_cond_batch: Callable, model: BaseModel, conds: list[list[dict]], x_in: torch.Tensor, timestep: torch.Tensor, model_options: dict[str]):
+        self.set_step(timestep, model_options)
+        context_windows = self.get_context_windows(model, x_in, model_options)
+        enumerated_context_windows = list(enumerate(context_windows))
+
+        conds_final = [torch.zeros_like(x_in) for _ in conds]
+        if self.fuse_method.name == ContextFuseMethods.RELATIVE:
+            counts_final = [torch.ones(get_shape_for_dim(x_in, self.dim), device=x_in.device) for _ in conds]
+        else:
+            counts_final = [torch.zeros(get_shape_for_dim(x_in, self.dim), device=x_in.device) for _ in conds]
+        biases_final = [([0.0] * x_in.shape[self.dim]) for _ in conds]
+
+        for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EXECUTE_START, self.callbacks):
+            callback(self, model, x_in, conds, timestep, model_options)
+
+        for enum_window in enumerated_context_windows:
+            results = self.evaluate_context_windows(calc_cond_batch, model, x_in, conds, timestep, [enum_window], model_options)
+            for result in results:
+                self.combine_context_window_results(x_in, result.sub_conds_out, result.sub_conds, result.window, result.window_idx, len(enumerated_context_windows), timestep,
+                                            conds_final, counts_final, biases_final)
+        try:
+            # finalize conds
+            if self.fuse_method.name == ContextFuseMethods.RELATIVE:
+                # relative is already normalized, so return as is
+                del counts_final
+                return conds_final
+            else:
+                # normalize conds via division by context usage counts
+                for i in range(len(conds_final)):
+                    conds_final[i] /= counts_final[i]
+                del counts_final
+                return conds_final
+        finally:
+            for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EXECUTE_CLEANUP, self.callbacks):
+                callback(self, model, x_in, conds, timestep, model_options)
+
+    def evaluate_context_windows(self, calc_cond_batch: Callable, model: BaseModel, x_in: torch.Tensor, conds, timestep: torch.Tensor, enumerated_context_windows: list[tuple[int, IndexListContextWindow]],
+                                model_options, device=None, first_device=None):
+        results: list[ContextResults] = []
+        for window_idx, window in enumerated_context_windows:
+            # allow processing to end between context window executions for faster Cancel
+            comfy.model_management.throw_exception_if_processing_interrupted()
+
+            for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.EVALUATE_CONTEXT_WINDOWS, self.callbacks):
+                callback(self, model, x_in, conds, timestep, model_options, window_idx, window, model_options, device, first_device)
+
+            # update exposed params
+            model_options["transformer_options"]["context_window"] = window
+            # get subsections of x, timestep, conds
+            sub_x = window.get_tensor(x_in, device)
+            sub_timestep = window.get_tensor(timestep, device, dim=0)
+            sub_conds = [self.get_resized_cond(cond, x_in, window, device) for cond in conds]
+
+            sub_conds_out = calc_cond_batch(model, sub_conds, sub_x, sub_timestep, model_options)
+            if device is not None:
+                for i in range(len(sub_conds_out)):
+                    sub_conds_out[i] = sub_conds_out[i].to(x_in.device)
+            results.append(ContextResults(window_idx, sub_conds_out, sub_conds, window))
+        return results
+
+
+    def combine_context_window_results(self, x_in: torch.Tensor, sub_conds_out, sub_conds, window: IndexListContextWindow, window_idx: int, total_windows: int, timestep: torch.Tensor,
+                                    conds_final: list[torch.Tensor], counts_final: list[torch.Tensor], biases_final: list[torch.Tensor]):
+        if self.fuse_method.name == ContextFuseMethods.RELATIVE:
+            for pos, idx in enumerate(window.index_list):
+                # bias is the influence of a specific index in relation to the whole context window
+                bias = 1 - abs(idx - (window.index_list[0] + window.index_list[-1]) / 2) / ((window.index_list[-1] - window.index_list[0] + 1e-2) / 2)
+                bias = max(1e-2, bias)
+                # take weighted average relative to total bias of current idx
+                for i in range(len(sub_conds_out)):
+                    bias_total = biases_final[i][idx]
+                    prev_weight = (bias_total / (bias_total + bias))
+                    new_weight = (bias / (bias_total + bias))
+                    # account for dims of tensors
+                    idx_window = [slice(None)] * self.dim + [idx]
+                    pos_window = [slice(None)] * self.dim + [pos]
+                    # apply new values
+                    conds_final[i][idx_window] = conds_final[i][idx_window] * prev_weight + sub_conds_out[i][pos_window] * new_weight
+                    biases_final[i][idx] = bias_total + bias
+        else:
+            # add conds and counts based on weights of fuse method
+            weights = get_context_weights(window.context_length, x_in.shape[self.dim], window.index_list, self, sigma=timestep)
+            weights_tensor = match_weights_to_dim(weights, x_in, self.dim, device=x_in.device)
+            for i in range(len(sub_conds_out)):
+                window.add_window(conds_final[i], sub_conds_out[i] * weights_tensor)
+                window.add_window(counts_final[i], weights_tensor)
+
+        for callback in comfy.patcher_extension.get_all_callbacks(IndexListCallbacks.COMBINE_CONTEXT_WINDOW_RESULTS, self.callbacks):
+            callback(self, x_in, sub_conds_out, sub_conds, window, window_idx, total_windows, timestep, conds_final, counts_final, biases_final)
+
+
+def _prepare_sampling_wrapper(executor, model, noise_shape: torch.Tensor, *args, **kwargs):
+    # limit noise_shape length to context_length for more accurate vram use estimation
+    model_options = kwargs.get("model_options", None)
+    if model_options is None:
+        raise Exception("model_options not found in prepare_sampling_wrapper; this should never happen, something went wrong.")
+    handler: IndexListContextHandler = model_options.get("context_handler", None)
+    if handler is not None:
+        noise_shape = list(noise_shape)
+        noise_shape[handler.dim] = min(noise_shape[handler.dim], handler.context_length)
+    return executor(model, noise_shape, *args, **kwargs)
+
+
+def create_prepare_sampling_wrapper(model: ModelPatcher):
+    model.add_wrapper_with_key(
+        comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING,
+        "ContextWindows_prepare_sampling",
+        _prepare_sampling_wrapper
+    )
+
+
+def match_weights_to_dim(weights: list[float], x_in: torch.Tensor, dim: int, device=None) -> torch.Tensor:
+    total_dims = len(x_in.shape)
+    weights_tensor = torch.Tensor(weights).to(device=device)
+    for _ in range(dim):
+        weights_tensor = weights_tensor.unsqueeze(0)
+    for _ in range(total_dims - dim - 1):
+        weights_tensor = weights_tensor.unsqueeze(-1)
+    return weights_tensor
+
+def get_shape_for_dim(x_in: torch.Tensor, dim: int) -> list[int]:
+    total_dims = len(x_in.shape)
+    shape = []
+    for _ in range(dim):
+        shape.append(1)
+    shape.append(x_in.shape[dim])
+    for _ in range(total_dims - dim - 1):
+        shape.append(1)
+    return shape
+
+class ContextSchedules:
+    UNIFORM_LOOPED = "looped_uniform"
+    UNIFORM_STANDARD = "standard_uniform"
+    STATIC_STANDARD = "standard_static"
+    BATCHED = "batched"
+
+
+# from https://github.com/neggles/animatediff-cli/blob/main/src/animatediff/pipelines/context.py
+def create_windows_uniform_looped(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]):
+    windows = []
+    if num_frames < handler.context_length:
+        windows.append(list(range(num_frames)))
+        return windows
+
+    context_stride = min(handler.context_stride, int(np.ceil(np.log2(num_frames / handler.context_length))) + 1)
+    # obtain uniform windows as normal, looping and all
+    for context_step in 1 << np.arange(context_stride):
+        pad = int(round(num_frames * ordered_halving(handler._step)))
+        for j in range(
+            int(ordered_halving(handler._step) * context_step) + pad,
+            num_frames + pad + (0 if handler.closed_loop else -handler.context_overlap),
+            (handler.context_length * context_step - handler.context_overlap),
+        ):
+            windows.append([e % num_frames for e in range(j, j + handler.context_length * context_step, context_step)])
+
+    return windows
+
+def create_windows_uniform_standard(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]):
+    # unlike looped, uniform_straight does NOT allow windows that loop back to the beginning;
+    # instead, they get shifted to the corresponding end of the frames.
+    # in the case that a window (shifted or not) is identical to the previous one, it gets skipped.
+    windows = []
+    if num_frames <= handler.context_length:
+        windows.append(list(range(num_frames)))
+        return windows
+
+    context_stride = min(handler.context_stride, int(np.ceil(np.log2(num_frames / handler.context_length))) + 1)
+    # first, obtain uniform windows as normal, looping and all
+    for context_step in 1 << np.arange(context_stride):
+        pad = int(round(num_frames * ordered_halving(handler._step)))
+        for j in range(
+            int(ordered_halving(handler._step) * context_step) + pad,
+            num_frames + pad + (-handler.context_overlap),
+            (handler.context_length * context_step - handler.context_overlap),
+        ):
+            windows.append([e % num_frames for e in range(j, j + handler.context_length * context_step, context_step)])
+
+    # now that windows are created, shift any windows that loop, and delete duplicate windows
+    delete_idxs = []
+    win_i = 0
+    while win_i < len(windows):
+        # if window is rolls over itself, need to shift it
+        is_roll, roll_idx = does_window_roll_over(windows[win_i], num_frames)
+        if is_roll:
+            roll_val = windows[win_i][roll_idx]  # roll_val might not be 0 for windows of higher strides
+            shift_window_to_end(windows[win_i], num_frames=num_frames)
+            # check if next window (cyclical) is missing roll_val
+            if roll_val not in windows[(win_i+1) % len(windows)]:
+                # need to insert new window here - just insert window starting at roll_val
+                windows.insert(win_i+1, list(range(roll_val, roll_val + handler.context_length)))
+        # delete window if it's not unique
+        for pre_i in range(0, win_i):
+            if windows[win_i] == windows[pre_i]:
+                delete_idxs.append(win_i)
+                break
+        win_i += 1
+
+    # reverse delete_idxs so that they will be deleted in an order that doesn't break idx correlation
+    delete_idxs.reverse()
+    for i in delete_idxs:
+        windows.pop(i)
+
+    return windows
+
+
+def create_windows_static_standard(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]):
+    windows = []
+    if num_frames <= handler.context_length:
+        windows.append(list(range(num_frames)))
+        return windows
+    # always return the same set of windows
+    delta = handler.context_length - handler.context_overlap
+    for start_idx in range(0, num_frames, delta):
+        # if past the end of frames, move start_idx back to allow same context_length
+        ending = start_idx + handler.context_length
+        if ending >= num_frames:
+            final_delta = ending - num_frames
+            final_start_idx = start_idx - final_delta
+            windows.append(list(range(final_start_idx, final_start_idx + handler.context_length)))
+            break
+        windows.append(list(range(start_idx, start_idx + handler.context_length)))
+    return windows
+
+
+def create_windows_batched(num_frames: int, handler: IndexListContextHandler, model_options: dict[str]):
+    windows = []
+    if num_frames <= handler.context_length:
+        windows.append(list(range(num_frames)))
+        return windows
+    # always return the same set of windows;
+    # no overlap, just cut up based on context_length;
+    # last window size will be different if num_frames % opts.context_length != 0
+    for start_idx in range(0, num_frames, handler.context_length):
+        windows.append(list(range(start_idx, min(start_idx + handler.context_length, num_frames))))
+    return windows
+
+
+def create_windows_default(num_frames: int, handler: IndexListContextHandler):
+    return [list(range(num_frames))]
+
+
+CONTEXT_MAPPING = {
+    ContextSchedules.UNIFORM_LOOPED: create_windows_uniform_looped,
+    ContextSchedules.UNIFORM_STANDARD: create_windows_uniform_standard,
+    ContextSchedules.STATIC_STANDARD: create_windows_static_standard,
+    ContextSchedules.BATCHED: create_windows_batched,
+}
+
+
+def get_matching_context_schedule(context_schedule: str) -> ContextSchedule:
+    func = CONTEXT_MAPPING.get(context_schedule, None)
+    if func is None:
+        raise ValueError(f"Unknown context_schedule '{context_schedule}'.")
+    return ContextSchedule(context_schedule, func)
+
+
+def get_context_weights(length: int, full_length: int, idxs: list[int], handler: IndexListContextHandler, sigma: torch.Tensor=None):
+    return handler.fuse_method.func(length, sigma=sigma, handler=handler, full_length=full_length, idxs=idxs)
+
+
+def create_weights_flat(length: int, **kwargs) -> list[float]:
+    # weight is the same for all
+    return [1.0] * length
+
+def create_weights_pyramid(length: int, **kwargs) -> list[float]:
+    # weight is based on the distance away from the edge of the context window;
+    # based on weighted average concept in FreeNoise paper
+    if length % 2 == 0:
+        max_weight = length // 2
+        weight_sequence = list(range(1, max_weight + 1, 1)) + list(range(max_weight, 0, -1))
+    else:
+        max_weight = (length + 1) // 2
+        weight_sequence = list(range(1, max_weight, 1)) + [max_weight] + list(range(max_weight - 1, 0, -1))
+    return weight_sequence
+
+def create_weights_overlap_linear(length: int, full_length: int, idxs: list[int], handler: IndexListContextHandler, **kwargs):
+    # based on code in Kijai's WanVideoWrapper: https://github.com/kijai/ComfyUI-WanVideoWrapper/blob/dbb2523b37e4ccdf45127e5ae33e31362f755c8e/nodes.py#L1302
+    # only expected overlap is given different weights
+    weights_torch = torch.ones((length))
+    # blend left-side on all except first window
+    if min(idxs) > 0:
+        ramp_up = torch.linspace(1e-37, 1, handler.context_overlap)
+        weights_torch[:handler.context_overlap] = ramp_up
+    # blend right-side on all except last window
+    if max(idxs) < full_length-1:
+        ramp_down = torch.linspace(1, 1e-37, handler.context_overlap)
+        weights_torch[-handler.context_overlap:] = ramp_down
+    return weights_torch
+
+class ContextFuseMethods:
+    FLAT = "flat"
+    PYRAMID = "pyramid"
+    RELATIVE = "relative"
+    OVERLAP_LINEAR = "overlap-linear"
+
+    LIST = [PYRAMID, FLAT, OVERLAP_LINEAR]
+    LIST_STATIC = [PYRAMID, RELATIVE, FLAT, OVERLAP_LINEAR]
+
+
+FUSE_MAPPING = {
+    ContextFuseMethods.FLAT: create_weights_flat,
+    ContextFuseMethods.PYRAMID: create_weights_pyramid,
+    ContextFuseMethods.RELATIVE: create_weights_pyramid,
+    ContextFuseMethods.OVERLAP_LINEAR: create_weights_overlap_linear,
+}
+
+def get_matching_fuse_method(fuse_method: str) -> ContextFuseMethod:
+    func = FUSE_MAPPING.get(fuse_method, None)
+    if func is None:
+        raise ValueError(f"Unknown fuse_method '{fuse_method}'.")
+    return ContextFuseMethod(fuse_method, func)
+
+# Returns fraction that has denominator that is a power of 2
+def ordered_halving(val):
+    # get binary value, padded with 0s for 64 bits
+    bin_str = f"{val:064b}"
+    # flip binary value, padding included
+    bin_flip = bin_str[::-1]
+    # convert binary to int
+    as_int = int(bin_flip, 2)
+    # divide by 1 << 64, equivalent to 2**64, or 18446744073709551616,
+    # or b10000000000000000000000000000000000000000000000000000000000000000 (1 with 64 zero's)
+    return as_int / (1 << 64)
+
+
+def get_missing_indexes(windows: list[list[int]], num_frames: int) -> list[int]:
+    all_indexes = list(range(num_frames))
+    for w in windows:
+        for val in w:
+            try:
+                all_indexes.remove(val)
+            except ValueError:
+                pass
+    return all_indexes
+
+
+def does_window_roll_over(window: list[int], num_frames: int) -> tuple[bool, int]:
+    prev_val = -1
+    for i, val in enumerate(window):
+        val = val % num_frames
+        if val < prev_val:
+            return True, i
+        prev_val = val
+    return False, -1
+
+
+def shift_window_to_start(window: list[int], num_frames: int):
+    start_val = window[0]
+    for i in range(len(window)):
+        # 1) subtract each element by start_val to move vals relative to the start of all frames
+        # 2) add num_frames and take modulus to get adjusted vals
+        window[i] = ((window[i] - start_val) + num_frames) % num_frames
+
+
+def shift_window_to_end(window: list[int], num_frames: int):
+    # 1) shift window to start
+    shift_window_to_start(window, num_frames)
+    end_val = window[-1]
+    end_delta = num_frames - end_val - 1
+    for i in range(len(window)):
+        # 2) add end_delta to each val to slide windows to end
+        window[i] = window[i] + end_delta
--- a/comfy/controlnet.py
+++ b/comfy/controlnet.py
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+
+import torch
+from enum import Enum
+import math
+import os
+import logging
+import comfy.utils
+import comfy.model_management
+import comfy.model_detection
+import comfy.model_patcher
+import comfy.ops
+import comfy.latent_formats
+import comfy.model_base
+
+import comfy.cldm.cldm
+import comfy.t2i_adapter.adapter
+import comfy.ldm.cascade.controlnet
+import comfy.cldm.mmdit
+import comfy.ldm.hydit.controlnet
+import comfy.ldm.flux.controlnet
+import comfy.ldm.qwen_image.controlnet
+import comfy.cldm.dit_embedder
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from comfy.hooks import HookGroup
+
+
+def broadcast_image_to(tensor, target_batch_size, batched_number):
+    current_batch_size = tensor.shape[0]
+    if current_batch_size == 1:
+        return tensor
+
+    per_batch = target_batch_size // batched_number
+    tensor = tensor[:per_batch]
+
+    if per_batch > tensor.shape[0]:
+        tensor = torch.cat([tensor] * (per_batch // tensor.shape[0]) + [tensor[:(per_batch % tensor.shape[0])]], dim=0)
+
+    current_batch_size = tensor.shape[0]
+    if current_batch_size == target_batch_size:
+        return tensor
+    else:
+        return torch.cat([tensor] * batched_number, dim=0)
+
+class StrengthType(Enum):
+    CONSTANT = 1
+    LINEAR_UP = 2
+
+class ControlBase:
+    def __init__(self):
+        self.cond_hint_original = None
+        self.cond_hint = None
+        self.strength = 1.0
+        self.timestep_percent_range = (0.0, 1.0)
+        self.latent_format = None
+        self.vae = None
+        self.global_average_pooling = False
+        self.timestep_range = None
+        self.compression_ratio = 8
+        self.upscale_algorithm = 'nearest-exact'
+        self.extra_args = {}
+        self.previous_controlnet = None
+        self.extra_conds = []
+        self.strength_type = StrengthType.CONSTANT
+        self.concat_mask = False
+        self.extra_concat_orig = []
+        self.extra_concat = None
+        self.extra_hooks: HookGroup = None
+        self.preprocess_image = lambda a: a
+
+    def set_cond_hint(self, cond_hint, strength=1.0, timestep_percent_range=(0.0, 1.0), vae=None, extra_concat=[]):
+        self.cond_hint_original = cond_hint
+        self.strength = strength
+        self.timestep_percent_range = timestep_percent_range
+        if self.latent_format is not None:
+            if vae is None:
+                logging.warning("WARNING: no VAE provided to the controlnet apply node when this controlnet requires one.")
+            self.vae = vae
+        self.extra_concat_orig = extra_concat.copy()
+        if self.concat_mask and len(self.extra_concat_orig) == 0:
+            self.extra_concat_orig.append(torch.tensor([[[[1.0]]]]))
+        return self
+
+    def pre_run(self, model, percent_to_timestep_function):
+        self.timestep_range = (percent_to_timestep_function(self.timestep_percent_range[0]), percent_to_timestep_function(self.timestep_percent_range[1]))
+        if self.previous_controlnet is not None:
+            self.previous_controlnet.pre_run(model, percent_to_timestep_function)
+
+    def set_previous_controlnet(self, controlnet):
+        self.previous_controlnet = controlnet
+        return self
+
+    def cleanup(self):
+        if self.previous_controlnet is not None:
+            self.previous_controlnet.cleanup()
+
+        self.cond_hint = None
+        self.extra_concat = None
+        self.timestep_range = None
+
+    def get_models(self):
+        out = []
+        if self.previous_controlnet is not None:
+            out += self.previous_controlnet.get_models()
+        return out
+
+    def get_extra_hooks(self):
+        out = []
+        if self.extra_hooks is not None:
+            out.append(self.extra_hooks)
+        if self.previous_controlnet is not None:
+            out += self.previous_controlnet.get_extra_hooks()
+        return out
+
+    def copy_to(self, c):
+        c.cond_hint_original = self.cond_hint_original
+        c.strength = self.strength
+        c.timestep_percent_range = self.timestep_percent_range
+        c.global_average_pooling = self.global_average_pooling
+        c.compression_ratio = self.compression_ratio
+        c.upscale_algorithm = self.upscale_algorithm
+        c.latent_format = self.latent_format
+        c.extra_args = self.extra_args.copy()
+        c.vae = self.vae
+        c.extra_conds = self.extra_conds.copy()
+        c.strength_type = self.strength_type
+        c.concat_mask = self.concat_mask
+        c.extra_concat_orig = self.extra_concat_orig.copy()
+        c.extra_hooks = self.extra_hooks.clone() if self.extra_hooks else None
+        c.preprocess_image = self.preprocess_image
+
+    def inference_memory_requirements(self, dtype):
+        if self.previous_controlnet is not None:
+            return self.previous_controlnet.inference_memory_requirements(dtype)
+        return 0
+
+    def control_merge(self, control, control_prev, output_dtype):
+        out = {'input':[], 'middle':[], 'output': []}
+
+        for key in control:
+            control_output = control[key]
+            applied_to = set()
+            for i in range(len(control_output)):
+                x = control_output[i]
+                if x is not None:
+                    if self.global_average_pooling:
+                        x = torch.mean(x, dim=(2, 3), keepdim=True).repeat(1, 1, x.shape[2], x.shape[3])
+
+                    if x not in applied_to: #memory saving strategy, allow shared tensors and only apply strength to shared tensors once
+                        applied_to.add(x)
+                        if self.strength_type == StrengthType.CONSTANT:
+                            x *= self.strength
+                        elif self.strength_type == StrengthType.LINEAR_UP:
+                            x *= (self.strength ** float(len(control_output) - i))
+
+                    if output_dtype is not None and x.dtype != output_dtype:
+                        x = x.to(output_dtype)
+
+                out[key].append(x)
+
+        if control_prev is not None:
+            for x in ['input', 'middle', 'output']:
+                o = out[x]
+                for i in range(len(control_prev[x])):
+                    prev_val = control_prev[x][i]
+                    if i >= len(o):
+                        o.append(prev_val)
+                    elif prev_val is not None:
+                        if o[i] is None:
+                            o[i] = prev_val
+                        else:
+                            if o[i].shape[0] < prev_val.shape[0]:
+                                o[i] = prev_val + o[i]
+                            else:
+                                o[i] = prev_val + o[i] #TODO: change back to inplace add if shared tensors stop being an issue
+        return out
+
+    def set_extra_arg(self, argument, value=None):
+        self.extra_args[argument] = value
+
+
+class ControlNet(ControlBase):
+    def __init__(self, control_model=None, global_average_pooling=False, compression_ratio=8, latent_format=None, load_device=None, manual_cast_dtype=None, extra_conds=["y"], strength_type=StrengthType.CONSTANT, concat_mask=False, preprocess_image=lambda a: a):
+        super().__init__()
+        self.control_model = control_model
+        self.load_device = load_device
+        if control_model is not None:
+            self.control_model_wrapped = comfy.model_patcher.ModelPatcher(self.control_model, load_device=load_device, offload_device=comfy.model_management.unet_offload_device())
+
+        self.compression_ratio = compression_ratio
+        self.global_average_pooling = global_average_pooling
+        self.model_sampling_current = None
+        self.manual_cast_dtype = manual_cast_dtype
+        self.latent_format = latent_format
+        self.extra_conds += extra_conds
+        self.strength_type = strength_type
+        self.concat_mask = concat_mask
+        self.preprocess_image = preprocess_image
+
+    def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
+        control_prev = None
+        if self.previous_controlnet is not None:
+            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number, transformer_options)
+
+        if self.timestep_range is not None:
+            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
+                if control_prev is not None:
+                    return control_prev
+                else:
+                    return None
+
+        dtype = self.control_model.dtype
+        if self.manual_cast_dtype is not None:
+            dtype = self.manual_cast_dtype
+
+        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
+            if self.cond_hint is not None:
+                del self.cond_hint
+            self.cond_hint = None
+            compression_ratio = self.compression_ratio
+            if self.vae is not None:
+                compression_ratio *= self.vae.spacial_compression_encode()
+            else:
+                if self.latent_format is not None:
+                    raise ValueError("This Controlnet needs a VAE but none was provided, please use a ControlNetApply node with a VAE input and connect it.")
+            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, x_noisy.shape[-1] * compression_ratio, x_noisy.shape[-2] * compression_ratio, self.upscale_algorithm, "center")
+            self.cond_hint = self.preprocess_image(self.cond_hint)
+            if self.vae is not None:
+                loaded_models = comfy.model_management.loaded_models(only_currently_used=True)
+                self.cond_hint = self.vae.encode(self.cond_hint.movedim(1, -1))
+                comfy.model_management.load_models_gpu(loaded_models)
+            if self.latent_format is not None:
+                self.cond_hint = self.latent_format.process_in(self.cond_hint)
+            if len(self.extra_concat_orig) > 0:
+                to_concat = []
+                for c in self.extra_concat_orig:
+                    c = c.to(self.cond_hint.device)
+                    c = comfy.utils.common_upscale(c, self.cond_hint.shape[3], self.cond_hint.shape[2], self.upscale_algorithm, "center")
+                    to_concat.append(comfy.utils.repeat_to_batch_size(c, self.cond_hint.shape[0]))
+                self.cond_hint = torch.cat([self.cond_hint] + to_concat, dim=1)
+
+            self.cond_hint = self.cond_hint.to(device=x_noisy.device, dtype=dtype)
+        if x_noisy.shape[0] != self.cond_hint.shape[0]:
+            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
+
+        context = cond.get('crossattn_controlnet', cond['c_crossattn'])
+        extra = self.extra_args.copy()
+        for c in self.extra_conds:
+            temp = cond.get(c, None)
+            if temp is not None:
+                extra[c] = comfy.model_base.convert_tensor(temp, dtype, x_noisy.device)
+
+        timestep = self.model_sampling_current.timestep(t)
+        x_noisy = self.model_sampling_current.calculate_input(t, x_noisy)
+
+        control = self.control_model(x=x_noisy.to(dtype), hint=self.cond_hint, timesteps=timestep.to(dtype), context=comfy.model_management.cast_to_device(context, x_noisy.device, dtype), **extra)
+        return self.control_merge(control, control_prev, output_dtype=None)
+
+    def copy(self):
+        c = ControlNet(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
+        c.control_model = self.control_model
+        c.control_model_wrapped = self.control_model_wrapped
+        self.copy_to(c)
+        return c
+
+    def get_models(self):
+        out = super().get_models()
+        out.append(self.control_model_wrapped)
+        return out
+
+    def pre_run(self, model, percent_to_timestep_function):
+        super().pre_run(model, percent_to_timestep_function)
+        self.model_sampling_current = model.model_sampling
+
+    def cleanup(self):
+        self.model_sampling_current = None
+        super().cleanup()
+
+class ControlLoraOps:
+    class Linear(torch.nn.Module, comfy.ops.CastWeightBiasOp):
+        def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                    device=None, dtype=None) -> None:
+            super().__init__()
+            self.in_features = in_features
+            self.out_features = out_features
+            self.weight = None
+            self.up = None
+            self.down = None
+            self.bias = None
+
+        def forward(self, input):
+            weight, bias = comfy.ops.cast_bias_weight(self, input)
+            if self.up is not None:
+                return torch.nn.functional.linear(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias)
+            else:
+                return torch.nn.functional.linear(input, weight, bias)
+
+    class Conv2d(torch.nn.Module, comfy.ops.CastWeightBiasOp):
+        def __init__(
+            self,
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            padding=0,
+            dilation=1,
+            groups=1,
+            bias=True,
+            padding_mode='zeros',
+            device=None,
+            dtype=None
+        ):
+            super().__init__()
+            self.in_channels = in_channels
+            self.out_channels = out_channels
+            self.kernel_size = kernel_size
+            self.stride = stride
+            self.padding = padding
+            self.dilation = dilation
+            self.transposed = False
+            self.output_padding = 0
+            self.groups = groups
+            self.padding_mode = padding_mode
+
+            self.weight = None
+            self.bias = None
+            self.up = None
+            self.down = None
+
+
+        def forward(self, input):
+            weight, bias = comfy.ops.cast_bias_weight(self, input)
+            if self.up is not None:
+                return torch.nn.functional.conv2d(input, weight + (torch.mm(self.up.flatten(start_dim=1), self.down.flatten(start_dim=1))).reshape(self.weight.shape).type(input.dtype), bias, self.stride, self.padding, self.dilation, self.groups)
+            else:
+                return torch.nn.functional.conv2d(input, weight, bias, self.stride, self.padding, self.dilation, self.groups)
+
+
+class ControlLora(ControlNet):
+    def __init__(self, control_weights, global_average_pooling=False, model_options={}): #TODO? model_options
+        ControlBase.__init__(self)
+        self.control_weights = control_weights
+        self.global_average_pooling = global_average_pooling
+        self.extra_conds += ["y"]
+
+    def pre_run(self, model, percent_to_timestep_function):
+        super().pre_run(model, percent_to_timestep_function)
+        controlnet_config = model.model_config.unet_config.copy()
+        controlnet_config.pop("out_channels")
+        controlnet_config["hint_channels"] = self.control_weights["input_hint_block.0.weight"].shape[1]
+        self.manual_cast_dtype = model.manual_cast_dtype
+        dtype = model.get_dtype()
+        if self.manual_cast_dtype is None:
+            class control_lora_ops(ControlLoraOps, comfy.ops.disable_weight_init):
+                pass
+        else:
+            class control_lora_ops(ControlLoraOps, comfy.ops.manual_cast):
+                pass
+            dtype = self.manual_cast_dtype
+
+        controlnet_config["operations"] = control_lora_ops
+        controlnet_config["dtype"] = dtype
+        self.control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
+        self.control_model.to(comfy.model_management.get_torch_device())
+        diffusion_model = model.diffusion_model
+        sd = diffusion_model.state_dict()
+
+        for k in sd:
+            weight = sd[k]
+            try:
+                comfy.utils.set_attr_param(self.control_model, k, weight)
+            except:
+                pass
+
+        for k in self.control_weights:
+            if (k not in {"lora_controlnet"}):
+                if (k.endswith(".up") or k.endswith(".down") or k.endswith(".weight") or k.endswith(".bias")) and ("__" not in k):
+                    comfy.utils.set_attr_param(self.control_model, k, self.control_weights[k].to(dtype).to(comfy.model_management.get_torch_device()))
+
+    def copy(self):
+        c = ControlLora(self.control_weights, global_average_pooling=self.global_average_pooling)
+        self.copy_to(c)
+        return c
+
+    def cleanup(self):
+        del self.control_model
+        self.control_model = None
+        super().cleanup()
+
+    def get_models(self):
+        out = ControlBase.get_models(self)
+        return out
+
+    def inference_memory_requirements(self, dtype):
+        return comfy.utils.calculate_parameters(self.control_weights) * comfy.model_management.dtype_size(dtype) + ControlBase.inference_memory_requirements(self, dtype)
+
+def controlnet_config(sd, model_options={}):
+    model_config = comfy.model_detection.model_config_from_unet(sd, "", True)
+
+    unet_dtype = model_options.get("dtype", None)
+    if unet_dtype is None:
+        weight_dtype = comfy.utils.weight_dtype(sd)
+
+        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
+
+    load_device = comfy.model_management.get_torch_device()
+    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
+
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
+
+    offload_device = comfy.model_management.unet_offload_device()
+    return model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device
+
+def controlnet_load_state_dict(control_model, sd):
+    missing, unexpected = control_model.load_state_dict(sd, strict=False)
+
+    if len(missing) > 0:
+        logging.warning("missing controlnet keys: {}".format(missing))
+
+    if len(unexpected) > 0:
+        logging.debug("unexpected controlnet keys: {}".format(unexpected))
+    return control_model
+
+
+def load_controlnet_mmdit(sd, model_options={}):
+    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
+    num_blocks = comfy.model_detection.count_blocks(new_sd, 'joint_blocks.{}.')
+    for k in sd:
+        new_sd[k] = sd[k]
+
+    concat_mask = False
+    control_latent_channels = new_sd.get("pos_embed_input.proj.weight").shape[1]
+    if control_latent_channels == 17: #inpaint controlnet
+        concat_mask = True
+
+    control_model = comfy.cldm.mmdit.ControlNet(num_blocks=num_blocks, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, new_sd)
+
+    latent_format = comfy.latent_formats.SD3()
+    latent_format.shift_factor = 0 #SD3 controlnet weirdness
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
+    return control
+
+
+class ControlNetSD35(ControlNet):
+    def pre_run(self, model, percent_to_timestep_function):
+        if self.control_model.double_y_emb:
+            missing, unexpected = self.control_model.orig_y_embedder.load_state_dict(model.diffusion_model.y_embedder.state_dict(), strict=False)
+        else:
+            missing, unexpected = self.control_model.x_embedder.load_state_dict(model.diffusion_model.x_embedder.state_dict(), strict=False)
+        super().pre_run(model, percent_to_timestep_function)
+
+    def copy(self):
+        c = ControlNetSD35(None, global_average_pooling=self.global_average_pooling, load_device=self.load_device, manual_cast_dtype=self.manual_cast_dtype)
+        c.control_model = self.control_model
+        c.control_model_wrapped = self.control_model_wrapped
+        self.copy_to(c)
+        return c
+
+def load_controlnet_sd35(sd, model_options={}):
+    control_type = -1
+    if "control_type" in sd:
+        control_type = round(sd.pop("control_type").item())
+
+    # blur_cnet = control_type == 0
+    canny_cnet = control_type == 1
+    depth_cnet = control_type == 2
+
+    new_sd = {}
+    for k in comfy.utils.MMDIT_MAP_BASIC:
+        if k[1] in sd:
+            new_sd[k[0]] = sd.pop(k[1])
+    for k in sd:
+        new_sd[k] = sd[k]
+    sd = new_sd
+
+    y_emb_shape = sd["y_embedder.mlp.0.weight"].shape
+    depth = y_emb_shape[0] // 64
+    hidden_size = 64 * depth
+    num_heads = depth
+    head_dim = hidden_size // num_heads
+    num_blocks = comfy.model_detection.count_blocks(new_sd, 'transformer_blocks.{}.')
+
+    load_device = comfy.model_management.get_torch_device()
+    offload_device = comfy.model_management.unet_offload_device()
+    unet_dtype = comfy.model_management.unet_dtype(model_params=-1)
+
+    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
+
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype, disable_fast_fp8=True)
+
+    control_model = comfy.cldm.dit_embedder.ControlNetEmbedder(img_size=None,
+                                                               patch_size=2,
+                                                               in_chans=16,
+                                                               num_layers=num_blocks,
+                                                               main_model_double=depth,
+                                                               double_y_emb=y_emb_shape[0] == y_emb_shape[1],
+                                                               attention_head_dim=head_dim,
+                                                               num_attention_heads=num_heads,
+                                                               adm_in_channels=2048,
+                                                               device=offload_device,
+                                                               dtype=unet_dtype,
+                                                               operations=operations)
+
+    control_model = controlnet_load_state_dict(control_model, sd)
+
+    latent_format = comfy.latent_formats.SD3()
+    preprocess_image = lambda a: a
+    if canny_cnet:
+        preprocess_image = lambda a: (a * 255 * 0.5 + 0.5)
+    elif depth_cnet:
+        preprocess_image = lambda a: 1.0 - a
+
+    control = ControlNetSD35(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, preprocess_image=preprocess_image)
+    return control
+
+
+
+def load_controlnet_hunyuandit(controlnet_data, model_options={}):
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(controlnet_data, model_options=model_options)
+
+    control_model = comfy.ldm.hydit.controlnet.HunYuanControlNet(operations=operations, device=offload_device, dtype=unet_dtype)
+    control_model = controlnet_load_state_dict(control_model, controlnet_data)
+
+    latent_format = comfy.latent_formats.SDXL()
+    extra_conds = ['text_embedding_mask', 'encoder_hidden_states_t5', 'text_embedding_mask_t5', 'image_meta_size', 'style', 'cos_cis_img', 'sin_cis_img']
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds, strength_type=StrengthType.CONSTANT)
+    return control
+
+def load_controlnet_flux_xlabs_mistoline(sd, mistoline=False, model_options={}):
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
+    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(mistoline=mistoline, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, sd)
+    extra_conds = ['y', 'guidance']
+    control = ControlNet(control_model, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    return control
+
+def load_controlnet_flux_instantx(sd, model_options={}):
+    new_sd = comfy.model_detection.convert_diffusers_mmdit(sd, "")
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(new_sd, model_options=model_options)
+    for k in sd:
+        new_sd[k] = sd[k]
+
+    num_union_modes = 0
+    union_cnet = "controlnet_mode_embedder.weight"
+    if union_cnet in new_sd:
+        num_union_modes = new_sd[union_cnet].shape[0]
+
+    control_latent_channels = new_sd.get("pos_embed_input.weight").shape[1] // 4
+    concat_mask = False
+    if control_latent_channels == 17:
+        concat_mask = True
+
+    control_model = comfy.ldm.flux.controlnet.ControlNetFlux(latent_input=True, num_union_modes=num_union_modes, control_latent_channels=control_latent_channels, operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, new_sd)
+
+    latent_format = comfy.latent_formats.Flux()
+    extra_conds = ['y', 'guidance']
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, concat_mask=concat_mask, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    return control
+
+def load_controlnet_qwen_instantx(sd, model_options={}):
+    model_config, operations, load_device, unet_dtype, manual_cast_dtype, offload_device = controlnet_config(sd, model_options=model_options)
+    control_model = comfy.ldm.qwen_image.controlnet.QwenImageControlNetModel(operations=operations, device=offload_device, dtype=unet_dtype, **model_config.unet_config)
+    control_model = controlnet_load_state_dict(control_model, sd)
+    latent_format = comfy.latent_formats.Wan21()
+    extra_conds = []
+    control = ControlNet(control_model, compression_ratio=1, latent_format=latent_format, load_device=load_device, manual_cast_dtype=manual_cast_dtype, extra_conds=extra_conds)
+    return control
+
+def convert_mistoline(sd):
+    return comfy.utils.state_dict_prefix_replace(sd, {"single_controlnet_blocks.": "controlnet_single_blocks."})
+
+
+def load_controlnet_state_dict(state_dict, model=None, model_options={}):
+    controlnet_data = state_dict
+    if 'after_proj_list.18.bias' in controlnet_data.keys(): #Hunyuan DiT
+        return load_controlnet_hunyuandit(controlnet_data, model_options=model_options)
+
+    if "lora_controlnet" in controlnet_data:
+        return ControlLora(controlnet_data, model_options=model_options)
+
+    controlnet_config = None
+    supported_inference_dtypes = None
+
+    if "controlnet_cond_embedding.conv_in.weight" in controlnet_data: #diffusers format
+        controlnet_config = comfy.model_detection.unet_config_from_diffusers_unet(controlnet_data)
+        diffusers_keys = comfy.utils.unet_to_diffusers(controlnet_config)
+        diffusers_keys["controlnet_mid_block.weight"] = "middle_block_out.0.weight"
+        diffusers_keys["controlnet_mid_block.bias"] = "middle_block_out.0.bias"
+
+        count = 0
+        loop = True
+        while loop:
+            suffix = [".weight", ".bias"]
+            for s in suffix:
+                k_in = "controlnet_down_blocks.{}{}".format(count, s)
+                k_out = "zero_convs.{}.0{}".format(count, s)
+                if k_in not in controlnet_data:
+                    loop = False
+                    break
+                diffusers_keys[k_in] = k_out
+            count += 1
+
+        count = 0
+        loop = True
+        while loop:
+            suffix = [".weight", ".bias"]
+            for s in suffix:
+                if count == 0:
+                    k_in = "controlnet_cond_embedding.conv_in{}".format(s)
+                else:
+                    k_in = "controlnet_cond_embedding.blocks.{}{}".format(count - 1, s)
+                k_out = "input_hint_block.{}{}".format(count * 2, s)
+                if k_in not in controlnet_data:
+                    k_in = "controlnet_cond_embedding.conv_out{}".format(s)
+                    loop = False
+                diffusers_keys[k_in] = k_out
+            count += 1
+
+        new_sd = {}
+        for k in diffusers_keys:
+            if k in controlnet_data:
+                new_sd[diffusers_keys[k]] = controlnet_data.pop(k)
+
+        if "control_add_embedding.linear_1.bias" in controlnet_data: #Union Controlnet
+            controlnet_config["union_controlnet_num_control_type"] = controlnet_data["task_embedding"].shape[0]
+            for k in list(controlnet_data.keys()):
+                new_k = k.replace('.attn.in_proj_', '.attn.in_proj.')
+                new_sd[new_k] = controlnet_data.pop(k)
+
+        leftover_keys = controlnet_data.keys()
+        if len(leftover_keys) > 0:
+            logging.warning("leftover keys: {}".format(leftover_keys))
+        controlnet_data = new_sd
+    elif "controlnet_blocks.0.weight" in controlnet_data:
+        if "double_blocks.0.img_attn.norm.key_norm.scale" in controlnet_data:
+            return load_controlnet_flux_xlabs_mistoline(controlnet_data, model_options=model_options)
+        elif "pos_embed_input.proj.weight" in controlnet_data:
+            if "transformer_blocks.0.adaLN_modulation.1.bias" in controlnet_data:
+                return load_controlnet_sd35(controlnet_data, model_options=model_options) #Stability sd3.5 format
+            else:
+                return load_controlnet_mmdit(controlnet_data, model_options=model_options) #SD3 diffusers controlnet
+        elif "transformer_blocks.0.img_mlp.net.0.proj.weight" in controlnet_data:
+            return load_controlnet_qwen_instantx(controlnet_data, model_options=model_options)
+        elif "controlnet_x_embedder.weight" in controlnet_data:
+            return load_controlnet_flux_instantx(controlnet_data, model_options=model_options)
+
+    elif "controlnet_blocks.0.linear.weight" in controlnet_data: #mistoline flux
+        return load_controlnet_flux_xlabs_mistoline(convert_mistoline(controlnet_data), mistoline=True, model_options=model_options)
+
+    pth_key = 'control_model.zero_convs.0.0.weight'
+    pth = False
+    key = 'zero_convs.0.0.weight'
+    if pth_key in controlnet_data:
+        pth = True
+        key = pth_key
+        prefix = "control_model."
+    elif key in controlnet_data:
+        prefix = ""
+    else:
+        net = load_t2i_adapter(controlnet_data, model_options=model_options)
+        if net is None:
+            logging.error("error could not detect control model type.")
+        return net
+
+    if controlnet_config is None:
+        model_config = comfy.model_detection.model_config_from_unet(controlnet_data, prefix, True)
+        supported_inference_dtypes = list(model_config.supported_inference_dtypes)
+        controlnet_config = model_config.unet_config
+
+    unet_dtype = model_options.get("dtype", None)
+    if unet_dtype is None:
+        weight_dtype = comfy.utils.weight_dtype(controlnet_data)
+
+        if supported_inference_dtypes is None:
+            supported_inference_dtypes = [comfy.model_management.unet_dtype()]
+
+        unet_dtype = comfy.model_management.unet_dtype(model_params=-1, supported_dtypes=supported_inference_dtypes, weight_dtype=weight_dtype)
+
+    load_device = comfy.model_management.get_torch_device()
+
+    manual_cast_dtype = comfy.model_management.unet_manual_cast(unet_dtype, load_device)
+    operations = model_options.get("custom_operations", None)
+    if operations is None:
+        operations = comfy.ops.pick_operations(unet_dtype, manual_cast_dtype)
+
+    controlnet_config["operations"] = operations
+    controlnet_config["dtype"] = unet_dtype
+    controlnet_config["device"] = comfy.model_management.unet_offload_device()
+    controlnet_config.pop("out_channels")
+    controlnet_config["hint_channels"] = controlnet_data["{}input_hint_block.0.weight".format(prefix)].shape[1]
+    control_model = comfy.cldm.cldm.ControlNet(**controlnet_config)
+
+    if pth:
+        if 'difference' in controlnet_data:
+            if model is not None:
+                comfy.model_management.load_models_gpu([model])
+                model_sd = model.model_state_dict()
+                for x in controlnet_data:
+                    c_m = "control_model."
+                    if x.startswith(c_m):
+                        sd_key = "diffusion_model.{}".format(x[len(c_m):])
+                        if sd_key in model_sd:
+                            cd = controlnet_data[x]
+                            cd += model_sd[sd_key].type(cd.dtype).to(cd.device)
+            else:
+                logging.warning("WARNING: Loaded a diff controlnet without a model. It will very likely not work.")
+
+        class WeightsLoader(torch.nn.Module):
+            pass
+        w = WeightsLoader()
+        w.control_model = control_model
+        missing, unexpected = w.load_state_dict(controlnet_data, strict=False)
+    else:
+        missing, unexpected = control_model.load_state_dict(controlnet_data, strict=False)
+
+    if len(missing) > 0:
+        logging.warning("missing controlnet keys: {}".format(missing))
+
+    if len(unexpected) > 0:
+        logging.debug("unexpected controlnet keys: {}".format(unexpected))
+
+    global_average_pooling = model_options.get("global_average_pooling", False)
+    control = ControlNet(control_model, global_average_pooling=global_average_pooling, load_device=load_device, manual_cast_dtype=manual_cast_dtype)
+    return control
+
+def load_controlnet(ckpt_path, model=None, model_options={}):
+    model_options = model_options.copy()
+    if "global_average_pooling" not in model_options:
+        filename = os.path.splitext(ckpt_path)[0]
+        if filename.endswith("_shuffle") or filename.endswith("_shuffle_fp16"): #TODO: smarter way of enabling global_average_pooling
+            model_options["global_average_pooling"] = True
+
+    cnet = load_controlnet_state_dict(comfy.utils.load_torch_file(ckpt_path, safe_load=True), model=model, model_options=model_options)
+    if cnet is None:
+        logging.error("error checkpoint does not contain controlnet or t2i adapter data {}".format(ckpt_path))
+    return cnet
+
+class T2IAdapter(ControlBase):
+    def __init__(self, t2i_model, channels_in, compression_ratio, upscale_algorithm, device=None):
+        super().__init__()
+        self.t2i_model = t2i_model
+        self.channels_in = channels_in
+        self.control_input = None
+        self.compression_ratio = compression_ratio
+        self.upscale_algorithm = upscale_algorithm
+        if device is None:
+            device = comfy.model_management.get_torch_device()
+        self.device = device
+
+    def scale_image_to(self, width, height):
+        unshuffle_amount = self.t2i_model.unshuffle_amount
+        width = math.ceil(width / unshuffle_amount) * unshuffle_amount
+        height = math.ceil(height / unshuffle_amount) * unshuffle_amount
+        return width, height
+
+    def get_control(self, x_noisy, t, cond, batched_number, transformer_options):
+        control_prev = None
+        if self.previous_controlnet is not None:
+            control_prev = self.previous_controlnet.get_control(x_noisy, t, cond, batched_number, transformer_options)
+
+        if self.timestep_range is not None:
+            if t[0] > self.timestep_range[0] or t[0] < self.timestep_range[1]:
+                if control_prev is not None:
+                    return control_prev
+                else:
+                    return None
+
+        if self.cond_hint is None or x_noisy.shape[2] * self.compression_ratio != self.cond_hint.shape[2] or x_noisy.shape[3] * self.compression_ratio != self.cond_hint.shape[3]:
+            if self.cond_hint is not None:
+                del self.cond_hint
+            self.control_input = None
+            self.cond_hint = None
+            width, height = self.scale_image_to(x_noisy.shape[3] * self.compression_ratio, x_noisy.shape[2] * self.compression_ratio)
+            self.cond_hint = comfy.utils.common_upscale(self.cond_hint_original, width, height, self.upscale_algorithm, "center").float().to(self.device)
+            if self.channels_in == 1 and self.cond_hint.shape[1] > 1:
+                self.cond_hint = torch.mean(self.cond_hint, 1, keepdim=True)
+        if x_noisy.shape[0] != self.cond_hint.shape[0]:
+            self.cond_hint = broadcast_image_to(self.cond_hint, x_noisy.shape[0], batched_number)
+        if self.control_input is None:
+            self.t2i_model.to(x_noisy.dtype)
+            self.t2i_model.to(self.device)
+            self.control_input = self.t2i_model(self.cond_hint.to(x_noisy.dtype))
+            self.t2i_model.cpu()
+
+        control_input = {}
+        for k in self.control_input:
+            control_input[k] = list(map(lambda a: None if a is None else a.clone(), self.control_input[k]))
+
+        return self.control_merge(control_input, control_prev, x_noisy.dtype)
+
+    def copy(self):
+        c = T2IAdapter(self.t2i_model, self.channels_in, self.compression_ratio, self.upscale_algorithm)
+        self.copy_to(c)
+        return c
+
+def load_t2i_adapter(t2i_data, model_options={}): #TODO: model_options
+    compression_ratio = 8
+    upscale_algorithm = 'nearest-exact'
+
+    if 'adapter' in t2i_data:
+        t2i_data = t2i_data['adapter']
+    if 'adapter.body.0.resnets.0.block1.weight' in t2i_data: #diffusers format
+        prefix_replace = {}
+        for i in range(4):
+            for j in range(2):
+                prefix_replace["adapter.body.{}.resnets.{}.".format(i, j)] = "body.{}.".format(i * 2 + j)
+            prefix_replace["adapter.body.{}.".format(i, )] = "body.{}.".format(i * 2)
+        prefix_replace["adapter."] = ""
+        t2i_data = comfy.utils.state_dict_prefix_replace(t2i_data, prefix_replace)
+    keys = t2i_data.keys()
+
+    if "body.0.in_conv.weight" in keys:
+        cin = t2i_data['body.0.in_conv.weight'].shape[1]
+        model_ad = comfy.t2i_adapter.adapter.Adapter_light(cin=cin, channels=[320, 640, 1280, 1280], nums_rb=4)
+    elif 'conv_in.weight' in keys:
+        cin = t2i_data['conv_in.weight'].shape[1]
+        channel = t2i_data['conv_in.weight'].shape[0]
+        ksize = t2i_data['body.0.block2.weight'].shape[2]
+        use_conv = False
+        down_opts = list(filter(lambda a: a.endswith("down_opt.op.weight"), keys))
+        if len(down_opts) > 0:
+            use_conv = True
+        xl = False
+        if cin == 256 or cin == 768:
+            xl = True
+        model_ad = comfy.t2i_adapter.adapter.Adapter(cin=cin, channels=[channel, channel*2, channel*4, channel*4][:4], nums_rb=2, ksize=ksize, sk=True, use_conv=use_conv, xl=xl)
+    elif "backbone.0.0.weight" in keys:
+        model_ad = comfy.ldm.cascade.controlnet.ControlNet(c_in=t2i_data['backbone.0.0.weight'].shape[1], proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
+        compression_ratio = 32
+        upscale_algorithm = 'bilinear'
+    elif "backbone.10.blocks.0.weight" in keys:
+        model_ad = comfy.ldm.cascade.controlnet.ControlNet(c_in=t2i_data['backbone.0.weight'].shape[1], bottleneck_mode="large", proj_blocks=[0, 4, 8, 12, 51, 55, 59, 63])
+        compression_ratio = 1
+        upscale_algorithm = 'nearest-exact'
+    else:
+        return None
+
+    missing, unexpected = model_ad.load_state_dict(t2i_data)
+    if len(missing) > 0:
+        logging.warning("t2i missing {}".format(missing))
+
+    if len(unexpected) > 0:
+        logging.debug("t2i unexpected {}".format(unexpected))
+
+    return T2IAdapter(model_ad, model_ad.input_channels, compression_ratio, upscale_algorithm)
--- a/comfy/diffusers_convert.py
+++ b/comfy/diffusers_convert.py
+import re
+import torch
+import logging
+
+# conversion code from https://github.com/huggingface/diffusers/blob/main/scripts/convert_diffusers_to_original_stable_diffusion.py
+
+# ================#
+# VAE Conversion #
+# ================#
+
+vae_conversion_map = [
+    # (stable-diffusion, HF Diffusers)
+    ("nin_shortcut", "conv_shortcut"),
+    ("norm_out", "conv_norm_out"),
+    ("mid.attn_1.", "mid_block.attentions.0."),
+]
+
+for i in range(4):
+    # down_blocks have two resnets
+    for j in range(2):
+        hf_down_prefix = f"encoder.down_blocks.{i}.resnets.{j}."
+        sd_down_prefix = f"encoder.down.{i}.block.{j}."
+        vae_conversion_map.append((sd_down_prefix, hf_down_prefix))
+
+    if i < 3:
+        hf_downsample_prefix = f"down_blocks.{i}.downsamplers.0."
+        sd_downsample_prefix = f"down.{i}.downsample."
+        vae_conversion_map.append((sd_downsample_prefix, hf_downsample_prefix))
+
+        hf_upsample_prefix = f"up_blocks.{i}.upsamplers.0."
+        sd_upsample_prefix = f"up.{3 - i}.upsample."
+        vae_conversion_map.append((sd_upsample_prefix, hf_upsample_prefix))
+
+    # up_blocks have three resnets
+    # also, up blocks in hf are numbered in reverse from sd
+    for j in range(3):
+        hf_up_prefix = f"decoder.up_blocks.{i}.resnets.{j}."
+        sd_up_prefix = f"decoder.up.{3 - i}.block.{j}."
+        vae_conversion_map.append((sd_up_prefix, hf_up_prefix))
+
+# this part accounts for mid blocks in both the encoder and the decoder
+for i in range(2):
+    hf_mid_res_prefix = f"mid_block.resnets.{i}."
+    sd_mid_res_prefix = f"mid.block_{i + 1}."
+    vae_conversion_map.append((sd_mid_res_prefix, hf_mid_res_prefix))
+
+vae_conversion_map_attn = [
+    # (stable-diffusion, HF Diffusers)
+    ("norm.", "group_norm."),
+    ("q.", "query."),
+    ("k.", "key."),
+    ("v.", "value."),
+    ("q.", "to_q."),
+    ("k.", "to_k."),
+    ("v.", "to_v."),
+    ("proj_out.", "to_out.0."),
+    ("proj_out.", "proj_attn."),
+]
+
+
+def reshape_weight_for_sd(w, conv3d=False):
+    # convert HF linear weights to SD conv2d weights
+    if conv3d:
+        return w.reshape(*w.shape, 1, 1, 1)
+    else:
+        return w.reshape(*w.shape, 1, 1)
+
+
+def convert_vae_state_dict(vae_state_dict):
+    mapping = {k: k for k in vae_state_dict.keys()}
+    conv3d = False
+    for k, v in mapping.items():
+        for sd_part, hf_part in vae_conversion_map:
+            v = v.replace(hf_part, sd_part)
+        if v.endswith(".conv.weight"):
+            if not conv3d and vae_state_dict[k].ndim == 5:
+                conv3d = True
+        mapping[k] = v
+    for k, v in mapping.items():
+        if "attentions" in k:
+            for sd_part, hf_part in vae_conversion_map_attn:
+                v = v.replace(hf_part, sd_part)
+            mapping[k] = v
+    new_state_dict = {v: vae_state_dict[k] for k, v in mapping.items()}
+    weights_to_convert = ["q", "k", "v", "proj_out"]
+    for k, v in new_state_dict.items():
+        for weight_name in weights_to_convert:
+            if f"mid.attn_1.{weight_name}.weight" in k:
+                logging.debug(f"Reshaping {k} for SD format")
+                new_state_dict[k] = reshape_weight_for_sd(v, conv3d=conv3d)
+    return new_state_dict
+
+
+# =========================#
+# Text Encoder Conversion #
+# =========================#
+
+
+textenc_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[1]): x[0] for x in textenc_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+
+# Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
+code2idx = {"q": 0, "k": 1, "v": 2}
+
+
+# This function exists because at the time of writing torch.cat can't do fp8 with cuda
+def cat_tensors(tensors):
+    x = 0
+    for t in tensors:
+        x += t.shape[0]
+
+    shape = [x] + list(tensors[0].shape)[1:]
+    out = torch.empty(shape, device=tensors[0].device, dtype=tensors[0].dtype)
+
+    x = 0
+    for t in tensors:
+        out[x:x + t.shape[0]] = t
+        x += t.shape[0]
+
+    return out
+
+
+def convert_text_enc_state_dict_v20(text_enc_dict, prefix=""):
+    new_state_dict = {}
+    capture_qkv_weight = {}
+    capture_qkv_bias = {}
+    for k, v in text_enc_dict.items():
+        if not k.startswith(prefix):
+            continue
+        if (
+                k.endswith(".self_attn.q_proj.weight")
+                or k.endswith(".self_attn.k_proj.weight")
+                or k.endswith(".self_attn.v_proj.weight")
+        ):
+            k_pre = k[: -len(".q_proj.weight")]
+            k_code = k[-len("q_proj.weight")]
+            if k_pre not in capture_qkv_weight:
+                capture_qkv_weight[k_pre] = [None, None, None]
+            capture_qkv_weight[k_pre][code2idx[k_code]] = v
+            continue
+
+        if (
+                k.endswith(".self_attn.q_proj.bias")
+                or k.endswith(".self_attn.k_proj.bias")
+                or k.endswith(".self_attn.v_proj.bias")
+        ):
+            k_pre = k[: -len(".q_proj.bias")]
+            k_code = k[-len("q_proj.bias")]
+            if k_pre not in capture_qkv_bias:
+                capture_qkv_bias[k_pre] = [None, None, None]
+            capture_qkv_bias[k_pre][code2idx[k_code]] = v
+            continue
+
+        text_proj = "transformer.text_projection.weight"
+        if k.endswith(text_proj):
+            new_state_dict[k.replace(text_proj, "text_projection")] = v.transpose(0, 1).contiguous()
+        else:
+            relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k)
+            new_state_dict[relabelled_key] = v
+
+    for k_pre, tensors in capture_qkv_weight.items():
+        if None in tensors:
+            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
+        new_state_dict[relabelled_key + ".in_proj_weight"] = cat_tensors(tensors)
+
+    for k_pre, tensors in capture_qkv_bias.items():
+        if None in tensors:
+            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
+        new_state_dict[relabelled_key + ".in_proj_bias"] = cat_tensors(tensors)
+
+    return new_state_dict
+
+
+def convert_text_enc_state_dict(text_enc_dict):
+    return text_enc_dict
--- a/comfy/diffusers_load.py
+++ b/comfy/diffusers_load.py
+import os
+
+import comfy.sd
+
+def first_file(path, filenames):
+    for f in filenames:
+        p = os.path.join(path, f)
+        if os.path.exists(p):
+            return p
+    return None
+
+def load_diffusers(model_path, output_vae=True, output_clip=True, embedding_directory=None):
+    diffusion_model_names = ["diffusion_pytorch_model.fp16.safetensors", "diffusion_pytorch_model.safetensors", "diffusion_pytorch_model.fp16.bin", "diffusion_pytorch_model.bin"]
+    unet_path = first_file(os.path.join(model_path, "unet"), diffusion_model_names)
+    vae_path = first_file(os.path.join(model_path, "vae"), diffusion_model_names)
+
+    text_encoder_model_names = ["model.fp16.safetensors", "model.safetensors", "pytorch_model.fp16.bin", "pytorch_model.bin"]
+    text_encoder1_path = first_file(os.path.join(model_path, "text_encoder"), text_encoder_model_names)
+    text_encoder2_path = first_file(os.path.join(model_path, "text_encoder_2"), text_encoder_model_names)
+
+    text_encoder_paths = [text_encoder1_path]
+    if text_encoder2_path is not None:
+        text_encoder_paths.append(text_encoder2_path)
+
+    unet = comfy.sd.load_diffusion_model(unet_path)
+
+    clip = None
+    if output_clip:
+        clip = comfy.sd.load_clip(text_encoder_paths, embedding_directory=embedding_directory)
+
+    vae = None
+    if output_vae:
+        sd = comfy.utils.load_torch_file(vae_path)
+        vae = comfy.sd.VAE(sd=sd)
+
+    return (unet, clip, vae)
--- a/comfy/extra_samplers/__pycache__/uni_pc.cpython-310.pyc
+++ b/comfy/extra_samplers/__pycache__/uni_pc.cpython-310.pyc
--- a/comfy/extra_samplers/uni_pc.py
+++ b/comfy/extra_samplers/uni_pc.py
+#code taken from: https://github.com/wl-zhao/UniPC and modified
+
+import torch
+import math
+import logging
+
+from tqdm.auto import trange
+
+
+class NoiseScheduleVP:
+    def __init__(
+            self,
+            schedule='discrete',
+            betas=None,
+            alphas_cumprod=None,
+            continuous_beta_0=0.1,
+            continuous_beta_1=20.,
+        ):
+        r"""Create a wrapper class for the forward SDE (VP type).
+
+        ***
+        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
+                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
+        ***
+
+        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
+        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
+        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
+
+            log_alpha_t = self.marginal_log_mean_coeff(t)
+            sigma_t = self.marginal_std(t)
+            lambda_t = self.marginal_lambda(t)
+
+        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
+
+            t = self.inverse_lambda(lambda_t)
+
+        ===============================================================
+
+        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
+
+        1. For discrete-time DPMs:
+
+            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
+                t_i = (i + 1) / N
+            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
+            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
+
+            Args:
+                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
+                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
+
+            Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
+
+            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
+                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
+                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
+                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
+                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
+                and
+                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
+
+
+        2. For continuous-time DPMs:
+
+            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
+            schedule are the default settings in DDPM and improved-DDPM:
+
+            Args:
+                beta_min: A `float` number. The smallest beta for the linear schedule.
+                beta_max: A `float` number. The largest beta for the linear schedule.
+                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
+                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
+                T: A `float` number. The ending time of the forward process.
+
+        ===============================================================
+
+        Args:
+            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
+                    'linear' or 'cosine' for continuous-time DPMs.
+        Returns:
+            A wrapper object of the forward SDE (VP type).
+
+        ===============================================================
+
+        Example:
+
+        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', betas=betas)
+
+        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
+
+        # For continuous-time DPMs (VPSDE), linear schedule:
+        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
+
+        """
+
+        if schedule not in ['discrete', 'linear', 'cosine']:
+            raise ValueError("Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(schedule))
+
+        self.schedule = schedule
+        if schedule == 'discrete':
+            if betas is not None:
+                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
+            else:
+                assert alphas_cumprod is not None
+                log_alphas = 0.5 * torch.log(alphas_cumprod)
+            self.total_N = len(log_alphas)
+            self.T = 1.
+            self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1))
+            self.log_alpha_array = log_alphas.reshape((1, -1,))
+        else:
+            self.total_N = 1000
+            self.beta_0 = continuous_beta_0
+            self.beta_1 = continuous_beta_1
+            self.cosine_s = 0.008
+            self.cosine_beta_max = 999.
+            self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
+            self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.))
+            self.schedule = schedule
+            if schedule == 'cosine':
+                # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
+                # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
+                self.T = 0.9946
+            else:
+                self.T = 1.
+
+    def marginal_log_mean_coeff(self, t):
+        """
+        Compute log(alpha_t) of a given continuous-time label t in [0, T].
+        """
+        if self.schedule == 'discrete':
+            return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)).reshape((-1))
+        elif self.schedule == 'linear':
+            return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+        elif self.schedule == 'cosine':
+            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.))
+            log_alpha_t =  log_alpha_fn(t) - self.cosine_log_alpha_0
+            return log_alpha_t
+
+    def marginal_alpha(self, t):
+        """
+        Compute alpha_t of a given continuous-time label t in [0, T].
+        """
+        return torch.exp(self.marginal_log_mean_coeff(t))
+
+    def marginal_std(self, t):
+        """
+        Compute sigma_t of a given continuous-time label t in [0, T].
+        """
+        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
+
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
+        return log_mean_coeff - log_std
+
+    def inverse_lambda(self, lamb):
+        """
+        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
+        """
+        if self.schedule == 'linear':
+            tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            Delta = self.beta_0**2 + tmp
+            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
+        elif self.schedule == 'discrete':
+            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb)
+            t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), torch.flip(self.t_array.to(lamb.device), [1]))
+            return t.reshape((-1,))
+        else:
+            log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb))
+            t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * (1. + self.cosine_s) / math.pi - self.cosine_s
+            t = t_fn(log_alpha)
+            return t
+
+
+def model_wrapper(
+    model,
+    noise_schedule,
+    model_type="noise",
+    model_kwargs={},
+    guidance_type="uncond",
+    condition=None,
+    unconditional_condition=None,
+    guidance_scale=1.,
+    classifier_fn=None,
+    classifier_kwargs={},
+):
+    """Create a wrapper function for the noise prediction model.
+
+    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
+    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
+
+    We support four types of the diffusion model by setting `model_type`:
+
+        1. "noise": noise prediction model. (Trained by predicting noise).
+
+        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
+
+        3. "v": velocity prediction model. (Trained by predicting the velocity).
+            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
+
+            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
+                arXiv preprint arXiv:2202.00512 (2022).
+            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
+                arXiv preprint arXiv:2210.02303 (2022).
+
+        4. "score": marginal score function. (Trained by denoising score matching).
+            Note that the score function and the noise prediction model follows a simple relationship:
+            ```
+                noise(x_t, t) = -sigma_t * score(x_t, t)
+            ```
+
+    We support three types of guided sampling by DPMs by setting `guidance_type`:
+        1. "uncond": unconditional sampling by DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+
+        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+
+            The input `classifier_fn` has the following format:
+            ``
+                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
+            ``
+
+            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
+                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
+
+        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
+            ``
+            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
+
+            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
+                arXiv preprint arXiv:2207.12598 (2022).
+
+
+    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
+    or continuous-time labels (i.e. epsilon to T).
+
+    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
+    ``
+        def model_fn(x, t_continuous) -> noise:
+            t_input = get_model_input_time(t_continuous)
+            return noise_pred(model, x, t_input, **model_kwargs)
+    ``
+    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
+
+    ===============================================================
+
+    Args:
+        model: A diffusion model with the corresponding format described above.
+        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+        model_type: A `str`. The parameterization type of the diffusion model.
+                    "noise" or "x_start" or "v" or "score".
+        model_kwargs: A `dict`. A dict for the other inputs of the model function.
+        guidance_type: A `str`. The type of the guidance for sampling.
+                    "uncond" or "classifier" or "classifier-free".
+        condition: A pytorch tensor. The condition for the guided sampling.
+                    Only used for "classifier" or "classifier-free" guidance type.
+        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
+                    Only used for "classifier-free" guidance type.
+        guidance_scale: A `float`. The scale for the guided sampling.
+        classifier_fn: A classifier function. Only used for the classifier guidance.
+        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
+    Returns:
+        A noise prediction model that accepts the noised data and the continuous time as the inputs.
+    """
+
+    def get_model_input_time(t_continuous):
+        """
+        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
+        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
+        For continuous-time DPMs, we just use `t_continuous`.
+        """
+        if noise_schedule.schedule == 'discrete':
+            return (t_continuous - 1. / noise_schedule.total_N) * 1000.
+        else:
+            return t_continuous
+
+    def noise_pred_fn(x, t_continuous, cond=None):
+        if t_continuous.reshape((-1,)).shape[0] == 1:
+            t_continuous = t_continuous.expand((x.shape[0]))
+        t_input = get_model_input_time(t_continuous)
+        output = model(x, t_input, **model_kwargs)
+        if model_type == "noise":
+            return output
+        elif model_type == "x_start":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            dims = x.dim()
+            return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims)
+        elif model_type == "v":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            dims = x.dim()
+            return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x
+        elif model_type == "score":
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            dims = x.dim()
+            return -expand_dims(sigma_t, dims) * output
+
+    def cond_grad_fn(x, t_input):
+        """
+        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
+        """
+        with torch.enable_grad():
+            x_in = x.detach().requires_grad_(True)
+            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
+            return torch.autograd.grad(log_prob.sum(), x_in)[0]
+
+    def model_fn(x, t_continuous):
+        """
+        The noise predicition model function that is used for DPM-Solver.
+        """
+        if t_continuous.reshape((-1,)).shape[0] == 1:
+            t_continuous = t_continuous.expand((x.shape[0]))
+        if guidance_type == "uncond":
+            return noise_pred_fn(x, t_continuous)
+        elif guidance_type == "classifier":
+            assert classifier_fn is not None
+            t_input = get_model_input_time(t_continuous)
+            cond_grad = cond_grad_fn(x, t_input)
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            noise = noise_pred_fn(x, t_continuous)
+            return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad
+        elif guidance_type == "classifier-free":
+            if guidance_scale == 1. or unconditional_condition is None:
+                return noise_pred_fn(x, t_continuous, cond=condition)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t_continuous] * 2)
+                c_in = torch.cat([unconditional_condition, condition])
+                noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
+                return noise_uncond + guidance_scale * (noise - noise_uncond)
+
+    assert model_type in ["noise", "x_start", "v"]
+    assert guidance_type in ["uncond", "classifier", "classifier-free"]
+    return model_fn
+
+
+class UniPC:
+    def __init__(
+        self,
+        model_fn,
+        noise_schedule,
+        predict_x0=True,
+        thresholding=False,
+        max_val=1.,
+        variant='bh1',
+    ):
+        """Construct a UniPC.
+
+        We support both data_prediction and noise_prediction.
+        """
+        self.model = model_fn
+        self.noise_schedule = noise_schedule
+        self.variant = variant
+        self.predict_x0 = predict_x0
+        self.thresholding = thresholding
+        self.max_val = max_val
+
+    def dynamic_thresholding_fn(self, x0, t=None):
+        """
+        The dynamic thresholding method.
+        """
+        dims = x0.dim()
+        p = self.dynamic_thresholding_ratio
+        s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
+        s = expand_dims(torch.maximum(s, self.thresholding_max_val * torch.ones_like(s).to(s.device)), dims)
+        x0 = torch.clamp(x0, -s, s) / s
+        return x0
+
+    def noise_prediction_fn(self, x, t):
+        """
+        Return the noise prediction model.
+        """
+        return self.model(x, t)
+
+    def data_prediction_fn(self, x, t):
+        """
+        Return the data prediction model (with thresholding).
+        """
+        noise = self.noise_prediction_fn(x, t)
+        dims = x.dim()
+        alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t)
+        x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims)
+        if self.thresholding:
+            p = 0.995   # A hyperparameter in the paper of "Imagen" [1].
+            s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1)
+            s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims)
+            x0 = torch.clamp(x0, -s, s) / s
+        return x0
+
+    def model_fn(self, x, t):
+        """
+        Convert the model to the noise prediction model or the data prediction model.
+        """
+        if self.predict_x0:
+            return self.data_prediction_fn(x, t)
+        else:
+            return self.noise_prediction_fn(x, t)
+
+    def get_time_steps(self, skip_type, t_T, t_0, N, device):
+        """Compute the intermediate time steps for sampling.
+        """
+        if skip_type == 'logSNR':
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == 'time_uniform':
+            return torch.linspace(t_T, t_0, N + 1).to(device)
+        elif skip_type == 'time_quadratic':
+            t_order = 2
+            t = torch.linspace(t_T**(1. / t_order), t_0**(1. / t_order), N + 1).pow(t_order).to(device)
+            return t
+        else:
+            raise ValueError("Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type))
+
+    def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device):
+        """
+        Get the order of each step for sampling by the singlestep DPM-Solver.
+        """
+        if order == 3:
+            K = steps // 3 + 1
+            if steps % 3 == 0:
+                orders = [3,] * (K - 2) + [2, 1]
+            elif steps % 3 == 1:
+                orders = [3,] * (K - 1) + [1]
+            else:
+                orders = [3,] * (K - 1) + [2]
+        elif order == 2:
+            if steps % 2 == 0:
+                K = steps // 2
+                orders = [2,] * K
+            else:
+                K = steps // 2 + 1
+                orders = [2,] * (K - 1) + [1]
+        elif order == 1:
+            K = steps
+            orders = [1,] * steps
+        else:
+            raise ValueError("'order' must be '1' or '2' or '3'.")
+        if skip_type == 'logSNR':
+            # To reproduce the results in DPM-Solver paper
+            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device)
+        else:
+            timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[torch.cumsum(torch.tensor([0,] + orders), 0).to(device)]
+        return timesteps_outer, orders
+
+    def denoise_to_zero_fn(self, x, s):
+        """
+        Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization.
+        """
+        return self.data_prediction_fn(x, s)
+
+    def multistep_uni_pc_update(self, x, model_prev_list, t_prev_list, t, order, **kwargs):
+        if len(t.shape) == 0:
+            t = t.view(-1)
+        if 'bh' in self.variant:
+            return self.multistep_uni_pc_bh_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
+        else:
+            assert self.variant == 'vary_coeff'
+            return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
+
+    def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True):
+        logging.info(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
+        ns = self.noise_schedule
+        assert order <= len(model_prev_list)
+
+        # first compute rks
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
+        lambda_t = ns.marginal_lambda(t)
+        model_prev_0 = model_prev_list[-1]
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        log_alpha_t = ns.marginal_log_mean_coeff(t)
+        alpha_t = torch.exp(log_alpha_t)
+
+        h = lambda_t - lambda_prev_0
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = ns.marginal_lambda(t_prev_i)
+            rk = (lambda_prev_i - lambda_prev_0) / h
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+
+        K = len(rks)
+        # build C matrix
+        C = []
+
+        col = torch.ones_like(rks)
+        for k in range(1, K + 1):
+            C.append(col)
+            col = col * rks / (k + 1)
+        C = torch.stack(C, dim=1)
+
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1) # (B, K)
+            C_inv_p = torch.linalg.inv(C[:-1, :-1])
+            A_p = C_inv_p
+
+        if use_corrector:
+            C_inv = torch.linalg.inv(C)
+            A_c = C_inv
+
+        hh = -h if self.predict_x0 else h
+        h_phi_1 = torch.expm1(hh)
+        h_phi_ks = []
+        factorial_k = 1
+        h_phi_k = h_phi_1
+        for k in range(1, K + 2):
+            h_phi_ks.append(h_phi_k)
+            h_phi_k = h_phi_k / hh - 1 / factorial_k
+            factorial_k *= (k + 1)
+
+        model_t = None
+        if self.predict_x0:
+            x_t_ = (
+                sigma_t / sigma_prev_0 * x
+                - alpha_t * h_phi_1 * model_prev_0
+            )
+            # now predictor
+            x_t = x_t_
+            if len(D1s) > 0:
+                # compute the residuals for predictor
+                for k in range(K - 1):
+                    x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
+            # now corrector
+            if use_corrector:
+                model_t = self.model_fn(x_t, t)
+                D1_t = (model_t - model_prev_0)
+                x_t = x_t_
+                k = 0
+                for k in range(K - 1):
+                    x_t = x_t - alpha_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
+                x_t = x_t - alpha_t * h_phi_ks[K] * (D1_t * A_c[k][-1])
+        else:
+            log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+            x_t_ = (
+                (torch.exp(log_alpha_t - log_alpha_prev_0)) * x
+                - (sigma_t * h_phi_1) * model_prev_0
+            )
+            # now predictor
+            x_t = x_t_
+            if len(D1s) > 0:
+                # compute the residuals for predictor
+                for k in range(K - 1):
+                    x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
+            # now corrector
+            if use_corrector:
+                model_t = self.model_fn(x_t, t)
+                D1_t = (model_t - model_prev_0)
+                x_t = x_t_
+                k = 0
+                for k in range(K - 1):
+                    x_t = x_t - sigma_t * h_phi_ks[k + 1] * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
+                x_t = x_t - sigma_t * h_phi_ks[K] * (D1_t * A_c[k][-1])
+        return x_t, model_t
+
+    def multistep_uni_pc_bh_update(self, x, model_prev_list, t_prev_list, t, order, x_t=None, use_corrector=True):
+        # print(f'using unified predictor-corrector with order {order} (solver type: B(h))')
+        ns = self.noise_schedule
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+
+        # first compute rks
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
+        lambda_t = ns.marginal_lambda(t)
+        model_prev_0 = model_prev_list[-1]
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        alpha_t = torch.exp(log_alpha_t)
+
+        h = lambda_t - lambda_prev_0
+
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = ns.marginal_lambda(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+
+        R = []
+        b = []
+
+        hh = -h[0] if self.predict_x0 else h[0]
+        h_phi_1 = torch.expm1(hh) # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+
+        factorial_i = 1
+
+        if self.variant == 'bh1':
+            B_h = hh
+        elif self.variant == 'bh2':
+            B_h = torch.expm1(hh)
+        else:
+            raise NotImplementedError()
+
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= (i + 1)
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+
+        R = torch.stack(R)
+        b = torch.tensor(b, device=x.device)
+
+        # now predictor
+        use_predictor = len(D1s) > 0 and x_t is None
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1) # (B, K)
+            if x_t is None:
+                # for order 2, we use a simplified version
+                if order == 2:
+                    rhos_p = torch.tensor([0.5], device=b.device)
+                else:
+                    rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
+        else:
+            D1s = None
+
+        if use_corrector:
+            # print('using corrector')
+            # for order 1, we use a simplified version
+            if order == 1:
+                rhos_c = torch.tensor([0.5], device=b.device)
+            else:
+                rhos_c = torch.linalg.solve(R, b)
+
+        model_t = None
+        if self.predict_x0:
+            x_t_ = (
+                expand_dims(sigma_t / sigma_prev_0, dims) * x
+                - expand_dims(alpha_t * h_phi_1, dims)* model_prev_0
+            )
+
+            if x_t is None:
+                if use_predictor:
+                    pred_res = torch.tensordot(D1s, rhos_p, dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+                else:
+                    pred_res = 0
+                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * pred_res
+
+            if use_corrector:
+                model_t = self.model_fn(x_t, t)
+                if D1s is not None:
+                    corr_res = torch.tensordot(D1s, rhos_c[:-1], dims=([1], [0]))  # torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+                else:
+                    corr_res = 0
+                D1_t = (model_t - model_prev_0)
+                x_t = x_t_ - expand_dims(alpha_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
+        else:
+            x_t_ = (
+                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+                - expand_dims(sigma_t * h_phi_1, dims) * model_prev_0
+            )
+            if x_t is None:
+                if use_predictor:
+                    pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+                else:
+                    pred_res = 0
+                x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * pred_res
+
+            if use_corrector:
+                model_t = self.model_fn(x_t, t)
+                if D1s is not None:
+                    corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+                else:
+                    corr_res = 0
+                D1_t = (model_t - model_prev_0)
+                x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
+        return x_t, model_t
+
+
+    def sample(self, x, timesteps, t_start=None, t_end=None, order=3, skip_type='time_uniform',
+        method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver',
+        atol=0.0078, rtol=0.05, corrector=False, callback=None, disable_pbar=False
+    ):
+        # t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end
+        # t_T = self.noise_schedule.T if t_start is None else t_start
+        steps = len(timesteps) - 1
+        if method == 'multistep':
+            assert steps >= order
+            # timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
+            assert timesteps.shape[0] - 1 == steps
+            # with torch.no_grad():
+            for step_index in trange(steps, disable=disable_pbar):
+                if step_index == 0:
+                    vec_t = timesteps[0].expand((x.shape[0]))
+                    model_prev_list = [self.model_fn(x, vec_t)]
+                    t_prev_list = [vec_t]
+                elif step_index < order:
+                    init_order = step_index
+                # Init the first `order` values by lower order multistep DPM-Solver.
+                # for init_order in range(1, order):
+                    vec_t = timesteps[init_order].expand(x.shape[0])
+                    x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, init_order, use_corrector=True)
+                    if model_x is None:
+                        model_x = self.model_fn(x, vec_t)
+                    model_prev_list.append(model_x)
+                    t_prev_list.append(vec_t)
+                else:
+                    extra_final_step = 0
+                    if step_index == (steps - 1):
+                        extra_final_step = 1
+                    for step in range(step_index, step_index + 1 + extra_final_step):
+                        vec_t = timesteps[step].expand(x.shape[0])
+                        if lower_order_final:
+                            step_order = min(order, steps + 1 - step)
+                        else:
+                            step_order = order
+                        # print('this step order:', step_order)
+                        if step == steps:
+                            # print('do not run corrector at the last step')
+                            use_corrector = False
+                        else:
+                            use_corrector = True
+                        x, model_x =  self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, step_order, use_corrector=use_corrector)
+                        for i in range(order - 1):
+                            t_prev_list[i] = t_prev_list[i + 1]
+                            model_prev_list[i] = model_prev_list[i + 1]
+                        t_prev_list[-1] = vec_t
+                        # We do not need to evaluate the final model value.
+                        if step < steps:
+                            if model_x is None:
+                                model_x = self.model_fn(x, vec_t)
+                            model_prev_list[-1] = model_x
+                if callback is not None:
+                    callback({'x': x, 'i': step_index, 'denoised': model_prev_list[-1]})
+        else:
+            raise NotImplementedError()
+        # if denoise_to_zero:
+        #     x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0)
+        return x
+
+
+#############################################################
+# other utility functions
+#############################################################
+
+def interpolate_fn(x, xp, yp):
+    """
+    A piecewise linear function y = f(x), using xp and yp as keypoints.
+    We implement f(x) in a differentiable way (i.e. applicable for autograd).
+    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+
+    Args:
+        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+        yp: PyTorch tensor with shape [C, K].
+    Returns:
+        The function values f(x), with shape [N, C].
+    """
+    N, K = x.shape[0], xp.shape[1]
+    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+    return cand
+
+
+def expand_dims(v, dims):
+    """
+    Expand the tensor `v` to the dim `dims`.
+
+    Args:
+        `v`: a PyTorch tensor with shape [N].
+        `dim`: a `int`.
+    Returns:
+        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+    """
+    return v[(...,) + (None,)*(dims - 1)]
+
+
+class SigmaConvert:
+    schedule = ""
+    def marginal_log_mean_coeff(self, sigma):
+        return 0.5 * torch.log(1 / ((sigma * sigma) + 1))
+
+    def marginal_alpha(self, t):
+        return torch.exp(self.marginal_log_mean_coeff(t))
+
+    def marginal_std(self, t):
+        return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t)))
+
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff))
+        return log_mean_coeff - log_std
+
+def predict_eps_sigma(model, input, sigma_in, **kwargs):
+    sigma = sigma_in.view(sigma_in.shape[:1] + (1,) * (input.ndim - 1))
+    input = input * ((sigma ** 2 + 1.0) ** 0.5)
+    return  (input - model(input, sigma_in, **kwargs)) / sigma
+
+
+def sample_unipc(model, noise, sigmas, extra_args=None, callback=None, disable=False, variant='bh1'):
+        timesteps = sigmas.clone()
+        if sigmas[-1] == 0:
+            timesteps = sigmas[:]
+            timesteps[-1] = 0.001
+        else:
+            timesteps = sigmas.clone()
+        ns = SigmaConvert()
+
+        noise = noise / torch.sqrt(1.0 + timesteps[0] ** 2.0)
+        model_type = "noise"
+
+        model_fn = model_wrapper(
+            lambda input, sigma, **kwargs: predict_eps_sigma(model, input, sigma, **kwargs),
+            ns,
+            model_type=model_type,
+            guidance_type="uncond",
+            model_kwargs=extra_args,
+        )
+
+        order = min(3, len(timesteps) - 2)
+        uni_pc = UniPC(model_fn, ns, predict_x0=True, thresholding=False, variant=variant)
+        x = uni_pc.sample(noise, timesteps=timesteps, skip_type="time_uniform", method="multistep", order=order, lower_order_final=True, callback=callback, disable_pbar=disable)
+        x /= ns.marginal_alpha(timesteps[-1])
+        return x
+
+def sample_unipc_bh2(model, noise, sigmas, extra_args=None, callback=None, disable=False):
+    return sample_unipc(model, noise, sigmas, extra_args, callback, disable, variant='bh2')
--- a/comfy/float.py
+++ b/comfy/float.py
+import torch
+
+def calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=None):
+    mantissa_scaled = torch.where(
+        normal_mask,
+        (abs_x / (2.0 ** (exponent - EXPONENT_BIAS)) - 1.0) * (2**MANTISSA_BITS),
+        (abs_x / (2.0 ** (-EXPONENT_BIAS + 1 - MANTISSA_BITS)))
+    )
+
+    mantissa_scaled += torch.rand(mantissa_scaled.size(), dtype=mantissa_scaled.dtype, layout=mantissa_scaled.layout, device=mantissa_scaled.device, generator=generator)
+    return mantissa_scaled.floor() / (2**MANTISSA_BITS)
+
+#Not 100% sure about this
+def manual_stochastic_round_to_float8(x, dtype, generator=None):
+    if dtype == torch.float8_e4m3fn:
+        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 4, 3, 7
+    elif dtype == torch.float8_e5m2:
+        EXPONENT_BITS, MANTISSA_BITS, EXPONENT_BIAS = 5, 2, 15
+    else:
+        raise ValueError("Unsupported dtype")
+
+    x = x.half()
+    sign = torch.sign(x)
+    abs_x = x.abs()
+    sign = torch.where(abs_x == 0, 0, sign)
+
+    # Combine exponent calculation and clamping
+    exponent = torch.clamp(
+        torch.floor(torch.log2(abs_x)) + EXPONENT_BIAS,
+        0, 2**EXPONENT_BITS - 1
+    )
+
+    # Combine mantissa calculation and rounding
+    normal_mask = ~(exponent == 0)
+
+    abs_x[:] = calc_mantissa(abs_x, exponent, normal_mask, MANTISSA_BITS, EXPONENT_BIAS, generator=generator)
+
+    sign *= torch.where(
+        normal_mask,
+        (2.0 ** (exponent - EXPONENT_BIAS)) * (1.0 + abs_x),
+        (2.0 ** (-EXPONENT_BIAS + 1)) * abs_x
+    )
+
+    inf = torch.finfo(dtype)
+    torch.clamp(sign, min=inf.min, max=inf.max, out=sign)
+    return sign
+
+
+
+def stochastic_rounding(value, dtype, seed=0):
+    if dtype == torch.float32:
+        return value.to(dtype=torch.float32)
+    if dtype == torch.float16:
+        return value.to(dtype=torch.float16)
+    if dtype == torch.bfloat16:
+        return value.to(dtype=torch.bfloat16)
+    if dtype == torch.float8_e4m3fn or dtype == torch.float8_e5m2:
+        generator = torch.Generator(device=value.device)
+        generator.manual_seed(seed)
+        output = torch.empty_like(value, dtype=dtype)
+        num_slices = max(1, (value.numel() / (4096 * 4096)))
+        slice_size = max(1, round(value.shape[0] / num_slices))
+        for i in range(0, value.shape[0], slice_size):
+            output[i:i+slice_size].copy_(manual_stochastic_round_to_float8(value[i:i+slice_size], dtype, generator=generator))
+        return output
+
+    return value.to(dtype=dtype)
--- a/comfy/gligen.py
+++ b/comfy/gligen.py
+import math
+import torch
+from torch import nn
+from .ldm.modules.attention import CrossAttention, FeedForward
+import comfy.ops
+ops = comfy.ops.manual_cast
+
+
+class GatedCrossAttentionDense(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, d_head):
+        super().__init__()
+
+        self.attn = CrossAttention(
+            query_dim=query_dim,
+            context_dim=context_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            operations=ops)
+        self.ff = FeedForward(query_dim, glu=True)
+
+        self.norm1 = ops.LayerNorm(query_dim)
+        self.norm2 = ops.LayerNorm(query_dim)
+
+        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
+        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
+
+        # this can be useful: we can externally change magnitude of tanh(alpha)
+        # for example, when it is set to 0, then the entire model is same as
+        # original one
+        self.scale = 1
+
+    def forward(self, x, objs):
+
+        x = x + self.scale * \
+            torch.tanh(self.alpha_attn) * self.attn(self.norm1(x), objs, objs)
+        x = x + self.scale * \
+            torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
+
+        return x
+
+
+class GatedSelfAttentionDense(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, d_head):
+        super().__init__()
+
+        # we need a linear projection since we need cat visual feature and obj
+        # feature
+        self.linear = ops.Linear(context_dim, query_dim)
+
+        self.attn = CrossAttention(
+            query_dim=query_dim,
+            context_dim=query_dim,
+            heads=n_heads,
+            dim_head=d_head,
+            operations=ops)
+        self.ff = FeedForward(query_dim, glu=True)
+
+        self.norm1 = ops.LayerNorm(query_dim)
+        self.norm2 = ops.LayerNorm(query_dim)
+
+        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
+        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
+
+        # this can be useful: we can externally change magnitude of tanh(alpha)
+        # for example, when it is set to 0, then the entire model is same as
+        # original one
+        self.scale = 1
+
+    def forward(self, x, objs):
+
+        N_visual = x.shape[1]
+        objs = self.linear(objs)
+
+        x = x + self.scale * torch.tanh(self.alpha_attn) * self.attn(
+            self.norm1(torch.cat([x, objs], dim=1)))[:, 0:N_visual, :]
+        x = x + self.scale * \
+            torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
+
+        return x
+
+
+class GatedSelfAttentionDense2(nn.Module):
+    def __init__(self, query_dim, context_dim, n_heads, d_head):
+        super().__init__()
+
+        # we need a linear projection since we need cat visual feature and obj
+        # feature
+        self.linear = ops.Linear(context_dim, query_dim)
+
+        self.attn = CrossAttention(
+            query_dim=query_dim, context_dim=query_dim, dim_head=d_head, operations=ops)
+        self.ff = FeedForward(query_dim, glu=True)
+
+        self.norm1 = ops.LayerNorm(query_dim)
+        self.norm2 = ops.LayerNorm(query_dim)
+
+        self.register_parameter('alpha_attn', nn.Parameter(torch.tensor(0.)))
+        self.register_parameter('alpha_dense', nn.Parameter(torch.tensor(0.)))
+
+        # this can be useful: we can externally change magnitude of tanh(alpha)
+        # for example, when it is set to 0, then the entire model is same as
+        # original one
+        self.scale = 1
+
+    def forward(self, x, objs):
+
+        B, N_visual, _ = x.shape
+        B, N_ground, _ = objs.shape
+
+        objs = self.linear(objs)
+
+        # sanity check
+        size_v = math.sqrt(N_visual)
+        size_g = math.sqrt(N_ground)
+        assert int(size_v) == size_v, "Visual tokens must be square rootable"
+        assert int(size_g) == size_g, "Grounding tokens must be square rootable"
+        size_v = int(size_v)
+        size_g = int(size_g)
+
+        # select grounding token and resize it to visual token size as residual
+        out = self.attn(self.norm1(torch.cat([x, objs], dim=1)))[
+            :, N_visual:, :]
+        out = out.permute(0, 2, 1).reshape(B, -1, size_g, size_g)
+        out = torch.nn.functional.interpolate(
+            out, (size_v, size_v), mode='bicubic')
+        residual = out.reshape(B, -1, N_visual).permute(0, 2, 1)
+
+        # add residual to visual feature
+        x = x + self.scale * torch.tanh(self.alpha_attn) * residual
+        x = x + self.scale * \
+            torch.tanh(self.alpha_dense) * self.ff(self.norm2(x))
+
+        return x
+
+
+class FourierEmbedder():
+    def __init__(self, num_freqs=64, temperature=100):
+
+        self.num_freqs = num_freqs
+        self.temperature = temperature
+        self.freq_bands = temperature ** (torch.arange(num_freqs) / num_freqs)
+
+    @torch.no_grad()
+    def __call__(self, x, cat_dim=-1):
+        "x: arbitrary shape of tensor. dim: cat dim"
+        out = []
+        for freq in self.freq_bands:
+            out.append(torch.sin(freq * x))
+            out.append(torch.cos(freq * x))
+        return torch.cat(out, cat_dim)
+
+
+class PositionNet(nn.Module):
+    def __init__(self, in_dim, out_dim, fourier_freqs=8):
+        super().__init__()
+        self.in_dim = in_dim
+        self.out_dim = out_dim
+
+        self.fourier_embedder = FourierEmbedder(num_freqs=fourier_freqs)
+        self.position_dim = fourier_freqs * 2 * 4  # 2 is sin&cos, 4 is xyxy
+
+        self.linears = nn.Sequential(
+            ops.Linear(self.in_dim + self.position_dim, 512),
+            nn.SiLU(),
+            ops.Linear(512, 512),
+            nn.SiLU(),
+            ops.Linear(512, out_dim),
+        )
+
+        self.null_positive_feature = torch.nn.Parameter(
+            torch.zeros([self.in_dim]))
+        self.null_position_feature = torch.nn.Parameter(
+            torch.zeros([self.position_dim]))
+
+    def forward(self, boxes, masks, positive_embeddings):
+        B, N, _ = boxes.shape
+        masks = masks.unsqueeze(-1)
+        positive_embeddings = positive_embeddings
+
+        # embedding position (it may includes padding as placeholder)
+        xyxy_embedding = self.fourier_embedder(boxes)  # B*N*4 --> B*N*C
+
+        # learnable null embedding
+        positive_null = self.null_positive_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)
+        xyxy_null = self.null_position_feature.to(device=boxes.device, dtype=boxes.dtype).view(1, 1, -1)
+
+        # replace padding with learnable null embedding
+        positive_embeddings = positive_embeddings * \
+            masks + (1 - masks) * positive_null
+        xyxy_embedding = xyxy_embedding * masks + (1 - masks) * xyxy_null
+
+        objs = self.linears(
+            torch.cat([positive_embeddings, xyxy_embedding], dim=-1))
+        assert objs.shape == torch.Size([B, N, self.out_dim])
+        return objs
+
+
+class Gligen(nn.Module):
+    def __init__(self, modules, position_net, key_dim):
+        super().__init__()
+        self.module_list = nn.ModuleList(modules)
+        self.position_net = position_net
+        self.key_dim = key_dim
+        self.max_objs = 30
+        self.current_device = torch.device("cpu")
+
+    def _set_position(self, boxes, masks, positive_embeddings):
+        objs = self.position_net(boxes, masks, positive_embeddings)
+        def func(x, extra_options):
+            key = extra_options["transformer_index"]
+            module = self.module_list[key]
+            return module(x, objs.to(device=x.device, dtype=x.dtype))
+        return func
+
+    def set_position(self, latent_image_shape, position_params, device):
+        batch, c, h, w = latent_image_shape
+        masks = torch.zeros([self.max_objs], device="cpu")
+        boxes = []
+        positive_embeddings = []
+        for p in position_params:
+            x1 = (p[4]) / w
+            y1 = (p[3]) / h
+            x2 = (p[4] + p[2]) / w
+            y2 = (p[3] + p[1]) / h
+            masks[len(boxes)] = 1.0
+            boxes += [torch.tensor((x1, y1, x2, y2)).unsqueeze(0)]
+            positive_embeddings += [p[0]]
+        append_boxes = []
+        append_conds = []
+        if len(boxes) < self.max_objs:
+            append_boxes = [torch.zeros(
+                [self.max_objs - len(boxes), 4], device="cpu")]
+            append_conds = [torch.zeros(
+                [self.max_objs - len(boxes), self.key_dim], device="cpu")]
+
+        box_out = torch.cat(
+            boxes + append_boxes).unsqueeze(0).repeat(batch, 1, 1)
+        masks = masks.unsqueeze(0).repeat(batch, 1)
+        conds = torch.cat(positive_embeddings +
+                          append_conds).unsqueeze(0).repeat(batch, 1, 1)
+        return self._set_position(
+            box_out.to(device),
+            masks.to(device),
+            conds.to(device))
+
+    def set_empty(self, latent_image_shape, device):
+        batch, c, h, w = latent_image_shape
+        masks = torch.zeros([self.max_objs], device="cpu").repeat(batch, 1)
+        box_out = torch.zeros([self.max_objs, 4],
+                              device="cpu").repeat(batch, 1, 1)
+        conds = torch.zeros([self.max_objs, self.key_dim],
+                            device="cpu").repeat(batch, 1, 1)
+        return self._set_position(
+            box_out.to(device),
+            masks.to(device),
+            conds.to(device))
+
+
+def load_gligen(sd):
+    sd_k = sd.keys()
+    output_list = []
+    key_dim = 768
+    for a in ["input_blocks", "middle_block", "output_blocks"]:
+        for b in range(20):
+            k_temp = filter(lambda k: "{}.{}.".format(a, b)
+                            in k and ".fuser." in k, sd_k)
+            k_temp = map(lambda k: (k, k.split(".fuser.")[-1]), k_temp)
+
+            n_sd = {}
+            for k in k_temp:
+                n_sd[k[1]] = sd[k[0]]
+            if len(n_sd) > 0:
+                query_dim = n_sd["linear.weight"].shape[0]
+                key_dim = n_sd["linear.weight"].shape[1]
+
+                if key_dim == 768:  # SD1.x
+                    n_heads = 8
+                    d_head = query_dim // n_heads
+                else:
+                    d_head = 64
+                    n_heads = query_dim // d_head
+
+                gated = GatedSelfAttentionDense(
+                    query_dim, key_dim, n_heads, d_head)
+                gated.load_state_dict(n_sd, strict=False)
+                output_list.append(gated)
+
+    if "position_net.null_positive_feature" in sd_k:
+        in_dim = sd["position_net.null_positive_feature"].shape[0]
+        out_dim = sd["position_net.linears.4.weight"].shape[0]
+
+        class WeightsLoader(torch.nn.Module):
+            pass
+        w = WeightsLoader()
+        w.position_net = PositionNet(in_dim, out_dim)
+        w.load_state_dict(sd, strict=False)
+
+    gligen = Gligen(output_list, w.position_net, key_dim)
+    return gligen
--- a/comfy/hooks.py
+++ b/comfy/hooks.py
+from __future__ import annotations
+from typing import TYPE_CHECKING, Callable
+import enum
+import math
+import torch
+import numpy as np
+import itertools
+import logging
+
+if TYPE_CHECKING:
+    from comfy.model_patcher import ModelPatcher, PatcherInjection
+    from comfy.model_base import BaseModel
+    from comfy.sd import CLIP
+import comfy.lora
+import comfy.model_management
+import comfy.patcher_extension
+from node_helpers import conditioning_set_values
+
+# #######################################################################################################
+# Hooks explanation
+# -------------------
+# The purpose of hooks is to allow conds to influence sampling without the need for ComfyUI core code to
+# make explicit special cases like it does for ControlNet and GLIGEN.
+#
+# This is necessary for nodes/features that are intended for use with masked or scheduled conds, or those
+# that should run special code when a 'marked' cond is used in sampling.
+# #######################################################################################################
+
+class EnumHookMode(enum.Enum):
+    '''
+    Priority of hook memory optimization vs. speed, mostly related to WeightHooks.
+
+    MinVram: No caching will occur for any operations related to hooks.
+    MaxSpeed: Excess VRAM (and RAM, once VRAM is sufficiently depleted) will be used to cache hook weights when switching hook groups.
+    '''
+    MinVram = "minvram"
+    MaxSpeed = "maxspeed"
+
+class EnumHookType(enum.Enum):
+    '''
+    Hook types, each of which has different expected behavior.
+    '''
+    Weight = "weight"
+    ObjectPatch = "object_patch"
+    AdditionalModels = "add_models"
+    TransformerOptions = "transformer_options"
+    Injections = "add_injections"
+
+class EnumWeightTarget(enum.Enum):
+    Model = "model"
+    Clip = "clip"
+
+class EnumHookScope(enum.Enum):
+    '''
+    Determines if hook should be limited in its influence over sampling.
+
+    AllConditioning: hook will affect all conds used in sampling.
+    HookedOnly: hook will only affect the conds it was attached to.
+    '''
+    AllConditioning = "all_conditioning"
+    HookedOnly = "hooked_only"
+
+
+class _HookRef:
+    pass
+
+
+def default_should_register(hook: Hook, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+    '''Example for how custom_should_register function can look like.'''
+    return True
+
+
+def create_target_dict(target: EnumWeightTarget=None, **kwargs) -> dict[str]:
+    '''Creates base dictionary for use with Hooks' target param.'''
+    d = {}
+    if target is not None:
+        d['target'] = target
+    d.update(kwargs)
+    return d
+
+
+class Hook:
+    def __init__(self, hook_type: EnumHookType=None, hook_ref: _HookRef=None, hook_id: str=None,
+                 hook_keyframe: HookKeyframeGroup=None, hook_scope=EnumHookScope.AllConditioning):
+        self.hook_type = hook_type
+        '''Enum identifying the general class of this hook.'''
+        self.hook_ref = hook_ref if hook_ref else _HookRef()
+        '''Reference shared between hook clones that have the same value. Should NOT be modified.'''
+        self.hook_id = hook_id
+        '''Optional string ID to identify hook; useful if need to consolidate duplicates at registration time.'''
+        self.hook_keyframe = hook_keyframe if hook_keyframe else HookKeyframeGroup()
+        '''Keyframe storage that can be referenced to get strength for current sampling step.'''
+        self.hook_scope = hook_scope
+        '''Scope of where this hook should apply in terms of the conds used in sampling run.'''
+        self.custom_should_register = default_should_register
+        '''Can be overriden with a compatible function to decide if this hook should be registered without the need to override .should_register'''
+
+    @property
+    def strength(self):
+        return self.hook_keyframe.strength
+
+    def initialize_timesteps(self, model: BaseModel):
+        self.reset()
+        self.hook_keyframe.initialize_timesteps(model)
+
+    def reset(self):
+        self.hook_keyframe.reset()
+
+    def clone(self):
+        c: Hook = self.__class__()
+        c.hook_type = self.hook_type
+        c.hook_ref = self.hook_ref
+        c.hook_id = self.hook_id
+        c.hook_keyframe = self.hook_keyframe
+        c.hook_scope = self.hook_scope
+        c.custom_should_register = self.custom_should_register
+        return c
+
+    def should_register(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        return self.custom_should_register(self, model, model_options, target_dict, registered)
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        raise NotImplementedError("add_hook_patches should be defined for Hook subclasses")
+
+    def __eq__(self, other: Hook):
+        return self.__class__ == other.__class__ and self.hook_ref == other.hook_ref
+
+    def __hash__(self):
+        return hash(self.hook_ref)
+
+class WeightHook(Hook):
+    '''
+    Hook responsible for tracking weights to be applied to some model/clip.
+
+    Note, value of hook_scope is ignored and is treated as HookedOnly.
+    '''
+    def __init__(self, strength_model=1.0, strength_clip=1.0):
+        super().__init__(hook_type=EnumHookType.Weight, hook_scope=EnumHookScope.HookedOnly)
+        self.weights: dict = None
+        self.weights_clip: dict = None
+        self.need_weight_init = True
+        self._strength_model = strength_model
+        self._strength_clip = strength_clip
+        self.hook_scope = EnumHookScope.HookedOnly # this value does not matter for WeightHooks, just for docs
+
+    @property
+    def strength_model(self):
+        return self._strength_model * self.strength
+
+    @property
+    def strength_clip(self):
+        return self._strength_clip * self.strength
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        if not self.should_register(model, model_options, target_dict, registered):
+            return False
+        weights = None
+
+        target = target_dict.get('target', None)
+        if target == EnumWeightTarget.Clip:
+            strength = self._strength_clip
+        else:
+            strength = self._strength_model
+
+        if self.need_weight_init:
+            key_map = {}
+            if target == EnumWeightTarget.Clip:
+                key_map = comfy.lora.model_lora_keys_clip(model.model, key_map)
+            else:
+                key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
+            weights = comfy.lora.load_lora(self.weights, key_map, log_missing=False)
+        else:
+            if target == EnumWeightTarget.Clip:
+                weights = self.weights_clip
+            else:
+                weights = self.weights
+        model.add_hook_patches(hook=self, patches=weights, strength_patch=strength)
+        registered.add(self)
+        return True
+        # TODO: add logs about any keys that were not applied
+
+    def clone(self):
+        c: WeightHook = super().clone()
+        c.weights = self.weights
+        c.weights_clip = self.weights_clip
+        c.need_weight_init = self.need_weight_init
+        c._strength_model = self._strength_model
+        c._strength_clip = self._strength_clip
+        return c
+
+class ObjectPatchHook(Hook):
+    def __init__(self, object_patches: dict[str]=None,
+                 hook_scope=EnumHookScope.AllConditioning):
+        super().__init__(hook_type=EnumHookType.ObjectPatch)
+        self.object_patches = object_patches
+        self.hook_scope = hook_scope
+
+    def clone(self):
+        c: ObjectPatchHook = super().clone()
+        c.object_patches = self.object_patches
+        return c
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        raise NotImplementedError("ObjectPatchHook is not supported yet in ComfyUI.")
+
+class AdditionalModelsHook(Hook):
+    '''
+    Hook responsible for telling model management any additional models that should be loaded.
+
+    Note, value of hook_scope is ignored and is treated as AllConditioning.
+    '''
+    def __init__(self, models: list[ModelPatcher]=None, key: str=None):
+        super().__init__(hook_type=EnumHookType.AdditionalModels)
+        self.models = models
+        self.key = key
+
+    def clone(self):
+        c: AdditionalModelsHook = super().clone()
+        c.models = self.models.copy() if self.models else self.models
+        c.key = self.key
+        return c
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        if not self.should_register(model, model_options, target_dict, registered):
+            return False
+        registered.add(self)
+        return True
+
+class TransformerOptionsHook(Hook):
+    '''
+    Hook responsible for adding wrappers, callbacks, patches, or anything else related to transformer_options.
+    '''
+    def __init__(self, transformers_dict: dict[str, dict[str, dict[str, list[Callable]]]]=None,
+                 hook_scope=EnumHookScope.AllConditioning):
+        super().__init__(hook_type=EnumHookType.TransformerOptions)
+        self.transformers_dict = transformers_dict
+        self.hook_scope = hook_scope
+        self._skip_adding = False
+        '''Internal value used to avoid double load of transformer_options when hook_scope is AllConditioning.'''
+
+    def clone(self):
+        c: TransformerOptionsHook = super().clone()
+        c.transformers_dict = self.transformers_dict
+        c._skip_adding = self._skip_adding
+        return c
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        if not self.should_register(model, model_options, target_dict, registered):
+            return False
+        # NOTE: to_load_options will be used to manually load patches/wrappers/callbacks from hooks
+        self._skip_adding = False
+        if self.hook_scope == EnumHookScope.AllConditioning:
+            add_model_options = {"transformer_options": self.transformers_dict,
+                                 "to_load_options": self.transformers_dict}
+            # skip_adding if included in AllConditioning to avoid double loading
+            self._skip_adding = True
+        else:
+            add_model_options = {"to_load_options": self.transformers_dict}
+        registered.add(self)
+        comfy.patcher_extension.merge_nested_dicts(model_options, add_model_options, copy_dict1=False)
+        return True
+
+    def on_apply_hooks(self, model: ModelPatcher, transformer_options: dict[str]):
+        if not self._skip_adding:
+            comfy.patcher_extension.merge_nested_dicts(transformer_options, self.transformers_dict, copy_dict1=False)
+
+WrapperHook = TransformerOptionsHook
+'''Only here for backwards compatibility, WrapperHook is identical to TransformerOptionsHook.'''
+
+class InjectionsHook(Hook):
+    def __init__(self, key: str=None, injections: list[PatcherInjection]=None,
+                 hook_scope=EnumHookScope.AllConditioning):
+        super().__init__(hook_type=EnumHookType.Injections)
+        self.key = key
+        self.injections = injections
+        self.hook_scope = hook_scope
+
+    def clone(self):
+        c: InjectionsHook = super().clone()
+        c.key = self.key
+        c.injections = self.injections.copy() if self.injections else self.injections
+        return c
+
+    def add_hook_patches(self, model: ModelPatcher, model_options: dict, target_dict: dict[str], registered: HookGroup):
+        raise NotImplementedError("InjectionsHook is not supported yet in ComfyUI.")
+
+class HookGroup:
+    '''
+    Stores groups of hooks, and allows them to be queried by type.
+
+    To prevent breaking their functionality, never modify the underlying self.hooks or self._hook_dict vars directly;
+    always use the provided functions on HookGroup.
+    '''
+    def __init__(self):
+        self.hooks: list[Hook] = []
+        self._hook_dict: dict[EnumHookType, list[Hook]] = {}
+
+    def __len__(self):
+        return len(self.hooks)
+
+    def add(self, hook: Hook):
+        if hook not in self.hooks:
+            self.hooks.append(hook)
+            self._hook_dict.setdefault(hook.hook_type, []).append(hook)
+
+    def remove(self, hook: Hook):
+        if hook in self.hooks:
+            self.hooks.remove(hook)
+            self._hook_dict[hook.hook_type].remove(hook)
+
+    def get_type(self, hook_type: EnumHookType):
+        return self._hook_dict.get(hook_type, [])
+
+    def contains(self, hook: Hook):
+        return hook in self.hooks
+
+    def is_subset_of(self, other: HookGroup):
+        self_hooks = set(self.hooks)
+        other_hooks = set(other.hooks)
+        return self_hooks.issubset(other_hooks)
+
+    def new_with_common_hooks(self, other: HookGroup):
+        c = HookGroup()
+        for hook in self.hooks:
+            if other.contains(hook):
+                c.add(hook.clone())
+        return c
+
+    def clone(self):
+        c = HookGroup()
+        for hook in self.hooks:
+            c.add(hook.clone())
+        return c
+
+    def clone_and_combine(self, other: HookGroup):
+        c = self.clone()
+        if other is not None:
+            for hook in other.hooks:
+                c.add(hook.clone())
+        return c
+
+    def set_keyframes_on_hooks(self, hook_kf: HookKeyframeGroup):
+        if hook_kf is None:
+            hook_kf = HookKeyframeGroup()
+        else:
+            hook_kf = hook_kf.clone()
+        for hook in self.hooks:
+            hook.hook_keyframe = hook_kf
+
+    def get_hooks_for_clip_schedule(self):
+        scheduled_hooks: dict[WeightHook, list[tuple[tuple[float,float], HookKeyframe]]] = {}
+        # only care about WeightHooks, for now
+        for hook in self.get_type(EnumHookType.Weight):
+            hook: WeightHook
+            hook_schedule = []
+            # if no hook keyframes, assign default value
+            if len(hook.hook_keyframe.keyframes) == 0:
+                hook_schedule.append(((0.0, 1.0), None))
+                scheduled_hooks[hook] = hook_schedule
+                continue
+            # find ranges of values
+            prev_keyframe = hook.hook_keyframe.keyframes[0]
+            for keyframe in hook.hook_keyframe.keyframes:
+                if keyframe.start_percent > prev_keyframe.start_percent and not math.isclose(keyframe.strength, prev_keyframe.strength):
+                    hook_schedule.append(((prev_keyframe.start_percent, keyframe.start_percent), prev_keyframe))
+                    prev_keyframe = keyframe
+                elif keyframe.start_percent == prev_keyframe.start_percent:
+                    prev_keyframe = keyframe
+            # create final range, assuming last start_percent was not 1.0
+            if not math.isclose(prev_keyframe.start_percent, 1.0):
+                hook_schedule.append(((prev_keyframe.start_percent, 1.0), prev_keyframe))
+            scheduled_hooks[hook] = hook_schedule
+        # hooks should not have their schedules in a list of tuples
+        all_ranges: list[tuple[float, float]] = []
+        for range_kfs in scheduled_hooks.values():
+            for t_range, keyframe in range_kfs:
+                all_ranges.append(t_range)
+        # turn list of ranges into boundaries
+        boundaries_set = set(itertools.chain.from_iterable(all_ranges))
+        boundaries_set.add(0.0)
+        boundaries = sorted(boundaries_set)
+        real_ranges = [(boundaries[i], boundaries[i + 1]) for i in range(len(boundaries) - 1)]
+        # with real ranges defined, give appropriate hooks w/ keyframes for each range
+        scheduled_keyframes: list[tuple[tuple[float,float], list[tuple[WeightHook, HookKeyframe]]]] = []
+        for t_range in real_ranges:
+            hooks_schedule = []
+            for hook, val in scheduled_hooks.items():
+                keyframe = None
+                # check if is a keyframe that works for the current t_range
+                for stored_range, stored_kf in val:
+                    # if stored start is less than current end, then fits - give it assigned keyframe
+                    if stored_range[0] < t_range[1] and stored_range[1] > t_range[0]:
+                        keyframe = stored_kf
+                        break
+                hooks_schedule.append((hook, keyframe))
+            scheduled_keyframes.append((t_range, hooks_schedule))
+        return scheduled_keyframes
+
+    def reset(self):
+        for hook in self.hooks:
+            hook.reset()
+
+    @staticmethod
+    def combine_all_hooks(hooks_list: list[HookGroup], require_count=0) -> HookGroup:
+        actual: list[HookGroup] = []
+        for group in hooks_list:
+            if group is not None:
+                actual.append(group)
+        if len(actual) < require_count:
+            raise Exception(f"Need at least {require_count} hooks to combine, but only had {len(actual)}.")
+        # if no hooks, then return None
+        if len(actual) == 0:
+            return None
+        # if only 1 hook, just return itself without cloning
+        elif len(actual) == 1:
+            return actual[0]
+        final_hook: HookGroup = None
+        for hook in actual:
+            if final_hook is None:
+                final_hook = hook.clone()
+            else:
+                final_hook = final_hook.clone_and_combine(hook)
+        return final_hook
+
+
+class HookKeyframe:
+    def __init__(self, strength: float, start_percent=0.0, guarantee_steps=1):
+        self.strength = strength
+        # scheduling
+        self.start_percent = float(start_percent)
+        self.start_t = 999999999.9
+        self.guarantee_steps = guarantee_steps
+
+    def get_effective_guarantee_steps(self, max_sigma: torch.Tensor):
+        '''If keyframe starts before current sampling range (max_sigma), treat as 0.'''
+        if self.start_t > max_sigma:
+            return 0
+        return self.guarantee_steps
+
+    def clone(self):
+        c = HookKeyframe(strength=self.strength,
+                         start_percent=self.start_percent, guarantee_steps=self.guarantee_steps)
+        c.start_t = self.start_t
+        return c
+
+class HookKeyframeGroup:
+    def __init__(self):
+        self.keyframes: list[HookKeyframe] = []
+        self._current_keyframe: HookKeyframe = None
+        self._current_used_steps = 0
+        self._current_index = 0
+        self._current_strength = None
+        self._curr_t = -1.
+
+    # properties shadow those of HookWeightsKeyframe
+    @property
+    def strength(self):
+        if self._current_keyframe is not None:
+            return self._current_keyframe.strength
+        return 1.0
+
+    def reset(self):
+        self._current_keyframe = None
+        self._current_used_steps = 0
+        self._current_index = 0
+        self._current_strength = None
+        self.curr_t = -1.
+        self._set_first_as_current()
+
+    def add(self, keyframe: HookKeyframe):
+        # add to end of list, then sort
+        self.keyframes.append(keyframe)
+        self.keyframes = get_sorted_list_via_attr(self.keyframes, "start_percent")
+        self._set_first_as_current()
+
+    def _set_first_as_current(self):
+        if len(self.keyframes) > 0:
+            self._current_keyframe = self.keyframes[0]
+        else:
+            self._current_keyframe = None
+
+    def has_guarantee_steps(self):
+        for kf in self.keyframes:
+            if kf.guarantee_steps > 0:
+                return True
+        return False
+
+    def has_index(self, index: int):
+        return index >= 0 and index < len(self.keyframes)
+
+    def is_empty(self):
+        return len(self.keyframes) == 0
+
+    def clone(self):
+        c = HookKeyframeGroup()
+        for keyframe in self.keyframes:
+            c.keyframes.append(keyframe.clone())
+        c._set_first_as_current()
+        return c
+
+    def initialize_timesteps(self, model: BaseModel):
+        for keyframe in self.keyframes:
+            keyframe.start_t = model.model_sampling.percent_to_sigma(keyframe.start_percent)
+
+    def prepare_current_keyframe(self, curr_t: float, transformer_options: dict[str, torch.Tensor]) -> bool:
+        if self.is_empty():
+            return False
+        if curr_t == self._curr_t:
+            return False
+        max_sigma = torch.max(transformer_options["sample_sigmas"])
+        prev_index = self._current_index
+        prev_strength = self._current_strength
+        # if met guaranteed steps, look for next keyframe in case need to switch
+        if self._current_used_steps >= self._current_keyframe.get_effective_guarantee_steps(max_sigma):
+            # if has next index, loop through and see if need to switch
+            if self.has_index(self._current_index+1):
+                for i in range(self._current_index+1, len(self.keyframes)):
+                    eval_c = self.keyframes[i]
+                    # check if start_t is greater or equal to curr_t
+                    # NOTE: t is in terms of sigmas, not percent, so bigger number = earlier step in sampling
+                    if eval_c.start_t >= curr_t:
+                        self._current_index = i
+                        self._current_strength = eval_c.strength
+                        self._current_keyframe = eval_c
+                        self._current_used_steps = 0
+                        # if guarantee_steps greater than zero, stop searching for other keyframes
+                        if self._current_keyframe.get_effective_guarantee_steps(max_sigma) > 0:
+                            break
+                    # if eval_c is outside the percent range, stop looking further
+                    else: break
+        # update steps current context is used
+        self._current_used_steps += 1
+        # update current timestep this was performed on
+        self._curr_t = curr_t
+        # return True if keyframe changed, False if no change
+        return prev_index != self._current_index and prev_strength != self._current_strength
+
+
+class InterpolationMethod:
+    LINEAR = "linear"
+    EASE_IN = "ease_in"
+    EASE_OUT = "ease_out"
+    EASE_IN_OUT = "ease_in_out"
+
+    _LIST = [LINEAR, EASE_IN, EASE_OUT, EASE_IN_OUT]
+
+    @classmethod
+    def get_weights(cls, num_from: float, num_to: float, length: int, method: str, reverse=False):
+        diff = num_to - num_from
+        if method == cls.LINEAR:
+            weights = torch.linspace(num_from, num_to, length)
+        elif method == cls.EASE_IN:
+            index = torch.linspace(0, 1, length)
+            weights = diff * np.power(index, 2) + num_from
+        elif method == cls.EASE_OUT:
+            index = torch.linspace(0, 1, length)
+            weights = diff * (1 - np.power(1 - index, 2)) + num_from
+        elif method == cls.EASE_IN_OUT:
+            index = torch.linspace(0, 1, length)
+            weights = diff * ((1 - np.cos(index * np.pi)) / 2) + num_from
+        else:
+            raise ValueError(f"Unrecognized interpolation method '{method}'.")
+        if reverse:
+            weights = weights.flip(dims=(0,))
+        return weights
+
+def get_sorted_list_via_attr(objects: list, attr: str) -> list:
+    if not objects:
+        return objects
+    elif len(objects) <= 1:
+        return [x for x in objects]
+    # now that we know we have to sort, do it following these rules:
+    # a) if objects have same value of attribute, maintain their relative order
+    # b) perform sorting of the groups of objects with same attributes
+    unique_attrs = {}
+    for o in objects:
+        val_attr = getattr(o, attr)
+        attr_list: list = unique_attrs.get(val_attr, list())
+        attr_list.append(o)
+        if val_attr not in unique_attrs:
+            unique_attrs[val_attr] = attr_list
+    # now that we have the unique attr values grouped together in relative order, sort them by key
+    sorted_attrs = dict(sorted(unique_attrs.items()))
+    # now flatten out the dict into a list to return
+    sorted_list = []
+    for object_list in sorted_attrs.values():
+        sorted_list.extend(object_list)
+    return sorted_list
+
+def create_transformer_options_from_hooks(model: ModelPatcher, hooks: HookGroup,  transformer_options: dict[str]=None):
+    # if no hooks or is not a ModelPatcher for sampling, return empty dict
+    if hooks is None or model.is_clip:
+        return {}
+    if transformer_options is None:
+        transformer_options = {}
+    for hook in hooks.get_type(EnumHookType.TransformerOptions):
+        hook: TransformerOptionsHook
+        hook.on_apply_hooks(model, transformer_options)
+    return transformer_options
+
+def create_hook_lora(lora: dict[str, torch.Tensor], strength_model: float, strength_clip: float):
+    hook_group = HookGroup()
+    hook = WeightHook(strength_model=strength_model, strength_clip=strength_clip)
+    hook_group.add(hook)
+    hook.weights = lora
+    return hook_group
+
+def create_hook_model_as_lora(weights_model, weights_clip, strength_model: float, strength_clip: float):
+    hook_group = HookGroup()
+    hook = WeightHook(strength_model=strength_model, strength_clip=strength_clip)
+    hook_group.add(hook)
+    patches_model = None
+    patches_clip = None
+    if weights_model is not None:
+        patches_model = {}
+        for key in weights_model:
+            patches_model[key] = ("model_as_lora", (weights_model[key],))
+    if weights_clip is not None:
+        patches_clip = {}
+        for key in weights_clip:
+            patches_clip[key] = ("model_as_lora", (weights_clip[key],))
+    hook.weights = patches_model
+    hook.weights_clip = patches_clip
+    hook.need_weight_init = False
+    return hook_group
+
+def get_patch_weights_from_model(model: ModelPatcher, discard_model_sampling=True):
+    if model is None:
+        return None
+    patches_model: dict[str, torch.Tensor] = model.model.state_dict()
+    if discard_model_sampling:
+        # do not include ANY model_sampling components of the model that should act as a patch
+        for key in list(patches_model.keys()):
+            if key.startswith("model_sampling"):
+                patches_model.pop(key, None)
+    return patches_model
+
+# NOTE: this function shows how to register weight hooks directly on the ModelPatchers
+def load_hook_lora_for_models(model: ModelPatcher, clip: CLIP, lora: dict[str, torch.Tensor],
+                              strength_model: float, strength_clip: float):
+    key_map = {}
+    if model is not None:
+        key_map = comfy.lora.model_lora_keys_unet(model.model, key_map)
+    if clip is not None:
+        key_map = comfy.lora.model_lora_keys_clip(clip.cond_stage_model, key_map)
+
+    hook_group = HookGroup()
+    hook = WeightHook()
+    hook_group.add(hook)
+    loaded: dict[str] = comfy.lora.load_lora(lora, key_map)
+    if model is not None:
+        new_modelpatcher = model.clone()
+        k = new_modelpatcher.add_hook_patches(hook=hook, patches=loaded, strength_patch=strength_model)
+    else:
+        k = ()
+        new_modelpatcher = None
+
+    if clip is not None:
+        new_clip = clip.clone()
+        k1 = new_clip.patcher.add_hook_patches(hook=hook, patches=loaded, strength_patch=strength_clip)
+    else:
+        k1 = ()
+        new_clip = None
+    k = set(k)
+    k1 = set(k1)
+    for x in loaded:
+        if (x not in k) and (x not in k1):
+            logging.warning(f"NOT LOADED {x}")
+    return (new_modelpatcher, new_clip, hook_group)
+
+def _combine_hooks_from_values(c_dict: dict[str, HookGroup], values: dict[str, HookGroup], cache: dict[tuple[HookGroup, HookGroup], HookGroup]):
+    hooks_key = 'hooks'
+    # if hooks only exist in one dict, do what's needed so that it ends up in c_dict
+    if hooks_key not in values:
+        return
+    if hooks_key not in c_dict:
+        hooks_value = values.get(hooks_key, None)
+        if hooks_value is not None:
+            c_dict[hooks_key] = hooks_value
+        return
+    # otherwise, need to combine with minimum duplication via cache
+    hooks_tuple = (c_dict[hooks_key], values[hooks_key])
+    cached_hooks = cache.get(hooks_tuple, None)
+    if cached_hooks is None:
+        new_hooks = hooks_tuple[0].clone_and_combine(hooks_tuple[1])
+        cache[hooks_tuple] = new_hooks
+        c_dict[hooks_key] = new_hooks
+    else:
+        c_dict[hooks_key] = cache[hooks_tuple]
+
+def conditioning_set_values_with_hooks(conditioning, values={}, append_hooks=True,
+                                       cache: dict[tuple[HookGroup, HookGroup], HookGroup]=None):
+    c = []
+    if cache is None:
+        cache = {}
+    for t in conditioning:
+        n = [t[0], t[1].copy()]
+        for k in values:
+            if append_hooks and k == 'hooks':
+                _combine_hooks_from_values(n[1], values, cache)
+            else:
+                n[1][k] = values[k]
+        c.append(n)
+
+    return c
+
+def set_hooks_for_conditioning(cond, hooks: HookGroup, append_hooks=True, cache: dict[tuple[HookGroup, HookGroup], HookGroup]=None):
+    if hooks is None:
+        return cond
+    return conditioning_set_values_with_hooks(cond, {'hooks': hooks}, append_hooks=append_hooks, cache=cache)
+
+def set_timesteps_for_conditioning(cond, timestep_range: tuple[float,float]):
+    if timestep_range is None:
+        return cond
+    return conditioning_set_values(cond, {"start_percent": timestep_range[0],
+                                          "end_percent": timestep_range[1]})
+
+def set_mask_for_conditioning(cond, mask: torch.Tensor, set_cond_area: str, strength: float):
+    if mask is None:
+        return cond
+    set_area_to_bounds = False
+    if set_cond_area != 'default':
+        set_area_to_bounds = True
+    if len(mask.shape) < 3:
+        mask = mask.unsqueeze(0)
+    return conditioning_set_values(cond, {'mask': mask,
+                                          'set_area_to_bounds': set_area_to_bounds,
+                                          'mask_strength': strength})
+
+def combine_conditioning(conds: list):
+    combined_conds = []
+    for cond in conds:
+        combined_conds.extend(cond)
+    return combined_conds
+
+def combine_with_new_conds(conds: list, new_conds: list):
+    combined_conds = []
+    for c, new_c in zip(conds, new_conds):
+        combined_conds.append(combine_conditioning([c, new_c]))
+    return combined_conds
+
+def set_conds_props(conds: list, strength: float, set_cond_area: str,
+                   mask: torch.Tensor=None, hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
+    final_conds = []
+    cache = {}
+    for c in conds:
+        # first, apply lora_hook to conditioning, if provided
+        c = set_hooks_for_conditioning(c, hooks, append_hooks=append_hooks, cache=cache)
+        # next, apply mask to conditioning
+        c = set_mask_for_conditioning(cond=c, mask=mask, strength=strength, set_cond_area=set_cond_area)
+        # apply timesteps, if present
+        c = set_timesteps_for_conditioning(cond=c, timestep_range=timesteps_range)
+        # finally, apply mask to conditioning and store
+        final_conds.append(c)
+    return final_conds
+
+def set_conds_props_and_combine(conds: list, new_conds: list, strength: float=1.0, set_cond_area: str="default",
+                               mask: torch.Tensor=None, hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
+    combined_conds = []
+    cache = {}
+    for c, masked_c in zip(conds, new_conds):
+        # first, apply lora_hook to new conditioning, if provided
+        masked_c = set_hooks_for_conditioning(masked_c, hooks, append_hooks=append_hooks, cache=cache)
+        # next, apply mask to new conditioning, if provided
+        masked_c = set_mask_for_conditioning(cond=masked_c, mask=mask, set_cond_area=set_cond_area, strength=strength)
+        # apply timesteps, if present
+        masked_c = set_timesteps_for_conditioning(cond=masked_c, timestep_range=timesteps_range)
+        # finally, combine with existing conditioning and store
+        combined_conds.append(combine_conditioning([c, masked_c]))
+    return combined_conds
+
+def set_default_conds_and_combine(conds: list, new_conds: list,
+                                   hooks: HookGroup=None, timesteps_range: tuple[float,float]=None, append_hooks=True):
+    combined_conds = []
+    cache = {}
+    for c, new_c in zip(conds, new_conds):
+        # first, apply lora_hook to new conditioning, if provided
+        new_c = set_hooks_for_conditioning(new_c, hooks, append_hooks=append_hooks, cache=cache)
+        # next, add default_cond key to cond so that during sampling, it can be identified
+        new_c = conditioning_set_values(new_c, {'default': True})
+        # apply timesteps, if present
+        new_c = set_timesteps_for_conditioning(cond=new_c, timestep_range=timesteps_range)
+        # finally, combine with existing conditioning and store
+        combined_conds.append(combine_conditioning([c, new_c]))
+    return combined_conds
--- a/comfy/image_encoders/__pycache__/dino2.cpython-310.pyc
+++ b/comfy/image_encoders/__pycache__/dino2.cpython-310.pyc
--- a/comfy/image_encoders/dino2.py
+++ b/comfy/image_encoders/dino2.py
+import torch
+from comfy.text_encoders.bert import BertAttention
+import comfy.model_management
+from comfy.ldm.modules.attention import optimized_attention_for_device
+
+
+class Dino2AttentionOutput(torch.nn.Module):
+    def __init__(self, input_dim, output_dim, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.dense = operations.Linear(input_dim, output_dim, dtype=dtype, device=device)
+
+    def forward(self, x):
+        return self.dense(x)
+
+
+class Dino2AttentionBlock(torch.nn.Module):
+    def __init__(self, embed_dim, heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.attention = BertAttention(embed_dim, heads, dtype, device, operations)
+        self.output = Dino2AttentionOutput(embed_dim, embed_dim, layer_norm_eps, dtype, device, operations)
+
+    def forward(self, x, mask, optimized_attention):
+        return self.output(self.attention(x, mask, optimized_attention))
+
+
+class LayerScale(torch.nn.Module):
+    def __init__(self, dim, dtype, device, operations):
+        super().__init__()
+        self.lambda1 = torch.nn.Parameter(torch.empty(dim, device=device, dtype=dtype))
+
+    def forward(self, x):
+        return x * comfy.model_management.cast_to_device(self.lambda1, x.device, x.dtype)
+
+
+class SwiGLUFFN(torch.nn.Module):
+    def __init__(self, dim, dtype, device, operations):
+        super().__init__()
+        in_features = out_features = dim
+        hidden_features = int(dim * 4)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = operations.Linear(in_features, 2 * hidden_features, bias=True, device=device, dtype=dtype)
+        self.weights_out = operations.Linear(hidden_features, out_features, bias=True, device=device, dtype=dtype)
+
+    def forward(self, x):
+        x = self.weights_in(x)
+        x1, x2 = x.chunk(2, dim=-1)
+        x = torch.nn.functional.silu(x1) * x2
+        return self.weights_out(x)
+
+
+class Dino2Block(torch.nn.Module):
+    def __init__(self, dim, num_heads, layer_norm_eps, dtype, device, operations):
+        super().__init__()
+        self.attention = Dino2AttentionBlock(dim, num_heads, layer_norm_eps, dtype, device, operations)
+        self.layer_scale1 = LayerScale(dim, dtype, device, operations)
+        self.layer_scale2 = LayerScale(dim, dtype, device, operations)
+        self.mlp = SwiGLUFFN(dim, dtype, device, operations)
+        self.norm1 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
+        self.norm2 = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
+
+    def forward(self, x, optimized_attention):
+        x = x + self.layer_scale1(self.attention(self.norm1(x), None, optimized_attention))
+        x = x + self.layer_scale2(self.mlp(self.norm2(x)))
+        return x
+
+
+class Dino2Encoder(torch.nn.Module):
+    def __init__(self, dim, num_heads, layer_norm_eps, num_layers, dtype, device, operations):
+        super().__init__()
+        self.layer = torch.nn.ModuleList([Dino2Block(dim, num_heads, layer_norm_eps, dtype, device, operations) for _ in range(num_layers)])
+
+    def forward(self, x, intermediate_output=None):
+        optimized_attention = optimized_attention_for_device(x.device, False, small_input=True)
+
+        if intermediate_output is not None:
+            if intermediate_output < 0:
+                intermediate_output = len(self.layer) + intermediate_output
+
+        intermediate = None
+        for i, l in enumerate(self.layer):
+            x = l(x, optimized_attention)
+            if i == intermediate_output:
+                intermediate = x.clone()
+        return x, intermediate
+
+
+class Dino2PatchEmbeddings(torch.nn.Module):
+    def __init__(self, dim, num_channels=3, patch_size=14, image_size=518, dtype=None, device=None, operations=None):
+        super().__init__()
+        self.projection = operations.Conv2d(
+            in_channels=num_channels,
+            out_channels=dim,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=True,
+            dtype=dtype,
+            device=device
+        )
+
+    def forward(self, pixel_values):
+        return self.projection(pixel_values).flatten(2).transpose(1, 2)
+
+
+class Dino2Embeddings(torch.nn.Module):
+    def __init__(self, dim, dtype, device, operations):
+        super().__init__()
+        patch_size = 14
+        image_size = 518
+
+        self.patch_embeddings = Dino2PatchEmbeddings(dim, patch_size=patch_size, image_size=image_size, dtype=dtype, device=device, operations=operations)
+        self.position_embeddings = torch.nn.Parameter(torch.empty(1, (image_size // patch_size) ** 2 + 1, dim, dtype=dtype, device=device))
+        self.cls_token = torch.nn.Parameter(torch.empty(1, 1, dim, dtype=dtype, device=device))
+        self.mask_token = torch.nn.Parameter(torch.empty(1, dim, dtype=dtype, device=device))
+
+    def forward(self, pixel_values):
+        x = self.patch_embeddings(pixel_values)
+        # TODO: mask_token?
+        x = torch.cat((self.cls_token.to(device=x.device, dtype=x.dtype).expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + comfy.model_management.cast_to_device(self.position_embeddings, x.device, x.dtype)
+        return x
+
+
+class Dinov2Model(torch.nn.Module):
+    def __init__(self, config_dict, dtype, device, operations):
+        super().__init__()
+        num_layers = config_dict["num_hidden_layers"]
+        dim = config_dict["hidden_size"]
+        heads = config_dict["num_attention_heads"]
+        layer_norm_eps = config_dict["layer_norm_eps"]
+
+        self.embeddings = Dino2Embeddings(dim, dtype, device, operations)
+        self.encoder = Dino2Encoder(dim, heads, layer_norm_eps, num_layers, dtype, device, operations)
+        self.layernorm = operations.LayerNorm(dim, eps=layer_norm_eps, dtype=dtype, device=device)
+
+    def forward(self, pixel_values, attention_mask=None, intermediate_output=None):
+        x = self.embeddings(pixel_values)
+        x, i = self.encoder(x, intermediate_output=intermediate_output)
+        x = self.layernorm(x)
+        pooled_output = x[:, 0, :]
+        return x, i, pooled_output, None
--- a/comfy/image_encoders/dino2_giant.json
+++ b/comfy/image_encoders/dino2_giant.json
+{
+  "attention_probs_dropout_prob": 0.0,
+  "drop_path_rate": 0.0,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 1536,
+  "image_size": 518,
+  "initializer_range": 0.02,
+  "layer_norm_eps": 1e-06,
+  "layerscale_value": 1.0,
+  "mlp_ratio": 4,
+  "model_type": "dinov2",
+  "num_attention_heads": 24,
+  "num_channels": 3,
+  "num_hidden_layers": 40,
+  "patch_size": 14,
+  "qkv_bias": true,
+  "use_swiglu_ffn": true,
+  "image_mean": [0.485, 0.456, 0.406],
+  "image_std": [0.229, 0.224, 0.225]
+}
--- a/comfy/k_diffusion/__pycache__/deis.cpython-310.pyc
+++ b/comfy/k_diffusion/__pycache__/deis.cpython-310.pyc
--- a/comfy/k_diffusion/__pycache__/sa_solver.cpython-310.pyc
+++ b/comfy/k_diffusion/__pycache__/sa_solver.cpython-310.pyc
--- a/comfy/k_diffusion/__pycache__/sampling.cpython-310.pyc
+++ b/comfy/k_diffusion/__pycache__/sampling.cpython-310.pyc