Initial commit

75f45050 · jerrrrry · 75f45050 · 75f45050 · 75f45050 · 75f45050
Commit 75f45050 authored Jan 24, 2026 by jerrrrry
9 changed files
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+import psutil
+import logging
+from enum import Enum
+from comfy.cli_args import args, PerformanceFeature
+import torch
+import sys
+import platform
+import weakref
+import gc
+class VRAMState(Enum):
+    DISABLED = 0    #No vram present: no need to move models to vram
+    NO_VRAM = 1     #Very low vram: enable all the options to save vram
+    LOW_VRAM = 2
+    NORMAL_VRAM = 3
+    HIGH_VRAM = 4
+    SHARED = 5      #No dedicated vram: memory shared between CPU and GPU but models still need to be moved between both.
+class CPUState(Enum):
+    GPU = 0
+    CPU = 1
+    MPS = 2
+# Determine VRAM State
+vram_state = VRAMState.NORMAL_VRAM
+set_vram_to = VRAMState.NORMAL_VRAM
+cpu_state = CPUState.GPU
+total_vram = 0
+def get_supported_float8_types():
+    float8_types = []
+    try:
+        float8_types.append(torch.float8_e4m3fn)
+    except:
+        pass
+    try:
+        float8_types.append(torch.float8_e4m3fnuz)
+    except:
+        pass
+    try:
+        float8_types.append(torch.float8_e5m2)
+    except:
+        pass
+    try:
+        float8_types.append(torch.float8_e5m2fnuz)
+    except:
+        pass
+    try:
+        float8_types.append(torch.float8_e8m0fnu)
+    except:
+        pass
+    return float8_types
+FLOAT8_TYPES = get_supported_float8_types()
+xpu_available = False
+torch_version = ""
+try:
+    torch_version = torch.version.__version__
+    temp = torch_version.split(".")
+    torch_version_numeric = (int(temp[0]), int(temp[1]))
+except:
+    pass
+lowvram_available = True
+if args.deterministic:
+    logging.info("Using deterministic algorithms for pytorch")
+    torch.use_deterministic_algorithms(True, warn_only=True)
+directml_enabled = False
+if args.directml is not None:
+    import torch_directml
+    directml_enabled = True
+    device_index = args.directml
+    if device_index < 0:
+        directml_device = torch_directml.device()
+    else:
+        directml_device = torch_directml.device(device_index)
+    logging.info("Using directml with device: {}".format(torch_directml.device_name(device_index)))
+    # torch_directml.disable_tiled_resources(True)
+    lowvram_available = False #TODO: need to find a way to get free memory in directml before this can be enabled by default.
+try:
+    import intel_extension_for_pytorch as ipex  # noqa: F401
+except:
+    pass
+try:
+    _ = torch.xpu.device_count()
+    xpu_available = torch.xpu.is_available()
+except:
+    xpu_available = False
+try:
+    if torch.backends.mps.is_available():
+        cpu_state = CPUState.MPS
+        import torch.mps
+except:
+    pass
+try:
+    import torch_npu  # noqa: F401
+    _ = torch.npu.device_count()
+    npu_available = torch.npu.is_available()
+except:
+    npu_available = False
+try:
+    import torch_mlu  # noqa: F401
+    _ = torch.mlu.device_count()
+    mlu_available = torch.mlu.is_available()
+except:
+    mlu_available = False
+try:
+    ixuca_available = hasattr(torch, "corex")
+except:
+    ixuca_available = False
+if args.cpu:
+    cpu_state = CPUState.CPU
+def is_intel_xpu():
+    global cpu_state
+    global xpu_available
+    if cpu_state == CPUState.GPU:
+        if xpu_available:
+            return True
+    return False
+def is_ascend_npu():
+    global npu_available
+    if npu_available:
+        return True
+    return False
+def is_mlu():
+    global mlu_available
+    if mlu_available:
+        return True
+    return False
+def is_ixuca():
+    global ixuca_available
+    if ixuca_available:
+        return True
+    return False
+def get_torch_device():
+    global directml_enabled
+    global cpu_state
+    if directml_enabled:
+        global directml_device
+        return directml_device
+    if cpu_state == CPUState.MPS:
+        return torch.device("mps")
+    if cpu_state == CPUState.CPU:
+        return torch.device("cpu")
+    else:
+        if is_intel_xpu():
+            return torch.device("xpu", torch.xpu.current_device())
+        elif is_ascend_npu():
+            return torch.device("npu", torch.npu.current_device())
+        elif is_mlu():
+            return torch.device("mlu", torch.mlu.current_device())
+        else:
+            return torch.device(torch.cuda.current_device())
+def get_total_memory(dev=None, torch_total_too=False):
+    global directml_enabled
+    if dev is None:
+        dev = get_torch_device()
+    if hasattr(dev, 'type') and (dev.type == 'cpu' or dev.type == 'mps'):
+        mem_total = psutil.virtual_memory().total
+        mem_total_torch = mem_total
+    else:
+        if directml_enabled:
+            mem_total = 1024 * 1024 * 1024 #TODO
+            mem_total_torch = mem_total
+        elif is_intel_xpu():
+            stats = torch.xpu.memory_stats(dev)
+            mem_reserved = stats['reserved_bytes.all.current']
+            mem_total_xpu = torch.xpu.get_device_properties(dev).total_memory
+            mem_total_torch = mem_reserved
+            mem_total = mem_total_xpu
+        elif is_ascend_npu():
+            stats = torch.npu.memory_stats(dev)
+            mem_reserved = stats['reserved_bytes.all.current']
+            _, mem_total_npu = torch.npu.mem_get_info(dev)
+            mem_total_torch = mem_reserved
+            mem_total = mem_total_npu
+        elif is_mlu():
+            stats = torch.mlu.memory_stats(dev)
+            mem_reserved = stats['reserved_bytes.all.current']
+            _, mem_total_mlu = torch.mlu.mem_get_info(dev)
+            mem_total_torch = mem_reserved
+            mem_total = mem_total_mlu
+        else:
+            stats = torch.cuda.memory_stats(dev)
+            mem_reserved = stats['reserved_bytes.all.current']
+            _, mem_total_cuda = torch.cuda.mem_get_info(dev)
+            mem_total_torch = mem_reserved
+            mem_total = mem_total_cuda
+    if torch_total_too:
+        return (mem_total, mem_total_torch)
+    else:
+        return mem_total
+def mac_version():
+    try:
+        return tuple(int(n) for n in platform.mac_ver()[0].split("."))
+    except:
+        return None
+total_vram = get_total_memory(get_torch_device()) / (1024 * 1024)
+total_ram = psutil.virtual_memory().total / (1024 * 1024)
+logging.info("Total VRAM {:0.0f} MB, total RAM {:0.0f} MB".format(total_vram, total_ram))
+try:
+    logging.info("pytorch version: {}".format(torch_version))
+    mac_ver = mac_version()
+    if mac_ver is not None:
+        logging.info("Mac Version {}".format(mac_ver))
+except:
+    pass
+try:
+    OOM_EXCEPTION = torch.cuda.OutOfMemoryError
+except:
+    OOM_EXCEPTION = Exception
+XFORMERS_VERSION = ""
+XFORMERS_ENABLED_VAE = True
+if args.disable_xformers:
+    XFORMERS_IS_AVAILABLE = False
+else:
+    try:
+        import xformers
+        import xformers.ops
+        XFORMERS_IS_AVAILABLE = True
+        try:
+            XFORMERS_IS_AVAILABLE = xformers._has_cpp_library
+        except:
+            pass
+        try:
+            XFORMERS_VERSION = xformers.version.__version__
+            logging.info("xformers version: {}".format(XFORMERS_VERSION))
+            if XFORMERS_VERSION.startswith("0.0.18"):
+                logging.warning("\nWARNING: This version of xformers has a major bug where you will get black images when generating high resolution images.")
+                logging.warning("Please downgrade or upgrade xformers to a different version.\n")
+                XFORMERS_ENABLED_VAE = False
+        except:
+            pass
+    except:
+        XFORMERS_IS_AVAILABLE = False
+def is_nvidia():
+    global cpu_state
+    if cpu_state == CPUState.GPU:
+        if torch.version.cuda:
+            return True
+    return False
+def is_amd():
+    global cpu_state
+    if cpu_state == CPUState.GPU:
+        if torch.version.hip:
+            return True
+    return False
+MIN_WEIGHT_MEMORY_RATIO = 0.4
+if is_nvidia():
+    MIN_WEIGHT_MEMORY_RATIO = 0.0
+ENABLE_PYTORCH_ATTENTION = False
+if args.use_pytorch_cross_attention:
+    ENABLE_PYTORCH_ATTENTION = True
+    XFORMERS_IS_AVAILABLE = False
+try:
+    if is_nvidia():
+        if torch_version_numeric[0] >= 2:
+            if ENABLE_PYTORCH_ATTENTION == False and args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
+                ENABLE_PYTORCH_ATTENTION = True
+    if is_intel_xpu() or is_ascend_npu() or is_mlu() or is_ixuca():
+        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
+            ENABLE_PYTORCH_ATTENTION = True
+except:
+    pass
+SUPPORT_FP8_OPS = args.supports_fp8_compute
+try:
+    if is_amd():
+        try:
+            rocm_version = tuple(map(int, str(torch.version.hip).split(".")[:2]))
+        except:
+            rocm_version = (6, -1)
+        arch = torch.cuda.get_device_properties(get_torch_device()).gcnArchName
+        logging.info("AMD arch: {}".format(arch))
+        logging.info("ROCm version: {}".format(rocm_version))
+        if args.use_split_cross_attention == False and args.use_quad_cross_attention == False:
+            if torch_version_numeric >= (2, 7):  # works on 2.6 but doesn't actually seem to improve much
+                if any((a in arch) for a in ["gfx90a", "gfx942", "gfx1100", "gfx1101", "gfx1151"]):  # TODO: more arches, TODO: gfx950
+                    ENABLE_PYTORCH_ATTENTION = True
+#            if torch_version_numeric >= (2, 8):
+#                if any((a in arch) for a in ["gfx1201"]):
+#                    ENABLE_PYTORCH_ATTENTION = True
+        if torch_version_numeric >= (2, 7) and rocm_version >= (6, 4):
+            if any((a in arch) for a in ["gfx1201", "gfx942", "gfx950"]):  # TODO: more arches
+                SUPPORT_FP8_OPS = True
+except:
+    pass
+if ENABLE_PYTORCH_ATTENTION:
+    torch.backends.cuda.enable_math_sdp(True)
+    torch.backends.cuda.enable_flash_sdp(True)
+    torch.backends.cuda.enable_mem_efficient_sdp(True)
+PRIORITIZE_FP16 = False  # TODO: remove and replace with something that shows exactly which dtype is faster than the other
+try:
+    if (is_nvidia() or is_amd()) and PerformanceFeature.Fp16Accumulation in args.fast:
+        torch.backends.cuda.matmul.allow_fp16_accumulation = True
+        PRIORITIZE_FP16 = True  # TODO: limit to cards where it actually boosts performance
+        logging.info("Enabled fp16 accumulation.")
+except:
+    pass
+try:
+    if torch_version_numeric >= (2, 5):
+        torch.backends.cuda.allow_fp16_bf16_reduction_math_sdp(True)
+except:
+    logging.warning("Warning, could not set allow_fp16_bf16_reduction_math_sdp")
+if args.lowvram:
+    set_vram_to = VRAMState.LOW_VRAM
+    lowvram_available = True
+elif args.novram:
+    set_vram_to = VRAMState.NO_VRAM
+elif args.highvram or args.gpu_only:
+    vram_state = VRAMState.HIGH_VRAM
+FORCE_FP32 = False
+if args.force_fp32:
+    logging.info("Forcing FP32, if this improves things please report it.")
+    FORCE_FP32 = True
+if lowvram_available:
+    if set_vram_to in (VRAMState.LOW_VRAM, VRAMState.NO_VRAM):
+        vram_state = set_vram_to
+if cpu_state != CPUState.GPU:
+    vram_state = VRAMState.DISABLED
+if cpu_state == CPUState.MPS:
+    vram_state = VRAMState.SHARED
+logging.info(f"Set vram state to: {vram_state.name}")
+DISABLE_SMART_MEMORY = args.disable_smart_memory
+if DISABLE_SMART_MEMORY:
+    logging.info("Disabling smart memory management")
+def get_torch_device_name(device):
+    if hasattr(device, 'type'):
+        if device.type == "cuda":
+            try:
+                allocator_backend = torch.cuda.get_allocator_backend()
+            except:
+                allocator_backend = ""
+            return "{} {} : {}".format(device, torch.cuda.get_device_name(device), allocator_backend)
+        elif device.type == "xpu":
+            return "{} {}".format(device, torch.xpu.get_device_name(device))
+        else:
+            return "{}".format(device.type)
+    elif is_intel_xpu():
+        return "{} {}".format(device, torch.xpu.get_device_name(device))
+    elif is_ascend_npu():
+        return "{} {}".format(device, torch.npu.get_device_name(device))
+    elif is_mlu():
+        return "{} {}".format(device, torch.mlu.get_device_name(device))
+    else:
+        return "CUDA {}: {}".format(device, torch.cuda.get_device_name(device))
+try:
+    logging.info("Device: {}".format(get_torch_device_name(get_torch_device())))
+except:
+    logging.warning("Could not pick default device.")
+current_loaded_models = []
+def module_size(module):
+    module_mem = 0
+    sd = module.state_dict()
+    for k in sd:
+        t = sd[k]
+        module_mem += t.nelement() * t.element_size()
+    return module_mem
+class LoadedModel:
+    def __init__(self, model):
+        self._set_model(model)
+        self.device = model.load_device
+        self.real_model = None
+        self.currently_used = True
+        self.model_finalizer = None
+        self._patcher_finalizer = None
+    def _set_model(self, model):
+        self._model = weakref.ref(model)
+        if model.parent is not None:
+            self._parent_model = weakref.ref(model.parent)
+            self._patcher_finalizer = weakref.finalize(model, self._switch_parent)
+    def _switch_parent(self):
+        model = self._parent_model()
+        if model is not None:
+            self._set_model(model)
+    @property
+    def model(self):
+        return self._model()
+    def model_memory(self):
+        return self.model.model_size()
+    def model_loaded_memory(self):
+        return self.model.loaded_size()
+    def model_offloaded_memory(self):
+        return self.model.model_size() - self.model.loaded_size()
+    def model_memory_required(self, device):
+        if device == self.model.current_loaded_device():
+            return self.model_offloaded_memory()
+        else:
+            return self.model_memory()
+    def model_load(self, lowvram_model_memory=0, force_patch_weights=False):
+        self.model.model_patches_to(self.device)
+        self.model.model_patches_to(self.model.model_dtype())
+        # if self.model.loaded_size() > 0:
+        use_more_vram = lowvram_model_memory
+        if use_more_vram == 0:
+            use_more_vram = 1e32
+        self.model_use_more_vram(use_more_vram, force_patch_weights=force_patch_weights)
+        real_model = self.model.model
+        if is_intel_xpu() and not args.disable_ipex_optimize and 'ipex' in globals() and real_model is not None:
+            with torch.no_grad():
+                real_model = ipex.optimize(real_model.eval(), inplace=True, graph_mode=True, concat_linear=True)
+        self.real_model = weakref.ref(real_model)
+        self.model_finalizer = weakref.finalize(real_model, cleanup_models)
+        return real_model
+    def should_reload_model(self, force_patch_weights=False):
+        if force_patch_weights and self.model.lowvram_patch_counter() > 0:
+            return True
+        return False
+    def model_unload(self, memory_to_free=None, unpatch_weights=True):
+        if memory_to_free is not None:
+            if memory_to_free < self.model.loaded_size():
+                freed = self.model.partially_unload(self.model.offload_device, memory_to_free)
+                if freed >= memory_to_free:
+                    return False
+        self.model.detach(unpatch_weights)
+        self.model_finalizer.detach()
+        self.model_finalizer = None
+        self.real_model = None
+        return True
+    def model_use_more_vram(self, extra_memory, force_patch_weights=False):
+        return self.model.partially_load(self.device, extra_memory, force_patch_weights=force_patch_weights)
+    def __eq__(self, other):
+        return self.model is other.model
+    def __del__(self):
+        if self._patcher_finalizer is not None:
+            self._patcher_finalizer.detach()
+    def is_dead(self):
+        return self.real_model() is not None and self.model is None
+def use_more_memory(extra_memory, loaded_models, device):
+    for m in loaded_models:
+        if m.device == device:
+            extra_memory -= m.model_use_more_vram(extra_memory)
+            if extra_memory <= 0:
+                break
+def offloaded_memory(loaded_models, device):
+    offloaded_mem = 0
+    for m in loaded_models:
+        if m.device == device:
+            offloaded_mem += m.model_offloaded_memory()
+    return offloaded_mem
+WINDOWS = any(platform.win32_ver())
+EXTRA_RESERVED_VRAM = 400 * 1024 * 1024
+if WINDOWS:
+    EXTRA_RESERVED_VRAM = 600 * 1024 * 1024 #Windows is higher because of the shared vram issue
+    if total_vram > (15 * 1024):  # more extra reserved vram on 16GB+ cards
+        EXTRA_RESERVED_VRAM += 100 * 1024 * 1024
+if args.reserve_vram is not None:
+    EXTRA_RESERVED_VRAM = args.reserve_vram * 1024 * 1024 * 1024
+    logging.debug("Reserving {}MB vram for other applications.".format(EXTRA_RESERVED_VRAM / (1024 * 1024)))
+def extra_reserved_memory():
+    return EXTRA_RESERVED_VRAM
+def minimum_inference_memory():
+    return (1024 * 1024 * 1024) * 0.8 + extra_reserved_memory()
+def free_memory(memory_required, device, keep_loaded=[]):
+    cleanup_models_gc()
+    unloaded_model = []
+    can_unload = []
+    unloaded_models = []
+    for i in range(len(current_loaded_models) -1, -1, -1):
+        shift_model = current_loaded_models[i]
+        if shift_model.device == device:
+            if shift_model not in keep_loaded and not shift_model.is_dead():
+                can_unload.append((-shift_model.model_offloaded_memory(), sys.getrefcount(shift_model.model), shift_model.model_memory(), i))
+                shift_model.currently_used = False
+    for x in sorted(can_unload):
+        i = x[-1]
+        memory_to_free = None
+        if not DISABLE_SMART_MEMORY:
+            free_mem = get_free_memory(device)
+            if free_mem > memory_required:
+                break
+            memory_to_free = memory_required - free_mem
+        logging.debug(f"Unloading {current_loaded_models[i].model.model.__class__.__name__}")
+        if current_loaded_models[i].model_unload(memory_to_free):
+            unloaded_model.append(i)
+    for i in sorted(unloaded_model, reverse=True):
+        unloaded_models.append(current_loaded_models.pop(i))
+    if len(unloaded_model) > 0:
+        soft_empty_cache()
+    else:
+        if vram_state != VRAMState.HIGH_VRAM:
+            mem_free_total, mem_free_torch = get_free_memory(device, torch_free_too=True)
+            if mem_free_torch > mem_free_total * 0.25:
+                soft_empty_cache()
+    return unloaded_models
+def load_models_gpu(models, memory_required=0, force_patch_weights=False, minimum_memory_required=None, force_full_load=False):
+    cleanup_models_gc()
+    global vram_state
+    inference_memory = minimum_inference_memory()
+    extra_mem = max(inference_memory, memory_required + extra_reserved_memory())
+    if minimum_memory_required is None:
+        minimum_memory_required = extra_mem
+    else:
+        minimum_memory_required = max(inference_memory, minimum_memory_required + extra_reserved_memory())
+    models_temp = set()
+    for m in models:
+        models_temp.add(m)
+        for mm in m.model_patches_models():
+            models_temp.add(mm)
+    models = models_temp
+    models_to_load = []
+    for x in models:
+        loaded_model = LoadedModel(x)
+        try:
+            loaded_model_index = current_loaded_models.index(loaded_model)
+        except:
+            loaded_model_index = None
+        if loaded_model_index is not None:
+            loaded = current_loaded_models[loaded_model_index]
+            loaded.currently_used = True
+            models_to_load.append(loaded)
+        else:
+            if hasattr(x, "model"):
+                logging.info(f"Requested to load {x.model.__class__.__name__}")
+            models_to_load.append(loaded_model)
+    for loaded_model in models_to_load:
+        to_unload = []
+        for i in range(len(current_loaded_models)):
+            if loaded_model.model.is_clone(current_loaded_models[i].model):
+                to_unload = [i] + to_unload
+        for i in to_unload:
+            current_loaded_models.pop(i).model.detach(unpatch_all=False)
+    total_memory_required = {}
+    for loaded_model in models_to_load:
+        total_memory_required[loaded_model.device] = total_memory_required.get(loaded_model.device, 0) + loaded_model.model_memory_required(loaded_model.device)
+    for device in total_memory_required:
+        if device != torch.device("cpu"):
+            free_memory(total_memory_required[device] * 1.1 + extra_mem, device)
+    for device in total_memory_required:
+        if device != torch.device("cpu"):
+            free_mem = get_free_memory(device)
+            if free_mem < minimum_memory_required:
+                models_l = free_memory(minimum_memory_required, device)
+                logging.info("{} models unloaded.".format(len(models_l)))
+    for loaded_model in models_to_load:
+        model = loaded_model.model
+        torch_dev = model.load_device
+        if is_device_cpu(torch_dev):
+            vram_set_state = VRAMState.DISABLED
+        else:
+            vram_set_state = vram_state
+        lowvram_model_memory = 0
+        if lowvram_available and (vram_set_state == VRAMState.LOW_VRAM or vram_set_state == VRAMState.NORMAL_VRAM) and not force_full_load:
+            loaded_memory = loaded_model.model_loaded_memory()
+            current_free_mem = get_free_memory(torch_dev) + loaded_memory
+            lowvram_model_memory = max(128 * 1024 * 1024, (current_free_mem - minimum_memory_required), min(current_free_mem * MIN_WEIGHT_MEMORY_RATIO, current_free_mem - minimum_inference_memory()))
+            lowvram_model_memory = max(0.1, lowvram_model_memory - loaded_memory)
+        if vram_set_state == VRAMState.NO_VRAM:
+            lowvram_model_memory = 0.1
+        loaded_model.model_load(lowvram_model_memory, force_patch_weights=force_patch_weights)
+        current_loaded_models.insert(0, loaded_model)
+    return
+def load_model_gpu(model):
+    return load_models_gpu([model])
+def loaded_models(only_currently_used=False):
+    output = []
+    for m in current_loaded_models:
+        if only_currently_used:
+            if not m.currently_used:
+                continue
+        output.append(m.model)
+    return output
+def cleanup_models_gc():
+    do_gc = False
+    for i in range(len(current_loaded_models)):
+        cur = current_loaded_models[i]
+        if cur.is_dead():
+            logging.info("Potential memory leak detected with model {}, doing a full garbage collect, for maximum performance avoid circular references in the model code.".format(cur.real_model().__class__.__name__))
+            do_gc = True
+            break
+    if do_gc:
+        gc.collect()
+        soft_empty_cache()
+        for i in range(len(current_loaded_models)):
+            cur = current_loaded_models[i]
+            if cur.is_dead():
+                logging.warning("WARNING, memory leak with model {}. Please make sure it is not being referenced from somewhere.".format(cur.real_model().__class__.__name__))
+def cleanup_models():
+    to_delete = []
+    for i in range(len(current_loaded_models)):
+        if current_loaded_models[i].real_model() is None:
+            to_delete = [i] + to_delete
+    for i in to_delete:
+        x = current_loaded_models.pop(i)
+        del x
+def dtype_size(dtype):
+    dtype_size = 4
+    if dtype == torch.float16 or dtype == torch.bfloat16:
+        dtype_size = 2
+    elif dtype == torch.float32:
+        dtype_size = 4
+    else:
+        try:
+            dtype_size = dtype.itemsize
+        except: #Old pytorch doesn't have .itemsize
+            pass
+    return dtype_size
+def unet_offload_device():
+    if vram_state == VRAMState.HIGH_VRAM:
+        return get_torch_device()
+    else:
+        return torch.device("cpu")
+def unet_inital_load_device(parameters, dtype):
+    torch_dev = get_torch_device()
+    if vram_state == VRAMState.HIGH_VRAM or vram_state == VRAMState.SHARED:
+        return torch_dev
+    cpu_dev = torch.device("cpu")
+    if DISABLE_SMART_MEMORY or vram_state == VRAMState.NO_VRAM:
+        return cpu_dev
+    model_size = dtype_size(dtype) * parameters
+    mem_dev = get_free_memory(torch_dev)
+    mem_cpu = get_free_memory(cpu_dev)
+    if mem_dev > mem_cpu and model_size < mem_dev:
+        return torch_dev
+    else:
+        return cpu_dev
+def maximum_vram_for_weights(device=None):
+    return (get_total_memory(device) * 0.88 - minimum_inference_memory())
+def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32], weight_dtype=None):
+    if model_params < 0:
+        model_params = 1000000000000000000000
+    if args.fp32_unet:
+        return torch.float32
+    if args.fp64_unet:
+        return torch.float64
+    if args.bf16_unet:
+        return torch.bfloat16
+    if args.fp16_unet:
+        return torch.float16
+    if args.fp8_e4m3fn_unet:
+        return torch.float8_e4m3fn
+    if args.fp8_e5m2_unet:
+        return torch.float8_e5m2
+    if args.fp8_e8m0fnu_unet:
+        return torch.float8_e8m0fnu
+    fp8_dtype = None
+    if weight_dtype in FLOAT8_TYPES:
+        fp8_dtype = weight_dtype
+    if fp8_dtype is not None:
+        if supports_fp8_compute(device): #if fp8 compute is supported the casting is most likely not expensive
+            return fp8_dtype
+        free_model_memory = maximum_vram_for_weights(device)
+        if model_params * 2 > free_model_memory:
+            return fp8_dtype
+    if PRIORITIZE_FP16 or weight_dtype == torch.float16:
+        if torch.float16 in supported_dtypes and should_use_fp16(device=device, model_params=model_params):
+            return torch.float16
+    for dt in supported_dtypes:
+        if dt == torch.float16 and should_use_fp16(device=device, model_params=model_params):
+            if torch.float16 in supported_dtypes:
+                return torch.float16
+        if dt == torch.bfloat16 and should_use_bf16(device, model_params=model_params):
+            if torch.bfloat16 in supported_dtypes:
+                return torch.bfloat16
+    for dt in supported_dtypes:
+        if dt == torch.float16 and should_use_fp16(device=device, model_params=model_params, manual_cast=True):
+            if torch.float16 in supported_dtypes:
+                return torch.float16
+        if dt == torch.bfloat16 and should_use_bf16(device, model_params=model_params, manual_cast=True):
+            if torch.bfloat16 in supported_dtypes:
+                return torch.bfloat16
+    return torch.float32
+# None means no manual cast
+def unet_manual_cast(weight_dtype, inference_device, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
+    if weight_dtype == torch.float32 or weight_dtype == torch.float64:
+        return None
+    fp16_supported = should_use_fp16(inference_device, prioritize_performance=False)
+    if fp16_supported and weight_dtype == torch.float16:
+        return None
+    bf16_supported = should_use_bf16(inference_device)
+    if bf16_supported and weight_dtype == torch.bfloat16:
+        return None
+    fp16_supported = should_use_fp16(inference_device, prioritize_performance=True)
+    if PRIORITIZE_FP16 and fp16_supported and torch.float16 in supported_dtypes:
+        return torch.float16
+    for dt in supported_dtypes:
+        if dt == torch.float16 and fp16_supported:
+            return torch.float16
+        if dt == torch.bfloat16 and bf16_supported:
+            return torch.bfloat16
+    return torch.float32
+def text_encoder_offload_device():
+    if args.gpu_only:
+        return get_torch_device()
+    else:
+        return torch.device("cpu")
+def text_encoder_device():
+    if args.gpu_only:
+        return get_torch_device()
+    elif vram_state == VRAMState.HIGH_VRAM or vram_state == VRAMState.NORMAL_VRAM:
+        if should_use_fp16(prioritize_performance=False):
+            return get_torch_device()
+        else:
+            return torch.device("cpu")
+    else:
+        return torch.device("cpu")
+def text_encoder_initial_device(load_device, offload_device, model_size=0):
+    if load_device == offload_device or model_size <= 1024 * 1024 * 1024:
+        return offload_device
+    if is_device_mps(load_device):
+        return load_device
+    mem_l = get_free_memory(load_device)
+    mem_o = get_free_memory(offload_device)
+    if mem_l > (mem_o * 0.5) and model_size * 1.2 < mem_l:
+        return load_device
+    else:
+        return offload_device
+def text_encoder_dtype(device=None):
+    if args.fp8_e4m3fn_text_enc:
+        return torch.float8_e4m3fn
+    elif args.fp8_e5m2_text_enc:
+        return torch.float8_e5m2
+    elif args.fp16_text_enc:
+        return torch.float16
+    elif args.bf16_text_enc:
+        return torch.bfloat16
+    elif args.fp32_text_enc:
+        return torch.float32
+    if is_device_cpu(device):
+        return torch.float16
+    return torch.float16
+def intermediate_device():
+    if args.gpu_only:
+        return get_torch_device()
+    else:
+        return torch.device("cpu")
+def vae_device():
+    if args.cpu_vae:
+        return torch.device("cpu")
+    return get_torch_device()
+def vae_offload_device():
+    if args.gpu_only:
+        return get_torch_device()
+    else:
+        return torch.device("cpu")
+def vae_dtype(device=None, allowed_dtypes=[]):
+    if args.fp16_vae:
+        return torch.float16
+    elif args.bf16_vae:
+        return torch.bfloat16
+    elif args.fp32_vae:
+        return torch.float32
+    for d in allowed_dtypes:
+        if d == torch.float16 and should_use_fp16(device):
+            return d
+        # NOTE: bfloat16 seems to work on AMD for the VAE but is extremely slow in some cases compared to fp32
+        # slowness still a problem on pytorch nightly 2.9.0.dev20250720+rocm6.4 tested on RDNA3
+        if d == torch.bfloat16 and (not is_amd()) and should_use_bf16(device):
+            return d
+    return torch.float32
+def get_autocast_device(dev):
+    if hasattr(dev, 'type'):
+        return dev.type
+    return "cuda"
+def supports_dtype(device, dtype): #TODO
+    if dtype == torch.float32:
+        return True
+    if is_device_cpu(device):
+        return False
+    if dtype == torch.float16:
+        return True
+    if dtype == torch.bfloat16:
+        return True
+    return False
+def supports_cast(device, dtype): #TODO
+    if dtype == torch.float32:
+        return True
+    if dtype == torch.float16:
+        return True
+    if directml_enabled: #TODO: test this
+        return False
+    if dtype == torch.bfloat16:
+        return True
+    if is_device_mps(device):
+        return False
+    if dtype == torch.float8_e4m3fn:
+        return True
+    if dtype == torch.float8_e5m2:
+        return True
+    return False
+def pick_weight_dtype(dtype, fallback_dtype, device=None):
+    if dtype is None:
+        dtype = fallback_dtype
+    elif dtype_size(dtype) > dtype_size(fallback_dtype):
+        dtype = fallback_dtype
+    if not supports_cast(device, dtype):
+        dtype = fallback_dtype
+    return dtype
+def device_supports_non_blocking(device):
+    if args.force_non_blocking:
+        return True
+    if is_device_mps(device):
+        return False #pytorch bug? mps doesn't support non blocking
+    if is_intel_xpu(): #xpu does support non blocking but it is slower on iGPUs for some reason so disable by default until situation changes
+        return False
+    if args.deterministic: #TODO: figure out why deterministic breaks non blocking from gpu to cpu (previews)
+        return False
+    if directml_enabled:
+        return False
+    return True
+def device_should_use_non_blocking(device):
+    if not device_supports_non_blocking(device):
+        return False
+    return False
+    # return True #TODO: figure out why this causes memory issues on Nvidia and possibly others
+def force_channels_last():
+    if args.force_channels_last:
+        return True
+    #TODO
+    return False
+STREAMS = {}
+NUM_STREAMS = 1
+if args.async_offload:
+    NUM_STREAMS = 2
+    logging.info("Using async weight offloading with {} streams".format(NUM_STREAMS))
+stream_counters = {}
+def get_offload_stream(device):
+    stream_counter = stream_counters.get(device, 0)
+    if NUM_STREAMS <= 1:
+        return None
+    if device in STREAMS:
+        ss = STREAMS[device]
+        s = ss[stream_counter]
+        stream_counter = (stream_counter + 1) % len(ss)
+        if is_device_cuda(device):
+            ss[stream_counter].wait_stream(torch.cuda.current_stream())
+        elif is_device_xpu(device):
+            ss[stream_counter].wait_stream(torch.xpu.current_stream())
+        stream_counters[device] = stream_counter
+        return s
+    elif is_device_cuda(device):
+        ss = []
+        for k in range(NUM_STREAMS):
+            ss.append(torch.cuda.Stream(device=device, priority=0))
+        STREAMS[device] = ss
+        s = ss[stream_counter]
+        stream_counter = (stream_counter + 1) % len(ss)
+        stream_counters[device] = stream_counter
+        return s
+    elif is_device_xpu(device):
+        ss = []
+        for k in range(NUM_STREAMS):
+            ss.append(torch.xpu.Stream(device=device, priority=0))
+        STREAMS[device] = ss
+        s = ss[stream_counter]
+        stream_counter = (stream_counter + 1) % len(ss)
+        stream_counters[device] = stream_counter
+        return s
+    return None
+def sync_stream(device, stream):
+    if stream is None:
+        return
+    if is_device_cuda(device):
+        torch.cuda.current_stream().wait_stream(stream)
+    elif is_device_xpu(device):
+        torch.xpu.current_stream().wait_stream(stream)
+def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None):
+    if device is None or weight.device == device:
+        if not copy:
+            if dtype is None or weight.dtype == dtype:
+                return weight
+        if stream is not None:
+            with stream:
+                return weight.to(dtype=dtype, copy=copy)
+        return weight.to(dtype=dtype, copy=copy)
+    if stream is not None:
+        with stream:
+            r = torch.empty_like(weight, dtype=dtype, device=device)
+            r.copy_(weight, non_blocking=non_blocking)
+    else:
+        r = torch.empty_like(weight, dtype=dtype, device=device)
+        r.copy_(weight, non_blocking=non_blocking)
+    return r
+def cast_to_device(tensor, device, dtype, copy=False):
+    non_blocking = device_supports_non_blocking(device)
+    return cast_to(tensor, dtype=dtype, device=device, non_blocking=non_blocking, copy=copy)
+def sage_attention_enabled():
+    return args.use_sage_attention
+def flash_attention_enabled():
+    return args.use_flash_attention
+def xformers_enabled():
+    global directml_enabled
+    global cpu_state
+    if cpu_state != CPUState.GPU:
+        return False
+    if is_intel_xpu():
+        return False
+    if is_ascend_npu():
+        return False
+    if is_mlu():
+        return False
+    if is_ixuca():
+        return False
+    if directml_enabled:
+        return False
+    return XFORMERS_IS_AVAILABLE
+def xformers_enabled_vae():
+    enabled = xformers_enabled()
+    if not enabled:
+        return False
+    return XFORMERS_ENABLED_VAE
+def pytorch_attention_enabled():
+    global ENABLE_PYTORCH_ATTENTION
+    return ENABLE_PYTORCH_ATTENTION
+def pytorch_attention_enabled_vae():
+    if is_amd():
+        return False  # enabling pytorch attention on AMD currently causes crash when doing high res
+    return pytorch_attention_enabled()
+def pytorch_attention_flash_attention():
+    global ENABLE_PYTORCH_ATTENTION
+    if ENABLE_PYTORCH_ATTENTION:
+        #TODO: more reliable way of checking for flash attention?
+        if is_nvidia():
+            return True
+        if is_intel_xpu():
+            return True
+        if is_ascend_npu():
+            return True
+        if is_mlu():
+            return True
+        if is_amd():
+            return True #if you have pytorch attention enabled on AMD it probably supports at least mem efficient attention
+        if is_ixuca():
+            return True
+    return False
+def force_upcast_attention_dtype():
+    upcast = args.force_upcast_attention
+    macos_version = mac_version()
+    if macos_version is not None and ((14, 5) <= macos_version):  # black image bug on recent versions of macOS, I don't think it's ever getting fixed
+        upcast = True
+    if upcast:
+        return {torch.float16: torch.float32}
+    else:
+        return None
+def get_free_memory(dev=None, torch_free_too=False):
+    global directml_enabled
+    if dev is None:
+        dev = get_torch_device()
+    if hasattr(dev, 'type') and (dev.type == 'cpu' or dev.type == 'mps'):
+        mem_free_total = psutil.virtual_memory().available
+        mem_free_torch = mem_free_total
+    else:
+        if directml_enabled:
+            mem_free_total = 1024 * 1024 * 1024 #TODO
+            mem_free_torch = mem_free_total
+        elif is_intel_xpu():
+            stats = torch.xpu.memory_stats(dev)
+            mem_active = stats['active_bytes.all.current']
+            mem_reserved = stats['reserved_bytes.all.current']
+            mem_free_xpu = torch.xpu.get_device_properties(dev).total_memory - mem_reserved
+            mem_free_torch = mem_reserved - mem_active
+            mem_free_total = mem_free_xpu + mem_free_torch
+        elif is_ascend_npu():
+            stats = torch.npu.memory_stats(dev)
+            mem_active = stats['active_bytes.all.current']
+            mem_reserved = stats['reserved_bytes.all.current']
+            mem_free_npu, _ = torch.npu.mem_get_info(dev)
+            mem_free_torch = mem_reserved - mem_active
+            mem_free_total = mem_free_npu + mem_free_torch
+        elif is_mlu():
+            stats = torch.mlu.memory_stats(dev)
+            mem_active = stats['active_bytes.all.current']
+            mem_reserved = stats['reserved_bytes.all.current']
+            mem_free_mlu, _ = torch.mlu.mem_get_info(dev)
+            mem_free_torch = mem_reserved - mem_active
+            mem_free_total = mem_free_mlu + mem_free_torch
+        else:
+            stats = torch.cuda.memory_stats(dev)
+            mem_active = stats['active_bytes.all.current']
+            mem_reserved = stats['reserved_bytes.all.current']
+            mem_free_cuda, _ = torch.cuda.mem_get_info(dev)
+            mem_free_torch = mem_reserved - mem_active
+            mem_free_total = mem_free_cuda + mem_free_torch
+    if torch_free_too:
+        return (mem_free_total, mem_free_torch)
+    else:
+        return mem_free_total
+def cpu_mode():
+    global cpu_state
+    return cpu_state == CPUState.CPU
+def mps_mode():
+    global cpu_state
+    return cpu_state == CPUState.MPS
+def is_device_type(device, type):
+    if hasattr(device, 'type'):
+        if (device.type == type):
+            return True
+    return False
+def is_device_cpu(device):
+    return is_device_type(device, 'cpu')
+def is_device_mps(device):
+    return is_device_type(device, 'mps')
+def is_device_xpu(device):
+    return is_device_type(device, 'xpu')
+def is_device_cuda(device):
+    return is_device_type(device, 'cuda')
+def is_directml_enabled():
+    global directml_enabled
+    if directml_enabled:
+        return True
+    return False
+def should_use_fp16(device=None, model_params=0, prioritize_performance=True, manual_cast=False):
+    if device is not None:
+        if is_device_cpu(device):
+            return False
+    if args.force_fp16:
+        return True
+    if FORCE_FP32:
+        return False
+    if is_directml_enabled():
+        return True
+    if (device is not None and is_device_mps(device)) or mps_mode():
+        return True
+    if cpu_mode():
+        return False
+    if is_intel_xpu():
+        if torch_version_numeric < (2, 3):
+            return True
+        else:
+            return torch.xpu.get_device_properties(device).has_fp16
+    if is_ascend_npu():
+        return True
+    if is_mlu():
+        return True
+    if is_ixuca():
+        return True
+    if torch.version.hip:
+        return True
+    props = torch.cuda.get_device_properties(device)
+    if props.major >= 8:
+        return True
+    if props.major < 6:
+        return False
+    #FP16 is confirmed working on a 1080 (GP104) and on latest pytorch actually seems faster than fp32
+    nvidia_10_series = ["1080", "1070", "titan x", "p3000", "p3200", "p4000", "p4200", "p5000", "p5200", "p6000", "1060", "1050", "p40", "p100", "p6", "p4"]
+    for x in nvidia_10_series:
+        if x in props.name.lower():
+            if WINDOWS or manual_cast:
+                return True
+            else:
+                return False #weird linux behavior where fp32 is faster
+    if manual_cast:
+        free_model_memory = maximum_vram_for_weights(device)
+        if (not prioritize_performance) or model_params * 4 > free_model_memory:
+            return True
+    if props.major < 7:
+        return False
+    #FP16 is just broken on these cards
+    nvidia_16_series = ["1660", "1650", "1630", "T500", "T550", "T600", "MX550", "MX450", "CMP 30HX", "T2000", "T1000", "T1200"]
+    for x in nvidia_16_series:
+        if x in props.name:
+            return False
+    return True
+def should_use_bf16(device=None, model_params=0, prioritize_performance=True, manual_cast=False):
+    if device is not None:
+        if is_device_cpu(device): #TODO ? bf16 works on CPU but is extremely slow
+            return False
+    if FORCE_FP32:
+        return False
+    if directml_enabled:
+        return False
+    if (device is not None and is_device_mps(device)) or mps_mode():
+        if mac_version() < (14,):
+            return False
+        return True
+    if cpu_mode():
+        return False
+    if is_intel_xpu():
+        if torch_version_numeric < (2, 3):
+            return True
+        else:
+            return torch.xpu.is_bf16_supported()
+    if is_ascend_npu():
+        return True
+    if is_ixuca():
+        return True
+    if is_amd():
+        arch = torch.cuda.get_device_properties(device).gcnArchName
+        if any((a in arch) for a in ["gfx1030", "gfx1031", "gfx1010", "gfx1011", "gfx1012", "gfx906", "gfx900", "gfx803"]):  # RDNA2 and older don't support bf16
+            if manual_cast:
+                return True
+            return False
+    props = torch.cuda.get_device_properties(device)
+    if is_mlu():
+        if props.major > 3:
+            return True
+    if props.major >= 8:
+        return True
+    bf16_works = torch.cuda.is_bf16_supported()
+    if bf16_works and manual_cast:
+        free_model_memory = maximum_vram_for_weights(device)
+        if (not prioritize_performance) or model_params * 4 > free_model_memory:
+            return True
+    return False
+def supports_fp8_compute(device=None):
+    if SUPPORT_FP8_OPS:
+        return True
+    if not is_nvidia():
+        return False
+    props = torch.cuda.get_device_properties(device)
+    if props.major >= 9:
+        return True
+    if props.major < 8:
+        return False
+    if props.minor < 9:
+        return False
+    if torch_version_numeric < (2, 3):
+        return False
+    if WINDOWS:
+        if torch_version_numeric < (2, 4):
+            return False
+    return True
+def extended_fp16_support():
+    # TODO: check why some models work with fp16 on newer torch versions but not on older
+    if torch_version_numeric < (2, 7):
+        return False
+    return True
+def soft_empty_cache(force=False):
+    global cpu_state
+    if cpu_state == CPUState.MPS:
+        torch.mps.empty_cache()
+    elif is_intel_xpu():
+        torch.xpu.empty_cache()
+    elif is_ascend_npu():
+        torch.npu.empty_cache()
+    elif is_mlu():
+        torch.mlu.empty_cache()
+    elif torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+def unload_all_models():
+    free_memory(1e30, get_torch_device())
+#TODO: might be cleaner to put this somewhere else
+import threading
+class InterruptProcessingException(Exception):
+    pass
+interrupt_processing_mutex = threading.RLock()
+interrupt_processing = False
+def interrupt_current_processing(value=True):
+    global interrupt_processing
+    global interrupt_processing_mutex
+    with interrupt_processing_mutex:
+        interrupt_processing = value
+def processing_interrupted():
+    global interrupt_processing
+    global interrupt_processing_mutex
+    with interrupt_processing_mutex:
+        return interrupt_processing
+def throw_exception_if_processing_interrupted():
+    global interrupt_processing
+    global interrupt_processing_mutex
+    with interrupt_processing_mutex:
+        if interrupt_processing:
+            interrupt_processing = False
+            raise InterruptProcessingException()
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+from __future__ import annotations
+import collections
+import copy
+import inspect
+import logging
+import math
+import uuid
+from typing import Callable, Optional
+import torch
+import comfy.float
+import comfy.hooks
+import comfy.lora
+import comfy.model_management
+import comfy.patcher_extension
+import comfy.utils
+from comfy.comfy_types import UnetWrapperFunction
+from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
+def string_to_seed(data):
+    crc = 0xFFFFFFFF
+    for byte in data:
+        if isinstance(byte, str):
+            byte = ord(byte)
+        crc ^= byte
+        for _ in range(8):
+            if crc & 1:
+                crc = (crc >> 1) ^ 0xEDB88320
+            else:
+                crc >>= 1
+    return crc ^ 0xFFFFFFFF
+def set_model_options_patch_replace(model_options, patch, name, block_name, number, transformer_index=None):
+    to = model_options["transformer_options"].copy()
+    if "patches_replace" not in to:
+        to["patches_replace"] = {}
+    else:
+        to["patches_replace"] = to["patches_replace"].copy()
+    if name not in to["patches_replace"]:
+        to["patches_replace"][name] = {}
+    else:
+        to["patches_replace"][name] = to["patches_replace"][name].copy()
+    if transformer_index is not None:
+        block = (block_name, number, transformer_index)
+    else:
+        block = (block_name, number)
+    to["patches_replace"][name][block] = patch
+    model_options["transformer_options"] = to
+    return model_options
+def set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=False):
+    model_options["sampler_post_cfg_function"] = model_options.get("sampler_post_cfg_function", []) + [post_cfg_function]
+    if disable_cfg1_optimization:
+        model_options["disable_cfg1_optimization"] = True
+    return model_options
+def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_cfg1_optimization=False):
+    model_options["sampler_pre_cfg_function"] = model_options.get("sampler_pre_cfg_function", []) + [pre_cfg_function]
+    if disable_cfg1_optimization:
+        model_options["disable_cfg1_optimization"] = True
+    return model_options
+def create_model_options_clone(orig_model_options: dict):
+    return comfy.patcher_extension.copy_nested_dicts(orig_model_options)
+def create_hook_patches_clone(orig_hook_patches):
+    new_hook_patches = {}
+    for hook_ref in orig_hook_patches:
+        new_hook_patches[hook_ref] = {}
+        for k in orig_hook_patches[hook_ref]:
+            new_hook_patches[hook_ref][k] = orig_hook_patches[hook_ref][k][:]
+    return new_hook_patches
+def wipe_lowvram_weight(m):
+    if hasattr(m, "prev_comfy_cast_weights"):
+        m.comfy_cast_weights = m.prev_comfy_cast_weights
+        del m.prev_comfy_cast_weights
+    if hasattr(m, "weight_function"):
+        m.weight_function = []
+    if hasattr(m, "bias_function"):
+        m.bias_function = []
+def move_weight_functions(m, device):
+    if device is None:
+        return 0
+    memory = 0
+    if hasattr(m, "weight_function"):
+        for f in m.weight_function:
+            if hasattr(f, "move_to"):
+                memory += f.move_to(device=device)
+    if hasattr(m, "bias_function"):
+        for f in m.bias_function:
+            if hasattr(f, "move_to"):
+                memory += f.move_to(device=device)
+    return memory
+class LowVramPatch:
+    def __init__(self, key, patches):
+        self.key = key
+        self.patches = patches
+    def __call__(self, weight):
+        intermediate_dtype = weight.dtype
+        if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops
+            intermediate_dtype = torch.float32
+            return comfy.float.stochastic_rounding(comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype), weight.dtype, seed=string_to_seed(self.key))
+        return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype)
+def get_key_weight(model, key):
+    set_func = None
+    convert_func = None
+    op_keys = key.rsplit('.', 1)
+    if len(op_keys) < 2:
+        weight = comfy.utils.get_attr(model, key)
+    else:
+        op = comfy.utils.get_attr(model, op_keys[0])
+        try:
+            set_func = getattr(op, "set_{}".format(op_keys[1]))
+        except AttributeError:
+            pass
+        try:
+            convert_func = getattr(op, "convert_{}".format(op_keys[1]))
+        except AttributeError:
+            pass
+        weight = getattr(op, op_keys[1])
+        if convert_func is not None:
+            weight = comfy.utils.get_attr(model, key)
+    return weight, set_func, convert_func
+class AutoPatcherEjector:
+    def __init__(self, model: 'ModelPatcher', skip_and_inject_on_exit_only=False):
+        self.model = model
+        self.was_injected = False
+        self.prev_skip_injection = False
+        self.skip_and_inject_on_exit_only = skip_and_inject_on_exit_only
+    def __enter__(self):
+        self.was_injected = False
+        self.prev_skip_injection = self.model.skip_injection
+        if self.skip_and_inject_on_exit_only:
+            self.model.skip_injection = True
+        if self.model.is_injected:
+            self.model.eject_model()
+            self.was_injected = True
+    def __exit__(self, *args):
+        if self.skip_and_inject_on_exit_only:
+            self.model.skip_injection = self.prev_skip_injection
+            self.model.inject_model()
+        if self.was_injected and not self.model.skip_injection:
+            self.model.inject_model()
+        self.model.skip_injection = self.prev_skip_injection
+class MemoryCounter:
+    def __init__(self, initial: int, minimum=0):
+        self.value = initial
+        self.minimum = minimum
+        # TODO: add a safe limit besides 0
+    def use(self, weight: torch.Tensor):
+        weight_size = weight.nelement() * weight.element_size()
+        if self.is_useable(weight_size):
+            self.decrement(weight_size)
+            return True
+        return False
+    def is_useable(self, used: int):
+        return self.value - used > self.minimum
+    def decrement(self, used: int):
+        self.value -= used
+class ModelPatcher:
+    def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
+        self.size = size
+        self.model = model
+        if not hasattr(self.model, 'device'):
+            logging.debug("Model doesn't have a device attribute.")
+            self.model.device = offload_device
+        elif self.model.device is None:
+            self.model.device = offload_device
+        self.patches = {}
+        self.backup = {}
+        self.object_patches = {}
+        self.object_patches_backup = {}
+        self.weight_wrapper_patches = {}
+        self.model_options = {"transformer_options":{}}
+        self.model_size()
+        self.load_device = load_device
+        self.offload_device = offload_device
+        self.weight_inplace_update = weight_inplace_update
+        self.force_cast_weights = False
+        self.patches_uuid = uuid.uuid4()
+        self.parent = None
+        self.attachments: dict[str] = {}
+        self.additional_models: dict[str, list[ModelPatcher]] = {}
+        self.callbacks: dict[str, dict[str, list[Callable]]] = CallbacksMP.init_callbacks()
+        self.wrappers: dict[str, dict[str, list[Callable]]] = WrappersMP.init_wrappers()
+        self.is_injected = False
+        self.skip_injection = False
+        self.injections: dict[str, list[PatcherInjection]] = {}
+        self.hook_patches: dict[comfy.hooks._HookRef] = {}
+        self.hook_patches_backup: dict[comfy.hooks._HookRef] = None
+        self.hook_backup: dict[str, tuple[torch.Tensor, torch.device]] = {}
+        self.cached_hook_patches: dict[comfy.hooks.HookGroup, dict[str, torch.Tensor]] = {}
+        self.current_hooks: Optional[comfy.hooks.HookGroup] = None
+        self.forced_hooks: Optional[comfy.hooks.HookGroup] = None  # NOTE: only used for CLIP at this time
+        self.is_clip = False
+        self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed
+        if not hasattr(self.model, 'model_loaded_weight_memory'):
+            self.model.model_loaded_weight_memory = 0
+        if not hasattr(self.model, 'lowvram_patch_counter'):
+            self.model.lowvram_patch_counter = 0
+        if not hasattr(self.model, 'model_lowvram'):
+            self.model.model_lowvram = False
+        if not hasattr(self.model, 'current_weight_patches_uuid'):
+            self.model.current_weight_patches_uuid = None
+    def model_size(self):
+        if self.size > 0:
+            return self.size
+        self.size = comfy.model_management.module_size(self.model)
+        return self.size
+    def loaded_size(self):
+        return self.model.model_loaded_weight_memory
+    def lowvram_patch_counter(self):
+        return self.model.lowvram_patch_counter
+    def clone(self):
+        n = self.__class__(self.model, self.load_device, self.offload_device, self.size, weight_inplace_update=self.weight_inplace_update)
+        n.patches = {}
+        for k in self.patches:
+            n.patches[k] = self.patches[k][:]
+        n.patches_uuid = self.patches_uuid
+        n.object_patches = self.object_patches.copy()
+        n.weight_wrapper_patches = self.weight_wrapper_patches.copy()
+        n.model_options = copy.deepcopy(self.model_options)
+        n.backup = self.backup
+        n.object_patches_backup = self.object_patches_backup
+        n.parent = self
+        n.force_cast_weights = self.force_cast_weights
+        # attachments
+        n.attachments = {}
+        for k in self.attachments:
+            if hasattr(self.attachments[k], "on_model_patcher_clone"):
+                n.attachments[k] = self.attachments[k].on_model_patcher_clone()
+            else:
+                n.attachments[k] = self.attachments[k]
+        # additional models
+        for k, c in self.additional_models.items():
+            n.additional_models[k] = [x.clone() for x in c]
+        # callbacks
+        for k, c in self.callbacks.items():
+            n.callbacks[k] = {}
+            for k1, c1 in c.items():
+                n.callbacks[k][k1] = c1.copy()
+        # sample wrappers
+        for k, w in self.wrappers.items():
+            n.wrappers[k] = {}
+            for k1, w1 in w.items():
+                n.wrappers[k][k1] = w1.copy()
+        # injection
+        n.is_injected = self.is_injected
+        n.skip_injection = self.skip_injection
+        for k, i in self.injections.items():
+            n.injections[k] = i.copy()
+        # hooks
+        n.hook_patches = create_hook_patches_clone(self.hook_patches)
+        n.hook_patches_backup = create_hook_patches_clone(self.hook_patches_backup) if self.hook_patches_backup else self.hook_patches_backup
+        for group in self.cached_hook_patches:
+            n.cached_hook_patches[group] = {}
+            for k in self.cached_hook_patches[group]:
+                n.cached_hook_patches[group][k] = self.cached_hook_patches[group][k]
+        n.hook_backup = self.hook_backup
+        n.current_hooks = self.current_hooks.clone() if self.current_hooks else self.current_hooks
+        n.forced_hooks = self.forced_hooks.clone() if self.forced_hooks else self.forced_hooks
+        n.is_clip = self.is_clip
+        n.hook_mode = self.hook_mode
+        for callback in self.get_all_callbacks(CallbacksMP.ON_CLONE):
+            callback(self, n)
+        return n
+    def is_clone(self, other):
+        if hasattr(other, 'model') and self.model is other.model:
+            return True
+        return False
+    def clone_has_same_weights(self, clone: 'ModelPatcher'):
+        if not self.is_clone(clone):
+            return False
+        if self.current_hooks != clone.current_hooks:
+            return False
+        if self.forced_hooks != clone.forced_hooks:
+            return False
+        if self.hook_patches.keys() != clone.hook_patches.keys():
+            return False
+        if self.attachments.keys() != clone.attachments.keys():
+            return False
+        if self.additional_models.keys() != clone.additional_models.keys():
+            return False
+        for key in self.callbacks:
+            if len(self.callbacks[key]) != len(clone.callbacks[key]):
+                return False
+        for key in self.wrappers:
+            if len(self.wrappers[key]) != len(clone.wrappers[key]):
+                return False
+        if self.injections.keys() != clone.injections.keys():
+            return False
+        if len(self.patches) == 0 and len(clone.patches) == 0:
+            return True
+        if self.patches_uuid == clone.patches_uuid:
+            if len(self.patches) != len(clone.patches):
+                logging.warning("WARNING: something went wrong, same patch uuid but different length of patches.")
+            else:
+                return True
+    def memory_required(self, input_shape):
+        return self.model.memory_required(input_shape=input_shape)
+    def set_model_sampler_cfg_function(self, sampler_cfg_function, disable_cfg1_optimization=False):
+        if len(inspect.signature(sampler_cfg_function).parameters) == 3:
+            self.model_options["sampler_cfg_function"] = lambda args: sampler_cfg_function(args["cond"], args["uncond"], args["cond_scale"]) #Old way
+        else:
+            self.model_options["sampler_cfg_function"] = sampler_cfg_function
+        if disable_cfg1_optimization:
+            self.model_options["disable_cfg1_optimization"] = True
+    def set_model_sampler_post_cfg_function(self, post_cfg_function, disable_cfg1_optimization=False):
+        self.model_options = set_model_options_post_cfg_function(self.model_options, post_cfg_function, disable_cfg1_optimization)
+    def set_model_sampler_pre_cfg_function(self, pre_cfg_function, disable_cfg1_optimization=False):
+        self.model_options = set_model_options_pre_cfg_function(self.model_options, pre_cfg_function, disable_cfg1_optimization)
+    def set_model_sampler_calc_cond_batch_function(self, sampler_calc_cond_batch_function):
+        self.model_options["sampler_calc_cond_batch_function"] = sampler_calc_cond_batch_function
+    def set_model_unet_function_wrapper(self, unet_wrapper_function: UnetWrapperFunction):
+        self.model_options["model_function_wrapper"] = unet_wrapper_function
+    def set_model_denoise_mask_function(self, denoise_mask_function):
+        self.model_options["denoise_mask_function"] = denoise_mask_function
+    def set_model_patch(self, patch, name):
+        to = self.model_options["transformer_options"]
+        if "patches" not in to:
+            to["patches"] = {}
+        to["patches"][name] = to["patches"].get(name, []) + [patch]
+    def set_model_patch_replace(self, patch, name, block_name, number, transformer_index=None):
+        self.model_options = set_model_options_patch_replace(self.model_options, patch, name, block_name, number, transformer_index=transformer_index)
+    def set_model_attn1_patch(self, patch):
+        self.set_model_patch(patch, "attn1_patch")
+    def set_model_attn2_patch(self, patch):
+        self.set_model_patch(patch, "attn2_patch")
+    def set_model_attn1_replace(self, patch, block_name, number, transformer_index=None):
+        self.set_model_patch_replace(patch, "attn1", block_name, number, transformer_index)
+    def set_model_attn2_replace(self, patch, block_name, number, transformer_index=None):
+        self.set_model_patch_replace(patch, "attn2", block_name, number, transformer_index)
+    def set_model_attn1_output_patch(self, patch):
+        self.set_model_patch(patch, "attn1_output_patch")
+    def set_model_attn2_output_patch(self, patch):
+        self.set_model_patch(patch, "attn2_output_patch")
+    def set_model_input_block_patch(self, patch):
+        self.set_model_patch(patch, "input_block_patch")
+    def set_model_input_block_patch_after_skip(self, patch):
+        self.set_model_patch(patch, "input_block_patch_after_skip")
+    def set_model_output_block_patch(self, patch):
+        self.set_model_patch(patch, "output_block_patch")
+    def set_model_emb_patch(self, patch):
+        self.set_model_patch(patch, "emb_patch")
+    def set_model_forward_timestep_embed_patch(self, patch):
+        self.set_model_patch(patch, "forward_timestep_embed_patch")
+    def set_model_double_block_patch(self, patch):
+        self.set_model_patch(patch, "double_block")
+    def add_object_patch(self, name, obj):
+        self.object_patches[name] = obj
+    def set_model_compute_dtype(self, dtype):
+        self.add_object_patch("manual_cast_dtype", dtype)
+        if dtype is not None:
+            self.force_cast_weights = True
+        self.patches_uuid = uuid.uuid4() #TODO: optimize by preventing a full model reload for this
+    def add_weight_wrapper(self, name, function):
+        self.weight_wrapper_patches[name] = self.weight_wrapper_patches.get(name, []) + [function]
+        self.patches_uuid = uuid.uuid4()
+    def get_model_object(self, name: str) -> torch.nn.Module:
+        """Retrieves a nested attribute from an object using dot notation considering
+        object patches.
+        Args:
+            name (str): The attribute path using dot notation (e.g. "model.layer.weight")
+        Returns:
+            The value of the requested attribute
+        Example:
+            patcher = ModelPatcher()
+            weight = patcher.get_model_object("layer1.conv.weight")
+        """
+        if name in self.object_patches:
+            return self.object_patches[name]
+        else:
+            if name in self.object_patches_backup:
+                return self.object_patches_backup[name]
+            else:
+                return comfy.utils.get_attr(self.model, name)
+    def model_patches_to(self, device):
+        to = self.model_options["transformer_options"]
+        if "patches" in to:
+            patches = to["patches"]
+            for name in patches:
+                patch_list = patches[name]
+                for i in range(len(patch_list)):
+                    if hasattr(patch_list[i], "to"):
+                        patch_list[i] = patch_list[i].to(device)
+        if "patches_replace" in to:
+            patches = to["patches_replace"]
+            for name in patches:
+                patch_list = patches[name]
+                for k in patch_list:
+                    if hasattr(patch_list[k], "to"):
+                        patch_list[k] = patch_list[k].to(device)
+        if "model_function_wrapper" in self.model_options:
+            wrap_func = self.model_options["model_function_wrapper"]
+            if hasattr(wrap_func, "to"):
+                self.model_options["model_function_wrapper"] = wrap_func.to(device)
+    def model_patches_models(self):
+        to = self.model_options["transformer_options"]
+        models = []
+        if "patches" in to:
+            patches = to["patches"]
+            for name in patches:
+                patch_list = patches[name]
+                for i in range(len(patch_list)):
+                    if hasattr(patch_list[i], "models"):
+                        models += patch_list[i].models()
+        if "patches_replace" in to:
+            patches = to["patches_replace"]
+            for name in patches:
+                patch_list = patches[name]
+                for k in patch_list:
+                    if hasattr(patch_list[k], "models"):
+                        models += patch_list[k].models()
+        if "model_function_wrapper" in self.model_options:
+            wrap_func = self.model_options["model_function_wrapper"]
+            if hasattr(wrap_func, "models"):
+                models += wrap_func.models()
+        return models
+    def model_dtype(self):
+        if hasattr(self.model, "get_dtype"):
+            return self.model.get_dtype()
+    def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
+        with self.use_ejected():
+            p = set()
+            model_sd = self.model.state_dict()
+            for k in patches:
+                offset = None
+                function = None
+                if isinstance(k, str):
+                    key = k
+                else:
+                    offset = k[1]
+                    key = k[0]
+                    if len(k) > 2:
+                        function = k[2]
+                if key in model_sd:
+                    p.add(k)
+                    current_patches = self.patches.get(key, [])
+                    current_patches.append((strength_patch, patches[k], strength_model, offset, function))
+                    self.patches[key] = current_patches
+            self.patches_uuid = uuid.uuid4()
+            return list(p)
+    def get_key_patches(self, filter_prefix=None):
+        model_sd = self.model_state_dict()
+        p = {}
+        for k in model_sd:
+            if filter_prefix is not None:
+                if not k.startswith(filter_prefix):
+                    continue
+            bk = self.backup.get(k, None)
+            hbk = self.hook_backup.get(k, None)
+            weight, set_func, convert_func = get_key_weight(self.model, k)
+            if bk is not None:
+                weight = bk.weight
+            if hbk is not None:
+                weight = hbk[0]
+            if convert_func is None:
+                convert_func = lambda a, **kwargs: a
+            if k in self.patches:
+                p[k] = [(weight, convert_func)] + self.patches[k]
+            else:
+                p[k] = [(weight, convert_func)]
+        return p
+    def model_state_dict(self, filter_prefix=None):
+        with self.use_ejected():
+            sd = self.model.state_dict()
+            keys = list(sd.keys())
+            if filter_prefix is not None:
+                for k in keys:
+                    if not k.startswith(filter_prefix):
+                        sd.pop(k)
+            return sd
+    def patch_weight_to_device(self, key, device_to=None, inplace_update=False):
+        if key not in self.patches:
+            return
+        weight, set_func, convert_func = get_key_weight(self.model, key)
+        inplace_update = self.weight_inplace_update or inplace_update
+        if key not in self.backup:
+            self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
+        if device_to is not None:
+            temp_weight = comfy.model_management.cast_to_device(weight, device_to, torch.float32, copy=True)
+        else:
+            temp_weight = weight.to(torch.float32, copy=True)
+        if convert_func is not None:
+            temp_weight = convert_func(temp_weight, inplace=True)
+        out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
+        if set_func is None:
+            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
+            if inplace_update:
+                comfy.utils.copy_to_param(self.model, key, out_weight)
+            else:
+                comfy.utils.set_attr_param(self.model, key, out_weight)
+        else:
+            set_func(out_weight, inplace_update=inplace_update, seed=string_to_seed(key))
+    def _load_list(self):
+        loading = []
+        for n, m in self.model.named_modules():
+            params = []
+            skip = False
+            for name, param in m.named_parameters(recurse=False):
+                params.append(name)
+            for name, param in m.named_parameters(recurse=True):
+                if name not in params:
+                    skip = True # skip random weights in non leaf modules
+                    break
+            if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
+                loading.append((comfy.model_management.module_size(m), n, m, params))
+        return loading
+    def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
+        with self.use_ejected():
+            self.unpatch_hooks()
+            mem_counter = 0
+            patch_counter = 0
+            lowvram_counter = 0
+            loading = self._load_list()
+            load_completely = []
+            loading.sort(reverse=True)
+            for x in loading:
+                n = x[1]
+                m = x[2]
+                params = x[3]
+                module_mem = x[0]
+                lowvram_weight = False
+                weight_key = "{}.weight".format(n)
+                bias_key = "{}.bias".format(n)
+                if not full_load and hasattr(m, "comfy_cast_weights"):
+                    if mem_counter + module_mem >= lowvram_model_memory:
+                        lowvram_weight = True
+                        lowvram_counter += 1
+                        if hasattr(m, "prev_comfy_cast_weights"): #Already lowvramed
+                            continue
+                cast_weight = self.force_cast_weights
+                if lowvram_weight:
+                    if hasattr(m, "comfy_cast_weights"):
+                        m.weight_function = []
+                        m.bias_function = []
+                    if weight_key in self.patches:
+                        if force_patch_weights:
+                            self.patch_weight_to_device(weight_key)
+                        else:
+                            m.weight_function = [LowVramPatch(weight_key, self.patches)]
+                            patch_counter += 1
+                    if bias_key in self.patches:
+                        if force_patch_weights:
+                            self.patch_weight_to_device(bias_key)
+                        else:
+                            m.bias_function = [LowVramPatch(bias_key, self.patches)]
+                            patch_counter += 1
+                    cast_weight = True
+                else:
+                    if hasattr(m, "comfy_cast_weights"):
+                        wipe_lowvram_weight(m)
+                    if full_load or mem_counter + module_mem < lowvram_model_memory:
+                        mem_counter += module_mem
+                        load_completely.append((module_mem, n, m, params))
+                if cast_weight and hasattr(m, "comfy_cast_weights"):
+                    m.prev_comfy_cast_weights = m.comfy_cast_weights
+                    m.comfy_cast_weights = True
+                if weight_key in self.weight_wrapper_patches:
+                    m.weight_function.extend(self.weight_wrapper_patches[weight_key])
+                if bias_key in self.weight_wrapper_patches:
+                    m.bias_function.extend(self.weight_wrapper_patches[bias_key])
+                mem_counter += move_weight_functions(m, device_to)
+            load_completely.sort(reverse=True)
+            for x in load_completely:
+                n = x[1]
+                m = x[2]
+                params = x[3]
+                if hasattr(m, "comfy_patched_weights"):
+                    if m.comfy_patched_weights == True:
+                        continue
+                for param in params:
+                    self.patch_weight_to_device("{}.{}".format(n, param), device_to=device_to)
+                logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
+                m.comfy_patched_weights = True
+            for x in load_completely:
+                x[2].to(device_to)
+            if lowvram_counter > 0:
+                logging.info("loaded partially {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), patch_counter))
+                self.model.model_lowvram = True
+            else:
+                logging.info("loaded completely {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
+                self.model.model_lowvram = False
+                if full_load:
+                    self.model.to(device_to)
+                    mem_counter = self.model_size()
+            self.model.lowvram_patch_counter += patch_counter
+            self.model.device = device_to
+            self.model.model_loaded_weight_memory = mem_counter
+            self.model.current_weight_patches_uuid = self.patches_uuid
+            for callback in self.get_all_callbacks(CallbacksMP.ON_LOAD):
+                callback(self, device_to, lowvram_model_memory, force_patch_weights, full_load)
+            self.apply_hooks(self.forced_hooks, force_apply=True)
+    def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
+        with self.use_ejected():
+            for k in self.object_patches:
+                old = comfy.utils.set_attr(self.model, k, self.object_patches[k])
+                if k not in self.object_patches_backup:
+                    self.object_patches_backup[k] = old
+            if lowvram_model_memory == 0:
+                full_load = True
+            else:
+                full_load = False
+            if load_weights:
+                self.load(device_to, lowvram_model_memory=lowvram_model_memory, force_patch_weights=force_patch_weights, full_load=full_load)
+        self.inject_model()
+        return self.model
+    def unpatch_model(self, device_to=None, unpatch_weights=True):
+        self.eject_model()
+        if unpatch_weights:
+            self.unpatch_hooks()
+            if self.model.model_lowvram:
+                for m in self.model.modules():
+                    move_weight_functions(m, device_to)
+                    wipe_lowvram_weight(m)
+                self.model.model_lowvram = False
+                self.model.lowvram_patch_counter = 0
+            keys = list(self.backup.keys())
+            for k in keys:
+                bk = self.backup[k]
+                if bk.inplace_update:
+                    comfy.utils.copy_to_param(self.model, k, bk.weight)
+                else:
+                    comfy.utils.set_attr_param(self.model, k, bk.weight)
+            self.model.current_weight_patches_uuid = None
+            self.backup.clear()
+            if device_to is not None:
+                self.model.to(device_to)
+                self.model.device = device_to
+            self.model.model_loaded_weight_memory = 0
+            for m in self.model.modules():
+                if hasattr(m, "comfy_patched_weights"):
+                    del m.comfy_patched_weights
+        keys = list(self.object_patches_backup.keys())
+        for k in keys:
+            comfy.utils.set_attr(self.model, k, self.object_patches_backup[k])
+        self.object_patches_backup.clear()
+    def partially_unload(self, device_to, memory_to_free=0):
+        with self.use_ejected():
+            hooks_unpatched = False
+            memory_freed = 0
+            patch_counter = 0
+            unload_list = self._load_list()
+            unload_list.sort()
+            for unload in unload_list:
+                if memory_to_free < memory_freed:
+                    break
+                module_mem = unload[0]
+                n = unload[1]
+                m = unload[2]
+                params = unload[3]
+                lowvram_possible = hasattr(m, "comfy_cast_weights")
+                if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
+                    move_weight = True
+                    for param in params:
+                        key = "{}.{}".format(n, param)
+                        bk = self.backup.get(key, None)
+                        if bk is not None:
+                            if not lowvram_possible:
+                                move_weight = False
+                                break
+                            if not hooks_unpatched:
+                                self.unpatch_hooks()
+                                hooks_unpatched = True
+                            if bk.inplace_update:
+                                comfy.utils.copy_to_param(self.model, key, bk.weight)
+                            else:
+                                comfy.utils.set_attr_param(self.model, key, bk.weight)
+                            self.backup.pop(key)
+                    weight_key = "{}.weight".format(n)
+                    bias_key = "{}.bias".format(n)
+                    if move_weight:
+                        cast_weight = self.force_cast_weights
+                        m.to(device_to)
+                        module_mem += move_weight_functions(m, device_to)
+                        if lowvram_possible:
+                            if weight_key in self.patches:
+                                m.weight_function.append(LowVramPatch(weight_key, self.patches))
+                                patch_counter += 1
+                            if bias_key in self.patches:
+                                m.bias_function.append(LowVramPatch(bias_key, self.patches))
+                                patch_counter += 1
+                            cast_weight = True
+                        if cast_weight:
+                            m.prev_comfy_cast_weights = m.comfy_cast_weights
+                            m.comfy_cast_weights = True
+                        m.comfy_patched_weights = False
+                        memory_freed += module_mem
+                        logging.debug("freed {}".format(n))
+            self.model.model_lowvram = True
+            self.model.lowvram_patch_counter += patch_counter
+            self.model.model_loaded_weight_memory -= memory_freed
+            return memory_freed
+    def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
+        with self.use_ejected(skip_and_inject_on_exit_only=True):
+            unpatch_weights = self.model.current_weight_patches_uuid is not None and (self.model.current_weight_patches_uuid != self.patches_uuid or force_patch_weights)
+            # TODO: force_patch_weights should not unload + reload full model
+            used = self.model.model_loaded_weight_memory
+            self.unpatch_model(self.offload_device, unpatch_weights=unpatch_weights)
+            if unpatch_weights:
+                extra_memory += (used - self.model.model_loaded_weight_memory)
+            self.patch_model(load_weights=False)
+            full_load = False
+            if self.model.model_lowvram == False and self.model.model_loaded_weight_memory > 0:
+                self.apply_hooks(self.forced_hooks, force_apply=True)
+                return 0
+            if self.model.model_loaded_weight_memory + extra_memory > self.model_size():
+                full_load = True
+            current_used = self.model.model_loaded_weight_memory
+            try:
+                self.load(device_to, lowvram_model_memory=current_used + extra_memory, force_patch_weights=force_patch_weights, full_load=full_load)
+            except Exception as e:
+                self.detach()
+                raise e
+            return self.model.model_loaded_weight_memory - current_used
+    def detach(self, unpatch_all=True):
+        self.eject_model()
+        self.model_patches_to(self.offload_device)
+        if unpatch_all:
+            self.unpatch_model(self.offload_device, unpatch_weights=unpatch_all)
+        for callback in self.get_all_callbacks(CallbacksMP.ON_DETACH):
+            callback(self, unpatch_all)
+        return self.model
+    def current_loaded_device(self):
+        return self.model.device
+    def calculate_weight(self, patches, weight, key, intermediate_dtype=torch.float32):
+        logging.warning("The ModelPatcher.calculate_weight function is deprecated, please use: comfy.lora.calculate_weight instead")
+        return comfy.lora.calculate_weight(patches, weight, key, intermediate_dtype=intermediate_dtype)
+    def cleanup(self):
+        self.clean_hooks()
+        if hasattr(self.model, "current_patcher"):
+            self.model.current_patcher = None
+        for callback in self.get_all_callbacks(CallbacksMP.ON_CLEANUP):
+            callback(self)
+    def add_callback(self, call_type: str, callback: Callable):
+        self.add_callback_with_key(call_type, None, callback)
+    def add_callback_with_key(self, call_type: str, key: str, callback: Callable):
+        c = self.callbacks.setdefault(call_type, {}).setdefault(key, [])
+        c.append(callback)
+    def remove_callbacks_with_key(self, call_type: str, key: str):
+        c = self.callbacks.get(call_type, {})
+        if key in c:
+            c.pop(key)
+    def get_callbacks(self, call_type: str, key: str):
+        return self.callbacks.get(call_type, {}).get(key, [])
+    def get_all_callbacks(self, call_type: str):
+        c_list = []
+        for c in self.callbacks.get(call_type, {}).values():
+            c_list.extend(c)
+        return c_list
+    def add_wrapper(self, wrapper_type: str, wrapper: Callable):
+        self.add_wrapper_with_key(wrapper_type, None, wrapper)
+    def add_wrapper_with_key(self, wrapper_type: str, key: str, wrapper: Callable):
+        w = self.wrappers.setdefault(wrapper_type, {}).setdefault(key, [])
+        w.append(wrapper)
+    def remove_wrappers_with_key(self, wrapper_type: str, key: str):
+        w = self.wrappers.get(wrapper_type, {})
+        if key in w:
+            w.pop(key)
+    def get_wrappers(self, wrapper_type: str, key: str):
+        return self.wrappers.get(wrapper_type, {}).get(key, [])
+    def get_all_wrappers(self, wrapper_type: str):
+        w_list = []
+        for w in self.wrappers.get(wrapper_type, {}).values():
+            w_list.extend(w)
+        return w_list
+    def set_attachments(self, key: str, attachment):
+        self.attachments[key] = attachment
+    def remove_attachments(self, key: str):
+        if key in self.attachments:
+            self.attachments.pop(key)
+    def get_attachment(self, key: str):
+        return self.attachments.get(key, None)
+    def set_injections(self, key: str, injections: list[PatcherInjection]):
+        self.injections[key] = injections
+    def remove_injections(self, key: str):
+        if key in self.injections:
+            self.injections.pop(key)
+    def get_injections(self, key: str):
+        return self.injections.get(key, None)
+    def set_additional_models(self, key: str, models: list['ModelPatcher']):
+        self.additional_models[key] = models
+    def remove_additional_models(self, key: str):
+        if key in self.additional_models:
+            self.additional_models.pop(key)
+    def get_additional_models_with_key(self, key: str):
+        return self.additional_models.get(key, [])
+    def get_additional_models(self):
+        all_models = []
+        for models in self.additional_models.values():
+            all_models.extend(models)
+        return all_models
+    def get_nested_additional_models(self):
+        def _evaluate_sub_additional_models(prev_models: list[ModelPatcher], cache_set: set[ModelPatcher]):
+            '''Make sure circular references do not cause infinite recursion.'''
+            next_models = []
+            for model in prev_models:
+                candidates = model.get_additional_models()
+                for c in candidates:
+                    if c not in cache_set:
+                        next_models.append(c)
+                        cache_set.add(c)
+            if len(next_models) == 0:
+                return prev_models
+            return prev_models + _evaluate_sub_additional_models(next_models, cache_set)
+        all_models = self.get_additional_models()
+        models_set = set(all_models)
+        real_all_models = _evaluate_sub_additional_models(prev_models=all_models, cache_set=models_set)
+        return real_all_models
+    def use_ejected(self, skip_and_inject_on_exit_only=False):
+        return AutoPatcherEjector(self, skip_and_inject_on_exit_only=skip_and_inject_on_exit_only)
+    def inject_model(self):
+        if self.is_injected or self.skip_injection:
+            return
+        for injections in self.injections.values():
+            for inj in injections:
+                inj.inject(self)
+                self.is_injected = True
+        if self.is_injected:
+            for callback in self.get_all_callbacks(CallbacksMP.ON_INJECT_MODEL):
+                callback(self)
+    def eject_model(self):
+        if not self.is_injected:
+            return
+        for injections in self.injections.values():
+            for inj in injections:
+                inj.eject(self)
+        self.is_injected = False
+        for callback in self.get_all_callbacks(CallbacksMP.ON_EJECT_MODEL):
+            callback(self)
+    def pre_run(self):
+        if hasattr(self.model, "current_patcher"):
+            self.model.current_patcher = self
+        for callback in self.get_all_callbacks(CallbacksMP.ON_PRE_RUN):
+            callback(self)
+    def prepare_state(self, timestep):
+        for callback in self.get_all_callbacks(CallbacksMP.ON_PREPARE_STATE):
+            callback(self, timestep)
+    def restore_hook_patches(self):
+        if self.hook_patches_backup is not None:
+            self.hook_patches = self.hook_patches_backup
+            self.hook_patches_backup = None
+    def set_hook_mode(self, hook_mode: comfy.hooks.EnumHookMode):
+        self.hook_mode = hook_mode
+    def prepare_hook_patches_current_keyframe(self, t: torch.Tensor, hook_group: comfy.hooks.HookGroup, model_options: dict[str]):
+        curr_t = t[0]
+        reset_current_hooks = False
+        transformer_options = model_options.get("transformer_options", {})
+        for hook in hook_group.hooks:
+            changed = hook.hook_keyframe.prepare_current_keyframe(curr_t=curr_t, transformer_options=transformer_options)
+            # if keyframe changed, remove any cached HookGroups that contain hook with the same hook_ref;
+            # this will cause the weights to be recalculated when sampling
+            if changed:
+                # reset current_hooks if contains hook that changed
+                if self.current_hooks is not None:
+                    for current_hook in self.current_hooks.hooks:
+                        if current_hook == hook:
+                            reset_current_hooks = True
+                            break
+                for cached_group in list(self.cached_hook_patches.keys()):
+                    if cached_group.contains(hook):
+                        self.cached_hook_patches.pop(cached_group)
+        if reset_current_hooks:
+            self.patch_hooks(None)
+    def register_all_hook_patches(self, hooks: comfy.hooks.HookGroup, target_dict: dict[str], model_options: dict=None,
+                                  registered: comfy.hooks.HookGroup = None):
+        self.restore_hook_patches()
+        if registered is None:
+            registered = comfy.hooks.HookGroup()
+        # handle WeightHooks
+        weight_hooks_to_register: list[comfy.hooks.WeightHook] = []
+        for hook in hooks.get_type(comfy.hooks.EnumHookType.Weight):
+            if hook.hook_ref not in self.hook_patches:
+                weight_hooks_to_register.append(hook)
+            else:
+                registered.add(hook)
+        if len(weight_hooks_to_register) > 0:
+            # clone hook_patches to become backup so that any non-dynamic hooks will return to their original state
+            self.hook_patches_backup = create_hook_patches_clone(self.hook_patches)
+            for hook in weight_hooks_to_register:
+                hook.add_hook_patches(self, model_options, target_dict, registered)
+        for callback in self.get_all_callbacks(CallbacksMP.ON_REGISTER_ALL_HOOK_PATCHES):
+            callback(self, hooks, target_dict, model_options, registered)
+        return registered
+    def add_hook_patches(self, hook: comfy.hooks.WeightHook, patches, strength_patch=1.0, strength_model=1.0):
+        with self.use_ejected():
+            # NOTE: this mirrors behavior of add_patches func
+            current_hook_patches: dict[str,list] = self.hook_patches.get(hook.hook_ref, {})
+            p = set()
+            model_sd = self.model.state_dict()
+            for k in patches:
+                offset = None
+                function = None
+                if isinstance(k, str):
+                    key = k
+                else:
+                    offset = k[1]
+                    key = k[0]
+                    if len(k) > 2:
+                        function = k[2]
+                if key in model_sd:
+                    p.add(k)
+                    current_patches: list[tuple] = current_hook_patches.get(key, [])
+                    current_patches.append((strength_patch, patches[k], strength_model, offset, function))
+                    current_hook_patches[key] = current_patches
+            self.hook_patches[hook.hook_ref] = current_hook_patches
+            # since should care about these patches too to determine if same model, reroll patches_uuid
+            self.patches_uuid = uuid.uuid4()
+            return list(p)
+    def get_combined_hook_patches(self, hooks: comfy.hooks.HookGroup):
+        # combined_patches will contain  weights of all relevant hooks, per key
+        combined_patches = {}
+        if hooks is not None:
+            for hook in hooks.hooks:
+                hook_patches: dict = self.hook_patches.get(hook.hook_ref, {})
+                for key in hook_patches.keys():
+                    current_patches: list[tuple] = combined_patches.get(key, [])
+                    if math.isclose(hook.strength, 1.0):
+                        current_patches.extend(hook_patches[key])
+                    else:
+                        # patches are stored as tuples: (strength_patch, (tuple_with_weights,), strength_model)
+                        for patch in hook_patches[key]:
+                            new_patch = list(patch)
+                            new_patch[0] *= hook.strength
+                            current_patches.append(tuple(new_patch))
+                    combined_patches[key] = current_patches
+        return combined_patches
+    def apply_hooks(self, hooks: comfy.hooks.HookGroup, transformer_options: dict=None, force_apply=False):
+        # TODO: return transformer_options dict with any additions from hooks
+        if self.current_hooks == hooks and (not force_apply or (not self.is_clip and hooks is None)):
+            return comfy.hooks.create_transformer_options_from_hooks(self, hooks, transformer_options)
+        self.patch_hooks(hooks=hooks)
+        for callback in self.get_all_callbacks(CallbacksMP.ON_APPLY_HOOKS):
+            callback(self, hooks)
+        return comfy.hooks.create_transformer_options_from_hooks(self, hooks, transformer_options)
+    def patch_hooks(self, hooks: comfy.hooks.HookGroup):
+        with self.use_ejected():
+            if hooks is not None:
+                model_sd_keys = list(self.model_state_dict().keys())
+                memory_counter = None
+                if self.hook_mode == comfy.hooks.EnumHookMode.MaxSpeed:
+                    # TODO: minimum_counter should have a minimum that conforms to loaded model requirements
+                    memory_counter = MemoryCounter(initial=comfy.model_management.get_free_memory(self.load_device),
+                                                minimum=comfy.model_management.minimum_inference_memory()*2)
+                # if have cached weights for hooks, use it
+                cached_weights = self.cached_hook_patches.get(hooks, None)
+                if cached_weights is not None:
+                    model_sd_keys_set = set(model_sd_keys)
+                    for key in cached_weights:
+                        if key not in model_sd_keys:
+                            logging.warning(f"Cached hook could not patch. Key does not exist in model: {key}")
+                            continue
+                        self.patch_cached_hook_weights(cached_weights=cached_weights, key=key, memory_counter=memory_counter)
+                        model_sd_keys_set.remove(key)
+                    self.unpatch_hooks(model_sd_keys_set)
+                else:
+                    self.unpatch_hooks()
+                    relevant_patches = self.get_combined_hook_patches(hooks=hooks)
+                    original_weights = None
+                    if len(relevant_patches) > 0:
+                        original_weights = self.get_key_patches()
+                    for key in relevant_patches:
+                        if key not in model_sd_keys:
+                            logging.warning(f"Cached hook would not patch. Key does not exist in model: {key}")
+                            continue
+                        self.patch_hook_weight_to_device(hooks=hooks, combined_patches=relevant_patches, key=key, original_weights=original_weights,
+                                                            memory_counter=memory_counter)
+            else:
+                self.unpatch_hooks()
+            self.current_hooks = hooks
+    def patch_cached_hook_weights(self, cached_weights: dict, key: str, memory_counter: MemoryCounter):
+        if key not in self.hook_backup:
+            weight: torch.Tensor = comfy.utils.get_attr(self.model, key)
+            target_device = self.offload_device
+            if self.hook_mode == comfy.hooks.EnumHookMode.MaxSpeed:
+                used = memory_counter.use(weight)
+                if used:
+                    target_device = weight.device
+            self.hook_backup[key] = (weight.to(device=target_device, copy=True), weight.device)
+        comfy.utils.copy_to_param(self.model, key, cached_weights[key][0].to(device=cached_weights[key][1]))
+    def clear_cached_hook_weights(self):
+        self.cached_hook_patches.clear()
+        self.patch_hooks(None)
+    def patch_hook_weight_to_device(self, hooks: comfy.hooks.HookGroup, combined_patches: dict, key: str, original_weights: dict, memory_counter: MemoryCounter):
+        if key not in combined_patches:
+            return
+        weight, set_func, convert_func = get_key_weight(self.model, key)
+        weight: torch.Tensor
+        if key not in self.hook_backup:
+            target_device = self.offload_device
+            if self.hook_mode == comfy.hooks.EnumHookMode.MaxSpeed:
+                used = memory_counter.use(weight)
+                if used:
+                    target_device = weight.device
+            self.hook_backup[key] = (weight.to(device=target_device, copy=True), weight.device)
+        # TODO: properly handle LowVramPatch, if it ends up an issue
+        temp_weight = comfy.model_management.cast_to_device(weight, weight.device, torch.float32, copy=True)
+        if convert_func is not None:
+            temp_weight = convert_func(temp_weight, inplace=True)
+        out_weight = comfy.lora.calculate_weight(combined_patches[key],
+                                                 temp_weight,
+                                                 key, original_weights=original_weights)
+        del original_weights[key]
+        if set_func is None:
+            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
+            comfy.utils.copy_to_param(self.model, key, out_weight)
+        else:
+            set_func(out_weight, inplace_update=True, seed=string_to_seed(key))
+        if self.hook_mode == comfy.hooks.EnumHookMode.MaxSpeed:
+            # TODO: disable caching if not enough system RAM to do so
+            target_device = self.offload_device
+            used = memory_counter.use(weight)
+            if used:
+                target_device = weight.device
+            self.cached_hook_patches.setdefault(hooks, {})
+            self.cached_hook_patches[hooks][key] = (out_weight.to(device=target_device, copy=False), weight.device)
+        del temp_weight
+        del out_weight
+        del weight
+    def unpatch_hooks(self, whitelist_keys_set: set[str]=None) -> None:
+        with self.use_ejected():
+            if len(self.hook_backup) == 0:
+                self.current_hooks = None
+                return
+            keys = list(self.hook_backup.keys())
+            if whitelist_keys_set:
+                for k in keys:
+                    if k in whitelist_keys_set:
+                        comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
+                        self.hook_backup.pop(k)
+            else:
+                for k in keys:
+                    comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
+                self.hook_backup.clear()
+                self.current_hooks = None
+    def clean_hooks(self):
+        self.unpatch_hooks()
+        self.clear_cached_hook_weights()
+    def __del__(self):
+        self.detach(unpatch_all=False)
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
+import torch
+from comfy.ldm.modules.diffusionmodules.util import make_beta_schedule
+import math
+def rescale_zero_terminal_snr_sigmas(sigmas):
+    alphas_cumprod = 1 / ((sigmas * sigmas) + 1)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= (alphas_bar_sqrt_T)
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas_bar[-1] = 4.8973451890853435e-08
+    return ((1 - alphas_bar) / alphas_bar) ** 0.5
+class EPS:
+    def calculate_input(self, sigma, noise):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        return noise / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input - model_output * sigma
+    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        if max_denoise:
+            noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
+        else:
+            noise = noise * sigma
+        noise += latent_image
+        return noise
+    def inverse_noise_scaling(self, sigma, latent):
+        return latent
+class V_PREDICTION(EPS):
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+class EDM(V_PREDICTION):
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) + model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+class CONST:
+    def calculate_input(self, sigma, noise):
+        return noise
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input - model_output * sigma
+    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        return sigma * noise + (1.0 - sigma) * latent_image
+    def inverse_noise_scaling(self, sigma, latent):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (latent.ndim - 1))
+        return latent / (1.0 - sigma)
+class X0(EPS):
+    def calculate_denoised(self, sigma, model_output, model_input):
+        return model_output
+class IMG_TO_IMG(X0):
+    def calculate_input(self, sigma, noise):
+        return noise
+class COSMOS_RFLOW:
+    def calculate_input(self, sigma, noise):
+        sigma = (sigma / (sigma + 1))
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        return noise * (1.0 - sigma)
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = (sigma / (sigma + 1))
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input * (1.0 - sigma) - model_output * sigma
+    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        noise = noise * sigma
+        noise += latent_image
+        return noise
+    def inverse_noise_scaling(self, sigma, latent):
+        return latent
+class ModelSamplingDiscrete(torch.nn.Module):
+    def __init__(self, model_config=None, zsnr=None):
+        super().__init__()
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+        beta_schedule = sampling_settings.get("beta_schedule", "linear")
+        linear_start = sampling_settings.get("linear_start", 0.00085)
+        linear_end = sampling_settings.get("linear_end", 0.012)
+        timesteps = sampling_settings.get("timesteps", 1000)
+        if zsnr is None:
+            zsnr = sampling_settings.get("zsnr", False)
+        self._register_schedule(given_betas=None, beta_schedule=beta_schedule, timesteps=timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=8e-3, zsnr=zsnr)
+        self.sigma_data = 1.0
+    def _register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
+                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3, zsnr=False):
+        if given_betas is not None:
+            betas = given_betas
+        else:
+            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
+        alphas = 1. - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        self.zsnr = zsnr
+        # self.register_buffer('betas', torch.tensor(betas, dtype=torch.float32))
+        # self.register_buffer('alphas_cumprod', torch.tensor(alphas_cumprod, dtype=torch.float32))
+        # self.register_buffer('alphas_cumprod_prev', torch.tensor(alphas_cumprod_prev, dtype=torch.float32))
+        sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        if self.zsnr:
+            sigmas = rescale_zero_terminal_snr_sigmas(sigmas)
+        self.set_sigmas(sigmas)
+    def set_sigmas(self, sigmas):
+        self.register_buffer('sigmas', sigmas.float())
+        self.register_buffer('log_sigmas', sigmas.log().float())
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+    def timestep(self, sigma):
+        log_sigma = sigma.log()
+        dists = log_sigma.to(self.log_sigmas.device) - self.log_sigmas[:, None]
+        return dists.abs().argmin(dim=0).view(sigma.shape).to(sigma.device)
+    def sigma(self, timestep):
+        t = torch.clamp(timestep.float().to(self.log_sigmas.device), min=0, max=(len(self.sigmas) - 1))
+        low_idx = t.floor().long()
+        high_idx = t.ceil().long()
+        w = t.frac()
+        log_sigma = (1 - w) * self.log_sigmas[low_idx] + w * self.log_sigmas[high_idx]
+        return log_sigma.exp().to(timestep.device)
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 999999999.9
+        if percent >= 1.0:
+            return 0.0
+        percent = 1.0 - percent
+        return self.sigma(torch.tensor(percent * 999.0)).item()
+class ModelSamplingDiscreteEDM(ModelSamplingDiscrete):
+    def timestep(self, sigma):
+        return 0.25 * sigma.log()
+    def sigma(self, timestep):
+        return (timestep / 0.25).exp()
+class ModelSamplingContinuousEDM(torch.nn.Module):
+    def __init__(self, model_config=None):
+        super().__init__()
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+        sigma_min = sampling_settings.get("sigma_min", 0.002)
+        sigma_max = sampling_settings.get("sigma_max", 120.0)
+        sigma_data = sampling_settings.get("sigma_data", 1.0)
+        self.set_parameters(sigma_min, sigma_max, sigma_data)
+    def set_parameters(self, sigma_min, sigma_max, sigma_data):
+        self.sigma_data = sigma_data
+        sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), 1000).exp()
+        self.register_buffer('sigmas', sigmas) #for compatibility with some schedulers
+        self.register_buffer('log_sigmas', sigmas.log())
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+    def timestep(self, sigma):
+        return 0.25 * sigma.log()
+    def sigma(self, timestep):
+        return (timestep / 0.25).exp()
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 999999999.9
+        if percent >= 1.0:
+            return 0.0
+        percent = 1.0 - percent
+        log_sigma_min = math.log(self.sigma_min)
+        return math.exp((math.log(self.sigma_max) - log_sigma_min) * percent + log_sigma_min)
+class ModelSamplingContinuousV(ModelSamplingContinuousEDM):
+    def timestep(self, sigma):
+        return sigma.atan() / math.pi * 2
+    def sigma(self, timestep):
+        return (timestep * math.pi / 2).tan()
+def time_snr_shift(alpha, t):
+    if alpha == 1.0:
+        return t
+    return alpha * t / (1 + (alpha - 1) * t)
+class ModelSamplingDiscreteFlow(torch.nn.Module):
+    def __init__(self, model_config=None):
+        super().__init__()
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+        self.set_parameters(shift=sampling_settings.get("shift", 1.0), multiplier=sampling_settings.get("multiplier", 1000))
+    def set_parameters(self, shift=1.0, timesteps=1000, multiplier=1000):
+        self.shift = shift
+        self.multiplier = multiplier
+        ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps) * multiplier)
+        self.register_buffer('sigmas', ts)
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+    def timestep(self, sigma):
+        return sigma * self.multiplier
+    def sigma(self, timestep):
+        return time_snr_shift(self.shift, timestep / self.multiplier)
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 1.0
+        if percent >= 1.0:
+            return 0.0
+        return time_snr_shift(self.shift, 1.0 - percent)
+class StableCascadeSampling(ModelSamplingDiscrete):
+    def __init__(self, model_config=None):
+        super().__init__()
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+        self.set_parameters(sampling_settings.get("shift", 1.0))
+    def set_parameters(self, shift=1.0, cosine_s=8e-3):
+        self.shift = shift
+        self.cosine_s = torch.tensor(cosine_s)
+        self._init_alpha_cumprod = torch.cos(self.cosine_s / (1 + self.cosine_s) * torch.pi * 0.5) ** 2
+        #This part is just for compatibility with some schedulers in the codebase
+        self.num_timesteps = 10000
+        sigmas = torch.empty((self.num_timesteps), dtype=torch.float32)
+        for x in range(self.num_timesteps):
+            t = (x + 1) / self.num_timesteps
+            sigmas[x] = self.sigma(t)
+        self.set_sigmas(sigmas)
+    def sigma(self, timestep):
+        alpha_cumprod = (torch.cos((timestep + self.cosine_s) / (1 + self.cosine_s) * torch.pi * 0.5) ** 2 / self._init_alpha_cumprod)
+        if self.shift != 1.0:
+            var = alpha_cumprod
+            logSNR = (var/(1-var)).log()
+            logSNR += 2 * torch.log(1.0 / torch.tensor(self.shift))
+            alpha_cumprod = logSNR.sigmoid()
+        alpha_cumprod = alpha_cumprod.clamp(0.0001, 0.9999)
+        return ((1 - alpha_cumprod) / alpha_cumprod) ** 0.5
+    def timestep(self, sigma):
+        var = 1 / ((sigma * sigma) + 1)
+        var = var.clamp(0, 1.0)
+        s, min_var = self.cosine_s.to(var.device), self._init_alpha_cumprod.to(var.device)
+        t = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
+        return t
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 999999999.9
+        if percent >= 1.0:
+            return 0.0
+        percent = 1.0 - percent
+        return self.sigma(torch.tensor(percent))
+def flux_time_shift(mu: float, sigma: float, t):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+class ModelSamplingFlux(torch.nn.Module):
+    def __init__(self, model_config=None):
+        super().__init__()
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+        self.set_parameters(shift=sampling_settings.get("shift", 1.15))
+    def set_parameters(self, shift=1.15, timesteps=10000):
+        self.shift = shift
+        ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps))
+        self.register_buffer('sigmas', ts)
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+    def timestep(self, sigma):
+        return sigma
+    def sigma(self, timestep):
+        return flux_time_shift(self.shift, 1.0, timestep)
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 1.0
+        if percent >= 1.0:
+            return 0.0
+        return flux_time_shift(self.shift, 1.0, 1.0 - percent)
+class ModelSamplingCosmosRFlow(ModelSamplingContinuousEDM):
+    def timestep(self, sigma):
+        return sigma / (sigma + 1)
+    def sigma(self, timestep):
+        sigma_max = self.sigma_max
+        if timestep >= (sigma_max / (sigma_max + 1)):
+            return sigma_max
+        return timestep / (1 - timestep)
--- a/comfy/ops.py
+++ b/comfy/ops.py
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Stability AI
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+import torch
+import logging
+import comfy.model_management
+from comfy.cli_args import args, PerformanceFeature
+import comfy.float
+import comfy.rmsnorm
+import contextlib
+def scaled_dot_product_attention(q, k, v, *args, **kwargs):
+    return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs)
+try:
+    if torch.cuda.is_available():
+        from torch.nn.attention import SDPBackend, sdpa_kernel
+        import inspect
+        if "set_priority" in inspect.signature(sdpa_kernel).parameters:
+            SDPA_BACKEND_PRIORITY = [
+                SDPBackend.FLASH_ATTENTION,
+                SDPBackend.EFFICIENT_ATTENTION,
+                SDPBackend.MATH,
+            ]
+            SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION)
+            def scaled_dot_product_attention(q, k, v, *args, **kwargs):
+                with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True):
+                    return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs)
+        else:
+            logging.warning("Torch version too old to set sdpa backend priority.")
+except (ModuleNotFoundError, TypeError):
+    logging.warning("Could not set sdpa backend priority.")
+cast_to = comfy.model_management.cast_to #TODO: remove once no more references
+def cast_to_input(weight, input, non_blocking=False, copy=True):
+    return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
+def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
+    if input is not None:
+        if dtype is None:
+            dtype = input.dtype
+        if bias_dtype is None:
+            bias_dtype = dtype
+        if device is None:
+            device = input.device
+    offload_stream = comfy.model_management.get_offload_stream(device)
+    if offload_stream is not None:
+        wf_context = offload_stream
+    else:
+        wf_context = contextlib.nullcontext()
+    bias = None
+    non_blocking = comfy.model_management.device_supports_non_blocking(device)
+    if s.bias is not None:
+        has_function = len(s.bias_function) > 0
+        bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
+        if has_function:
+            with wf_context:
+                for f in s.bias_function:
+                    bias = f(bias)
+    has_function = len(s.weight_function) > 0
+    weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
+    if has_function:
+        with wf_context:
+            for f in s.weight_function:
+                weight = f(weight)
+    comfy.model_management.sync_stream(device, offload_stream)
+    return weight, bias
+class CastWeightBiasOp:
+    comfy_cast_weights = False
+    weight_function = []
+    bias_function = []
+class disable_weight_init:
+    class Linear(torch.nn.Linear, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.linear(input, weight, bias)
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+    class Conv1d(torch.nn.Conv1d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return self._conv_forward(input, weight, bias)
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+    class Conv2d(torch.nn.Conv2d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return self._conv_forward(input, weight, bias)
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+    class Conv3d(torch.nn.Conv3d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return self._conv_forward(input, weight, bias)
+        def forward(self, *args, **kwargs):
+            # import pdb
+            # pdb.set_trace()
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+    class GroupNorm(torch.nn.GroupNorm, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps)
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+    class LayerNorm(torch.nn.LayerNorm, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+        def forward_comfy_cast_weights(self, input):
+            if self.weight is not None:
+                weight, bias = cast_bias_weight(self, input)
+            else:
+                weight = None
+                bias = None
+            return torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps)
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+    class RMSNorm(comfy.rmsnorm.RMSNorm, CastWeightBiasOp):
+        def reset_parameters(self):
+            self.bias = None
+            return None
+        def forward_comfy_cast_weights(self, input):
+            if self.weight is not None:
+                weight, bias = cast_bias_weight(self, input)
+            else:
+                weight = None
+            return comfy.rmsnorm.rms_norm(input, weight, self.eps)  # TODO: switch to commented out line when old torch is deprecated
+            # return torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps)
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+    class ConvTranspose2d(torch.nn.ConvTranspose2d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+        def forward_comfy_cast_weights(self, input, output_size=None):
+            num_spatial_dims = 2
+            output_padding = self._output_padding(
+                input, output_size, self.stride, self.padding, self.kernel_size,
+                num_spatial_dims, self.dilation)
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.conv_transpose2d(
+                input, weight, bias, self.stride, self.padding,
+                output_padding, self.groups, self.dilation)
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+    class ConvTranspose1d(torch.nn.ConvTranspose1d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+        def forward_comfy_cast_weights(self, input, output_size=None):
+            num_spatial_dims = 1
+            output_padding = self._output_padding(
+                input, output_size, self.stride, self.padding, self.kernel_size,
+                num_spatial_dims, self.dilation)
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.conv_transpose1d(
+                input, weight, bias, self.stride, self.padding,
+                output_padding, self.groups, self.dilation)
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+    class Embedding(torch.nn.Embedding, CastWeightBiasOp):
+        def reset_parameters(self):
+            self.bias = None
+            return None
+        def forward_comfy_cast_weights(self, input, out_dtype=None):
+            output_dtype = out_dtype
+            if self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16:
+                out_dtype = None
+            weight, bias = cast_bias_weight(self, device=input.device, dtype=out_dtype)
+            return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype)
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                if "out_dtype" in kwargs:
+                    kwargs.pop("out_dtype")
+                return super().forward(*args, **kwargs)
+    @classmethod
+    def conv_nd(s, dims, *args, **kwargs):
+        if dims == 2:
+            return s.Conv2d(*args, **kwargs)
+        elif dims == 3:
+            return s.Conv3d(*args, **kwargs)
+        else:
+            raise ValueError(f"unsupported dimensions: {dims}")
+class manual_cast(disable_weight_init):
+    class Linear(disable_weight_init.Linear):
+        comfy_cast_weights = True
+    class Conv1d(disable_weight_init.Conv1d):
+        comfy_cast_weights = True
+    class Conv2d(disable_weight_init.Conv2d):
+        comfy_cast_weights = True
+    class Conv3d(disable_weight_init.Conv3d):
+        comfy_cast_weights = True
+    class GroupNorm(disable_weight_init.GroupNorm):
+        comfy_cast_weights = True
+    class LayerNorm(disable_weight_init.LayerNorm):
+        comfy_cast_weights = True
+    class ConvTranspose2d(disable_weight_init.ConvTranspose2d):
+        comfy_cast_weights = True
+    class ConvTranspose1d(disable_weight_init.ConvTranspose1d):
+        comfy_cast_weights = True
+    class RMSNorm(disable_weight_init.RMSNorm):
+        comfy_cast_weights = True
+    class Embedding(disable_weight_init.Embedding):
+        comfy_cast_weights = True
+def fp8_linear(self, input):
+    dtype = self.weight.dtype
+    if dtype not in [torch.float8_e4m3fn]:
+        return None
+    tensor_2d = False
+    if len(input.shape) == 2:
+        tensor_2d = True
+        input = input.unsqueeze(1)
+    input_shape = input.shape
+    input_dtype = input.dtype
+    if len(input.shape) == 3:
+        w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype)
+        w = w.t()
+        scale_weight = self.scale_weight
+        scale_input = self.scale_input
+        if scale_weight is None:
+            scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
+        else:
+            scale_weight = scale_weight.to(input.device)
+        if scale_input is None:
+            scale_input = torch.ones((), device=input.device, dtype=torch.float32)
+            input = torch.clamp(input, min=-448, max=448, out=input)
+            input = input.reshape(-1, input_shape[2]).to(dtype).contiguous()
+        else:
+            scale_input = scale_input.to(input.device)
+            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype).contiguous()
+        if bias is not None:
+            o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
+        else:
+            o = torch._scaled_mm(input, w, out_dtype=input_dtype, scale_a=scale_input, scale_b=scale_weight)
+        if isinstance(o, tuple):
+            o = o[0]
+        if tensor_2d:
+            return o.reshape(input_shape[0], -1)
+        return o.reshape((-1, input_shape[1], self.weight.shape[0]))
+    return None
+class fp8_ops(manual_cast):
+    class Linear(manual_cast.Linear):
+        def reset_parameters(self):
+            self.scale_weight = None
+            self.scale_input = None
+            return None
+        def forward_comfy_cast_weights(self, input):
+            try:
+                out = fp8_linear(self, input)
+                if out is not None:
+                    return out
+            except Exception as e:
+                logging.info("Exception during fp8 op: {}".format(e))
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.linear(input, weight, bias)
+def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
+    logging.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input))
+    class scaled_fp8_op(manual_cast):
+        class Linear(manual_cast.Linear):
+            def __init__(self, *args, **kwargs):
+                if override_dtype is not None:
+                    kwargs['dtype'] = override_dtype
+                super().__init__(*args, **kwargs)
+            def reset_parameters(self):
+                if not hasattr(self, 'scale_weight'):
+                    self.scale_weight = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
+                if not scale_input:
+                    self.scale_input = None
+                if not hasattr(self, 'scale_input'):
+                    self.scale_input = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
+                return None
+            def forward_comfy_cast_weights(self, input):
+                if fp8_matrix_mult:
+                    out = fp8_linear(self, input)
+                    if out is not None:
+                        return out
+                weight, bias = cast_bias_weight(self, input)
+                if weight.numel() < input.numel(): #TODO: optimize
+                    return torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias)
+                else:
+                    return torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias)
+            def convert_weight(self, weight, inplace=False, **kwargs):
+                if inplace:
+                    weight *= self.scale_weight.to(device=weight.device, dtype=weight.dtype)
+                    return weight
+                else:
+                    return weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype)
+            def set_weight(self, weight, inplace_update=False, seed=None, **kwargs):
+                weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed)
+                if inplace_update:
+                    self.weight.data.copy_(weight)
+                else:
+                    self.weight = torch.nn.Parameter(weight, requires_grad=False)
+    return scaled_fp8_op
+CUBLAS_IS_AVAILABLE = False
+try:
+    from cublas_ops import CublasLinear
+    CUBLAS_IS_AVAILABLE = True
+except ImportError:
+    pass
+if CUBLAS_IS_AVAILABLE:
+    class cublas_ops(disable_weight_init):
+        class Linear(CublasLinear, disable_weight_init.Linear):
+            def reset_parameters(self):
+                return None
+            def forward_comfy_cast_weights(self, input):
+                return super().forward(input)
+            def forward(self, *args, **kwargs):
+                return super().forward(*args, **kwargs)
+def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None):
+    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
+    if scaled_fp8 is not None:
+        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
+    if (
+        fp8_compute and
+        (fp8_optimizations or PerformanceFeature.Fp8MatrixMultiplication in args.fast) and
+        not disable_fast_fp8
+    ):
+        return fp8_ops
+    if (
+        PerformanceFeature.CublasOps in args.fast and
+        CUBLAS_IS_AVAILABLE and
+        weight_dtype == torch.float16 and
+        (compute_dtype == torch.float16 or compute_dtype is None)
+    ):
+        logging.info("Using cublas ops")
+        return cublas_ops
+    if compute_dtype is None or weight_dtype == compute_dtype:
+        return disable_weight_init
+    return manual_cast
--- a/comfy/options.py
+++ b/comfy/options.py
+args_parsing = False
+def enable_args_parsing(enable=True):
+    global args_parsing
+    args_parsing = enable
--- a/comfy/patcher_extension.py
+++ b/comfy/patcher_extension.py
+from __future__ import annotations
+from typing import Callable
+class CallbacksMP:
+    ON_CLONE = "on_clone"
+    ON_LOAD = "on_load_after"
+    ON_DETACH = "on_detach_after"
+    ON_CLEANUP = "on_cleanup"
+    ON_PRE_RUN = "on_pre_run"
+    ON_PREPARE_STATE = "on_prepare_state"
+    ON_APPLY_HOOKS = "on_apply_hooks"
+    ON_REGISTER_ALL_HOOK_PATCHES = "on_register_all_hook_patches"
+    ON_INJECT_MODEL = "on_inject_model"
+    ON_EJECT_MODEL = "on_eject_model"
+    # callbacks dict is in the format:
+    # {"call_type": {"key": [Callable1, Callable2, ...]} }
+    @classmethod
+    def init_callbacks(cls) -> dict[str, dict[str, list[Callable]]]:
+        return {}
+def add_callback(call_type: str, callback: Callable, transformer_options: dict, is_model_options=False):
+    add_callback_with_key(call_type, None, callback, transformer_options, is_model_options)
+def add_callback_with_key(call_type: str, key: str, callback: Callable, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.setdefault("transformer_options", {})
+    callbacks: dict[str, dict[str, list]] = transformer_options.setdefault("callbacks", {})
+    c = callbacks.setdefault(call_type, {}).setdefault(key, [])
+    c.append(callback)
+def get_callbacks_with_key(call_type: str, key: str, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.get("transformer_options", {})
+    c_list = []
+    callbacks: dict[str, list] = transformer_options.get("callbacks", {})
+    c_list.extend(callbacks.get(call_type, {}).get(key, []))
+    return c_list
+def get_all_callbacks(call_type: str, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.get("transformer_options", {})
+    c_list = []
+    callbacks: dict[str, list] = transformer_options.get("callbacks", {})
+    for c in callbacks.get(call_type, {}).values():
+        c_list.extend(c)
+    return c_list
+class WrappersMP:
+    OUTER_SAMPLE = "outer_sample"
+    PREPARE_SAMPLING = "prepare_sampling"
+    SAMPLER_SAMPLE = "sampler_sample"
+    PREDICT_NOISE = "predict_noise"
+    CALC_COND_BATCH = "calc_cond_batch"
+    APPLY_MODEL = "apply_model"
+    DIFFUSION_MODEL = "diffusion_model"
+    # wrappers dict is in the format:
+    # {"wrapper_type": {"key": [Callable1, Callable2, ...]} }
+    @classmethod
+    def init_wrappers(cls) -> dict[str, dict[str, list[Callable]]]:
+        return {}
+def add_wrapper(wrapper_type: str, wrapper: Callable, transformer_options: dict, is_model_options=False):
+    add_wrapper_with_key(wrapper_type, None, wrapper, transformer_options, is_model_options)
+def add_wrapper_with_key(wrapper_type: str, key: str, wrapper: Callable, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.setdefault("transformer_options", {})
+    wrappers: dict[str, dict[str, list]] = transformer_options.setdefault("wrappers", {})
+    w = wrappers.setdefault(wrapper_type, {}).setdefault(key, [])
+    w.append(wrapper)
+def get_wrappers_with_key(wrapper_type: str, key: str, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.get("transformer_options", {})
+    w_list = []
+    wrappers: dict[str, list] = transformer_options.get("wrappers", {})
+    w_list.extend(wrappers.get(wrapper_type, {}).get(key, []))
+    return w_list
+def get_all_wrappers(wrapper_type: str, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.get("transformer_options", {})
+    w_list = []
+    wrappers: dict[str, list] = transformer_options.get("wrappers", {})
+    for w in wrappers.get(wrapper_type, {}).values():
+        w_list.extend(w)
+    return w_list
+class WrapperExecutor:
+    """Handles call stack of wrappers around a function in an ordered manner."""
+    def __init__(self, original: Callable, class_obj: object, wrappers: list[Callable], idx: int):
+        # NOTE: class_obj exists so that wrappers surrounding a class method can access
+        #       the class instance at runtime via executor.class_obj
+        self.original = original
+        self.class_obj = class_obj
+        self.wrappers = wrappers.copy()
+        self.idx = idx
+        self.is_last = idx == len(wrappers)
+    def __call__(self, *args, **kwargs):
+        """Calls the next wrapper or original function, whichever is appropriate."""
+        new_executor = self._create_next_executor()
+        return new_executor.execute(*args, **kwargs)
+    def execute(self, *args, **kwargs):
+        """Used to initiate executor internally - DO NOT use this if you received executor in wrapper."""
+        args = list(args)
+        kwargs = dict(kwargs)
+        if self.is_last:
+            return self.original(*args, **kwargs)
+        return self.wrappers[self.idx](self, *args, **kwargs)
+    def _create_next_executor(self) -> 'WrapperExecutor':
+        new_idx = self.idx + 1
+        if new_idx > len(self.wrappers):
+            raise Exception("Wrapper idx exceeded available wrappers; something went very wrong.")
+        if self.class_obj is None:
+            return WrapperExecutor.new_executor(self.original, self.wrappers, new_idx)
+        return WrapperExecutor.new_class_executor(self.original, self.class_obj, self.wrappers, new_idx)
+    @classmethod
+    def new_executor(cls, original: Callable, wrappers: list[Callable], idx=0):
+        return cls(original, class_obj=None, wrappers=wrappers, idx=idx)
+    @classmethod
+    def new_class_executor(cls, original: Callable, class_obj: object, wrappers: list[Callable], idx=0):
+        return cls(original, class_obj, wrappers, idx=idx)
+class PatcherInjection:
+    def __init__(self, inject: Callable, eject: Callable):
+        self.inject = inject
+        self.eject = eject
+def copy_nested_dicts(input_dict: dict):
+    new_dict = input_dict.copy()
+    for key, value in input_dict.items():
+        if isinstance(value, dict):
+            new_dict[key] = copy_nested_dicts(value)
+        elif isinstance(value, list):
+            new_dict[key] = value.copy()
+    return new_dict
+def merge_nested_dicts(dict1: dict, dict2: dict, copy_dict1=True):
+    if copy_dict1:
+        merged_dict = copy_nested_dicts(dict1)
+    else:
+        merged_dict = dict1
+    for key, value in dict2.items():
+        if isinstance(value, dict):
+            curr_value = merged_dict.setdefault(key, {})
+            merged_dict[key] = merge_nested_dicts(value, curr_value)
+        elif isinstance(value, list):
+            merged_dict.setdefault(key, []).extend(value)
+        else:
+            merged_dict[key] = value
+    return merged_dict
--- a/comfy/rmsnorm.py
+++ b/comfy/rmsnorm.py
+import torch
+import comfy.model_management
+import numbers
+import logging
+RMSNorm = None
+try:
+    rms_norm_torch = torch.nn.functional.rms_norm
+    RMSNorm = torch.nn.RMSNorm
+except:
+    rms_norm_torch = None
+    logging.warning("Please update pytorch to use native RMSNorm")
+def rms_norm(x, weight=None, eps=1e-6):
+    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        if weight is None:
+            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
+        else:
+            return rms_norm_torch(x, weight.shape, weight=comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
+    else:
+        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        if weight is None:
+            return r
+        else:
+            return r * comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device)
+if RMSNorm is None:
+    class RMSNorm(torch.nn.Module):
+        def __init__(
+            self,
+            normalized_shape,
+            eps=1e-6,
+            elementwise_affine=True,
+            device=None,
+            dtype=None,
+        ):
+            factory_kwargs = {"device": device, "dtype": dtype}
+            super().__init__()
+            if isinstance(normalized_shape, numbers.Integral):
+                # mypy error: incompatible types in assignment
+                normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+            self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+            self.eps = eps
+            self.elementwise_affine = elementwise_affine
+            if self.elementwise_affine:
+                self.weight = torch.nn.Parameter(
+                    torch.empty(self.normalized_shape, **factory_kwargs)
+                )
+            else:
+                self.register_parameter("weight", None)
+            self.bias = None
+        def forward(self, x):
+            return rms_norm(x, self.weight, self.eps)
--- a/comfy/sample.py
+++ b/comfy/sample.py
+import torch
+import comfy.model_management
+import comfy.samplers
+import comfy.utils
+import numpy as np
+import logging
+def prepare_noise(latent_image, seed, noise_inds=None):
+    """
+    creates random noise given a latent image and a seed.
+    optional arg skip can be used to skip and discard x number of noise generations for a given seed
+    """
+    generator = torch.manual_seed(seed)
+    if noise_inds is None:
+        return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
+    unique_inds, inverse = np.unique(noise_inds, return_inverse=True)
+    noises = []
+    for i in range(unique_inds[-1]+1):
+        noise = torch.randn([1] + list(latent_image.size())[1:], dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
+        if i in unique_inds:
+            noises.append(noise)
+    noises = [noises[i] for i in inverse]
+    noises = torch.cat(noises, axis=0)
+    return noises
+def fix_empty_latent_channels(model, latent_image):
+    latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels
+    if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0:
+        latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
+    if latent_format.latent_dimensions == 3 and latent_image.ndim == 4:
+        latent_image = latent_image.unsqueeze(2)
+    return latent_image
+def prepare_sampling(model, noise_shape, positive, negative, noise_mask):
+    logging.warning("Warning: comfy.sample.prepare_sampling isn't used anymore and can be removed")
+    return model, positive, negative, noise_mask, []
+def cleanup_additional_models(models):
+    logging.warning("Warning: comfy.sample.cleanup_additional_models isn't used anymore and can be removed")
+def sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None):
+    sampler = comfy.samplers.KSampler(model, steps=steps, device=model.load_device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options)
+    samples = sampler.sample(noise, positive, negative, cfg=cfg, latent_image=latent_image, start_step=start_step, last_step=last_step, force_full_denoise=force_full_denoise, denoise_mask=noise_mask, sigmas=sigmas, callback=callback, disable_pbar=disable_pbar, seed=seed)
+    samples = samples.to(comfy.model_management.intermediate_device())
+    return samples
+def sample_custom(model, noise, cfg, sampler, sigmas, positive, negative, latent_image, noise_mask=None, callback=None, disable_pbar=False, seed=None):
+    samples = comfy.samplers.sample(model, noise, positive, negative, cfg, model.load_device, sampler, sigmas, model_options=model.model_options, latent_image=latent_image, denoise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
+    samples = samples.to(comfy.model_management.intermediate_device())
+    return samples
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
+from __future__ import annotations
+import uuid
+import math
+import collections
+import comfy.model_management
+import comfy.conds
+import comfy.utils
+import comfy.hooks
+import comfy.patcher_extension
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from comfy.model_patcher import ModelPatcher
+    from comfy.model_base import BaseModel
+    from comfy.controlnet import ControlBase
+def prepare_mask(noise_mask, shape, device):
+    return comfy.utils.reshape_mask(noise_mask, shape).to(device)
+def get_models_from_cond(cond, model_type):
+    models = []
+    for c in cond:
+        if model_type in c:
+            if isinstance(c[model_type], list):
+                models += c[model_type]
+            else:
+                models += [c[model_type]]
+    return models
+def get_hooks_from_cond(cond, full_hooks: comfy.hooks.HookGroup):
+    # get hooks from conds, and collect cnets so they can be checked for extra_hooks
+    cnets: list[ControlBase] = []
+    for c in cond:
+        if 'hooks' in c:
+            for hook in c['hooks'].hooks:
+                full_hooks.add(hook)
+        if 'control' in c:
+            cnets.append(c['control'])
+    def get_extra_hooks_from_cnet(cnet: ControlBase, _list: list):
+        if cnet.extra_hooks is not None:
+            _list.append(cnet.extra_hooks)
+        if cnet.previous_controlnet is None:
+            return _list
+        return get_extra_hooks_from_cnet(cnet.previous_controlnet, _list)
+    hooks_list = []
+    cnets = set(cnets)
+    for base_cnet in cnets:
+        get_extra_hooks_from_cnet(base_cnet, hooks_list)
+    extra_hooks = comfy.hooks.HookGroup.combine_all_hooks(hooks_list)
+    if extra_hooks is not None:
+        for hook in extra_hooks.hooks:
+            full_hooks.add(hook)
+    return full_hooks
+def convert_cond(cond):
+    out = []
+    for c in cond:
+        temp = c[1].copy()
+        model_conds = temp.get("model_conds", {})
+        if c[0] is not None:
+            temp["cross_attn"] = c[0]
+        temp["model_conds"] = model_conds
+        temp["uuid"] = uuid.uuid4()
+        out.append(temp)
+    return out
+def get_additional_models(conds, dtype):
+    """loads additional models in conditioning"""
+    cnets: list[ControlBase] = []
+    gligen = []
+    add_models = []
+    for k in conds:
+        cnets += get_models_from_cond(conds[k], "control")
+        gligen += get_models_from_cond(conds[k], "gligen")
+        add_models += get_models_from_cond(conds[k], "additional_models")
+    control_nets = set(cnets)
+    inference_memory = 0
+    control_models = []
+    for m in control_nets:
+        control_models += m.get_models()
+        inference_memory += m.inference_memory_requirements(dtype)
+    gligen = [x[1] for x in gligen]
+    models = control_models + gligen + add_models
+    return models, inference_memory
+def get_additional_models_from_model_options(model_options: dict[str]=None):
+    """loads additional models from registered AddModels hooks"""
+    models = []
+    if model_options is not None and "registered_hooks" in model_options:
+        registered: comfy.hooks.HookGroup = model_options["registered_hooks"]
+        for hook in registered.get_type(comfy.hooks.EnumHookType.AdditionalModels):
+            hook: comfy.hooks.AdditionalModelsHook
+            models.extend(hook.models)
+    return models
+def cleanup_additional_models(models):
+    """cleanup additional models that were loaded"""
+    for m in models:
+        if hasattr(m, 'cleanup'):
+            m.cleanup()
+def estimate_memory(model, noise_shape, conds):
+    cond_shapes = collections.defaultdict(list)
+    cond_shapes_min = {}
+    for _, cs in conds.items():
+        for cond in cs:
+            for k, v in model.model.extra_conds_shapes(**cond).items():
+                cond_shapes[k].append(v)
+                if cond_shapes_min.get(k, None) is None:
+                    cond_shapes_min[k] = [v]
+                elif math.prod(v) > math.prod(cond_shapes_min[k][0]):
+                    cond_shapes_min[k] = [v]
+    memory_required = model.model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:]), cond_shapes=cond_shapes)
+    minimum_memory_required = model.model.memory_required([noise_shape[0]] + list(noise_shape[1:]), cond_shapes=cond_shapes_min)
+    return memory_required, minimum_memory_required
+def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
+    executor = comfy.patcher_extension.WrapperExecutor.new_executor(
+        _prepare_sampling,
+        comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, model_options, is_model_options=True)
+    )
+    return executor.execute(model, noise_shape, conds, model_options=model_options)
+def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
+    real_model: BaseModel = None
+    models, inference_memory = get_additional_models(conds, model.model_dtype())
+    models += get_additional_models_from_model_options(model_options)
+    models += model.get_nested_additional_models()  # TODO: does this require inference_memory update?
+    memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds)
+    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required + inference_memory, minimum_memory_required=minimum_memory_required + inference_memory)
+    real_model = model.model
+    return real_model, conds, models
+def cleanup_models(conds, models):
+    cleanup_additional_models(models)
+    control_cleanup = []
+    for k in conds:
+        control_cleanup += get_models_from_cond(conds[k], "control")
+    cleanup_additional_models(set(control_cleanup))
+def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict):
+    '''
+    Registers hooks from conds.
+    '''
+    # check for hooks in conds - if not registered, see if can be applied
+    hooks = comfy.hooks.HookGroup()
+    for k in conds:
+        get_hooks_from_cond(conds[k], hooks)
+    # add wrappers and callbacks from ModelPatcher to transformer_options
+    comfy.patcher_extension.merge_nested_dicts(model_options["transformer_options"].setdefault("wrappers", {}), model.wrappers, copy_dict1=False)
+    comfy.patcher_extension.merge_nested_dicts(model_options["transformer_options"].setdefault("callbacks", {}), model.callbacks, copy_dict1=False)
+    # begin registering hooks
+    registered = comfy.hooks.HookGroup()
+    target_dict = comfy.hooks.create_target_dict(comfy.hooks.EnumWeightTarget.Model)
+    # handle all TransformerOptionsHooks
+    for hook in hooks.get_type(comfy.hooks.EnumHookType.TransformerOptions):
+        hook: comfy.hooks.TransformerOptionsHook
+        hook.add_hook_patches(model, model_options, target_dict, registered)
+    # handle all AddModelsHooks
+    for hook in hooks.get_type(comfy.hooks.EnumHookType.AdditionalModels):
+        hook: comfy.hooks.AdditionalModelsHook
+        hook.add_hook_patches(model, model_options, target_dict, registered)
+    # handle all WeightHooks by registering on ModelPatcher
+    model.register_all_hook_patches(hooks, target_dict, model_options, registered)
+    # add registered_hooks onto model_options for further reference
+    if len(registered) > 0:
+        model_options["registered_hooks"] = registered
+    # merge original wrappers and callbacks with hooked wrappers and callbacks
+    to_load_options: dict[str] = model_options.setdefault("to_load_options", {})
+    for wc_name in ["wrappers", "callbacks"]:
+        comfy.patcher_extension.merge_nested_dicts(to_load_options.setdefault(wc_name, {}), model_options["transformer_options"][wc_name],
+                                                    copy_dict1=False)
+    return to_load_options