init commit for comui

5e2c95b7 · wuxk1 · 5e2c95b7 · 5e2c95b7 · 5e2c95b7 · 5e2c95b7
Commit 5e2c95b7 authored Jan 07, 2026 by wuxk1
8 changed files
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Comfy
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+from __future__ import annotations
+
+import collections
+import copy
+import inspect
+import logging
+import math
+import uuid
+from typing import Callable, Optional
+
+import torch
+
+import comfy.float
+import comfy.hooks
+import comfy.lora
+import comfy.model_management
+import comfy.patcher_extension
+import comfy.utils
+from comfy.comfy_types import UnetWrapperFunction
+from comfy.patcher_extension import CallbacksMP, PatcherInjection, WrappersMP
+
+
+def string_to_seed(data):
+    crc = 0xFFFFFFFF
+    for byte in data:
+        if isinstance(byte, str):
+            byte = ord(byte)
+        crc ^= byte
+        for _ in range(8):
+            if crc & 1:
+                crc = (crc >> 1) ^ 0xEDB88320
+            else:
+                crc >>= 1
+    return crc ^ 0xFFFFFFFF
+
+def set_model_options_patch_replace(model_options, patch, name, block_name, number, transformer_index=None):
+    to = model_options["transformer_options"].copy()
+
+    if "patches_replace" not in to:
+        to["patches_replace"] = {}
+    else:
+        to["patches_replace"] = to["patches_replace"].copy()
+
+    if name not in to["patches_replace"]:
+        to["patches_replace"][name] = {}
+    else:
+        to["patches_replace"][name] = to["patches_replace"][name].copy()
+
+    if transformer_index is not None:
+        block = (block_name, number, transformer_index)
+    else:
+        block = (block_name, number)
+    to["patches_replace"][name][block] = patch
+    model_options["transformer_options"] = to
+    return model_options
+
+def set_model_options_post_cfg_function(model_options, post_cfg_function, disable_cfg1_optimization=False):
+    model_options["sampler_post_cfg_function"] = model_options.get("sampler_post_cfg_function", []) + [post_cfg_function]
+    if disable_cfg1_optimization:
+        model_options["disable_cfg1_optimization"] = True
+    return model_options
+
+def set_model_options_pre_cfg_function(model_options, pre_cfg_function, disable_cfg1_optimization=False):
+    model_options["sampler_pre_cfg_function"] = model_options.get("sampler_pre_cfg_function", []) + [pre_cfg_function]
+    if disable_cfg1_optimization:
+        model_options["disable_cfg1_optimization"] = True
+    return model_options
+
+def create_model_options_clone(orig_model_options: dict):
+    return comfy.patcher_extension.copy_nested_dicts(orig_model_options)
+
+def create_hook_patches_clone(orig_hook_patches):
+    new_hook_patches = {}
+    for hook_ref in orig_hook_patches:
+        new_hook_patches[hook_ref] = {}
+        for k in orig_hook_patches[hook_ref]:
+            new_hook_patches[hook_ref][k] = orig_hook_patches[hook_ref][k][:]
+    return new_hook_patches
+
+def wipe_lowvram_weight(m):
+    if hasattr(m, "prev_comfy_cast_weights"):
+        m.comfy_cast_weights = m.prev_comfy_cast_weights
+        del m.prev_comfy_cast_weights
+
+    if hasattr(m, "weight_function"):
+        m.weight_function = []
+
+    if hasattr(m, "bias_function"):
+        m.bias_function = []
+
+def move_weight_functions(m, device):
+    if device is None:
+        return 0
+
+    memory = 0
+    if hasattr(m, "weight_function"):
+        for f in m.weight_function:
+            if hasattr(f, "move_to"):
+                memory += f.move_to(device=device)
+
+    if hasattr(m, "bias_function"):
+        for f in m.bias_function:
+            if hasattr(f, "move_to"):
+                memory += f.move_to(device=device)
+    return memory
+
+class LowVramPatch:
+    def __init__(self, key, patches):
+        self.key = key
+        self.patches = patches
+    def __call__(self, weight):
+        intermediate_dtype = weight.dtype
+        if intermediate_dtype not in [torch.float32, torch.float16, torch.bfloat16]: #intermediate_dtype has to be one that is supported in math ops
+            intermediate_dtype = torch.float32
+            return comfy.float.stochastic_rounding(comfy.lora.calculate_weight(self.patches[self.key], weight.to(intermediate_dtype), self.key, intermediate_dtype=intermediate_dtype), weight.dtype, seed=string_to_seed(self.key))
+
+        return comfy.lora.calculate_weight(self.patches[self.key], weight, self.key, intermediate_dtype=intermediate_dtype)
+
+def get_key_weight(model, key):
+    set_func = None
+    convert_func = None
+    op_keys = key.rsplit('.', 1)
+    if len(op_keys) < 2:
+        weight = comfy.utils.get_attr(model, key)
+    else:
+        op = comfy.utils.get_attr(model, op_keys[0])
+        try:
+            set_func = getattr(op, "set_{}".format(op_keys[1]))
+        except AttributeError:
+            pass
+
+        try:
+            convert_func = getattr(op, "convert_{}".format(op_keys[1]))
+        except AttributeError:
+            pass
+
+        weight = getattr(op, op_keys[1])
+        if convert_func is not None:
+            weight = comfy.utils.get_attr(model, key)
+
+    return weight, set_func, convert_func
+
+class AutoPatcherEjector:
+    def __init__(self, model: 'ModelPatcher', skip_and_inject_on_exit_only=False):
+        self.model = model
+        self.was_injected = False
+        self.prev_skip_injection = False
+        self.skip_and_inject_on_exit_only = skip_and_inject_on_exit_only
+
+    def __enter__(self):
+        self.was_injected = False
+        self.prev_skip_injection = self.model.skip_injection
+        if self.skip_and_inject_on_exit_only:
+            self.model.skip_injection = True
+        if self.model.is_injected:
+            self.model.eject_model()
+            self.was_injected = True
+
+    def __exit__(self, *args):
+        if self.skip_and_inject_on_exit_only:
+            self.model.skip_injection = self.prev_skip_injection
+            self.model.inject_model()
+        if self.was_injected and not self.model.skip_injection:
+            self.model.inject_model()
+        self.model.skip_injection = self.prev_skip_injection
+
+class MemoryCounter:
+    def __init__(self, initial: int, minimum=0):
+        self.value = initial
+        self.minimum = minimum
+        # TODO: add a safe limit besides 0
+
+    def use(self, weight: torch.Tensor):
+        weight_size = weight.nelement() * weight.element_size()
+        if self.is_useable(weight_size):
+            self.decrement(weight_size)
+            return True
+        return False
+
+    def is_useable(self, used: int):
+        return self.value - used > self.minimum
+
+    def decrement(self, used: int):
+        self.value -= used
+
+class ModelPatcher:
+    def __init__(self, model, load_device, offload_device, size=0, weight_inplace_update=False):
+        self.size = size
+        self.model = model
+        if not hasattr(self.model, 'device'):
+            logging.debug("Model doesn't have a device attribute.")
+            self.model.device = offload_device
+        elif self.model.device is None:
+            self.model.device = offload_device
+
+        self.patches = {}
+        self.backup = {}
+        self.object_patches = {}
+        self.object_patches_backup = {}
+        self.weight_wrapper_patches = {}
+        self.model_options = {"transformer_options":{}}
+        self.model_size()
+        self.load_device = load_device
+        self.offload_device = offload_device
+        self.weight_inplace_update = weight_inplace_update
+        self.force_cast_weights = False
+        self.patches_uuid = uuid.uuid4()
+        self.parent = None
+
+        self.attachments: dict[str] = {}
+        self.additional_models: dict[str, list[ModelPatcher]] = {}
+        self.callbacks: dict[str, dict[str, list[Callable]]] = CallbacksMP.init_callbacks()
+        self.wrappers: dict[str, dict[str, list[Callable]]] = WrappersMP.init_wrappers()
+
+        self.is_injected = False
+        self.skip_injection = False
+        self.injections: dict[str, list[PatcherInjection]] = {}
+
+        self.hook_patches: dict[comfy.hooks._HookRef] = {}
+        self.hook_patches_backup: dict[comfy.hooks._HookRef] = None
+        self.hook_backup: dict[str, tuple[torch.Tensor, torch.device]] = {}
+        self.cached_hook_patches: dict[comfy.hooks.HookGroup, dict[str, torch.Tensor]] = {}
+        self.current_hooks: Optional[comfy.hooks.HookGroup] = None
+        self.forced_hooks: Optional[comfy.hooks.HookGroup] = None  # NOTE: only used for CLIP at this time
+        self.is_clip = False
+        self.hook_mode = comfy.hooks.EnumHookMode.MaxSpeed
+
+        if not hasattr(self.model, 'model_loaded_weight_memory'):
+            self.model.model_loaded_weight_memory = 0
+
+        if not hasattr(self.model, 'lowvram_patch_counter'):
+            self.model.lowvram_patch_counter = 0
+
+        if not hasattr(self.model, 'model_lowvram'):
+            self.model.model_lowvram = False
+
+        if not hasattr(self.model, 'current_weight_patches_uuid'):
+            self.model.current_weight_patches_uuid = None
+
+    def model_size(self):
+        if self.size > 0:
+            return self.size
+        self.size = comfy.model_management.module_size(self.model)
+        return self.size
+
+    def loaded_size(self):
+        return self.model.model_loaded_weight_memory
+
+    def lowvram_patch_counter(self):
+        return self.model.lowvram_patch_counter
+
+    def clone(self):
+        n = self.__class__(self.model, self.load_device, self.offload_device, self.size, weight_inplace_update=self.weight_inplace_update)
+        n.patches = {}
+        for k in self.patches:
+            n.patches[k] = self.patches[k][:]
+        n.patches_uuid = self.patches_uuid
+
+        n.object_patches = self.object_patches.copy()
+        n.weight_wrapper_patches = self.weight_wrapper_patches.copy()
+        n.model_options = copy.deepcopy(self.model_options)
+        n.backup = self.backup
+        n.object_patches_backup = self.object_patches_backup
+        n.parent = self
+
+        n.force_cast_weights = self.force_cast_weights
+
+        # attachments
+        n.attachments = {}
+        for k in self.attachments:
+            if hasattr(self.attachments[k], "on_model_patcher_clone"):
+                n.attachments[k] = self.attachments[k].on_model_patcher_clone()
+            else:
+                n.attachments[k] = self.attachments[k]
+        # additional models
+        for k, c in self.additional_models.items():
+            n.additional_models[k] = [x.clone() for x in c]
+        # callbacks
+        for k, c in self.callbacks.items():
+            n.callbacks[k] = {}
+            for k1, c1 in c.items():
+                n.callbacks[k][k1] = c1.copy()
+        # sample wrappers
+        for k, w in self.wrappers.items():
+            n.wrappers[k] = {}
+            for k1, w1 in w.items():
+                n.wrappers[k][k1] = w1.copy()
+        # injection
+        n.is_injected = self.is_injected
+        n.skip_injection = self.skip_injection
+        for k, i in self.injections.items():
+            n.injections[k] = i.copy()
+        # hooks
+        n.hook_patches = create_hook_patches_clone(self.hook_patches)
+        n.hook_patches_backup = create_hook_patches_clone(self.hook_patches_backup) if self.hook_patches_backup else self.hook_patches_backup
+        for group in self.cached_hook_patches:
+            n.cached_hook_patches[group] = {}
+            for k in self.cached_hook_patches[group]:
+                n.cached_hook_patches[group][k] = self.cached_hook_patches[group][k]
+        n.hook_backup = self.hook_backup
+        n.current_hooks = self.current_hooks.clone() if self.current_hooks else self.current_hooks
+        n.forced_hooks = self.forced_hooks.clone() if self.forced_hooks else self.forced_hooks
+        n.is_clip = self.is_clip
+        n.hook_mode = self.hook_mode
+
+        for callback in self.get_all_callbacks(CallbacksMP.ON_CLONE):
+            callback(self, n)
+        return n
+
+    def is_clone(self, other):
+        if hasattr(other, 'model') and self.model is other.model:
+            return True
+        return False
+
+    def clone_has_same_weights(self, clone: 'ModelPatcher'):
+        if not self.is_clone(clone):
+            return False
+
+        if self.current_hooks != clone.current_hooks:
+            return False
+        if self.forced_hooks != clone.forced_hooks:
+            return False
+        if self.hook_patches.keys() != clone.hook_patches.keys():
+            return False
+        if self.attachments.keys() != clone.attachments.keys():
+            return False
+        if self.additional_models.keys() != clone.additional_models.keys():
+            return False
+        for key in self.callbacks:
+            if len(self.callbacks[key]) != len(clone.callbacks[key]):
+                return False
+        for key in self.wrappers:
+            if len(self.wrappers[key]) != len(clone.wrappers[key]):
+                return False
+        if self.injections.keys() != clone.injections.keys():
+            return False
+
+        if len(self.patches) == 0 and len(clone.patches) == 0:
+            return True
+
+        if self.patches_uuid == clone.patches_uuid:
+            if len(self.patches) != len(clone.patches):
+                logging.warning("WARNING: something went wrong, same patch uuid but different length of patches.")
+            else:
+                return True
+
+    def memory_required(self, input_shape):
+        return self.model.memory_required(input_shape=input_shape)
+
+    def set_model_sampler_cfg_function(self, sampler_cfg_function, disable_cfg1_optimization=False):
+        if len(inspect.signature(sampler_cfg_function).parameters) == 3:
+            self.model_options["sampler_cfg_function"] = lambda args: sampler_cfg_function(args["cond"], args["uncond"], args["cond_scale"]) #Old way
+        else:
+            self.model_options["sampler_cfg_function"] = sampler_cfg_function
+        if disable_cfg1_optimization:
+            self.model_options["disable_cfg1_optimization"] = True
+
+    def set_model_sampler_post_cfg_function(self, post_cfg_function, disable_cfg1_optimization=False):
+        self.model_options = set_model_options_post_cfg_function(self.model_options, post_cfg_function, disable_cfg1_optimization)
+
+    def set_model_sampler_pre_cfg_function(self, pre_cfg_function, disable_cfg1_optimization=False):
+        self.model_options = set_model_options_pre_cfg_function(self.model_options, pre_cfg_function, disable_cfg1_optimization)
+
+    def set_model_sampler_calc_cond_batch_function(self, sampler_calc_cond_batch_function):
+        self.model_options["sampler_calc_cond_batch_function"] = sampler_calc_cond_batch_function
+
+    def set_model_unet_function_wrapper(self, unet_wrapper_function: UnetWrapperFunction):
+        self.model_options["model_function_wrapper"] = unet_wrapper_function
+
+    def set_model_denoise_mask_function(self, denoise_mask_function):
+        self.model_options["denoise_mask_function"] = denoise_mask_function
+
+    def set_model_patch(self, patch, name):
+        to = self.model_options["transformer_options"]
+        if "patches" not in to:
+            to["patches"] = {}
+        to["patches"][name] = to["patches"].get(name, []) + [patch]
+
+    def set_model_patch_replace(self, patch, name, block_name, number, transformer_index=None):
+        self.model_options = set_model_options_patch_replace(self.model_options, patch, name, block_name, number, transformer_index=transformer_index)
+
+    def set_model_attn1_patch(self, patch):
+        self.set_model_patch(patch, "attn1_patch")
+
+    def set_model_attn2_patch(self, patch):
+        self.set_model_patch(patch, "attn2_patch")
+
+    def set_model_attn1_replace(self, patch, block_name, number, transformer_index=None):
+        self.set_model_patch_replace(patch, "attn1", block_name, number, transformer_index)
+
+    def set_model_attn2_replace(self, patch, block_name, number, transformer_index=None):
+        self.set_model_patch_replace(patch, "attn2", block_name, number, transformer_index)
+
+    def set_model_attn1_output_patch(self, patch):
+        self.set_model_patch(patch, "attn1_output_patch")
+
+    def set_model_attn2_output_patch(self, patch):
+        self.set_model_patch(patch, "attn2_output_patch")
+
+    def set_model_input_block_patch(self, patch):
+        self.set_model_patch(patch, "input_block_patch")
+
+    def set_model_input_block_patch_after_skip(self, patch):
+        self.set_model_patch(patch, "input_block_patch_after_skip")
+
+    def set_model_output_block_patch(self, patch):
+        self.set_model_patch(patch, "output_block_patch")
+
+    def set_model_emb_patch(self, patch):
+        self.set_model_patch(patch, "emb_patch")
+
+    def set_model_forward_timestep_embed_patch(self, patch):
+        self.set_model_patch(patch, "forward_timestep_embed_patch")
+
+    def set_model_double_block_patch(self, patch):
+        self.set_model_patch(patch, "double_block")
+
+    def add_object_patch(self, name, obj):
+        self.object_patches[name] = obj
+
+    def set_model_compute_dtype(self, dtype):
+        self.add_object_patch("manual_cast_dtype", dtype)
+        if dtype is not None:
+            self.force_cast_weights = True
+        self.patches_uuid = uuid.uuid4() #TODO: optimize by preventing a full model reload for this
+
+    def add_weight_wrapper(self, name, function):
+        self.weight_wrapper_patches[name] = self.weight_wrapper_patches.get(name, []) + [function]
+        self.patches_uuid = uuid.uuid4()
+
+    def get_model_object(self, name: str) -> torch.nn.Module:
+        """Retrieves a nested attribute from an object using dot notation considering
+        object patches.
+
+        Args:
+            name (str): The attribute path using dot notation (e.g. "model.layer.weight")
+
+        Returns:
+            The value of the requested attribute
+
+        Example:
+            patcher = ModelPatcher()
+            weight = patcher.get_model_object("layer1.conv.weight")
+        """
+        if name in self.object_patches:
+            return self.object_patches[name]
+        else:
+            if name in self.object_patches_backup:
+                return self.object_patches_backup[name]
+            else:
+                return comfy.utils.get_attr(self.model, name)
+
+    def model_patches_to(self, device):
+        to = self.model_options["transformer_options"]
+        if "patches" in to:
+            patches = to["patches"]
+            for name in patches:
+                patch_list = patches[name]
+                for i in range(len(patch_list)):
+                    if hasattr(patch_list[i], "to"):
+                        patch_list[i] = patch_list[i].to(device)
+        if "patches_replace" in to:
+            patches = to["patches_replace"]
+            for name in patches:
+                patch_list = patches[name]
+                for k in patch_list:
+                    if hasattr(patch_list[k], "to"):
+                        patch_list[k] = patch_list[k].to(device)
+        if "model_function_wrapper" in self.model_options:
+            wrap_func = self.model_options["model_function_wrapper"]
+            if hasattr(wrap_func, "to"):
+                self.model_options["model_function_wrapper"] = wrap_func.to(device)
+
+    def model_patches_models(self):
+        to = self.model_options["transformer_options"]
+        models = []
+        if "patches" in to:
+            patches = to["patches"]
+            for name in patches:
+                patch_list = patches[name]
+                for i in range(len(patch_list)):
+                    if hasattr(patch_list[i], "models"):
+                        models += patch_list[i].models()
+        if "patches_replace" in to:
+            patches = to["patches_replace"]
+            for name in patches:
+                patch_list = patches[name]
+                for k in patch_list:
+                    if hasattr(patch_list[k], "models"):
+                        models += patch_list[k].models()
+        if "model_function_wrapper" in self.model_options:
+            wrap_func = self.model_options["model_function_wrapper"]
+            if hasattr(wrap_func, "models"):
+                models += wrap_func.models()
+
+        return models
+
+    def model_dtype(self):
+        if hasattr(self.model, "get_dtype"):
+            return self.model.get_dtype()
+
+    def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
+        with self.use_ejected():
+            p = set()
+            model_sd = self.model.state_dict()
+            for k in patches:
+                offset = None
+                function = None
+                if isinstance(k, str):
+                    key = k
+                else:
+                    offset = k[1]
+                    key = k[0]
+                    if len(k) > 2:
+                        function = k[2]
+
+                if key in model_sd:
+                    p.add(k)
+                    current_patches = self.patches.get(key, [])
+                    current_patches.append((strength_patch, patches[k], strength_model, offset, function))
+                    self.patches[key] = current_patches
+
+            self.patches_uuid = uuid.uuid4()
+            return list(p)
+
+    def get_key_patches(self, filter_prefix=None):
+        model_sd = self.model_state_dict()
+        p = {}
+        for k in model_sd:
+            if filter_prefix is not None:
+                if not k.startswith(filter_prefix):
+                    continue
+            bk = self.backup.get(k, None)
+            hbk = self.hook_backup.get(k, None)
+            weight, set_func, convert_func = get_key_weight(self.model, k)
+            if bk is not None:
+                weight = bk.weight
+            if hbk is not None:
+                weight = hbk[0]
+            if convert_func is None:
+                convert_func = lambda a, **kwargs: a
+
+            if k in self.patches:
+                p[k] = [(weight, convert_func)] + self.patches[k]
+            else:
+                p[k] = [(weight, convert_func)]
+        return p
+
+    def model_state_dict(self, filter_prefix=None):
+        with self.use_ejected():
+            sd = self.model.state_dict()
+            keys = list(sd.keys())
+            if filter_prefix is not None:
+                for k in keys:
+                    if not k.startswith(filter_prefix):
+                        sd.pop(k)
+            return sd
+
+    def patch_weight_to_device(self, key, device_to=None, inplace_update=False):
+        if key not in self.patches:
+            return
+
+        weight, set_func, convert_func = get_key_weight(self.model, key)
+        inplace_update = self.weight_inplace_update or inplace_update
+
+        if key not in self.backup:
+            self.backup[key] = collections.namedtuple('Dimension', ['weight', 'inplace_update'])(weight.to(device=self.offload_device, copy=inplace_update), inplace_update)
+
+        if device_to is not None:
+            temp_weight = comfy.model_management.cast_to_device(weight, device_to, torch.float32, copy=True)
+        else:
+            temp_weight = weight.to(torch.float32, copy=True)
+        if convert_func is not None:
+            temp_weight = convert_func(temp_weight, inplace=True)
+
+        out_weight = comfy.lora.calculate_weight(self.patches[key], temp_weight, key)
+        if set_func is None:
+            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
+            if inplace_update:
+                comfy.utils.copy_to_param(self.model, key, out_weight)
+            else:
+                comfy.utils.set_attr_param(self.model, key, out_weight)
+        else:
+            set_func(out_weight, inplace_update=inplace_update, seed=string_to_seed(key))
+
+    def _load_list(self):
+        loading = []
+        for n, m in self.model.named_modules():
+            params = []
+            skip = False
+            for name, param in m.named_parameters(recurse=False):
+                params.append(name)
+            for name, param in m.named_parameters(recurse=True):
+                if name not in params:
+                    skip = True # skip random weights in non leaf modules
+                    break
+            if not skip and (hasattr(m, "comfy_cast_weights") or len(params) > 0):
+                loading.append((comfy.model_management.module_size(m), n, m, params))
+        return loading
+
+    def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
+        with self.use_ejected():
+            self.unpatch_hooks()
+            mem_counter = 0
+            patch_counter = 0
+            lowvram_counter = 0
+            loading = self._load_list()
+
+            load_completely = []
+            loading.sort(reverse=True)
+            for x in loading:
+                n = x[1]
+                m = x[2]
+                params = x[3]
+                module_mem = x[0]
+
+                lowvram_weight = False
+
+                weight_key = "{}.weight".format(n)
+                bias_key = "{}.bias".format(n)
+
+                if not full_load and hasattr(m, "comfy_cast_weights"):
+                    if mem_counter + module_mem >= lowvram_model_memory:
+                        lowvram_weight = True
+                        lowvram_counter += 1
+                        if hasattr(m, "prev_comfy_cast_weights"): #Already lowvramed
+                            continue
+
+                cast_weight = self.force_cast_weights
+                if lowvram_weight:
+                    if hasattr(m, "comfy_cast_weights"):
+                        m.weight_function = []
+                        m.bias_function = []
+
+                    if weight_key in self.patches:
+                        if force_patch_weights:
+                            self.patch_weight_to_device(weight_key)
+                        else:
+                            m.weight_function = [LowVramPatch(weight_key, self.patches)]
+                            patch_counter += 1
+                    if bias_key in self.patches:
+                        if force_patch_weights:
+                            self.patch_weight_to_device(bias_key)
+                        else:
+                            m.bias_function = [LowVramPatch(bias_key, self.patches)]
+                            patch_counter += 1
+
+                    cast_weight = True
+                else:
+                    if hasattr(m, "comfy_cast_weights"):
+                        wipe_lowvram_weight(m)
+
+                    if full_load or mem_counter + module_mem < lowvram_model_memory:
+                        mem_counter += module_mem
+                        load_completely.append((module_mem, n, m, params))
+
+                if cast_weight and hasattr(m, "comfy_cast_weights"):
+                    m.prev_comfy_cast_weights = m.comfy_cast_weights
+                    m.comfy_cast_weights = True
+
+                if weight_key in self.weight_wrapper_patches:
+                    m.weight_function.extend(self.weight_wrapper_patches[weight_key])
+
+                if bias_key in self.weight_wrapper_patches:
+                    m.bias_function.extend(self.weight_wrapper_patches[bias_key])
+
+                mem_counter += move_weight_functions(m, device_to)
+
+            load_completely.sort(reverse=True)
+            for x in load_completely:
+                n = x[1]
+                m = x[2]
+                params = x[3]
+                if hasattr(m, "comfy_patched_weights"):
+                    if m.comfy_patched_weights == True:
+                        continue
+
+                for param in params:
+                    self.patch_weight_to_device("{}.{}".format(n, param), device_to=device_to)
+
+                logging.debug("lowvram: loaded module regularly {} {}".format(n, m))
+                m.comfy_patched_weights = True
+
+            for x in load_completely:
+                x[2].to(device_to)
+
+            if lowvram_counter > 0:
+                logging.info("loaded partially {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), patch_counter))
+                self.model.model_lowvram = True
+            else:
+                logging.info("loaded completely {} {} {}".format(lowvram_model_memory / (1024 * 1024), mem_counter / (1024 * 1024), full_load))
+                self.model.model_lowvram = False
+                if full_load:
+                    self.model.to(device_to)
+                    mem_counter = self.model_size()
+
+            self.model.lowvram_patch_counter += patch_counter
+            self.model.device = device_to
+            self.model.model_loaded_weight_memory = mem_counter
+            self.model.current_weight_patches_uuid = self.patches_uuid
+
+            for callback in self.get_all_callbacks(CallbacksMP.ON_LOAD):
+                callback(self, device_to, lowvram_model_memory, force_patch_weights, full_load)
+
+            self.apply_hooks(self.forced_hooks, force_apply=True)
+
+    def patch_model(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
+        with self.use_ejected():
+            for k in self.object_patches:
+                old = comfy.utils.set_attr(self.model, k, self.object_patches[k])
+                if k not in self.object_patches_backup:
+                    self.object_patches_backup[k] = old
+
+            if lowvram_model_memory == 0:
+                full_load = True
+            else:
+                full_load = False
+
+            if load_weights:
+                self.load(device_to, lowvram_model_memory=lowvram_model_memory, force_patch_weights=force_patch_weights, full_load=full_load)
+        self.inject_model()
+        return self.model
+
+    def unpatch_model(self, device_to=None, unpatch_weights=True):
+        self.eject_model()
+        if unpatch_weights:
+            self.unpatch_hooks()
+            if self.model.model_lowvram:
+                for m in self.model.modules():
+                    move_weight_functions(m, device_to)
+                    wipe_lowvram_weight(m)
+
+                self.model.model_lowvram = False
+                self.model.lowvram_patch_counter = 0
+
+            keys = list(self.backup.keys())
+
+            for k in keys:
+                bk = self.backup[k]
+                if bk.inplace_update:
+                    comfy.utils.copy_to_param(self.model, k, bk.weight)
+                else:
+                    comfy.utils.set_attr_param(self.model, k, bk.weight)
+
+            self.model.current_weight_patches_uuid = None
+            self.backup.clear()
+
+            if device_to is not None:
+                self.model.to(device_to)
+                self.model.device = device_to
+            self.model.model_loaded_weight_memory = 0
+
+            for m in self.model.modules():
+                if hasattr(m, "comfy_patched_weights"):
+                    del m.comfy_patched_weights
+
+        keys = list(self.object_patches_backup.keys())
+        for k in keys:
+            comfy.utils.set_attr(self.model, k, self.object_patches_backup[k])
+
+        self.object_patches_backup.clear()
+
+    def partially_unload(self, device_to, memory_to_free=0):
+        with self.use_ejected():
+            hooks_unpatched = False
+            memory_freed = 0
+            patch_counter = 0
+            unload_list = self._load_list()
+            unload_list.sort()
+            for unload in unload_list:
+                if memory_to_free < memory_freed:
+                    break
+                module_mem = unload[0]
+                n = unload[1]
+                m = unload[2]
+                params = unload[3]
+
+                lowvram_possible = hasattr(m, "comfy_cast_weights")
+                if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights == True:
+                    move_weight = True
+                    for param in params:
+                        key = "{}.{}".format(n, param)
+                        bk = self.backup.get(key, None)
+                        if bk is not None:
+                            if not lowvram_possible:
+                                move_weight = False
+                                break
+
+                            if not hooks_unpatched:
+                                self.unpatch_hooks()
+                                hooks_unpatched = True
+
+                            if bk.inplace_update:
+                                comfy.utils.copy_to_param(self.model, key, bk.weight)
+                            else:
+                                comfy.utils.set_attr_param(self.model, key, bk.weight)
+                            self.backup.pop(key)
+
+                    weight_key = "{}.weight".format(n)
+                    bias_key = "{}.bias".format(n)
+                    if move_weight:
+                        cast_weight = self.force_cast_weights
+                        m.to(device_to)
+                        module_mem += move_weight_functions(m, device_to)
+                        if lowvram_possible:
+                            if weight_key in self.patches:
+                                m.weight_function.append(LowVramPatch(weight_key, self.patches))
+                                patch_counter += 1
+                            if bias_key in self.patches:
+                                m.bias_function.append(LowVramPatch(bias_key, self.patches))
+                                patch_counter += 1
+                            cast_weight = True
+
+                        if cast_weight:
+                            m.prev_comfy_cast_weights = m.comfy_cast_weights
+                            m.comfy_cast_weights = True
+                        m.comfy_patched_weights = False
+                        memory_freed += module_mem
+                        logging.debug("freed {}".format(n))
+
+            self.model.model_lowvram = True
+            self.model.lowvram_patch_counter += patch_counter
+            self.model.model_loaded_weight_memory -= memory_freed
+            return memory_freed
+
+    def partially_load(self, device_to, extra_memory=0, force_patch_weights=False):
+        with self.use_ejected(skip_and_inject_on_exit_only=True):
+            unpatch_weights = self.model.current_weight_patches_uuid is not None and (self.model.current_weight_patches_uuid != self.patches_uuid or force_patch_weights)
+            # TODO: force_patch_weights should not unload + reload full model
+            used = self.model.model_loaded_weight_memory
+            self.unpatch_model(self.offload_device, unpatch_weights=unpatch_weights)
+            if unpatch_weights:
+                extra_memory += (used - self.model.model_loaded_weight_memory)
+
+            self.patch_model(load_weights=False)
+            full_load = False
+            if self.model.model_lowvram == False and self.model.model_loaded_weight_memory > 0:
+                self.apply_hooks(self.forced_hooks, force_apply=True)
+                return 0
+            if self.model.model_loaded_weight_memory + extra_memory > self.model_size():
+                full_load = True
+            current_used = self.model.model_loaded_weight_memory
+            try:
+                self.load(device_to, lowvram_model_memory=current_used + extra_memory, force_patch_weights=force_patch_weights, full_load=full_load)
+            except Exception as e:
+                self.detach()
+                raise e
+
+            return self.model.model_loaded_weight_memory - current_used
+
+    def detach(self, unpatch_all=True):
+        self.eject_model()
+        self.model_patches_to(self.offload_device)
+        if unpatch_all:
+            self.unpatch_model(self.offload_device, unpatch_weights=unpatch_all)
+        for callback in self.get_all_callbacks(CallbacksMP.ON_DETACH):
+            callback(self, unpatch_all)
+        return self.model
+
+    def current_loaded_device(self):
+        return self.model.device
+
+    def calculate_weight(self, patches, weight, key, intermediate_dtype=torch.float32):
+        logging.warning("The ModelPatcher.calculate_weight function is deprecated, please use: comfy.lora.calculate_weight instead")
+        return comfy.lora.calculate_weight(patches, weight, key, intermediate_dtype=intermediate_dtype)
+
+    def cleanup(self):
+        self.clean_hooks()
+        if hasattr(self.model, "current_patcher"):
+            self.model.current_patcher = None
+        for callback in self.get_all_callbacks(CallbacksMP.ON_CLEANUP):
+            callback(self)
+
+    def add_callback(self, call_type: str, callback: Callable):
+        self.add_callback_with_key(call_type, None, callback)
+
+    def add_callback_with_key(self, call_type: str, key: str, callback: Callable):
+        c = self.callbacks.setdefault(call_type, {}).setdefault(key, [])
+        c.append(callback)
+
+    def remove_callbacks_with_key(self, call_type: str, key: str):
+        c = self.callbacks.get(call_type, {})
+        if key in c:
+            c.pop(key)
+
+    def get_callbacks(self, call_type: str, key: str):
+        return self.callbacks.get(call_type, {}).get(key, [])
+
+    def get_all_callbacks(self, call_type: str):
+        c_list = []
+        for c in self.callbacks.get(call_type, {}).values():
+            c_list.extend(c)
+        return c_list
+
+    def add_wrapper(self, wrapper_type: str, wrapper: Callable):
+        self.add_wrapper_with_key(wrapper_type, None, wrapper)
+
+    def add_wrapper_with_key(self, wrapper_type: str, key: str, wrapper: Callable):
+        w = self.wrappers.setdefault(wrapper_type, {}).setdefault(key, [])
+        w.append(wrapper)
+
+    def remove_wrappers_with_key(self, wrapper_type: str, key: str):
+        w = self.wrappers.get(wrapper_type, {})
+        if key in w:
+            w.pop(key)
+
+    def get_wrappers(self, wrapper_type: str, key: str):
+        return self.wrappers.get(wrapper_type, {}).get(key, [])
+
+    def get_all_wrappers(self, wrapper_type: str):
+        w_list = []
+        for w in self.wrappers.get(wrapper_type, {}).values():
+            w_list.extend(w)
+        return w_list
+
+    def set_attachments(self, key: str, attachment):
+        self.attachments[key] = attachment
+
+    def remove_attachments(self, key: str):
+        if key in self.attachments:
+            self.attachments.pop(key)
+
+    def get_attachment(self, key: str):
+        return self.attachments.get(key, None)
+
+    def set_injections(self, key: str, injections: list[PatcherInjection]):
+        self.injections[key] = injections
+
+    def remove_injections(self, key: str):
+        if key in self.injections:
+            self.injections.pop(key)
+
+    def get_injections(self, key: str):
+        return self.injections.get(key, None)
+
+    def set_additional_models(self, key: str, models: list['ModelPatcher']):
+        self.additional_models[key] = models
+
+    def remove_additional_models(self, key: str):
+        if key in self.additional_models:
+            self.additional_models.pop(key)
+
+    def get_additional_models_with_key(self, key: str):
+        return self.additional_models.get(key, [])
+
+    def get_additional_models(self):
+        all_models = []
+        for models in self.additional_models.values():
+            all_models.extend(models)
+        return all_models
+
+    def get_nested_additional_models(self):
+        def _evaluate_sub_additional_models(prev_models: list[ModelPatcher], cache_set: set[ModelPatcher]):
+            '''Make sure circular references do not cause infinite recursion.'''
+            next_models = []
+            for model in prev_models:
+                candidates = model.get_additional_models()
+                for c in candidates:
+                    if c not in cache_set:
+                        next_models.append(c)
+                        cache_set.add(c)
+            if len(next_models) == 0:
+                return prev_models
+            return prev_models + _evaluate_sub_additional_models(next_models, cache_set)
+
+        all_models = self.get_additional_models()
+        models_set = set(all_models)
+        real_all_models = _evaluate_sub_additional_models(prev_models=all_models, cache_set=models_set)
+        return real_all_models
+
+    def use_ejected(self, skip_and_inject_on_exit_only=False):
+        return AutoPatcherEjector(self, skip_and_inject_on_exit_only=skip_and_inject_on_exit_only)
+
+    def inject_model(self):
+        if self.is_injected or self.skip_injection:
+            return
+        for injections in self.injections.values():
+            for inj in injections:
+                inj.inject(self)
+                self.is_injected = True
+        if self.is_injected:
+            for callback in self.get_all_callbacks(CallbacksMP.ON_INJECT_MODEL):
+                callback(self)
+
+    def eject_model(self):
+        if not self.is_injected:
+            return
+        for injections in self.injections.values():
+            for inj in injections:
+                inj.eject(self)
+        self.is_injected = False
+        for callback in self.get_all_callbacks(CallbacksMP.ON_EJECT_MODEL):
+            callback(self)
+
+    def pre_run(self):
+        if hasattr(self.model, "current_patcher"):
+            self.model.current_patcher = self
+        for callback in self.get_all_callbacks(CallbacksMP.ON_PRE_RUN):
+            callback(self)
+
+    def prepare_state(self, timestep):
+        for callback in self.get_all_callbacks(CallbacksMP.ON_PREPARE_STATE):
+            callback(self, timestep)
+
+    def restore_hook_patches(self):
+        if self.hook_patches_backup is not None:
+            self.hook_patches = self.hook_patches_backup
+            self.hook_patches_backup = None
+
+    def set_hook_mode(self, hook_mode: comfy.hooks.EnumHookMode):
+        self.hook_mode = hook_mode
+
+    def prepare_hook_patches_current_keyframe(self, t: torch.Tensor, hook_group: comfy.hooks.HookGroup, model_options: dict[str]):
+        curr_t = t[0]
+        reset_current_hooks = False
+        transformer_options = model_options.get("transformer_options", {})
+        for hook in hook_group.hooks:
+            changed = hook.hook_keyframe.prepare_current_keyframe(curr_t=curr_t, transformer_options=transformer_options)
+            # if keyframe changed, remove any cached HookGroups that contain hook with the same hook_ref;
+            # this will cause the weights to be recalculated when sampling
+            if changed:
+                # reset current_hooks if contains hook that changed
+                if self.current_hooks is not None:
+                    for current_hook in self.current_hooks.hooks:
+                        if current_hook == hook:
+                            reset_current_hooks = True
+                            break
+                for cached_group in list(self.cached_hook_patches.keys()):
+                    if cached_group.contains(hook):
+                        self.cached_hook_patches.pop(cached_group)
+        if reset_current_hooks:
+            self.patch_hooks(None)
+
+    def register_all_hook_patches(self, hooks: comfy.hooks.HookGroup, target_dict: dict[str], model_options: dict=None,
+                                  registered: comfy.hooks.HookGroup = None):
+        self.restore_hook_patches()
+        if registered is None:
+            registered = comfy.hooks.HookGroup()
+        # handle WeightHooks
+        weight_hooks_to_register: list[comfy.hooks.WeightHook] = []
+        for hook in hooks.get_type(comfy.hooks.EnumHookType.Weight):
+            if hook.hook_ref not in self.hook_patches:
+                weight_hooks_to_register.append(hook)
+            else:
+                registered.add(hook)
+        if len(weight_hooks_to_register) > 0:
+            # clone hook_patches to become backup so that any non-dynamic hooks will return to their original state
+            self.hook_patches_backup = create_hook_patches_clone(self.hook_patches)
+            for hook in weight_hooks_to_register:
+                hook.add_hook_patches(self, model_options, target_dict, registered)
+        for callback in self.get_all_callbacks(CallbacksMP.ON_REGISTER_ALL_HOOK_PATCHES):
+            callback(self, hooks, target_dict, model_options, registered)
+        return registered
+
+    def add_hook_patches(self, hook: comfy.hooks.WeightHook, patches, strength_patch=1.0, strength_model=1.0):
+        with self.use_ejected():
+            # NOTE: this mirrors behavior of add_patches func
+            current_hook_patches: dict[str,list] = self.hook_patches.get(hook.hook_ref, {})
+            p = set()
+            model_sd = self.model.state_dict()
+            for k in patches:
+                offset = None
+                function = None
+                if isinstance(k, str):
+                    key = k
+                else:
+                    offset = k[1]
+                    key = k[0]
+                    if len(k) > 2:
+                        function = k[2]
+
+                if key in model_sd:
+                    p.add(k)
+                    current_patches: list[tuple] = current_hook_patches.get(key, [])
+                    current_patches.append((strength_patch, patches[k], strength_model, offset, function))
+                    current_hook_patches[key] = current_patches
+            self.hook_patches[hook.hook_ref] = current_hook_patches
+            # since should care about these patches too to determine if same model, reroll patches_uuid
+            self.patches_uuid = uuid.uuid4()
+            return list(p)
+
+    def get_combined_hook_patches(self, hooks: comfy.hooks.HookGroup):
+        # combined_patches will contain  weights of all relevant hooks, per key
+        combined_patches = {}
+        if hooks is not None:
+            for hook in hooks.hooks:
+                hook_patches: dict = self.hook_patches.get(hook.hook_ref, {})
+                for key in hook_patches.keys():
+                    current_patches: list[tuple] = combined_patches.get(key, [])
+                    if math.isclose(hook.strength, 1.0):
+                        current_patches.extend(hook_patches[key])
+                    else:
+                        # patches are stored as tuples: (strength_patch, (tuple_with_weights,), strength_model)
+                        for patch in hook_patches[key]:
+                            new_patch = list(patch)
+                            new_patch[0] *= hook.strength
+                            current_patches.append(tuple(new_patch))
+                    combined_patches[key] = current_patches
+        return combined_patches
+
+    def apply_hooks(self, hooks: comfy.hooks.HookGroup, transformer_options: dict=None, force_apply=False):
+        # TODO: return transformer_options dict with any additions from hooks
+        if self.current_hooks == hooks and (not force_apply or (not self.is_clip and hooks is None)):
+            return comfy.hooks.create_transformer_options_from_hooks(self, hooks, transformer_options)
+        self.patch_hooks(hooks=hooks)
+        for callback in self.get_all_callbacks(CallbacksMP.ON_APPLY_HOOKS):
+            callback(self, hooks)
+        return comfy.hooks.create_transformer_options_from_hooks(self, hooks, transformer_options)
+
+    def patch_hooks(self, hooks: comfy.hooks.HookGroup):
+        with self.use_ejected():
+            if hooks is not None:
+                model_sd_keys = list(self.model_state_dict().keys())
+                memory_counter = None
+                if self.hook_mode == comfy.hooks.EnumHookMode.MaxSpeed:
+                    # TODO: minimum_counter should have a minimum that conforms to loaded model requirements
+                    memory_counter = MemoryCounter(initial=comfy.model_management.get_free_memory(self.load_device),
+                                                minimum=comfy.model_management.minimum_inference_memory()*2)
+                # if have cached weights for hooks, use it
+                cached_weights = self.cached_hook_patches.get(hooks, None)
+                if cached_weights is not None:
+                    model_sd_keys_set = set(model_sd_keys)
+                    for key in cached_weights:
+                        if key not in model_sd_keys:
+                            logging.warning(f"Cached hook could not patch. Key does not exist in model: {key}")
+                            continue
+                        self.patch_cached_hook_weights(cached_weights=cached_weights, key=key, memory_counter=memory_counter)
+                        model_sd_keys_set.remove(key)
+                    self.unpatch_hooks(model_sd_keys_set)
+                else:
+                    self.unpatch_hooks()
+                    relevant_patches = self.get_combined_hook_patches(hooks=hooks)
+                    original_weights = None
+                    if len(relevant_patches) > 0:
+                        original_weights = self.get_key_patches()
+                    for key in relevant_patches:
+                        if key not in model_sd_keys:
+                            logging.warning(f"Cached hook would not patch. Key does not exist in model: {key}")
+                            continue
+                        self.patch_hook_weight_to_device(hooks=hooks, combined_patches=relevant_patches, key=key, original_weights=original_weights,
+                                                            memory_counter=memory_counter)
+            else:
+                self.unpatch_hooks()
+            self.current_hooks = hooks
+
+    def patch_cached_hook_weights(self, cached_weights: dict, key: str, memory_counter: MemoryCounter):
+        if key not in self.hook_backup:
+            weight: torch.Tensor = comfy.utils.get_attr(self.model, key)
+            target_device = self.offload_device
+            if self.hook_mode == comfy.hooks.EnumHookMode.MaxSpeed:
+                used = memory_counter.use(weight)
+                if used:
+                    target_device = weight.device
+            self.hook_backup[key] = (weight.to(device=target_device, copy=True), weight.device)
+        comfy.utils.copy_to_param(self.model, key, cached_weights[key][0].to(device=cached_weights[key][1]))
+
+    def clear_cached_hook_weights(self):
+        self.cached_hook_patches.clear()
+        self.patch_hooks(None)
+
+    def patch_hook_weight_to_device(self, hooks: comfy.hooks.HookGroup, combined_patches: dict, key: str, original_weights: dict, memory_counter: MemoryCounter):
+        if key not in combined_patches:
+            return
+
+        weight, set_func, convert_func = get_key_weight(self.model, key)
+        weight: torch.Tensor
+        if key not in self.hook_backup:
+            target_device = self.offload_device
+            if self.hook_mode == comfy.hooks.EnumHookMode.MaxSpeed:
+                used = memory_counter.use(weight)
+                if used:
+                    target_device = weight.device
+            self.hook_backup[key] = (weight.to(device=target_device, copy=True), weight.device)
+        # TODO: properly handle LowVramPatch, if it ends up an issue
+        temp_weight = comfy.model_management.cast_to_device(weight, weight.device, torch.float32, copy=True)
+        if convert_func is not None:
+            temp_weight = convert_func(temp_weight, inplace=True)
+
+        out_weight = comfy.lora.calculate_weight(combined_patches[key],
+                                                 temp_weight,
+                                                 key, original_weights=original_weights)
+        del original_weights[key]
+        if set_func is None:
+            out_weight = comfy.float.stochastic_rounding(out_weight, weight.dtype, seed=string_to_seed(key))
+            comfy.utils.copy_to_param(self.model, key, out_weight)
+        else:
+            set_func(out_weight, inplace_update=True, seed=string_to_seed(key))
+        if self.hook_mode == comfy.hooks.EnumHookMode.MaxSpeed:
+            # TODO: disable caching if not enough system RAM to do so
+            target_device = self.offload_device
+            used = memory_counter.use(weight)
+            if used:
+                target_device = weight.device
+            self.cached_hook_patches.setdefault(hooks, {})
+            self.cached_hook_patches[hooks][key] = (out_weight.to(device=target_device, copy=False), weight.device)
+        del temp_weight
+        del out_weight
+        del weight
+
+    def unpatch_hooks(self, whitelist_keys_set: set[str]=None) -> None:
+        with self.use_ejected():
+            if len(self.hook_backup) == 0:
+                self.current_hooks = None
+                return
+            keys = list(self.hook_backup.keys())
+            if whitelist_keys_set:
+                for k in keys:
+                    if k in whitelist_keys_set:
+                        comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
+                        self.hook_backup.pop(k)
+            else:
+                for k in keys:
+                    comfy.utils.copy_to_param(self.model, k, self.hook_backup[k][0].to(device=self.hook_backup[k][1]))
+
+                self.hook_backup.clear()
+                self.current_hooks = None
+
+    def clean_hooks(self):
+        self.unpatch_hooks()
+        self.clear_cached_hook_weights()
+
+    def __del__(self):
+        self.detach(unpatch_all=False)
+
--- a/comfy/model_sampling.py
+++ b/comfy/model_sampling.py
+import torch
+from comfy.ldm.modules.diffusionmodules.util import make_beta_schedule
+import math
+
+def rescale_zero_terminal_snr_sigmas(sigmas):
+    alphas_cumprod = 1 / ((sigmas * sigmas) + 1)
+    alphas_bar_sqrt = alphas_cumprod.sqrt()
+
+    # Store old values.
+    alphas_bar_sqrt_0 = alphas_bar_sqrt[0].clone()
+    alphas_bar_sqrt_T = alphas_bar_sqrt[-1].clone()
+
+    # Shift so the last timestep is zero.
+    alphas_bar_sqrt -= (alphas_bar_sqrt_T)
+
+    # Scale so the first timestep is back to the old value.
+    alphas_bar_sqrt *= alphas_bar_sqrt_0 / (alphas_bar_sqrt_0 - alphas_bar_sqrt_T)
+
+    # Convert alphas_bar_sqrt to betas
+    alphas_bar = alphas_bar_sqrt**2  # Revert sqrt
+    alphas_bar[-1] = 4.8973451890853435e-08
+    return ((1 - alphas_bar) / alphas_bar) ** 0.5
+
+class EPS:
+    def calculate_input(self, sigma, noise):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        return noise / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input - model_output * sigma
+
+    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        if max_denoise:
+            noise = noise * torch.sqrt(1.0 + sigma ** 2.0)
+        else:
+            noise = noise * sigma
+
+        noise += latent_image
+        return noise
+
+    def inverse_noise_scaling(self, sigma, latent):
+        return latent
+
+class V_PREDICTION(EPS):
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) - model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+
+class EDM(V_PREDICTION):
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input * self.sigma_data ** 2 / (sigma ** 2 + self.sigma_data ** 2) + model_output * sigma * self.sigma_data / (sigma ** 2 + self.sigma_data ** 2) ** 0.5
+
+class CONST:
+    def calculate_input(self, sigma, noise):
+        return noise
+
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input - model_output * sigma
+
+    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        return sigma * noise + (1.0 - sigma) * latent_image
+
+    def inverse_noise_scaling(self, sigma, latent):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (latent.ndim - 1))
+        return latent / (1.0 - sigma)
+
+class X0(EPS):
+    def calculate_denoised(self, sigma, model_output, model_input):
+        return model_output
+
+class IMG_TO_IMG(X0):
+    def calculate_input(self, sigma, noise):
+        return noise
+
+class COSMOS_RFLOW:
+    def calculate_input(self, sigma, noise):
+        sigma = (sigma / (sigma + 1))
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        return noise * (1.0 - sigma)
+
+    def calculate_denoised(self, sigma, model_output, model_input):
+        sigma = (sigma / (sigma + 1))
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (model_output.ndim - 1))
+        return model_input * (1.0 - sigma) - model_output * sigma
+
+    def noise_scaling(self, sigma, noise, latent_image, max_denoise=False):
+        sigma = sigma.view(sigma.shape[:1] + (1,) * (noise.ndim - 1))
+        noise = noise * sigma
+        noise += latent_image
+        return noise
+
+    def inverse_noise_scaling(self, sigma, latent):
+        return latent
+
+class ModelSamplingDiscrete(torch.nn.Module):
+    def __init__(self, model_config=None, zsnr=None):
+        super().__init__()
+
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+
+        beta_schedule = sampling_settings.get("beta_schedule", "linear")
+        linear_start = sampling_settings.get("linear_start", 0.00085)
+        linear_end = sampling_settings.get("linear_end", 0.012)
+        timesteps = sampling_settings.get("timesteps", 1000)
+
+        if zsnr is None:
+            zsnr = sampling_settings.get("zsnr", False)
+
+        self._register_schedule(given_betas=None, beta_schedule=beta_schedule, timesteps=timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=8e-3, zsnr=zsnr)
+        self.sigma_data = 1.0
+
+    def _register_schedule(self, given_betas=None, beta_schedule="linear", timesteps=1000,
+                          linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3, zsnr=False):
+        if given_betas is not None:
+            betas = given_betas
+        else:
+            betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end, cosine_s=cosine_s)
+        alphas = 1. - betas
+        alphas_cumprod = torch.cumprod(alphas, dim=0)
+
+        timesteps, = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        self.zsnr = zsnr
+
+        # self.register_buffer('betas', torch.tensor(betas, dtype=torch.float32))
+        # self.register_buffer('alphas_cumprod', torch.tensor(alphas_cumprod, dtype=torch.float32))
+        # self.register_buffer('alphas_cumprod_prev', torch.tensor(alphas_cumprod_prev, dtype=torch.float32))
+
+        sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        if self.zsnr:
+            sigmas = rescale_zero_terminal_snr_sigmas(sigmas)
+
+        self.set_sigmas(sigmas)
+
+    def set_sigmas(self, sigmas):
+        self.register_buffer('sigmas', sigmas.float())
+        self.register_buffer('log_sigmas', sigmas.log().float())
+
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+
+    def timestep(self, sigma):
+        log_sigma = sigma.log()
+        dists = log_sigma.to(self.log_sigmas.device) - self.log_sigmas[:, None]
+        return dists.abs().argmin(dim=0).view(sigma.shape).to(sigma.device)
+
+    def sigma(self, timestep):
+        t = torch.clamp(timestep.float().to(self.log_sigmas.device), min=0, max=(len(self.sigmas) - 1))
+        low_idx = t.floor().long()
+        high_idx = t.ceil().long()
+        w = t.frac()
+        log_sigma = (1 - w) * self.log_sigmas[low_idx] + w * self.log_sigmas[high_idx]
+        return log_sigma.exp().to(timestep.device)
+
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 999999999.9
+        if percent >= 1.0:
+            return 0.0
+        percent = 1.0 - percent
+        return self.sigma(torch.tensor(percent * 999.0)).item()
+
+class ModelSamplingDiscreteEDM(ModelSamplingDiscrete):
+    def timestep(self, sigma):
+        return 0.25 * sigma.log()
+
+    def sigma(self, timestep):
+        return (timestep / 0.25).exp()
+
+class ModelSamplingContinuousEDM(torch.nn.Module):
+    def __init__(self, model_config=None):
+        super().__init__()
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+
+        sigma_min = sampling_settings.get("sigma_min", 0.002)
+        sigma_max = sampling_settings.get("sigma_max", 120.0)
+        sigma_data = sampling_settings.get("sigma_data", 1.0)
+        self.set_parameters(sigma_min, sigma_max, sigma_data)
+
+    def set_parameters(self, sigma_min, sigma_max, sigma_data):
+        self.sigma_data = sigma_data
+        sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), 1000).exp()
+
+        self.register_buffer('sigmas', sigmas) #for compatibility with some schedulers
+        self.register_buffer('log_sigmas', sigmas.log())
+
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+
+    def timestep(self, sigma):
+        return 0.25 * sigma.log()
+
+    def sigma(self, timestep):
+        return (timestep / 0.25).exp()
+
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 999999999.9
+        if percent >= 1.0:
+            return 0.0
+        percent = 1.0 - percent
+
+        log_sigma_min = math.log(self.sigma_min)
+        return math.exp((math.log(self.sigma_max) - log_sigma_min) * percent + log_sigma_min)
+
+
+class ModelSamplingContinuousV(ModelSamplingContinuousEDM):
+    def timestep(self, sigma):
+        return sigma.atan() / math.pi * 2
+
+    def sigma(self, timestep):
+        return (timestep * math.pi / 2).tan()
+
+
+def time_snr_shift(alpha, t):
+    if alpha == 1.0:
+        return t
+    return alpha * t / (1 + (alpha - 1) * t)
+
+class ModelSamplingDiscreteFlow(torch.nn.Module):
+    def __init__(self, model_config=None):
+        super().__init__()
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+
+        self.set_parameters(shift=sampling_settings.get("shift", 1.0), multiplier=sampling_settings.get("multiplier", 1000))
+
+    def set_parameters(self, shift=1.0, timesteps=1000, multiplier=1000):
+        self.shift = shift
+        self.multiplier = multiplier
+        ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps) * multiplier)
+        self.register_buffer('sigmas', ts)
+
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+
+    def timestep(self, sigma):
+        return sigma * self.multiplier
+
+    def sigma(self, timestep):
+        return time_snr_shift(self.shift, timestep / self.multiplier)
+
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 1.0
+        if percent >= 1.0:
+            return 0.0
+        return time_snr_shift(self.shift, 1.0 - percent)
+
+class StableCascadeSampling(ModelSamplingDiscrete):
+    def __init__(self, model_config=None):
+        super().__init__()
+
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+
+        self.set_parameters(sampling_settings.get("shift", 1.0))
+
+    def set_parameters(self, shift=1.0, cosine_s=8e-3):
+        self.shift = shift
+        self.cosine_s = torch.tensor(cosine_s)
+        self._init_alpha_cumprod = torch.cos(self.cosine_s / (1 + self.cosine_s) * torch.pi * 0.5) ** 2
+
+        #This part is just for compatibility with some schedulers in the codebase
+        self.num_timesteps = 10000
+        sigmas = torch.empty((self.num_timesteps), dtype=torch.float32)
+        for x in range(self.num_timesteps):
+            t = (x + 1) / self.num_timesteps
+            sigmas[x] = self.sigma(t)
+
+        self.set_sigmas(sigmas)
+
+    def sigma(self, timestep):
+        alpha_cumprod = (torch.cos((timestep + self.cosine_s) / (1 + self.cosine_s) * torch.pi * 0.5) ** 2 / self._init_alpha_cumprod)
+
+        if self.shift != 1.0:
+            var = alpha_cumprod
+            logSNR = (var/(1-var)).log()
+            logSNR += 2 * torch.log(1.0 / torch.tensor(self.shift))
+            alpha_cumprod = logSNR.sigmoid()
+
+        alpha_cumprod = alpha_cumprod.clamp(0.0001, 0.9999)
+        return ((1 - alpha_cumprod) / alpha_cumprod) ** 0.5
+
+    def timestep(self, sigma):
+        var = 1 / ((sigma * sigma) + 1)
+        var = var.clamp(0, 1.0)
+        s, min_var = self.cosine_s.to(var.device), self._init_alpha_cumprod.to(var.device)
+        t = (((var * min_var) ** 0.5).acos() / (torch.pi * 0.5)) * (1 + s) - s
+        return t
+
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 999999999.9
+        if percent >= 1.0:
+            return 0.0
+
+        percent = 1.0 - percent
+        return self.sigma(torch.tensor(percent))
+
+
+def flux_time_shift(mu: float, sigma: float, t):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+
+class ModelSamplingFlux(torch.nn.Module):
+    def __init__(self, model_config=None):
+        super().__init__()
+        if model_config is not None:
+            sampling_settings = model_config.sampling_settings
+        else:
+            sampling_settings = {}
+
+        self.set_parameters(shift=sampling_settings.get("shift", 1.15))
+
+    def set_parameters(self, shift=1.15, timesteps=10000):
+        self.shift = shift
+        ts = self.sigma((torch.arange(1, timesteps + 1, 1) / timesteps))
+        self.register_buffer('sigmas', ts)
+
+    @property
+    def sigma_min(self):
+        return self.sigmas[0]
+
+    @property
+    def sigma_max(self):
+        return self.sigmas[-1]
+
+    def timestep(self, sigma):
+        return sigma
+
+    def sigma(self, timestep):
+        return flux_time_shift(self.shift, 1.0, timestep)
+
+    def percent_to_sigma(self, percent):
+        if percent <= 0.0:
+            return 1.0
+        if percent >= 1.0:
+            return 0.0
+        return flux_time_shift(self.shift, 1.0, 1.0 - percent)
+
+
+class ModelSamplingCosmosRFlow(ModelSamplingContinuousEDM):
+    def timestep(self, sigma):
+        return sigma / (sigma + 1)
+
+    def sigma(self, timestep):
+        sigma_max = self.sigma_max
+        if timestep >= (sigma_max / (sigma_max + 1)):
+            return sigma_max
+
+        return timestep / (1 - timestep)
--- a/comfy/ops.py
+++ b/comfy/ops.py
+"""
+    This file is part of ComfyUI.
+    Copyright (C) 2024 Stability AI
+
+    This program is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation, either version 3 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+import torch
+import logging
+import comfy.model_management
+from comfy.cli_args import args, PerformanceFeature
+import comfy.float
+import comfy.rmsnorm
+import contextlib
+
+
+try:
+    from lmslim import quant_ops
+    import lmslimquant
+    from lmslim.layers.gemm.int8_utils import per_token_quant_int8
+except Exception:
+    print("INFO: Please install lmslim if you want to infergptq or awq or w8a8 model")
+
+
+def scaled_dot_product_attention(q, k, v, *args, **kwargs):
+    return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs)
+
+
+try:
+    if torch.cuda.is_available():
+        from torch.nn.attention import SDPBackend, sdpa_kernel
+        import inspect
+        if "set_priority" in inspect.signature(sdpa_kernel).parameters:
+            SDPA_BACKEND_PRIORITY = [
+                SDPBackend.FLASH_ATTENTION,
+                SDPBackend.EFFICIENT_ATTENTION,
+                SDPBackend.MATH,
+            ]
+
+            SDPA_BACKEND_PRIORITY.insert(0, SDPBackend.CUDNN_ATTENTION)
+
+            def scaled_dot_product_attention(q, k, v, *args, **kwargs):
+                with sdpa_kernel(SDPA_BACKEND_PRIORITY, set_priority=True):
+                    return torch.nn.functional.scaled_dot_product_attention(q, k, v, *args, **kwargs)
+        else:
+            logging.warning("Torch version too old to set sdpa backend priority.")
+except (ModuleNotFoundError, TypeError):
+    logging.warning("Could not set sdpa backend priority.")
+
+cast_to = comfy.model_management.cast_to #TODO: remove once no more references
+
+def cast_to_input(weight, input, non_blocking=False, copy=True):
+    return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
+
+def cast_bias_weight(s, input=None, dtype=None, device=None, bias_dtype=None):
+    if input is not None:
+        if dtype is None:
+            dtype = input.dtype
+        if bias_dtype is None:
+            bias_dtype = dtype
+        if device is None:
+            device = input.device
+
+    offload_stream = comfy.model_management.get_offload_stream(device)
+    if offload_stream is not None:
+        wf_context = offload_stream
+    else:
+        wf_context = contextlib.nullcontext()
+
+    bias = None
+    non_blocking = comfy.model_management.device_supports_non_blocking(device)
+    if s.bias is not None:
+        has_function = len(s.bias_function) > 0
+        bias = comfy.model_management.cast_to(s.bias, bias_dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
+
+        if has_function:
+            with wf_context:
+                for f in s.bias_function:
+                    bias = f(bias)
+
+    has_function = len(s.weight_function) > 0
+    weight = comfy.model_management.cast_to(s.weight, dtype, device, non_blocking=non_blocking, copy=has_function, stream=offload_stream)
+    if has_function:
+        with wf_context:
+            for f in s.weight_function:
+                weight = f(weight)
+
+    comfy.model_management.sync_stream(device, offload_stream)
+    return weight, bias
+
+class CastWeightBiasOp:
+    comfy_cast_weights = False
+    weight_function = []
+    bias_function = []
+
+class disable_weight_init:
+    class Linear(torch.nn.Linear, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.linear(input, weight, bias)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
+    class Conv1d(torch.nn.Conv1d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return self._conv_forward(input, weight, bias)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
+    class Conv2d(torch.nn.Conv2d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return self._conv_forward(input, weight, bias)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
+    class Conv3d(torch.nn.Conv3d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return self._conv_forward(input, weight, bias)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
+    class GroupNorm(torch.nn.GroupNorm, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.group_norm(input, self.num_groups, weight, bias, self.eps)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
+    class LayerNorm(torch.nn.LayerNorm, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            if self.weight is not None:
+                weight, bias = cast_bias_weight(self, input)
+            else:
+                weight = None
+                bias = None
+            return torch.nn.functional.layer_norm(input, self.normalized_shape, weight, bias, self.eps)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
+    class RMSNorm(comfy.rmsnorm.RMSNorm, CastWeightBiasOp):
+        def reset_parameters(self):
+            self.bias = None
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            if self.weight is not None:
+                weight, bias = cast_bias_weight(self, input)
+            else:
+                weight = None
+            return comfy.rmsnorm.rms_norm(input, weight, self.eps)  # TODO: switch to commented out line when old torch is deprecated
+            # return torch.nn.functional.rms_norm(input, self.normalized_shape, weight, self.eps)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
+    class ConvTranspose2d(torch.nn.ConvTranspose2d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+
+        def forward_comfy_cast_weights(self, input, output_size=None):
+            num_spatial_dims = 2
+            output_padding = self._output_padding(
+                input, output_size, self.stride, self.padding, self.kernel_size,
+                num_spatial_dims, self.dilation)
+
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.conv_transpose2d(
+                input, weight, bias, self.stride, self.padding,
+                output_padding, self.groups, self.dilation)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
+    class ConvTranspose1d(torch.nn.ConvTranspose1d, CastWeightBiasOp):
+        def reset_parameters(self):
+            return None
+
+        def forward_comfy_cast_weights(self, input, output_size=None):
+            num_spatial_dims = 1
+            output_padding = self._output_padding(
+                input, output_size, self.stride, self.padding, self.kernel_size,
+                num_spatial_dims, self.dilation)
+
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.conv_transpose1d(
+                input, weight, bias, self.stride, self.padding,
+                output_padding, self.groups, self.dilation)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                return super().forward(*args, **kwargs)
+
+    class Embedding(torch.nn.Embedding, CastWeightBiasOp):
+        def reset_parameters(self):
+            self.bias = None
+            return None
+
+        def forward_comfy_cast_weights(self, input, out_dtype=None):
+            output_dtype = out_dtype
+            if self.weight.dtype == torch.float16 or self.weight.dtype == torch.bfloat16:
+                out_dtype = None
+            weight, bias = cast_bias_weight(self, device=input.device, dtype=out_dtype)
+            return torch.nn.functional.embedding(input, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse).to(dtype=output_dtype)
+
+        def forward(self, *args, **kwargs):
+            if self.comfy_cast_weights or len(self.weight_function) > 0 or len(self.bias_function) > 0:
+                return self.forward_comfy_cast_weights(*args, **kwargs)
+            else:
+                if "out_dtype" in kwargs:
+                    kwargs.pop("out_dtype")
+                return super().forward(*args, **kwargs)
+
+    @classmethod
+    def conv_nd(s, dims, *args, **kwargs):
+        if dims == 2:
+            return s.Conv2d(*args, **kwargs)
+        elif dims == 3:
+            return s.Conv3d(*args, **kwargs)
+        else:
+            raise ValueError(f"unsupported dimensions: {dims}")
+
+
+class manual_cast(disable_weight_init):
+    class Linear(disable_weight_init.Linear):
+        comfy_cast_weights = True
+
+    class Conv1d(disable_weight_init.Conv1d):
+        comfy_cast_weights = True
+
+    class Conv2d(disable_weight_init.Conv2d):
+        comfy_cast_weights = True
+
+    class Conv3d(disable_weight_init.Conv3d):
+        comfy_cast_weights = True
+
+    class GroupNorm(disable_weight_init.GroupNorm):
+        comfy_cast_weights = True
+
+    class LayerNorm(disable_weight_init.LayerNorm):
+        comfy_cast_weights = True
+
+    class ConvTranspose2d(disable_weight_init.ConvTranspose2d):
+        comfy_cast_weights = True
+
+    class ConvTranspose1d(disable_weight_init.ConvTranspose1d):
+        comfy_cast_weights = True
+
+    class RMSNorm(disable_weight_init.RMSNorm):
+        comfy_cast_weights = True
+
+    class Embedding(disable_weight_init.Embedding):
+        comfy_cast_weights = True
+
+
+
+from typing import Optional
+class manual_cast_int8_per_channel(manual_cast):
+    class Linear(torch.nn.Module):
+        def __init__(self, in_features, out_features, bias=True, dtype=None, device=None):
+            super().__init__()
+            self.in_features = in_features
+            self.out_features = out_features
+            self.weight = torch.nn.Parameter(torch.empty((out_features, in_features), dtype=dtype, device=device), requires_grad=False)
+            if bias:
+                self.bias = torch.nn.Parameter(torch.empty(out_features, dtype=dtype, device=device))
+            else:
+                self.register_parameter("bias", None)
+
+            self.weight_quant = None
+            self.weight_scale = None
+
+        def blaslt_scaled_mm(self,
+                             a: torch.Tensor,
+                             b: torch.Tensor,
+                             scale_a: torch.Tensor,
+                             scale_b: torch.Tensor,
+                             out_dtype: torch.dtype,
+                             bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+            m = a.shape[0]
+            n = b.shape[0]
+            k = a.shape[1]
+            _, out = quant_ops.hipblaslt_w8a8_gemm(a, b, scale_a.to(torch.float32), scale_b.to(torch.float32), m, n, k, 'NT', out_dtype)
+            if bias is not None:
+                out += bias
+            return out
+
+        def weight_quant_int8(self, weight):
+            org_w_shape = weight.shape
+            w = weight.to(torch.bfloat16)
+            max_val = w.abs().amax(dim=1, keepdim=True).clamp(min=1e-5)
+            qmin, qmax = -128, 127
+            scales = (max_val / qmax).float()
+            w_q = torch.clamp(torch.round(w / scales), qmin, qmax).to(torch.int8)
+
+            assert torch.isnan(scales).sum() == 0
+            assert torch.isnan(w_q).sum() == 0
+
+            scales = scales.view(org_w_shape[0], -1)
+            w_q = w_q.reshape(org_w_shape)
+
+            return w_q, scales
+
+        def forward(self, input):
+            #return self.forward_calibration(input)
+
+            dim = input.dim()
+            if dim > 2:
+                input = input.squeeze(0)
+
+            if self.weight_quant is None:
+                self.weight_quant, self.weight_scale = self.weight_quant_int8(self.weight)
+                self.bias = torch.nn.Parameter(self.bias.to(input.dtype))
+
+            input_quant, input_scale = per_token_quant_int8(input)
+            output_tensor = self.blaslt_scaled_mm(input_quant, self.weight_quant, input_scale, self.weight_scale, input.dtype, self.bias)
+
+            if dim > 2:
+                output_tensor = output_tensor.unsqueeze(0)
+
+            return output_tensor
+
+class manual_cast_int8(manual_cast):
+    class Linear(torch.nn.Module, CastWeightBiasOp):
+        __constants__ = ['in_features', 'out_features']
+        in_features: int
+        out_features: int
+        weight: torch.Tensor
+        def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                    device=None, dtype=None) -> None:
+            factory_kwargs = {'device': device, 'dtype': dtype}
+            super().__init__()
+            print("=============use int8==============")
+            self.in_features = in_features
+            self.out_features = out_features
+            # self.weight = Parameter(torch.empty((out_features, in_features),dtype=torch.int8, device=device))
+            # self.weight_scale = Parameter(torch.empty((out_features,1), **factory_kwargs))
+            self.register_buffer("weight", torch.empty((out_features, in_features), dtype=torch.int8, device=device))
+            self.register_buffer("weight_scale", torch.empty((out_features, 1), dtype=torch.float16, device=device))
+            if bias:
+                self.bias = torch.nn.Parameter(torch.empty(out_features,dtype=torch.float16, device=device))
+            else:
+                self.register_parameter('bias', None)
+            self.reset_parameters()
+
+        def reset_parameters(self) -> None:
+        
+            return None
+        
+        def verify_quant_gemm(self,input_q,weight_q,input_scale, weight_scale,out_dtype: torch.dtype,
+                        bias):
+
+            # 2. INT GEMM
+            # (int8 matmul -> cast to int32 accumulated result)
+            y_q = (input_q.cpu().int() @ (weight_q.cpu().int().t()))
+
+            # 3. Dequantize
+            y_deq = y_q * ((input_scale * weight_scale.t()).cpu())
+
+            # 4. Reference FP32 GEMM
+            return y_deq.to(out_dtype).cuda()
+
+        def blaslt_scaled_mm(self,
+                        a: torch.Tensor,
+                        b: torch.Tensor,
+                        scale_a: torch.Tensor,
+                        scale_b: torch.Tensor,
+                        out_dtype: torch.dtype,
+                        bias) -> torch.Tensor:
+            # b = b.t()
+            m = a.shape[0]
+            n = b.shape[0]
+            k = a.shape[1]
+            
+            # import pdb
+            # pdb.set_trace()
+            stat, output = quant_ops.hipblaslt_w8a8_gemm(a, b, scale_a, scale_b, m, n, k, 'NT', out_dtype)
+            # output = matmul_int8(a, scale_a, b, scale_b, out_dtype, config=None)
+            # status, output = torch.ops.lmslim.lightop_channel_int8_mm(a, b, scale_a, scale_b, out_dtype, bias)
+            if bias is not None:
+                output += bias
+            # torch.cuda.synchronize()
+            # out = torch.rand((m, n),dtype=torch.bfloat16, device=a.device)
+            return output
+        
+        def quantize_symmetric_per_row_int8(self, x: torch.Tensor):
+            """
+            对输入 x 进行 per-row（dim=1）对称 INT8 量化。
+            
+            Args:
+                x: tensor of shape [B, N], dtype in {float32, float16, bfloat16}
+            
+            Returns:
+                x_q: quantized int8 tensor, shape [B, N]
+                scales: scale per row, shape [B, 1], same dtype as x
+            """
+            assert x.ndim == 2, f"Expected 2D input, got {x.shape}"
+            assert x.dtype in [torch.float32, torch.float16, torch.bfloat16]
+
+            # Step 1: 计算每行的最大绝对值 -> shape [B, 1]
+            max_abs = x.abs().amax(dim=1, keepdim=True)  # keepdim=True 保证 shape [32, 1]
+
+            # Step 2: 计算 scale = max_abs / 127
+            # 避免除零：若某行为全零，则 scale=1
+            scales = torch.where(
+                max_abs == 0,
+                torch.tensor(1.0, dtype=x.dtype, device=x.device),
+                max_abs / 127.0
+            )  # shape [32, 1], dtype = x.dtype
+
+            # Step 3: 量化：x_q = round(x / scales)
+            # 为避免 bfloat16 精度问题，中间计算用 float32
+            x_f32 = x.to(torch.float32)
+            scales_f32 = scales.to(torch.float32)
+            x_q_f32 = torch.round(x_f32 / scales_f32)
+
+            # Step 4: clamp 到 [-127, 127] 并转为 int8
+            x_q = torch.clamp(x_q_f32, -127, 127).to(torch.int8)
+
+            return x_q, scales_f32
+        def forward(self, input_tensor: torch.Tensor):
+            # import pdb
+            # pdb.set_trace()
+            dim = input_tensor.dim()
+            if dim > 2:
+                input_tensor = input_tensor.squeeze(0)
+            dtype = input_tensor.dtype
+            # print
+            # import pdb
+            # pdb.set_trace()
+            input_tensor_quant, input_tensor_scale = per_token_quant_int8(input_tensor)
+            # input_tensor_quant, input_tensor_scale = self.quantize_symmetric_per_row_int8(input_tensor)
+            
+            output_tensor = self.blaslt_scaled_mm(input_tensor_quant, self.weight, input_tensor_scale, self.weight_scale.to(torch.float32), dtype, self.bias)
+            # output_sf = self.verify_quant_gemm(input_tensor_quant, self.weight, input_tensor_scale, self.weight_scale.to(torch.float32), dtype, self.bias)
+
+            if dim > 2:
+                output_tensor = output_tensor.unsqueeze(0)
+            return output_tensor
+
+        def extra_repr(self) -> str:
+            return f'in_features={self.in_features}, out_features={self.out_features}, bias={self.bias is not None}'
+
+def fp8_linear(self, input):
+    dtype = self.weight.dtype
+    if dtype not in [torch.float8_e4m3fn]:
+        return None
+
+    tensor_2d = False
+    if len(input.shape) == 2:
+        tensor_2d = True
+        input = input.unsqueeze(1)
+
+    input_shape = input.shape
+    input_dtype = input.dtype
+    if len(input.shape) == 3:
+        w, bias = cast_bias_weight(self, input, dtype=dtype, bias_dtype=input_dtype)
+        w = w.t()
+
+        scale_weight = self.scale_weight
+        scale_input = self.scale_input
+        if scale_weight is None:
+            scale_weight = torch.ones((), device=input.device, dtype=torch.float32)
+        else:
+            scale_weight = scale_weight.to(input.device)
+
+        if scale_input is None:
+            scale_input = torch.ones((), device=input.device, dtype=torch.float32)
+            input = torch.clamp(input, min=-448, max=448, out=input)
+            input = input.reshape(-1, input_shape[2]).to(dtype).contiguous()
+        else:
+            scale_input = scale_input.to(input.device)
+            input = (input * (1.0 / scale_input).to(input_dtype)).reshape(-1, input_shape[2]).to(dtype).contiguous()
+
+        if bias is not None:
+            o = torch._scaled_mm(input, w, out_dtype=input_dtype, bias=bias, scale_a=scale_input, scale_b=scale_weight)
+        else:
+            o = torch._scaled_mm(input, w, out_dtype=input_dtype, scale_a=scale_input, scale_b=scale_weight)
+
+        if isinstance(o, tuple):
+            o = o[0]
+
+        if tensor_2d:
+            return o.reshape(input_shape[0], -1)
+
+        return o.reshape((-1, input_shape[1], self.weight.shape[0]))
+
+    return None
+
+class fp8_ops(manual_cast):
+    class Linear(manual_cast.Linear):
+        def reset_parameters(self):
+            self.scale_weight = None
+            self.scale_input = None
+            return None
+
+        def forward_comfy_cast_weights(self, input):
+            try:
+                out = fp8_linear(self, input)
+                if out is not None:
+                    return out
+            except Exception as e:
+                logging.info("Exception during fp8 op: {}".format(e))
+
+            weight, bias = cast_bias_weight(self, input)
+            return torch.nn.functional.linear(input, weight, bias)
+
+def scaled_fp8_ops(fp8_matrix_mult=False, scale_input=False, override_dtype=None):
+    logging.info("Using scaled fp8: fp8 matrix mult: {}, scale input: {}".format(fp8_matrix_mult, scale_input))
+    class scaled_fp8_op(manual_cast):
+        class Linear(manual_cast.Linear):
+            def __init__(self, *args, **kwargs):
+                if override_dtype is not None:
+                    kwargs['dtype'] = override_dtype
+                super().__init__(*args, **kwargs)
+
+            def reset_parameters(self):
+                if not hasattr(self, 'scale_weight'):
+                    self.scale_weight = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
+
+                if not scale_input:
+                    self.scale_input = None
+
+                if not hasattr(self, 'scale_input'):
+                    self.scale_input = torch.nn.parameter.Parameter(data=torch.ones((), device=self.weight.device, dtype=torch.float32), requires_grad=False)
+                return None
+
+            def forward_comfy_cast_weights(self, input):
+                if fp8_matrix_mult:
+                    out = fp8_linear(self, input)
+                    if out is not None:
+                        return out
+
+                weight, bias = cast_bias_weight(self, input)
+
+                if weight.numel() < input.numel(): #TODO: optimize
+                    return torch.nn.functional.linear(input, weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype), bias)
+                else:
+                    return torch.nn.functional.linear(input * self.scale_weight.to(device=weight.device, dtype=weight.dtype), weight, bias)
+
+            def convert_weight(self, weight, inplace=False, **kwargs):
+                if inplace:
+                    weight *= self.scale_weight.to(device=weight.device, dtype=weight.dtype)
+                    return weight
+                else:
+                    return weight * self.scale_weight.to(device=weight.device, dtype=weight.dtype)
+
+            def set_weight(self, weight, inplace_update=False, seed=None, **kwargs):
+                weight = comfy.float.stochastic_rounding(weight / self.scale_weight.to(device=weight.device, dtype=weight.dtype), self.weight.dtype, seed=seed)
+                if inplace_update:
+                    self.weight.data.copy_(weight)
+                else:
+                    self.weight = torch.nn.Parameter(weight, requires_grad=False)
+
+    return scaled_fp8_op
+
+CUBLAS_IS_AVAILABLE = False
+try:
+    from cublas_ops import CublasLinear
+    CUBLAS_IS_AVAILABLE = True
+except ImportError:
+    pass
+
+if CUBLAS_IS_AVAILABLE:
+    class cublas_ops(disable_weight_init):
+        class Linear(CublasLinear, disable_weight_init.Linear):
+            def reset_parameters(self):
+                return None
+
+            def forward_comfy_cast_weights(self, input):
+                return super().forward(input)
+
+            def forward(self, *args, **kwargs):
+                return super().forward(*args, **kwargs)
+
+def pick_operations(weight_dtype, compute_dtype, load_device=None, disable_fast_fp8=False, fp8_optimizations=False, scaled_fp8=None, int8_optimizations=None):
+    if int8_optimizations is not None and int8_optimizations:
+        return manual_cast_int8_per_channel
+    fp8_compute = comfy.model_management.supports_fp8_compute(load_device)
+    if scaled_fp8 is not None:
+        return scaled_fp8_ops(fp8_matrix_mult=fp8_compute and fp8_optimizations, scale_input=fp8_optimizations, override_dtype=scaled_fp8)
+
+    if (
+        fp8_compute and
+        (fp8_optimizations or PerformanceFeature.Fp8MatrixMultiplication in args.fast) and
+        not disable_fast_fp8
+    ):
+        return fp8_ops
+
+    if (
+        PerformanceFeature.CublasOps in args.fast and
+        CUBLAS_IS_AVAILABLE and
+        weight_dtype == torch.float16 and
+        (compute_dtype == torch.float16 or compute_dtype is None)
+    ):
+        logging.info("Using cublas ops")
+        return cublas_ops
+
+    if compute_dtype is None or weight_dtype == compute_dtype:
+        return disable_weight_init
+
+    return manual_cast
--- a/comfy/options.py
+++ b/comfy/options.py
+
+args_parsing = False
+
+def enable_args_parsing(enable=True):
+    global args_parsing
+    args_parsing = enable
--- a/comfy/patcher_extension.py
+++ b/comfy/patcher_extension.py
+from __future__ import annotations
+from typing import Callable
+
+class CallbacksMP:
+    ON_CLONE = "on_clone"
+    ON_LOAD = "on_load_after"
+    ON_DETACH = "on_detach_after"
+    ON_CLEANUP = "on_cleanup"
+    ON_PRE_RUN = "on_pre_run"
+    ON_PREPARE_STATE = "on_prepare_state"
+    ON_APPLY_HOOKS = "on_apply_hooks"
+    ON_REGISTER_ALL_HOOK_PATCHES = "on_register_all_hook_patches"
+    ON_INJECT_MODEL = "on_inject_model"
+    ON_EJECT_MODEL = "on_eject_model"
+
+    # callbacks dict is in the format:
+    # {"call_type": {"key": [Callable1, Callable2, ...]} }
+    @classmethod
+    def init_callbacks(cls) -> dict[str, dict[str, list[Callable]]]:
+        return {}
+
+def add_callback(call_type: str, callback: Callable, transformer_options: dict, is_model_options=False):
+    add_callback_with_key(call_type, None, callback, transformer_options, is_model_options)
+
+def add_callback_with_key(call_type: str, key: str, callback: Callable, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.setdefault("transformer_options", {})
+    callbacks: dict[str, dict[str, list]] = transformer_options.setdefault("callbacks", {})
+    c = callbacks.setdefault(call_type, {}).setdefault(key, [])
+    c.append(callback)
+
+def get_callbacks_with_key(call_type: str, key: str, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.get("transformer_options", {})
+    c_list = []
+    callbacks: dict[str, list] = transformer_options.get("callbacks", {})
+    c_list.extend(callbacks.get(call_type, {}).get(key, []))
+    return c_list
+
+def get_all_callbacks(call_type: str, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.get("transformer_options", {})
+    c_list = []
+    callbacks: dict[str, list] = transformer_options.get("callbacks", {})
+    for c in callbacks.get(call_type, {}).values():
+        c_list.extend(c)
+    return c_list
+
+class WrappersMP:
+    OUTER_SAMPLE = "outer_sample"
+    PREPARE_SAMPLING = "prepare_sampling"
+    SAMPLER_SAMPLE = "sampler_sample"
+    PREDICT_NOISE = "predict_noise"
+    CALC_COND_BATCH = "calc_cond_batch"
+    APPLY_MODEL = "apply_model"
+    DIFFUSION_MODEL = "diffusion_model"
+
+    # wrappers dict is in the format:
+    # {"wrapper_type": {"key": [Callable1, Callable2, ...]} }
+    @classmethod
+    def init_wrappers(cls) -> dict[str, dict[str, list[Callable]]]:
+        return {}
+
+def add_wrapper(wrapper_type: str, wrapper: Callable, transformer_options: dict, is_model_options=False):
+    add_wrapper_with_key(wrapper_type, None, wrapper, transformer_options, is_model_options)
+
+def add_wrapper_with_key(wrapper_type: str, key: str, wrapper: Callable, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.setdefault("transformer_options", {})
+    wrappers: dict[str, dict[str, list]] = transformer_options.setdefault("wrappers", {})
+    w = wrappers.setdefault(wrapper_type, {}).setdefault(key, [])
+    w.append(wrapper)
+
+def get_wrappers_with_key(wrapper_type: str, key: str, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.get("transformer_options", {})
+    w_list = []
+    wrappers: dict[str, list] = transformer_options.get("wrappers", {})
+    w_list.extend(wrappers.get(wrapper_type, {}).get(key, []))
+    return w_list
+
+def get_all_wrappers(wrapper_type: str, transformer_options: dict, is_model_options=False):
+    if is_model_options:
+        transformer_options = transformer_options.get("transformer_options", {})
+    w_list = []
+    wrappers: dict[str, list] = transformer_options.get("wrappers", {})
+    for w in wrappers.get(wrapper_type, {}).values():
+        w_list.extend(w)
+    return w_list
+
+class WrapperExecutor:
+    """Handles call stack of wrappers around a function in an ordered manner."""
+    def __init__(self, original: Callable, class_obj: object, wrappers: list[Callable], idx: int):
+        # NOTE: class_obj exists so that wrappers surrounding a class method can access
+        #       the class instance at runtime via executor.class_obj
+        self.original = original
+        self.class_obj = class_obj
+        self.wrappers = wrappers.copy()
+        self.idx = idx
+        self.is_last = idx == len(wrappers)
+
+    def __call__(self, *args, **kwargs):
+        """Calls the next wrapper or original function, whichever is appropriate."""
+        new_executor = self._create_next_executor()
+        return new_executor.execute(*args, **kwargs)
+
+    def execute(self, *args, **kwargs):
+        """Used to initiate executor internally - DO NOT use this if you received executor in wrapper."""
+        args = list(args)
+        kwargs = dict(kwargs)
+        if self.is_last:
+            return self.original(*args, **kwargs)
+        return self.wrappers[self.idx](self, *args, **kwargs)
+
+    def _create_next_executor(self) -> 'WrapperExecutor':
+        new_idx = self.idx + 1
+        if new_idx > len(self.wrappers):
+            raise Exception("Wrapper idx exceeded available wrappers; something went very wrong.")
+        if self.class_obj is None:
+            return WrapperExecutor.new_executor(self.original, self.wrappers, new_idx)
+        return WrapperExecutor.new_class_executor(self.original, self.class_obj, self.wrappers, new_idx)
+
+    @classmethod
+    def new_executor(cls, original: Callable, wrappers: list[Callable], idx=0):
+        return cls(original, class_obj=None, wrappers=wrappers, idx=idx)
+
+    @classmethod
+    def new_class_executor(cls, original: Callable, class_obj: object, wrappers: list[Callable], idx=0):
+        return cls(original, class_obj, wrappers, idx=idx)
+
+class PatcherInjection:
+    def __init__(self, inject: Callable, eject: Callable):
+        self.inject = inject
+        self.eject = eject
+
+def copy_nested_dicts(input_dict: dict):
+    new_dict = input_dict.copy()
+    for key, value in input_dict.items():
+        if isinstance(value, dict):
+            new_dict[key] = copy_nested_dicts(value)
+        elif isinstance(value, list):
+            new_dict[key] = value.copy()
+    return new_dict
+
+def merge_nested_dicts(dict1: dict, dict2: dict, copy_dict1=True):
+    if copy_dict1:
+        merged_dict = copy_nested_dicts(dict1)
+    else:
+        merged_dict = dict1
+    for key, value in dict2.items():
+        if isinstance(value, dict):
+            curr_value = merged_dict.setdefault(key, {})
+            merged_dict[key] = merge_nested_dicts(value, curr_value)
+        elif isinstance(value, list):
+            merged_dict.setdefault(key, []).extend(value)
+        else:
+            merged_dict[key] = value
+    return merged_dict
--- a/comfy/rmsnorm.py
+++ b/comfy/rmsnorm.py
+import torch
+import comfy.model_management
+import numbers
+import logging
+
+RMSNorm = None
+
+try:
+    rms_norm_torch = torch.nn.functional.rms_norm
+    RMSNorm = torch.nn.RMSNorm
+except:
+    rms_norm_torch = None
+    logging.warning("Please update pytorch to use native RMSNorm")
+
+
+def rms_norm(x, weight=None, eps=1e-6):
+    if rms_norm_torch is not None and not (torch.jit.is_tracing() or torch.jit.is_scripting()):
+        if weight is None:
+            return rms_norm_torch(x, (x.shape[-1],), eps=eps)
+        else:
+            return rms_norm_torch(x, weight.shape, weight=comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
+    else:
+        r = x * torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        if weight is None:
+            return r
+        else:
+            return r * comfy.model_management.cast_to(weight, dtype=x.dtype, device=x.device)
+
+
+if RMSNorm is None:
+    class RMSNorm(torch.nn.Module):
+        def __init__(
+            self,
+            normalized_shape,
+            eps=1e-6,
+            elementwise_affine=True,
+            device=None,
+            dtype=None,
+        ):
+            factory_kwargs = {"device": device, "dtype": dtype}
+            super().__init__()
+            if isinstance(normalized_shape, numbers.Integral):
+                # mypy error: incompatible types in assignment
+                normalized_shape = (normalized_shape,)  # type: ignore[assignment]
+            self.normalized_shape = tuple(normalized_shape)  # type: ignore[arg-type]
+            self.eps = eps
+            self.elementwise_affine = elementwise_affine
+            if self.elementwise_affine:
+                self.weight = torch.nn.Parameter(
+                    torch.empty(self.normalized_shape, **factory_kwargs)
+                )
+            else:
+                self.register_parameter("weight", None)
+            self.bias = None
+
+        def forward(self, x):
+            return rms_norm(x, self.weight, self.eps)
--- a/comfy/sample.py
+++ b/comfy/sample.py
+import torch
+import comfy.model_management
+import comfy.samplers
+import comfy.utils
+import numpy as np
+import logging
+
+def prepare_noise(latent_image, seed, noise_inds=None):
+    """
+    creates random noise given a latent image and a seed.
+    optional arg skip can be used to skip and discard x number of noise generations for a given seed
+    """
+    generator = torch.manual_seed(seed)
+    if noise_inds is None:
+        return torch.randn(latent_image.size(), dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
+
+    unique_inds, inverse = np.unique(noise_inds, return_inverse=True)
+    noises = []
+    for i in range(unique_inds[-1]+1):
+        noise = torch.randn([1] + list(latent_image.size())[1:], dtype=latent_image.dtype, layout=latent_image.layout, generator=generator, device="cpu")
+        if i in unique_inds:
+            noises.append(noise)
+    noises = [noises[i] for i in inverse]
+    noises = torch.cat(noises, axis=0)
+    return noises
+
+def fix_empty_latent_channels(model, latent_image):
+    latent_format = model.get_model_object("latent_format") #Resize the empty latent image so it has the right number of channels
+    if latent_format.latent_channels != latent_image.shape[1] and torch.count_nonzero(latent_image) == 0:
+        latent_image = comfy.utils.repeat_to_batch_size(latent_image, latent_format.latent_channels, dim=1)
+    if latent_format.latent_dimensions == 3 and latent_image.ndim == 4:
+        latent_image = latent_image.unsqueeze(2)
+    return latent_image
+
+def prepare_sampling(model, noise_shape, positive, negative, noise_mask):
+    logging.warning("Warning: comfy.sample.prepare_sampling isn't used anymore and can be removed")
+    return model, positive, negative, noise_mask, []
+
+def cleanup_additional_models(models):
+    logging.warning("Warning: comfy.sample.cleanup_additional_models isn't used anymore and can be removed")
+
+def sample(model, noise, steps, cfg, sampler_name, scheduler, positive, negative, latent_image, denoise=1.0, disable_noise=False, start_step=None, last_step=None, force_full_denoise=False, noise_mask=None, sigmas=None, callback=None, disable_pbar=False, seed=None):
+    sampler = comfy.samplers.KSampler(model, steps=steps, device=model.load_device, sampler=sampler_name, scheduler=scheduler, denoise=denoise, model_options=model.model_options)
+
+    samples = sampler.sample(noise, positive, negative, cfg=cfg, latent_image=latent_image, start_step=start_step, last_step=last_step, force_full_denoise=force_full_denoise, denoise_mask=noise_mask, sigmas=sigmas, callback=callback, disable_pbar=disable_pbar, seed=seed)
+    samples = samples.to(comfy.model_management.intermediate_device())
+    return samples
+
+def sample_custom(model, noise, cfg, sampler, sigmas, positive, negative, latent_image, noise_mask=None, callback=None, disable_pbar=False, seed=None):
+    samples = comfy.samplers.sample(model, noise, positive, negative, cfg, model.load_device, sampler, sigmas, model_options=model.model_options, latent_image=latent_image, denoise_mask=noise_mask, callback=callback, disable_pbar=disable_pbar, seed=seed)
+    samples = samples.to(comfy.model_management.intermediate_device())
+    return samples
--- a/comfy/sampler_helpers.py
+++ b/comfy/sampler_helpers.py
+from __future__ import annotations
+import uuid
+import math
+import collections
+import comfy.model_management
+import comfy.conds
+import comfy.utils
+import comfy.hooks
+import comfy.patcher_extension
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from comfy.model_patcher import ModelPatcher
+    from comfy.model_base import BaseModel
+    from comfy.controlnet import ControlBase
+
+def prepare_mask(noise_mask, shape, device):
+    return comfy.utils.reshape_mask(noise_mask, shape).to(device)
+
+def get_models_from_cond(cond, model_type):
+    models = []
+    for c in cond:
+        if model_type in c:
+            if isinstance(c[model_type], list):
+                models += c[model_type]
+            else:
+                models += [c[model_type]]
+    return models
+
+def get_hooks_from_cond(cond, full_hooks: comfy.hooks.HookGroup):
+    # get hooks from conds, and collect cnets so they can be checked for extra_hooks
+    cnets: list[ControlBase] = []
+    for c in cond:
+        if 'hooks' in c:
+            for hook in c['hooks'].hooks:
+                full_hooks.add(hook)
+        if 'control' in c:
+            cnets.append(c['control'])
+
+    def get_extra_hooks_from_cnet(cnet: ControlBase, _list: list):
+        if cnet.extra_hooks is not None:
+            _list.append(cnet.extra_hooks)
+        if cnet.previous_controlnet is None:
+            return _list
+        return get_extra_hooks_from_cnet(cnet.previous_controlnet, _list)
+
+    hooks_list = []
+    cnets = set(cnets)
+    for base_cnet in cnets:
+        get_extra_hooks_from_cnet(base_cnet, hooks_list)
+    extra_hooks = comfy.hooks.HookGroup.combine_all_hooks(hooks_list)
+    if extra_hooks is not None:
+        for hook in extra_hooks.hooks:
+            full_hooks.add(hook)
+
+    return full_hooks
+
+def convert_cond(cond):
+    out = []
+    for c in cond:
+        temp = c[1].copy()
+        model_conds = temp.get("model_conds", {})
+        if c[0] is not None:
+            temp["cross_attn"] = c[0]
+        temp["model_conds"] = model_conds
+        temp["uuid"] = uuid.uuid4()
+        out.append(temp)
+    return out
+
+def get_additional_models(conds, dtype):
+    """loads additional models in conditioning"""
+    cnets: list[ControlBase] = []
+    gligen = []
+    add_models = []
+
+    for k in conds:
+        cnets += get_models_from_cond(conds[k], "control")
+        gligen += get_models_from_cond(conds[k], "gligen")
+        add_models += get_models_from_cond(conds[k], "additional_models")
+
+    control_nets = set(cnets)
+
+    inference_memory = 0
+    control_models = []
+    for m in control_nets:
+        control_models += m.get_models()
+        inference_memory += m.inference_memory_requirements(dtype)
+
+    gligen = [x[1] for x in gligen]
+    models = control_models + gligen + add_models
+
+    return models, inference_memory
+
+def get_additional_models_from_model_options(model_options: dict[str]=None):
+    """loads additional models from registered AddModels hooks"""
+    models = []
+    if model_options is not None and "registered_hooks" in model_options:
+        registered: comfy.hooks.HookGroup = model_options["registered_hooks"]
+        for hook in registered.get_type(comfy.hooks.EnumHookType.AdditionalModels):
+            hook: comfy.hooks.AdditionalModelsHook
+            models.extend(hook.models)
+    return models
+
+def cleanup_additional_models(models):
+    """cleanup additional models that were loaded"""
+    for m in models:
+        if hasattr(m, 'cleanup'):
+            m.cleanup()
+
+def estimate_memory(model, noise_shape, conds):
+    cond_shapes = collections.defaultdict(list)
+    cond_shapes_min = {}
+    for _, cs in conds.items():
+        for cond in cs:
+            for k, v in model.model.extra_conds_shapes(**cond).items():
+                cond_shapes[k].append(v)
+                if cond_shapes_min.get(k, None) is None:
+                    cond_shapes_min[k] = [v]
+                elif math.prod(v) > math.prod(cond_shapes_min[k][0]):
+                    cond_shapes_min[k] = [v]
+
+    memory_required = model.model.memory_required([noise_shape[0] * 2] + list(noise_shape[1:]), cond_shapes=cond_shapes)
+    minimum_memory_required = model.model.memory_required([noise_shape[0]] + list(noise_shape[1:]), cond_shapes=cond_shapes_min)
+    return memory_required, minimum_memory_required
+
+def prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
+    executor = comfy.patcher_extension.WrapperExecutor.new_executor(
+        _prepare_sampling,
+        comfy.patcher_extension.get_all_wrappers(comfy.patcher_extension.WrappersMP.PREPARE_SAMPLING, model_options, is_model_options=True)
+    )
+    return executor.execute(model, noise_shape, conds, model_options=model_options)
+
+def _prepare_sampling(model: ModelPatcher, noise_shape, conds, model_options=None):
+    real_model: BaseModel = None
+    models, inference_memory = get_additional_models(conds, model.model_dtype())
+    models += get_additional_models_from_model_options(model_options)
+    models += model.get_nested_additional_models()  # TODO: does this require inference_memory update?
+    memory_required, minimum_memory_required = estimate_memory(model, noise_shape, conds)
+    comfy.model_management.load_models_gpu([model] + models, memory_required=memory_required + inference_memory, minimum_memory_required=minimum_memory_required + inference_memory)
+    real_model = model.model
+
+    return real_model, conds, models
+
+def cleanup_models(conds, models):
+    cleanup_additional_models(models)
+
+    control_cleanup = []
+    for k in conds:
+        control_cleanup += get_models_from_cond(conds[k], "control")
+
+    cleanup_additional_models(set(control_cleanup))
+
+def prepare_model_patcher(model: ModelPatcher, conds, model_options: dict):
+    '''
+    Registers hooks from conds.
+    '''
+    # check for hooks in conds - if not registered, see if can be applied
+    hooks = comfy.hooks.HookGroup()
+    for k in conds:
+        get_hooks_from_cond(conds[k], hooks)
+    # add wrappers and callbacks from ModelPatcher to transformer_options
+    comfy.patcher_extension.merge_nested_dicts(model_options["transformer_options"].setdefault("wrappers", {}), model.wrappers, copy_dict1=False)
+    comfy.patcher_extension.merge_nested_dicts(model_options["transformer_options"].setdefault("callbacks", {}), model.callbacks, copy_dict1=False)
+    # begin registering hooks
+    registered = comfy.hooks.HookGroup()
+    target_dict = comfy.hooks.create_target_dict(comfy.hooks.EnumWeightTarget.Model)
+    # handle all TransformerOptionsHooks
+    for hook in hooks.get_type(comfy.hooks.EnumHookType.TransformerOptions):
+        hook: comfy.hooks.TransformerOptionsHook
+        hook.add_hook_patches(model, model_options, target_dict, registered)
+    # handle all AddModelsHooks
+    for hook in hooks.get_type(comfy.hooks.EnumHookType.AdditionalModels):
+        hook: comfy.hooks.AdditionalModelsHook
+        hook.add_hook_patches(model, model_options, target_dict, registered)
+    # handle all WeightHooks by registering on ModelPatcher
+    model.register_all_hook_patches(hooks, target_dict, model_options, registered)
+    # add registered_hooks onto model_options for further reference
+    if len(registered) > 0:
+        model_options["registered_hooks"] = registered
+    # merge original wrappers and callbacks with hooked wrappers and callbacks
+    to_load_options: dict[str] = model_options.setdefault("to_load_options", {})
+    for wc_name in ["wrappers", "callbacks"]:
+        comfy.patcher_extension.merge_nested_dicts(to_load_options.setdefault(wc_name, {}), model_options["transformer_options"][wc_name],
+                                                    copy_dict1=False)
+    return to_load_options