Initial commit.

220afe33 · comfyanonymous · 220afe33 · 220afe33 · 220afe33 · 220afe33
Commit 220afe33 authored Jan 03, 2023 by comfyanonymous
20 changed files
--- a/comfy/ldm/modules/midas/__init__.py
+++ b/comfy/ldm/modules/midas/__init__.py
--- a/comfy/ldm/modules/midas/api.py
+++ b/comfy/ldm/modules/midas/api.py
+# based on https://github.com/isl-org/MiDaS
+import cv2
+import torch
+import torch.nn as nn
+from torchvision.transforms import Compose
+from ldm.modules.midas.midas.dpt_depth import DPTDepthModel
+from ldm.modules.midas.midas.midas_net import MidasNet
+from ldm.modules.midas.midas.midas_net_custom import MidasNet_small
+from ldm.modules.midas.midas.transforms import Resize, NormalizeImage, PrepareForNet
+ISL_PATHS = {
+    "dpt_large": "midas_models/dpt_large-midas-2f21e586.pt",
+    "dpt_hybrid": "midas_models/dpt_hybrid-midas-501f0c75.pt",
+    "midas_v21": "",
+    "midas_v21_small": "",
+}
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+def load_midas_transform(model_type):
+    # https://github.com/isl-org/MiDaS/blob/master/run.py
+    # load transform only
+    if model_type == "dpt_large":  # DPT-Large
+        net_w, net_h = 384, 384
+        resize_mode = "minimal"
+        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    elif model_type == "dpt_hybrid":  # DPT-Hybrid
+        net_w, net_h = 384, 384
+        resize_mode = "minimal"
+        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    elif model_type == "midas_v21":
+        net_w, net_h = 384, 384
+        resize_mode = "upper_bound"
+        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    elif model_type == "midas_v21_small":
+        net_w, net_h = 256, 256
+        resize_mode = "upper_bound"
+        normalization = NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    else:
+        assert False, f"model_type '{model_type}' not implemented, use: --model_type large"
+    transform = Compose(
+        [
+            Resize(
+                net_w,
+                net_h,
+                resize_target=None,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=32,
+                resize_method=resize_mode,
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            normalization,
+            PrepareForNet(),
+        ]
+    )
+    return transform
+def load_model(model_type):
+    # https://github.com/isl-org/MiDaS/blob/master/run.py
+    # load network
+    model_path = ISL_PATHS[model_type]
+    if model_type == "dpt_large":  # DPT-Large
+        model = DPTDepthModel(
+            path=model_path,
+            backbone="vitl16_384",
+            non_negative=True,
+        )
+        net_w, net_h = 384, 384
+        resize_mode = "minimal"
+        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    elif model_type == "dpt_hybrid":  # DPT-Hybrid
+        model = DPTDepthModel(
+            path=model_path,
+            backbone="vitb_rn50_384",
+            non_negative=True,
+        )
+        net_w, net_h = 384, 384
+        resize_mode = "minimal"
+        normalization = NormalizeImage(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    elif model_type == "midas_v21":
+        model = MidasNet(model_path, non_negative=True)
+        net_w, net_h = 384, 384
+        resize_mode = "upper_bound"
+        normalization = NormalizeImage(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+        )
+    elif model_type == "midas_v21_small":
+        model = MidasNet_small(model_path, features=64, backbone="efficientnet_lite3", exportable=True,
+                               non_negative=True, blocks={'expand': True})
+        net_w, net_h = 256, 256
+        resize_mode = "upper_bound"
+        normalization = NormalizeImage(
+            mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+        )
+    else:
+        print(f"model_type '{model_type}' not implemented, use: --model_type large")
+        assert False
+    transform = Compose(
+        [
+            Resize(
+                net_w,
+                net_h,
+                resize_target=None,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=32,
+                resize_method=resize_mode,
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            normalization,
+            PrepareForNet(),
+        ]
+    )
+    return model.eval(), transform
+class MiDaSInference(nn.Module):
+    MODEL_TYPES_TORCH_HUB = [
+        "DPT_Large",
+        "DPT_Hybrid",
+        "MiDaS_small"
+    ]
+    MODEL_TYPES_ISL = [
+        "dpt_large",
+        "dpt_hybrid",
+        "midas_v21",
+        "midas_v21_small",
+    ]
+    def __init__(self, model_type):
+        super().__init__()
+        assert (model_type in self.MODEL_TYPES_ISL)
+        model, _ = load_model(model_type)
+        self.model = model
+        self.model.train = disabled_train
+    def forward(self, x):
+        # x in 0..1 as produced by calling self.transform on a 0..1 float64 numpy array
+        # NOTE: we expect that the correct transform has been called during dataloading.
+        with torch.no_grad():
+            prediction = self.model(x)
+            prediction = torch.nn.functional.interpolate(
+                prediction.unsqueeze(1),
+                size=x.shape[2:],
+                mode="bicubic",
+                align_corners=False,
+            )
+        assert prediction.shape == (x.shape[0], 1, x.shape[2], x.shape[3])
+        return prediction
--- a/comfy/ldm/modules/midas/midas/__init__.py
+++ b/comfy/ldm/modules/midas/midas/__init__.py
--- a/comfy/ldm/modules/midas/midas/base_model.py
+++ b/comfy/ldm/modules/midas/midas/base_model.py
+import torch
+class BaseModel(torch.nn.Module):
+    def load(self, path):
+        """Load model from file.
+        Args:
+            path (str): file path
+        """
+        parameters = torch.load(path, map_location=torch.device('cpu'))
+        if "optimizer" in parameters:
+            parameters = parameters["model"]
+        self.load_state_dict(parameters)
--- a/comfy/ldm/modules/midas/midas/blocks.py
+++ b/comfy/ldm/modules/midas/midas/blocks.py
+import torch
+import torch.nn as nn
+from .vit import (
+    _make_pretrained_vitb_rn50_384,
+    _make_pretrained_vitl16_384,
+    _make_pretrained_vitb16_384,
+    forward_vit,
+)
+def _make_encoder(backbone, features, use_pretrained, groups=1, expand=False, exportable=True, hooks=None, use_vit_only=False, use_readout="ignore",):
+    if backbone == "vitl16_384":
+        pretrained = _make_pretrained_vitl16_384(
+            use_pretrained, hooks=hooks, use_readout=use_readout
+        )
+        scratch = _make_scratch(
+            [256, 512, 1024, 1024], features, groups=groups, expand=expand
+        )  # ViT-L/16 - 85.0% Top1 (backbone)
+    elif backbone == "vitb_rn50_384":
+        pretrained = _make_pretrained_vitb_rn50_384(
+            use_pretrained,
+            hooks=hooks,
+            use_vit_only=use_vit_only,
+            use_readout=use_readout,
+        )
+        scratch = _make_scratch(
+            [256, 512, 768, 768], features, groups=groups, expand=expand
+        )  # ViT-H/16 - 85.0% Top1 (backbone)
+    elif backbone == "vitb16_384":
+        pretrained = _make_pretrained_vitb16_384(
+            use_pretrained, hooks=hooks, use_readout=use_readout
+        )
+        scratch = _make_scratch(
+            [96, 192, 384, 768], features, groups=groups, expand=expand
+        )  # ViT-B/16 - 84.6% Top1 (backbone)
+    elif backbone == "resnext101_wsl":
+        pretrained = _make_pretrained_resnext101_wsl(use_pretrained)
+        scratch = _make_scratch([256, 512, 1024, 2048], features, groups=groups, expand=expand)     # efficientnet_lite3  
+    elif backbone == "efficientnet_lite3":
+        pretrained = _make_pretrained_efficientnet_lite3(use_pretrained, exportable=exportable)
+        scratch = _make_scratch([32, 48, 136, 384], features, groups=groups, expand=expand)  # efficientnet_lite3     
+    else:
+        print(f"Backbone '{backbone}' not implemented")
+        assert False
+    return pretrained, scratch
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    out_shape4 = out_shape
+    if expand==True:
+        out_shape1 = out_shape
+        out_shape2 = out_shape*2
+        out_shape3 = out_shape*4
+        out_shape4 = out_shape*8
+    scratch.layer1_rn = nn.Conv2d(
+        in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer2_rn = nn.Conv2d(
+        in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer3_rn = nn.Conv2d(
+        in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    scratch.layer4_rn = nn.Conv2d(
+        in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False, groups=groups
+    )
+    return scratch
+def _make_pretrained_efficientnet_lite3(use_pretrained, exportable=False):
+    efficientnet = torch.hub.load(
+        "rwightman/gen-efficientnet-pytorch",
+        "tf_efficientnet_lite3",
+        pretrained=use_pretrained,
+        exportable=exportable
+    )
+    return _make_efficientnet_backbone(efficientnet)
+def _make_efficientnet_backbone(effnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(
+        effnet.conv_stem, effnet.bn1, effnet.act1, *effnet.blocks[0:2]
+    )
+    pretrained.layer2 = nn.Sequential(*effnet.blocks[2:3])
+    pretrained.layer3 = nn.Sequential(*effnet.blocks[3:5])
+    pretrained.layer4 = nn.Sequential(*effnet.blocks[5:9])
+    return pretrained
+def _make_resnet_backbone(resnet):
+    pretrained = nn.Module()
+    pretrained.layer1 = nn.Sequential(
+        resnet.conv1, resnet.bn1, resnet.relu, resnet.maxpool, resnet.layer1
+    )
+    pretrained.layer2 = resnet.layer2
+    pretrained.layer3 = resnet.layer3
+    pretrained.layer4 = resnet.layer4
+    return pretrained
+def _make_pretrained_resnext101_wsl(use_pretrained):
+    resnet = torch.hub.load("facebookresearch/WSL-Images", "resnext101_32x8d_wsl")
+    return _make_resnet_backbone(resnet)
+class Interpolate(nn.Module):
+    """Interpolation module.
+    """
+    def __init__(self, scale_factor, mode, align_corners=False):
+        """Init.
+        Args:
+            scale_factor (float): scaling
+            mode (str): interpolation mode
+        """
+        super(Interpolate, self).__init__()
+        self.interp = nn.functional.interpolate
+        self.scale_factor = scale_factor
+        self.mode = mode
+        self.align_corners = align_corners
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: interpolated data
+        """
+        x = self.interp(
+            x, scale_factor=self.scale_factor, mode=self.mode, align_corners=self.align_corners
+        )
+        return x
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True
+        )
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.relu(x)
+        out = self.conv1(out)
+        out = self.relu(out)
+        out = self.conv2(out)
+        return out + x
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(self, features):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.resConfUnit1 = ResidualConvUnit(features)
+        self.resConfUnit2 = ResidualConvUnit(features)
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            output += self.resConfUnit1(xs[1])
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=True
+        )
+        return output
+class ResidualConvUnit_custom(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups=1
+        self.conv1 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        self.conv2 = nn.Conv2d(
+            features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups
+        )
+        if self.bn==True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn==True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn==True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+        # return out + x
+class FeatureFusionBlock_custom(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(self, features, activation, deconv=False, bn=False, expand=False, align_corners=True):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock_custom, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups=1
+        self.expand = expand
+        out_features = features
+        if self.expand==True:
+            out_features = features//2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.resConfUnit1 = ResidualConvUnit_custom(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit_custom(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, *xs):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+            # output += res
+        output = self.resConfUnit2(output)
+        output = nn.functional.interpolate(
+            output, scale_factor=2, mode="bilinear", align_corners=self.align_corners
+        )
+        output = self.out_conv(output)
+        return output
--- a/comfy/ldm/modules/midas/midas/dpt_depth.py
+++ b/comfy/ldm/modules/midas/midas/dpt_depth.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from .base_model import BaseModel
+from .blocks import (
+    FeatureFusionBlock,
+    FeatureFusionBlock_custom,
+    Interpolate,
+    _make_encoder,
+    forward_vit,
+)
+def _make_fusion_block(features, use_bn):
+    return FeatureFusionBlock_custom(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+    )
+class DPT(BaseModel):
+    def __init__(
+        self,
+        head,
+        features=256,
+        backbone="vitb_rn50_384",
+        readout="project",
+        channels_last=False,
+        use_bn=False,
+    ):
+        super(DPT, self).__init__()
+        self.channels_last = channels_last
+        hooks = {
+            "vitb_rn50_384": [0, 1, 8, 11],
+            "vitb16_384": [2, 5, 8, 11],
+            "vitl16_384": [5, 11, 17, 23],
+        }
+        # Instantiate backbone and reassemble blocks
+        self.pretrained, self.scratch = _make_encoder(
+            backbone,
+            features,
+            False, # Set to true of you want to train from scratch, uses ImageNet weights
+            groups=1,
+            expand=False,
+            exportable=False,
+            hooks=hooks[backbone],
+            use_readout=readout,
+        )
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        self.scratch.output_conv = head
+    def forward(self, x):
+        if self.channels_last == True:
+            x.contiguous(memory_format=torch.channels_last)
+        layer_1, layer_2, layer_3, layer_4 = forward_vit(self.pretrained, x)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        return out
+class DPTDepthModel(DPT):
+    def __init__(self, path=None, non_negative=True, **kwargs):
+        features = kwargs["features"] if "features" in kwargs else 256
+        head = nn.Sequential(
+            nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear", align_corners=True),
+            nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+            nn.Identity(),
+        )
+        super().__init__(head, **kwargs)
+        if path is not None:
+           self.load(path)
+    def forward(self, x):
+        return super().forward(x).squeeze(dim=1)
--- a/comfy/ldm/modules/midas/midas/midas_net.py
+++ b/comfy/ldm/modules/midas/midas/midas_net.py
+"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
+This file contains code that is adapted from
+https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
+"""
+import torch
+import torch.nn as nn
+from .base_model import BaseModel
+from .blocks import FeatureFusionBlock, Interpolate, _make_encoder
+class MidasNet(BaseModel):
+    """Network for monocular depth estimation.
+    """
+    def __init__(self, path=None, features=256, non_negative=True):
+        """Init.
+        Args:
+            path (str, optional): Path to saved model. Defaults to None.
+            features (int, optional): Number of features. Defaults to 256.
+            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
+        """
+        print("Loading weights: ", path)
+        super(MidasNet, self).__init__()
+        use_pretrained = False if path is None else True
+        self.pretrained, self.scratch = _make_encoder(backbone="resnext101_wsl", features=features, use_pretrained=use_pretrained)
+        self.scratch.refinenet4 = FeatureFusionBlock(features)
+        self.scratch.refinenet3 = FeatureFusionBlock(features)
+        self.scratch.refinenet2 = FeatureFusionBlock(features)
+        self.scratch.refinenet1 = FeatureFusionBlock(features)
+        self.scratch.output_conv = nn.Sequential(
+            nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
+            Interpolate(scale_factor=2, mode="bilinear"),
+            nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+        )
+        if path:
+            self.load(path)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input data (image)
+        Returns:
+            tensor: depth
+        """
+        layer_1 = self.pretrained.layer1(x)
+        layer_2 = self.pretrained.layer2(layer_1)
+        layer_3 = self.pretrained.layer3(layer_2)
+        layer_4 = self.pretrained.layer4(layer_3)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        return torch.squeeze(out, dim=1)
--- a/comfy/ldm/modules/midas/midas/midas_net_custom.py
+++ b/comfy/ldm/modules/midas/midas/midas_net_custom.py
+"""MidashNet: Network for monocular depth estimation trained by mixing several datasets.
+This file contains code that is adapted from
+https://github.com/thomasjpfan/pytorch_refinenet/blob/master/pytorch_refinenet/refinenet/refinenet_4cascade.py
+"""
+import torch
+import torch.nn as nn
+from .base_model import BaseModel
+from .blocks import FeatureFusionBlock, FeatureFusionBlock_custom, Interpolate, _make_encoder
+class MidasNet_small(BaseModel):
+    """Network for monocular depth estimation.
+    """
+    def __init__(self, path=None, features=64, backbone="efficientnet_lite3", non_negative=True, exportable=True, channels_last=False, align_corners=True,
+        blocks={'expand': True}):
+        """Init.
+        Args:
+            path (str, optional): Path to saved model. Defaults to None.
+            features (int, optional): Number of features. Defaults to 256.
+            backbone (str, optional): Backbone network for encoder. Defaults to resnet50
+        """
+        print("Loading weights: ", path)
+        super(MidasNet_small, self).__init__()
+        use_pretrained = False if path else True
+        self.channels_last = channels_last
+        self.blocks = blocks
+        self.backbone = backbone
+        self.groups = 1
+        features1=features
+        features2=features
+        features3=features
+        features4=features
+        self.expand = False
+        if "expand" in self.blocks and self.blocks['expand'] == True:
+            self.expand = True
+            features1=features
+            features2=features*2
+            features3=features*4
+            features4=features*8
+        self.pretrained, self.scratch = _make_encoder(self.backbone, features, use_pretrained, groups=self.groups, expand=self.expand, exportable=exportable)
+        self.scratch.activation = nn.ReLU(False)    
+        self.scratch.refinenet4 = FeatureFusionBlock_custom(features4, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
+        self.scratch.refinenet3 = FeatureFusionBlock_custom(features3, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
+        self.scratch.refinenet2 = FeatureFusionBlock_custom(features2, self.scratch.activation, deconv=False, bn=False, expand=self.expand, align_corners=align_corners)
+        self.scratch.refinenet1 = FeatureFusionBlock_custom(features1, self.scratch.activation, deconv=False, bn=False, align_corners=align_corners)
+        self.scratch.output_conv = nn.Sequential(
+            nn.Conv2d(features, features//2, kernel_size=3, stride=1, padding=1, groups=self.groups),
+            Interpolate(scale_factor=2, mode="bilinear"),
+            nn.Conv2d(features//2, 32, kernel_size=3, stride=1, padding=1),
+            self.scratch.activation,
+            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True) if non_negative else nn.Identity(),
+            nn.Identity(),
+        )
+        if path:
+            self.load(path)
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input data (image)
+        Returns:
+            tensor: depth
+        """
+        if self.channels_last==True:
+            print("self.channels_last = ", self.channels_last)
+            x.contiguous(memory_format=torch.channels_last)
+        layer_1 = self.pretrained.layer1(x)
+        layer_2 = self.pretrained.layer2(layer_1)
+        layer_3 = self.pretrained.layer3(layer_2)
+        layer_4 = self.pretrained.layer4(layer_3)
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn)
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv(path_1)
+        return torch.squeeze(out, dim=1)
+def fuse_model(m):
+    prev_previous_type = nn.Identity()
+    prev_previous_name = ''
+    previous_type = nn.Identity()
+    previous_name = ''
+    for name, module in m.named_modules():
+        if prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d and type(module) == nn.ReLU:
+            # print("FUSED ", prev_previous_name, previous_name, name)
+            torch.quantization.fuse_modules(m, [prev_previous_name, previous_name, name], inplace=True)
+        elif prev_previous_type == nn.Conv2d and previous_type == nn.BatchNorm2d:
+            # print("FUSED ", prev_previous_name, previous_name)
+            torch.quantization.fuse_modules(m, [prev_previous_name, previous_name], inplace=True)
+        # elif previous_type == nn.Conv2d and type(module) == nn.ReLU:
+        #    print("FUSED ", previous_name, name)
+        #    torch.quantization.fuse_modules(m, [previous_name, name], inplace=True)
+        prev_previous_type = previous_type
+        prev_previous_name = previous_name
+        previous_type = type(module)
+        previous_name = name
\ No newline at end of file
--- a/comfy/ldm/modules/midas/midas/transforms.py
+++ b/comfy/ldm/modules/midas/midas/transforms.py
+import numpy as np
+import cv2
+import math
+def apply_min_size(sample, size, image_interpolation_method=cv2.INTER_AREA):
+    """Rezise the sample to ensure the given size. Keeps aspect ratio.
+    Args:
+        sample (dict): sample
+        size (tuple): image size
+    Returns:
+        tuple: new size
+    """
+    shape = list(sample["disparity"].shape)
+    if shape[0] >= size[0] and shape[1] >= size[1]:
+        return sample
+    scale = [0, 0]
+    scale[0] = size[0] / shape[0]
+    scale[1] = size[1] / shape[1]
+    scale = max(scale)
+    shape[0] = math.ceil(scale * shape[0])
+    shape[1] = math.ceil(scale * shape[1])
+    # resize
+    sample["image"] = cv2.resize(
+        sample["image"], tuple(shape[::-1]), interpolation=image_interpolation_method
+    )
+    sample["disparity"] = cv2.resize(
+        sample["disparity"], tuple(shape[::-1]), interpolation=cv2.INTER_NEAREST
+    )
+    sample["mask"] = cv2.resize(
+        sample["mask"].astype(np.float32),
+        tuple(shape[::-1]),
+        interpolation=cv2.INTER_NEAREST,
+    )
+    sample["mask"] = sample["mask"].astype(bool)
+    return tuple(shape)
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+        self,
+        width,
+        height,
+        resize_target=True,
+        keep_aspect_ratio=False,
+        ensure_multiple_of=1,
+        resize_method="lower_bound",
+        image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(
+                    f"resize_method {self.__resize_method} not implemented"
+                )
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, min_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, min_val=self.__width
+            )
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(
+                scale_height * height, max_val=self.__height
+            )
+            new_width = self.constrain_to_multiple_of(
+                scale_width * width, max_val=self.__width
+            )
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(
+            sample["image"].shape[1], sample["image"].shape[0]
+        )
+        # resize sample
+        sample["image"] = cv2.resize(
+            sample["image"],
+            (width, height),
+            interpolation=self.__image_interpolation_method,
+        )
+        if self.__resize_target:
+            if "disparity" in sample:
+                sample["disparity"] = cv2.resize(
+                    sample["disparity"],
+                    (width, height),
+                    interpolation=cv2.INTER_NEAREST,
+                )
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(
+                    sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST
+                )
+            sample["mask"] = cv2.resize(
+                sample["mask"].astype(np.float32),
+                (width, height),
+                interpolation=cv2.INTER_NEAREST,
+            )
+            sample["mask"] = sample["mask"].astype(bool)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        if "disparity" in sample:
+            disparity = sample["disparity"].astype(np.float32)
+            sample["disparity"] = np.ascontiguousarray(disparity)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        return sample
--- a/comfy/ldm/modules/midas/midas/vit.py
+++ b/comfy/ldm/modules/midas/midas/vit.py
+import torch
+import torch.nn as nn
+import timm
+import types
+import math
+import torch.nn.functional as F
+class Slice(nn.Module):
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        return x[:, self.start_index :]
+class AddReadout(nn.Module):
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index :] + readout.unsqueeze(1)
+class ProjectReadout(nn.Module):
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
+        features = torch.cat((x[:, self.start_index :], readout), -1)
+        return self.project(features)
+class Transpose(nn.Module):
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+    glob = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations["1"]
+    layer_2 = pretrained.activations["2"]
+    layer_3 = pretrained.activations["3"]
+    layer_4 = pretrained.activations["4"]
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size(
+                [
+                    h // pretrained.model.patch_size[1],
+                    w // pretrained.model.patch_size[0],
+                ]
+            ),
+        )
+    )
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
+    layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
+    layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
+    layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
+    return layer_1, layer_2, layer_3, layer_4
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, : self.start_index],
+        posemb[0, self.start_index :],
+    )
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+    pos_embed = self._resize_pos_embed(
+        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
+    )
+    B = x.shape[0]
+    if hasattr(self.patch_embed, "backbone"):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[-1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+    if getattr(self, "dist_token", None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+    x = x + pos_embed
+    x = self.pos_drop(x)
+    for blk in self.blocks:
+        x = blk(x)
+    x = self.norm(x)
+    return x
+activations = {}
+def get_activation(name):
+    def hook(model, input, output):
+        activations[name] = output
+    return hook
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == "ignore":
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == "add":
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == "project":
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+    return readout_oper
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout="ignore",
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    hooks = [5, 11, 17, 23] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+    )
+def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
+    )
+def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
+    )
+def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model(
+        "vit_deit_base_distilled_patch16_384", pretrained=pretrained
+    )
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        start_index=2,
+    )
+def _make_vit_b_rn50_backbone(
+    model,
+    features=[256, 512, 768, 768],
+    size=[384, 384],
+    hooks=[0, 1, 8, 11],
+    vit_features=768,
+    use_vit_only=False,
+    use_readout="ignore",
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    if use_vit_only == True:
+        pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+        pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    else:
+        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
+            get_activation("1")
+        )
+        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
+            get_activation("2")
+        )
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    if use_vit_only == True:
+        pretrained.act_postprocess1 = nn.Sequential(
+            readout_oper[0],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[0],
+                out_channels=features[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            readout_oper[1],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[1],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[1],
+                out_channels=features[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+    else:
+        pretrained.act_postprocess1 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_pretrained_vitb_rn50_384(
+    pretrained, use_readout="ignore", hooks=None, use_vit_only=False
+):
+    model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
+    hooks = [0, 1, 8, 11] if hooks == None else hooks
+    return _make_vit_b_rn50_backbone(
+        model,
+        features=[256, 512, 768, 768],
+        size=[384, 384],
+        hooks=hooks,
+        use_vit_only=use_vit_only,
+        use_readout=use_readout,
+    )
--- a/comfy/ldm/modules/midas/utils.py
+++ b/comfy/ldm/modules/midas/utils.py
+"""Utils for monoDepth."""
+import sys
+import re
+import numpy as np
+import cv2
+import torch
+def read_pfm(path):
+    """Read pfm file.
+    Args:
+        path (str): path to file
+    Returns:
+        tuple: (data, scale)
+    """
+    with open(path, "rb") as file:
+        color = None
+        width = None
+        height = None
+        scale = None
+        endian = None
+        header = file.readline().rstrip()
+        if header.decode("ascii") == "PF":
+            color = True
+        elif header.decode("ascii") == "Pf":
+            color = False
+        else:
+            raise Exception("Not a PFM file: " + path)
+        dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
+        if dim_match:
+            width, height = list(map(int, dim_match.groups()))
+        else:
+            raise Exception("Malformed PFM header.")
+        scale = float(file.readline().decode("ascii").rstrip())
+        if scale < 0:
+            # little-endian
+            endian = "<"
+            scale = -scale
+        else:
+            # big-endian
+            endian = ">"
+        data = np.fromfile(file, endian + "f")
+        shape = (height, width, 3) if color else (height, width)
+        data = np.reshape(data, shape)
+        data = np.flipud(data)
+        return data, scale
+def write_pfm(path, image, scale=1):
+    """Write pfm file.
+    Args:
+        path (str): pathto file
+        image (array): data
+        scale (int, optional): Scale. Defaults to 1.
+    """
+    with open(path, "wb") as file:
+        color = None
+        if image.dtype.name != "float32":
+            raise Exception("Image dtype must be float32.")
+        image = np.flipud(image)
+        if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+            color = True
+        elif (
+            len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
+        ):  # greyscale
+            color = False
+        else:
+            raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
+        file.write("PF\n" if color else "Pf\n".encode())
+        file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
+        endian = image.dtype.byteorder
+        if endian == "<" or endian == "=" and sys.byteorder == "little":
+            scale = -scale
+        file.write("%f\n".encode() % scale)
+        image.tofile(file)
+def read_image(path):
+    """Read image and output RGB image (0-1).
+    Args:
+        path (str): path to file
+    Returns:
+        array: RGB image (0-1)
+    """
+    img = cv2.imread(path)
+    if img.ndim == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
+    return img
+def resize_image(img):
+    """Resize image and make it fit for network.
+    Args:
+        img (array): image
+    Returns:
+        tensor: data ready for network
+    """
+    height_orig = img.shape[0]
+    width_orig = img.shape[1]
+    if width_orig > height_orig:
+        scale = width_orig / 384
+    else:
+        scale = height_orig / 384
+    height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
+    width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
+    img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
+    img_resized = (
+        torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
+    )
+    img_resized = img_resized.unsqueeze(0)
+    return img_resized
+def resize_depth(depth, width, height):
+    """Resize depth map and bring to CPU (numpy).
+    Args:
+        depth (tensor): depth
+        width (int): image width
+        height (int): image height
+    Returns:
+        array: processed depth
+    """
+    depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
+    depth_resized = cv2.resize(
+        depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
+    )
+    return depth_resized
+def write_depth(path, depth, bits=1):
+    """Write depth map to pfm and png file.
+    Args:
+        path (str): filepath without extension
+        depth (array): depth
+    """
+    write_pfm(path + ".pfm", depth.astype(np.float32))
+    depth_min = depth.min()
+    depth_max = depth.max()
+    max_val = (2**(8*bits))-1
+    if depth_max - depth_min > np.finfo("float").eps:
+        out = max_val * (depth - depth_min) / (depth_max - depth_min)
+    else:
+        out = np.zeros(depth.shape, dtype=depth.type)
+    if bits == 1:
+        cv2.imwrite(path + ".png", out.astype("uint8"))
+    elif bits == 2:
+        cv2.imwrite(path + ".png", out.astype("uint16"))
+    return
--- a/comfy/ldm/modules/sub_quadratic_attention.py
+++ b/comfy/ldm/modules/sub_quadratic_attention.py
+# original source:
+#   https://github.com/AminRezaei0x443/memory-efficient-attention/blob/1bc0d9e6ac5f82ea43a375135c4e1d3896ee1694/memory_efficient_attention/attention_torch.py
+# license:
+#   MIT
+# credit:
+#   Amin Rezaei (original author)
+#   Alex Birch (optimized algorithm for 3D tensors, at the expense of removing bias, masking and callbacks)
+# implementation of:
+#   Self-attention Does Not Need O(n2) Memory":
+#   https://arxiv.org/abs/2112.05682v2
+from functools import partial
+import torch
+from torch import Tensor
+from torch.utils.checkpoint import checkpoint
+import math
+from typing import Optional, NamedTuple, Protocol, List
+from torch import Tensor
+from typing import List
+def dynamic_slice(
+    x: Tensor,
+    starts: List[int],
+    sizes: List[int],
+) -> Tensor:
+    slicing = [slice(start, start + size) for start, size in zip(starts, sizes)]
+    return x[slicing]
+class AttnChunk(NamedTuple):
+    exp_values: Tensor
+    exp_weights_sum: Tensor
+    max_score: Tensor
+class SummarizeChunk(Protocol):
+    @staticmethod
+    def __call__(
+        query: Tensor,
+        key_t: Tensor,
+        value: Tensor,
+    ) -> AttnChunk: ...
+class ComputeQueryChunkAttn(Protocol):
+    @staticmethod
+    def __call__(
+        query: Tensor,
+        key_t: Tensor,
+        value: Tensor,
+    ) -> Tensor: ...
+def _summarize_chunk(
+    query: Tensor,
+    key_t: Tensor,
+    value: Tensor,
+    scale: float,
+) -> AttnChunk:
+    attn_weights = torch.baddbmm(
+        torch.empty(1, 1, 1, device=query.device, dtype=query.dtype),
+        query,
+        key_t,
+        alpha=scale,
+        beta=0,
+    )
+    max_score, _ = torch.max(attn_weights, -1, keepdim=True)
+    max_score = max_score.detach()
+    exp_weights = torch.exp(attn_weights - max_score)
+    exp_values = torch.bmm(exp_weights, value)
+    max_score = max_score.squeeze(-1)
+    return AttnChunk(exp_values, exp_weights.sum(dim=-1), max_score)
+def _query_chunk_attention(
+    query: Tensor,
+    key_t: Tensor,
+    value: Tensor,
+    summarize_chunk: SummarizeChunk,
+    kv_chunk_size: int,
+) -> Tensor:
+    batch_x_heads, k_channels_per_head, k_tokens = key_t.shape
+    _, _, v_channels_per_head = value.shape
+    def chunk_scanner(chunk_idx: int) -> AttnChunk:
+        key_chunk = dynamic_slice(
+            key_t,
+            (0, 0, chunk_idx),
+            (batch_x_heads, k_channels_per_head, kv_chunk_size)
+        )
+        value_chunk = dynamic_slice(
+            value,
+            (0, chunk_idx, 0),
+            (batch_x_heads, kv_chunk_size, v_channels_per_head)
+        )
+        return summarize_chunk(query, key_chunk, value_chunk)
+    chunks: List[AttnChunk] = [
+        chunk_scanner(chunk) for chunk in torch.arange(0, k_tokens, kv_chunk_size)
+    ]
+    acc_chunk = AttnChunk(*map(torch.stack, zip(*chunks)))
+    chunk_values, chunk_weights, chunk_max = acc_chunk
+    global_max, _ = torch.max(chunk_max, 0, keepdim=True)
+    max_diffs = torch.exp(chunk_max - global_max)
+    chunk_values *= torch.unsqueeze(max_diffs, -1)
+    chunk_weights *= max_diffs
+    all_values = chunk_values.sum(dim=0)
+    all_weights = torch.unsqueeze(chunk_weights, -1).sum(dim=0)
+    return all_values / all_weights
+# TODO: refactor CrossAttention#get_attention_scores to share code with this
+def _get_attention_scores_no_kv_chunking(
+    query: Tensor,
+    key_t: Tensor,
+    value: Tensor,
+    scale: float,
+) -> Tensor:
+    attn_scores = torch.baddbmm(
+        torch.empty(1, 1, 1, device=query.device, dtype=query.dtype),
+        query,
+        key_t,
+        alpha=scale,
+        beta=0,
+    )
+    attn_probs = attn_scores.softmax(dim=-1)
+    del attn_scores
+    hidden_states_slice = torch.bmm(attn_probs, value)
+    return hidden_states_slice
+class ScannedChunk(NamedTuple):
+    chunk_idx: int
+    attn_chunk: AttnChunk
+def efficient_dot_product_attention(
+    query: Tensor,
+    key_t: Tensor,
+    value: Tensor,
+    query_chunk_size=1024,
+    kv_chunk_size: Optional[int] = None,
+    kv_chunk_size_min: Optional[int] = None,
+    use_checkpoint=True,
+):
+    """Computes efficient dot-product attention given query, transposed key, and value.
+      This is efficient version of attention presented in
+      https://arxiv.org/abs/2112.05682v2 which comes with O(sqrt(n)) memory requirements.
+      Args:
+        query: queries for calculating attention with shape of
+          `[batch * num_heads, tokens, channels_per_head]`.
+        key_t: keys for calculating attention with shape of
+          `[batch * num_heads, channels_per_head, tokens]`.
+        value: values to be used in attention with shape of
+          `[batch * num_heads, tokens, channels_per_head]`.
+        query_chunk_size: int: query chunks size
+        kv_chunk_size: Optional[int]: key/value chunks size. if None: defaults to sqrt(key_tokens)
+        kv_chunk_size_min: Optional[int]: key/value minimum chunk size. only considered when kv_chunk_size is None. changes `sqrt(key_tokens)` into `max(sqrt(key_tokens), kv_chunk_size_min)`, to ensure our chunk sizes don't get too small (smaller chunks = more chunks = less concurrent work done).
+        use_checkpoint: bool: whether to use checkpointing (recommended True for training, False for inference)
+      Returns:
+        Output of shape `[batch * num_heads, query_tokens, channels_per_head]`.
+      """
+    batch_x_heads, q_tokens, q_channels_per_head = query.shape
+    _, _, k_tokens = key_t.shape
+    scale = q_channels_per_head ** -0.5
+    kv_chunk_size = min(kv_chunk_size or int(math.sqrt(k_tokens)), k_tokens)
+    if kv_chunk_size_min is not None:
+        kv_chunk_size = max(kv_chunk_size, kv_chunk_size_min)
+    def get_query_chunk(chunk_idx: int) -> Tensor:
+        return dynamic_slice(
+            query,
+            (0, chunk_idx, 0),
+            (batch_x_heads, min(query_chunk_size, q_tokens), q_channels_per_head)
+        )
+    summarize_chunk: SummarizeChunk = partial(_summarize_chunk, scale=scale)
+    summarize_chunk: SummarizeChunk = partial(checkpoint, summarize_chunk) if use_checkpoint else summarize_chunk
+    compute_query_chunk_attn: ComputeQueryChunkAttn = partial(
+        _get_attention_scores_no_kv_chunking,
+        scale=scale
+    ) if k_tokens <= kv_chunk_size else (
+        # fast-path for when there's just 1 key-value chunk per query chunk (this is just sliced attention btw)
+        partial(
+            _query_chunk_attention,
+            kv_chunk_size=kv_chunk_size,
+            summarize_chunk=summarize_chunk,
+        )
+    )
+    if q_tokens <= query_chunk_size:
+        # fast-path for when there's just 1 query chunk
+        return compute_query_chunk_attn(
+            query=query,
+            key_t=key_t,
+            value=value,
+        )
+    # TODO: maybe we should use torch.empty_like(query) to allocate storage in-advance,
+    # and pass slices to be mutated, instead of torch.cat()ing the returned slices
+    res = torch.cat([
+        compute_query_chunk_attn(
+            query=get_query_chunk(i * query_chunk_size),
+            key_t=key_t,
+            value=value,
+        ) for i in range(math.ceil(q_tokens / query_chunk_size))
+    ], dim=1)
+    return res
--- a/comfy/ldm/util.py
+++ b/comfy/ldm/util.py
+import importlib
+import torch
+from torch import optim
+import numpy as np
+from inspect import isfunction
+from PIL import Image, ImageDraw, ImageFont
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
+        nc = int(40 * (wh[0] / 256))
+        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+def isimage(x):
+    if not isinstance(x,torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+class AdamWwithEMAandWings(optim.Optimizer):
+    # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
+    def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8,  # TODO: check hyperparameters before using
+                 weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999,   # ema decay to match previous code
+                 ema_power=1., param_names=()):
+        """AdamW that saves EMA versions of the parameters."""
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= ema_decay <= 1.0:
+            raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
+                        ema_power=ema_power, param_names=param_names)
+        super().__init__(params, defaults)
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            ema_params_with_grad = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+            beta1, beta2 = group['betas']
+            ema_decay = group['ema_decay']
+            ema_power = group['ema_power']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of parameter values
+                    state['param_exp_avg'] = p.detach().float().clone()
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                ema_params_with_grad.append(state['param_exp_avg'])
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+            optim._functional.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=amsgrad,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=group['lr'],
+                    weight_decay=group['weight_decay'],
+                    eps=group['eps'],
+                    maximize=False)
+            cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
+            for param, ema_param in zip(params_with_grad, ema_params_with_grad):
+                ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
+        return loss
\ No newline at end of file
--- a/comfy/samplers.py
+++ b/comfy/samplers.py
+import k_diffusion.sampling
+import k_diffusion.external
+import torch
+import contextlib
+class CFGDenoiser(torch.nn.Module):
+    def __init__(self, model):
+        super().__init__()
+        self.inner_model = model
+    def forward(self, x, sigma, uncond, cond, cond_scale):
+        if len(uncond[0]) == len(cond[0]) and x.shape[0] * x.shape[2] * x.shape[3] <= (96 * 96): #TODO check memory instead
+            x_in = torch.cat([x] * 2)
+            sigma_in = torch.cat([sigma] * 2)
+            cond_in = torch.cat([uncond, cond])
+            uncond, cond = self.inner_model(x_in, sigma_in, cond=cond_in).chunk(2)
+        else:
+            cond = self.inner_model(x, sigma, cond=cond)
+            uncond = self.inner_model(x, sigma, cond=uncond)
+        return uncond + (cond - uncond) * cond_scale
+def simple_scheduler(model, steps):
+    sigs = []
+    ss = len(model.sigmas) / steps
+    for x in range(steps):
+        sigs += [float(model.sigmas[-(1 + int(x * ss))])]
+    sigs += [0.0]
+    return torch.FloatTensor(sigs)
+class KSampler:
+    SCHEDULERS = ["karras", "normal", "simple"]
+    SAMPLERS = ["sample_euler", "sample_euler_ancestral", "sample_heun", "sample_dpm_2", "sample_dpm_2_ancestral",
+                "sample_lms", "sample_dpm_fast", "sample_dpm_adaptive", "sample_dpmpp_2s_ancestral", "sample_dpmpp_sde",
+                "sample_dpmpp_2m"]
+    def __init__(self, model, steps, device, sampler=None, scheduler=None, denoise=None):
+        self.model = model
+        if self.model.parameterization == "v":
+            self.model_wrap = k_diffusion.external.CompVisVDenoiser(self.model, quantize=True)
+        else:
+            self.model_wrap = k_diffusion.external.CompVisDenoiser(self.model, quantize=True)
+        self.model_k = CFGDenoiser(self.model_wrap)
+        self.device = device
+        if scheduler not in self.SCHEDULERS:
+            scheduler = self.SCHEDULERS[0]
+        if sampler not in self.SAMPLERS:
+            sampler = self.SAMPLERS[0]
+        self.scheduler = scheduler
+        self.sampler = sampler
+        self.sigma_min=float(self.model_wrap.sigmas[0])
+        self.sigma_max=float(self.model_wrap.sigmas[-1])
+        self.set_steps(steps, denoise)
+    def _calculate_sigmas(self, steps):
+        sigmas = None
+        discard_penultimate_sigma = False
+        if self.sampler in ['sample_dpm_2', 'sample_dpm_2_ancestral']:
+            steps += 1
+            discard_penultimate_sigma = True
+        if self.scheduler == "karras":
+            sigmas = k_diffusion.sampling.get_sigmas_karras(n=steps, sigma_min=self.sigma_min, sigma_max=self.sigma_max, device=self.device)
+        elif self.scheduler == "normal":
+            sigmas = self.model_wrap.get_sigmas(steps).to(self.device)
+        elif self.scheduler == "simple":
+            sigmas = simple_scheduler(self.model_wrap, steps).to(self.device)
+        else:
+            print("error invalid scheduler", self.scheduler)
+        if discard_penultimate_sigma:
+            sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])
+        return sigmas
+    def set_steps(self, steps, denoise=None):
+        self.steps = steps
+        if denoise is None:
+            self.sigmas = self._calculate_sigmas(steps)
+        else:
+            new_steps = int(steps/denoise)
+            sigmas = self._calculate_sigmas(new_steps)
+            self.sigmas = sigmas[-(steps + 1):]
+    def sample(self, noise, positive, negative, cfg, latent_image=None, start_step=None, last_step=None):
+        sigmas = self.sigmas
+        sigma_min = self.sigma_min
+        if last_step is not None:
+            sigma_min = sigmas[last_step]
+            sigmas = sigmas[:last_step + 1]
+        if start_step is not None:
+            sigmas = sigmas[start_step:]
+        noise *= sigmas[0]
+        if latent_image is not None:
+            noise += latent_image
+        if self.model.model.diffusion_model.dtype == torch.float16:
+            precision_scope = torch.autocast
+        else:
+            precision_scope = contextlib.nullcontext
+        with precision_scope(self.device):
+            if self.sampler == "sample_dpm_fast":
+                samples = k_diffusion.sampling.sample_dpm_fast(self.model_k, noise, sigma_min, sigmas[0], self.steps, extra_args={"cond":positive, "uncond":negative, "cond_scale": cfg})
+            elif self.sampler == "sample_dpm_adaptive":
+                samples = k_diffusion.sampling.sample_dpm_adaptive(self.model_k, noise, sigma_min, sigmas[0], extra_args={"cond":positive, "uncond":negative, "cond_scale": cfg})
+            else:
+                samples = getattr(k_diffusion.sampling, self.sampler)(self.model_k, noise, sigmas, extra_args={"cond":positive, "uncond":negative, "cond_scale": cfg})
+        return samples.to(torch.float32)
--- a/comfy/sd.py
+++ b/comfy/sd.py
+import torch
+import sd1_clip
+import sd2_clip
+from ldm.util import instantiate_from_config
+from ldm.models.autoencoder import AutoencoderKL
+from omegaconf import OmegaConf
+def load_model_from_config(config, ckpt, verbose=False, load_state_dict_to=[]):
+    print(f"Loading model from {ckpt}")
+    if ckpt.lower().endswith(".safetensors"):
+        import safetensors.torch
+        sd = safetensors.torch.load_file(ckpt, device="cpu")
+    else:
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        if "global_step" in pl_sd:
+            print(f"Global Step: {pl_sd['global_step']}")
+        sd = pl_sd["state_dict"]
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    k = list(sd.keys())
+    for x in k:
+        # print(x)
+        if x.startswith("cond_stage_model.transformer.") and not x.startswith("cond_stage_model.transformer.text_model."):
+            y = x.replace("cond_stage_model.transformer.", "cond_stage_model.transformer.text_model.")
+            sd[y] = sd.pop(x)
+    for x in load_state_dict_to:
+        x.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+    model.eval()
+    return model
+class CLIP:
+    def __init__(self, config):
+        self.target_clip = config["target"]
+        if self.target_clip == "ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder":
+            clip = sd2_clip.SD2ClipModel
+            tokenizer = sd2_clip.SD2Tokenizer
+        elif self.target_clip == "ldm.modules.encoders.modules.FrozenCLIPEmbedder":
+            clip = sd1_clip.SD1ClipModel
+            tokenizer = sd1_clip.SD1Tokenizer
+        if "params" in config:
+            self.cond_stage_model = clip(**(config["params"]))
+        else:
+            self.cond_stage_model = clip()
+        self.tokenizer = tokenizer()
+    def encode(self, text):
+        tokens = self.tokenizer.tokenize_with_weights(text)
+        cond = self.cond_stage_model.encode_token_weights(tokens)
+        return cond
+class VAE:
+    def __init__(self, ckpt_path=None, scale_factor=0.18215, device="cuda", config=None):
+        if config is None:
+            #default SD1.x/SD2.x VAE parameters
+            ddconfig = {'double_z': True, 'z_channels': 4, 'resolution': 256, 'in_channels': 3, 'out_ch': 3, 'ch': 128, 'ch_mult': [1, 2, 4, 4], 'num_res_blocks': 2, 'attn_resolutions': [], 'dropout': 0.0}
+            self.first_stage_model = AutoencoderKL(ddconfig, {'target': 'torch.nn.Identity'}, 4, monitor="val/rec_loss", ckpt_path=ckpt_path)
+        else:
+            self.first_stage_model = AutoencoderKL(**(config['params']), ckpt_path=ckpt_path)
+        self.first_stage_model = self.first_stage_model.eval()
+        self.scale_factor = scale_factor
+        self.device = device
+    def decode(self, samples):
+        self.first_stage_model = self.first_stage_model.to(self.device)
+        samples = samples.to(self.device)
+        pixel_samples = self.first_stage_model.decode(1. / self.scale_factor * samples)
+        pixel_samples = torch.clamp((pixel_samples + 1.0) / 2.0, min=0.0, max=1.0)
+        self.first_stage_model = self.first_stage_model.cpu()
+        pixel_samples = pixel_samples.cpu().movedim(1,-1)
+        return pixel_samples
+    def encode(self, pixel_samples):
+        self.first_stage_model = self.first_stage_model.to(self.device)
+        pixel_samples = pixel_samples.movedim(-1,1).to(self.device)
+        samples = self.first_stage_model.encode(2. * pixel_samples - 1.).sample() * self.scale_factor
+        self.first_stage_model = self.first_stage_model.cpu()
+        samples = samples.cpu()
+        return samples
+def load_checkpoint(config_path, ckpt_path, output_vae=True, output_clip=True):
+    config = OmegaConf.load(config_path)
+    model_config_params = config['model']['params']
+    clip_config = model_config_params['cond_stage_config']
+    scale_factor = model_config_params['scale_factor']
+    vae_config = model_config_params['first_stage_config']
+    clip = None
+    vae = None
+    class WeightsLoader(torch.nn.Module):
+        pass
+    w = WeightsLoader()
+    load_state_dict_to = []
+    if output_vae:
+        vae = VAE(scale_factor=scale_factor, config=vae_config)
+        w.first_stage_model = vae.first_stage_model
+        load_state_dict_to = [w]
+    if output_clip:
+        clip = CLIP(config=clip_config)
+        w.cond_stage_model = clip.cond_stage_model
+        load_state_dict_to = [w]
+    model = load_model_from_config(config, ckpt_path, verbose=False, load_state_dict_to=load_state_dict_to)
+    return (model, clip, vae)
--- a/comfy/sd1_clip.py
+++ b/comfy/sd1_clip.py
+import os
+from transformers import CLIPTokenizer, CLIPTextModel, CLIPTextConfig
+import torch
+class ClipTokenWeightEncoder:
+    def encode_token_weights(self, token_weight_pairs):
+        z_empty = self.encode(self.empty_tokens)
+        output = []
+        for x in token_weight_pairs:
+            tokens = [list(map(lambda a: a[0], x))]
+            z = self.encode(tokens)
+            for i in range(len(z)):
+                for j in range(len(z[i])):
+                    weight = x[j][1]
+                    z[i][j] = (z[i][j] - z_empty[0][j]) * weight + z_empty[0][j]
+            output += [z]
+        if (len(output) == 0):
+            return self.encode(self.empty_tokens)
+        return torch.cat(output, dim=-2)
+class SD1ClipModel(torch.nn.Module, ClipTokenWeightEncoder):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+    LAYERS = [
+        "last",
+        "pooled",
+        "hidden"
+    ]
+    def __init__(self, version="openai/clip-vit-large-patch14", device="cpu", max_length=77,
+                 freeze=True, layer="last", layer_idx=None, textmodel_json_config=None, textmodel_path=None):  # clip-vit-base-patch32
+        super().__init__()
+        assert layer in self.LAYERS
+        if textmodel_path is not None:
+            self.transformer = CLIPTextModel.from_pretrained(textmodel_path)
+        else:
+            if textmodel_json_config is None:
+                textmodel_json_config = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_clip_config.json")
+            config = CLIPTextConfig.from_json_file(textmodel_json_config)
+            self.transformer = CLIPTextModel(config)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        self.layer_idx = None
+        self.empty_tokens = [[49406] + [49407] * 76]
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert abs(layer_idx) <= 12
+            self.clip_layer(layer_idx)
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+        #self.train = disabled_train
+        for param in self.parameters():
+            param.requires_grad = False
+    def clip_layer(self, layer_idx):
+        if abs(layer_idx) >= 12:
+            self.layer = "last"
+        else:
+            self.layer = "hidden"
+            self.layer_idx = layer_idx
+    def forward(self, tokens):
+        tokens = torch.LongTensor(tokens).to(self.device)
+        outputs = self.transformer(input_ids=tokens, output_hidden_states=self.layer=="hidden")
+        if self.layer == "last":
+            z = outputs.last_hidden_state
+        elif self.layer == "pooled":
+            z = outputs.pooler_output[:, None, :]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
+            z = self.transformer.text_model.final_layer_norm(z)
+        return z
+    def encode(self, tokens):
+        return self(tokens)
+def parse_parentheses(string):
+    result = []
+    current_item = ""
+    nesting_level = 0
+    for char in string:
+        if char == "(":
+            if nesting_level == 0:
+                if current_item:
+                    result.append(current_item)
+                    current_item = "("
+                else:
+                    current_item = "("
+            else:
+                current_item += char
+            nesting_level += 1
+        elif char == ")":
+            nesting_level -= 1
+            if nesting_level == 0:
+                result.append(current_item + ")")
+                current_item = ""
+            else:
+                current_item += char
+        else:
+            current_item += char
+    if current_item:
+        result.append(current_item)
+    return result
+def token_weights(string, current_weight):
+    a = parse_parentheses(string)
+    out = []
+    for x in a:
+        weight = current_weight
+        if len(x) >= 2 and x[-1] == ')' and x[0] == '(':
+            x = x[1:-1]
+            xx = x.rfind(":")
+            weight *= 1.1
+            if xx > 0:
+                try:
+                    weight = float(x[xx+1:])
+                    x = x[:xx]
+                except:
+                    pass
+            out += token_weights(x, weight)
+        else:
+            out += [(x, current_weight)]
+    return out
+def escape_important(text):
+    text = text.replace("\\)", "\0\1")
+    text = text.replace("\\(", "\0\2")
+    return text
+def unescape_important(text):
+    text = text.replace("\0\1", ")")
+    text = text.replace("\0\2", "(")
+    return text
+class SD1Tokenizer:
+    def __init__(self, tokenizer_path=None, max_length=77, pad_with_end=True):
+        if tokenizer_path is None:
+            tokenizer_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "sd1_tokenizer")
+        self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer_path)
+        self.max_length = max_length
+        empty = self.tokenizer('')["input_ids"]
+        self.start_token = empty[0]
+        self.end_token = empty[1]
+        self.pad_with_end = pad_with_end
+        vocab = self.tokenizer.get_vocab()
+        self.inv_vocab = {v: k for k, v in vocab.items()}
+    def tokenize_with_weights(self, text):
+        text = escape_important(text)
+        parsed_weights = token_weights(text, 1.0)
+        tokens = []
+        for t in parsed_weights:
+            tt = self.tokenizer(unescape_important(t[0]))["input_ids"][1:-1]
+            for x in tt:
+                tokens += [(x, t[1])]
+        out_tokens = []
+        for x in range(0, len(tokens), self.max_length - 2):
+            o_token = [(self.start_token, 1.0)] + tokens[x:min(self.max_length - 2 + x, len(tokens))]
+            o_token += [(self.end_token, 1.0)]
+            if self.pad_with_end:
+                o_token +=[(self.end_token, 1.0)] * (self.max_length - len(o_token))
+            else:
+                o_token +=[(0, 1.0)] * (self.max_length - len(o_token))
+            out_tokens += [o_token]
+        return out_tokens
+    def untokenize(self, token_weight_pair):
+        return list(map(lambda a: (a, self.inv_vocab[a[0]]), token_weight_pair))
--- a/comfy/sd1_clip_config.json
+++ b/comfy/sd1_clip_config.json
+{
+  "_name_or_path": "openai/clip-vit-large-patch14",
+  "architectures": [
+    "CLIPTextModel"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 0,
+  "dropout": 0.0,
+  "eos_token_id": 2,
+  "hidden_act": "quick_gelu",
+  "hidden_size": 768,
+  "initializer_factor": 1.0,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 77,
+  "model_type": "clip_text_model",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "projection_dim": 768,
+  "torch_dtype": "float32",
+  "transformers_version": "4.24.0",
+  "vocab_size": 49408
+}
--- a/comfy/sd1_tokenizer/merges.txt
+++ b/comfy/sd1_tokenizer/merges.txt
--- a/comfy/sd1_tokenizer/special_tokens_map.json
+++ b/comfy/sd1_tokenizer/special_tokens_map.json
+{
+  "bos_token": {
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|endoftext|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}
--- a/comfy/sd1_tokenizer/tokenizer_config.json
+++ b/comfy/sd1_tokenizer/tokenizer_config.json
+{
+  "add_prefix_space": false,
+  "bos_token": {
+    "__type": "AddedToken",
+    "content": "<|startoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "do_lower_case": true,
+  "eos_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  },
+  "errors": "replace",
+  "model_max_length": 77,
+  "name_or_path": "openai/clip-vit-large-patch14",
+  "pad_token": "<|endoftext|>",
+  "special_tokens_map_file": "./special_tokens_map.json",
+  "tokenizer_class": "CLIPTokenizer",
+  "unk_token": {
+    "__type": "AddedToken",
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": true,
+    "rstrip": false,
+    "single_word": false
+  }
+}