Initial commit

4007efdd · lijian6 · 4007efdd · 4007efdd · 4007efdd · 4007efdd
Commit 4007efdd authored May 12, 2024 by lijian6
18 changed files
--- a/ldm/modules/midas/midas/vit.py
+++ b/ldm/modules/midas/midas/vit.py
+import torch
+import torch.nn as nn
+import timm
+import types
+import math
+import torch.nn.functional as F
+class Slice(nn.Module):
+    def __init__(self, start_index=1):
+        super(Slice, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        return x[:, self.start_index :]
+class AddReadout(nn.Module):
+    def __init__(self, start_index=1):
+        super(AddReadout, self).__init__()
+        self.start_index = start_index
+    def forward(self, x):
+        if self.start_index == 2:
+            readout = (x[:, 0] + x[:, 1]) / 2
+        else:
+            readout = x[:, 0]
+        return x[:, self.start_index :] + readout.unsqueeze(1)
+class ProjectReadout(nn.Module):
+    def __init__(self, in_features, start_index=1):
+        super(ProjectReadout, self).__init__()
+        self.start_index = start_index
+        self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
+    def forward(self, x):
+        readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
+        features = torch.cat((x[:, self.start_index :], readout), -1)
+        return self.project(features)
+class Transpose(nn.Module):
+    def __init__(self, dim0, dim1):
+        super(Transpose, self).__init__()
+        self.dim0 = dim0
+        self.dim1 = dim1
+    def forward(self, x):
+        x = x.transpose(self.dim0, self.dim1)
+        return x
+def forward_vit(pretrained, x):
+    b, c, h, w = x.shape
+    glob = pretrained.model.forward_flex(x)
+    layer_1 = pretrained.activations["1"]
+    layer_2 = pretrained.activations["2"]
+    layer_3 = pretrained.activations["3"]
+    layer_4 = pretrained.activations["4"]
+    layer_1 = pretrained.act_postprocess1[0:2](layer_1)
+    layer_2 = pretrained.act_postprocess2[0:2](layer_2)
+    layer_3 = pretrained.act_postprocess3[0:2](layer_3)
+    layer_4 = pretrained.act_postprocess4[0:2](layer_4)
+    unflatten = nn.Sequential(
+        nn.Unflatten(
+            2,
+            torch.Size(
+                [
+                    h // pretrained.model.patch_size[1],
+                    w // pretrained.model.patch_size[0],
+                ]
+            ),
+        )
+    )
+    if layer_1.ndim == 3:
+        layer_1 = unflatten(layer_1)
+    if layer_2.ndim == 3:
+        layer_2 = unflatten(layer_2)
+    if layer_3.ndim == 3:
+        layer_3 = unflatten(layer_3)
+    if layer_4.ndim == 3:
+        layer_4 = unflatten(layer_4)
+    layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
+    layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
+    layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
+    layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
+    return layer_1, layer_2, layer_3, layer_4
+def _resize_pos_embed(self, posemb, gs_h, gs_w):
+    posemb_tok, posemb_grid = (
+        posemb[:, : self.start_index],
+        posemb[0, self.start_index :],
+    )
+    gs_old = int(math.sqrt(len(posemb_grid)))
+    posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
+    posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
+    posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
+    posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
+    return posemb
+def forward_flex(self, x):
+    b, c, h, w = x.shape
+    pos_embed = self._resize_pos_embed(
+        self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
+    )
+    B = x.shape[0]
+    if hasattr(self.patch_embed, "backbone"):
+        x = self.patch_embed.backbone(x)
+        if isinstance(x, (list, tuple)):
+            x = x[-1]  # last feature if backbone outputs list/tuple of features
+    x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
+    if getattr(self, "dist_token", None) is not None:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        dist_token = self.dist_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, dist_token, x), dim=1)
+    else:
+        cls_tokens = self.cls_token.expand(
+            B, -1, -1
+        )  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+    x = x + pos_embed
+    x = self.pos_drop(x)
+    for blk in self.blocks:
+        x = blk(x)
+    x = self.norm(x)
+    return x
+activations = {}
+def get_activation(name):
+    def hook(model, input, output):
+        activations[name] = output
+    return hook
+def get_readout_oper(vit_features, features, use_readout, start_index=1):
+    if use_readout == "ignore":
+        readout_oper = [Slice(start_index)] * len(features)
+    elif use_readout == "add":
+        readout_oper = [AddReadout(start_index)] * len(features)
+    elif use_readout == "project":
+        readout_oper = [
+            ProjectReadout(vit_features, start_index) for out_feat in features
+        ]
+    else:
+        assert (
+            False
+        ), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
+    return readout_oper
+def _make_vit_b16_backbone(
+    model,
+    features=[96, 192, 384, 768],
+    size=[384, 384],
+    hooks=[2, 5, 8, 11],
+    vit_features=768,
+    use_readout="ignore",
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+    pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    # 32, 48, 136, 384
+    pretrained.act_postprocess1 = nn.Sequential(
+        readout_oper[0],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[0],
+            out_channels=features[0],
+            kernel_size=4,
+            stride=4,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess2 = nn.Sequential(
+        readout_oper[1],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[1],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.ConvTranspose2d(
+            in_channels=features[1],
+            out_channels=features[1],
+            kernel_size=2,
+            stride=2,
+            padding=0,
+            bias=True,
+            dilation=1,
+            groups=1,
+        ),
+    )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
+    hooks = [5, 11, 17, 23] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[256, 512, 1024, 1024],
+        hooks=hooks,
+        vit_features=1024,
+        use_readout=use_readout,
+    )
+def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
+    )
+def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
+    )
+def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
+    model = timm.create_model(
+        "vit_deit_base_distilled_patch16_384", pretrained=pretrained
+    )
+    hooks = [2, 5, 8, 11] if hooks == None else hooks
+    return _make_vit_b16_backbone(
+        model,
+        features=[96, 192, 384, 768],
+        hooks=hooks,
+        use_readout=use_readout,
+        start_index=2,
+    )
+def _make_vit_b_rn50_backbone(
+    model,
+    features=[256, 512, 768, 768],
+    size=[384, 384],
+    hooks=[0, 1, 8, 11],
+    vit_features=768,
+    use_vit_only=False,
+    use_readout="ignore",
+    start_index=1,
+):
+    pretrained = nn.Module()
+    pretrained.model = model
+    if use_vit_only == True:
+        pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
+        pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
+    else:
+        pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
+            get_activation("1")
+        )
+        pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
+            get_activation("2")
+        )
+    pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
+    pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
+    pretrained.activations = activations
+    readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
+    if use_vit_only == True:
+        pretrained.act_postprocess1 = nn.Sequential(
+            readout_oper[0],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[0],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[0],
+                out_channels=features[0],
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            readout_oper[1],
+            Transpose(1, 2),
+            nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+            nn.Conv2d(
+                in_channels=vit_features,
+                out_channels=features[1],
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ),
+            nn.ConvTranspose2d(
+                in_channels=features[1],
+                out_channels=features[1],
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=True,
+                dilation=1,
+                groups=1,
+            ),
+        )
+    else:
+        pretrained.act_postprocess1 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+        pretrained.act_postprocess2 = nn.Sequential(
+            nn.Identity(), nn.Identity(), nn.Identity()
+        )
+    pretrained.act_postprocess3 = nn.Sequential(
+        readout_oper[2],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[2],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+    )
+    pretrained.act_postprocess4 = nn.Sequential(
+        readout_oper[3],
+        Transpose(1, 2),
+        nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
+        nn.Conv2d(
+            in_channels=vit_features,
+            out_channels=features[3],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+        ),
+        nn.Conv2d(
+            in_channels=features[3],
+            out_channels=features[3],
+            kernel_size=3,
+            stride=2,
+            padding=1,
+        ),
+    )
+    pretrained.model.start_index = start_index
+    pretrained.model.patch_size = [16, 16]
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
+    # We inject this function into the VisionTransformer instances so that
+    # we can use it with interpolated position embeddings without modifying the library source.
+    pretrained.model._resize_pos_embed = types.MethodType(
+        _resize_pos_embed, pretrained.model
+    )
+    return pretrained
+def _make_pretrained_vitb_rn50_384(
+    pretrained, use_readout="ignore", hooks=None, use_vit_only=False
+):
+    model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
+    hooks = [0, 1, 8, 11] if hooks == None else hooks
+    return _make_vit_b_rn50_backbone(
+        model,
+        features=[256, 512, 768, 768],
+        size=[384, 384],
+        hooks=hooks,
+        use_vit_only=use_vit_only,
+        use_readout=use_readout,
+    )
--- a/ldm/modules/midas/utils.py
+++ b/ldm/modules/midas/utils.py
+"""Utils for monoDepth."""
+import sys
+import re
+import numpy as np
+import cv2
+import torch
+def read_pfm(path):
+    """Read pfm file.
+    Args:
+        path (str): path to file
+    Returns:
+        tuple: (data, scale)
+    """
+    with open(path, "rb") as file:
+        color = None
+        width = None
+        height = None
+        scale = None
+        endian = None
+        header = file.readline().rstrip()
+        if header.decode("ascii") == "PF":
+            color = True
+        elif header.decode("ascii") == "Pf":
+            color = False
+        else:
+            raise Exception("Not a PFM file: " + path)
+        dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
+        if dim_match:
+            width, height = list(map(int, dim_match.groups()))
+        else:
+            raise Exception("Malformed PFM header.")
+        scale = float(file.readline().decode("ascii").rstrip())
+        if scale < 0:
+            # little-endian
+            endian = "<"
+            scale = -scale
+        else:
+            # big-endian
+            endian = ">"
+        data = np.fromfile(file, endian + "f")
+        shape = (height, width, 3) if color else (height, width)
+        data = np.reshape(data, shape)
+        data = np.flipud(data)
+        return data, scale
+def write_pfm(path, image, scale=1):
+    """Write pfm file.
+    Args:
+        path (str): pathto file
+        image (array): data
+        scale (int, optional): Scale. Defaults to 1.
+    """
+    with open(path, "wb") as file:
+        color = None
+        if image.dtype.name != "float32":
+            raise Exception("Image dtype must be float32.")
+        image = np.flipud(image)
+        if len(image.shape) == 3 and image.shape[2] == 3:  # color image
+            color = True
+        elif (
+            len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
+        ):  # greyscale
+            color = False
+        else:
+            raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
+        file.write("PF\n" if color else "Pf\n".encode())
+        file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
+        endian = image.dtype.byteorder
+        if endian == "<" or endian == "=" and sys.byteorder == "little":
+            scale = -scale
+        file.write("%f\n".encode() % scale)
+        image.tofile(file)
+def read_image(path):
+    """Read image and output RGB image (0-1).
+    Args:
+        path (str): path to file
+    Returns:
+        array: RGB image (0-1)
+    """
+    img = cv2.imread(path)
+    if img.ndim == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
+    return img
+def resize_image(img):
+    """Resize image and make it fit for network.
+    Args:
+        img (array): image
+    Returns:
+        tensor: data ready for network
+    """
+    height_orig = img.shape[0]
+    width_orig = img.shape[1]
+    if width_orig > height_orig:
+        scale = width_orig / 384
+    else:
+        scale = height_orig / 384
+    height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
+    width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
+    img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
+    img_resized = (
+        torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
+    )
+    img_resized = img_resized.unsqueeze(0)
+    return img_resized
+def resize_depth(depth, width, height):
+    """Resize depth map and bring to CPU (numpy).
+    Args:
+        depth (tensor): depth
+        width (int): image width
+        height (int): image height
+    Returns:
+        array: processed depth
+    """
+    depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
+    depth_resized = cv2.resize(
+        depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
+    )
+    return depth_resized
+def write_depth(path, depth, bits=1):
+    """Write depth map to pfm and png file.
+    Args:
+        path (str): filepath without extension
+        depth (array): depth
+    """
+    write_pfm(path + ".pfm", depth.astype(np.float32))
+    depth_min = depth.min()
+    depth_max = depth.max()
+    max_val = (2**(8*bits))-1
+    if depth_max - depth_min > np.finfo("float").eps:
+        out = max_val * (depth - depth_min) / (depth_max - depth_min)
+    else:
+        out = np.zeros(depth.shape, dtype=depth.type)
+    if bits == 1:
+        cv2.imwrite(path + ".png", out.astype("uint8"))
+    elif bits == 2:
+        cv2.imwrite(path + ".png", out.astype("uint16"))
+    return
--- a/ldm/util.py
+++ b/ldm/util.py
+import importlib
+import torch
+from torch import optim
+import numpy as np
+from inspect import isfunction
+from PIL import Image, ImageDraw, ImageFont
+def autocast(f):
+    def do_autocast(*args, **kwargs):
+        with torch.cuda.amp.autocast(enabled=True,
+                                     dtype=torch.get_autocast_gpu_dtype(),
+                                     cache_enabled=torch.is_autocast_cache_enabled()):
+            return f(*args, **kwargs)
+    return do_autocast
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
+        nc = int(40 * (wh[0] / 256))
+        lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+def isimage(x):
+    if not isinstance(x,torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+def exists(x):
+    return x is not None
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
+    return total_params
+def instantiate_from_config(config):
+    if not "target" in config:
+        if config == '__is_first_stage__':
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+class AdamWwithEMAandWings(optim.Optimizer):
+    # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
+    def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8,  # TODO: check hyperparameters before using
+                 weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999,   # ema decay to match previous code
+                 ema_power=1., param_names=()):
+        """AdamW that saves EMA versions of the parameters."""
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        if not 0.0 <= weight_decay:
+            raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
+        if not 0.0 <= ema_decay <= 1.0:
+            raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
+        defaults = dict(lr=lr, betas=betas, eps=eps,
+                        weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
+                        ema_power=ema_power, param_names=param_names)
+        super().__init__(params, defaults)
+    def __setstate__(self, state):
+        super().__setstate__(state)
+        for group in self.param_groups:
+            group.setdefault('amsgrad', False)
+    @torch.no_grad()
+    def step(self, closure=None):
+        """Performs a single optimization step.
+        Args:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            with torch.enable_grad():
+                loss = closure()
+        for group in self.param_groups:
+            params_with_grad = []
+            grads = []
+            exp_avgs = []
+            exp_avg_sqs = []
+            ema_params_with_grad = []
+            state_sums = []
+            max_exp_avg_sqs = []
+            state_steps = []
+            amsgrad = group['amsgrad']
+            beta1, beta2 = group['betas']
+            ema_decay = group['ema_decay']
+            ema_power = group['ema_power']
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                params_with_grad.append(p)
+                if p.grad.is_sparse:
+                    raise RuntimeError('AdamW does not support sparse gradients')
+                grads.append(p.grad)
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    if amsgrad:
+                        # Maintains max of all exp. moving avg. of sq. grad. values
+                        state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
+                    # Exponential moving average of parameter values
+                    state['param_exp_avg'] = p.detach().float().clone()
+                exp_avgs.append(state['exp_avg'])
+                exp_avg_sqs.append(state['exp_avg_sq'])
+                ema_params_with_grad.append(state['param_exp_avg'])
+                if amsgrad:
+                    max_exp_avg_sqs.append(state['max_exp_avg_sq'])
+                # update the steps for each param group update
+                state['step'] += 1
+                # record the step after step update
+                state_steps.append(state['step'])
+            optim._functional.adamw(params_with_grad,
+                    grads,
+                    exp_avgs,
+                    exp_avg_sqs,
+                    max_exp_avg_sqs,
+                    state_steps,
+                    amsgrad=amsgrad,
+                    beta1=beta1,
+                    beta2=beta2,
+                    lr=group['lr'],
+                    weight_decay=group['weight_decay'],
+                    eps=group['eps'],
+                    maximize=False)
+            cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
+            for param, ema_param in zip(params_with_grad, ema_params_with_grad):
+                ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
+        return loss
\ No newline at end of file
--- a/modelcard.md
+++ b/modelcard.md
+# Stable Diffusion v2 Model Card
+This model card focuses on the models associated with the Stable Diffusion v2, available [here](https://github.com/Stability-AI/stablediffusion/).
+## Model Details
+- **Developed by:** Robin Rombach, Patrick Esser
+- **Model type:** Diffusion-based text-to-image generation model
+- **Language(s):** English
+- **License:** CreativeML Open RAIL++-M License
+- **Model Description:** This is a model that can be used to generate and modify images based on text prompts. It is a [Latent Diffusion Model](https://arxiv.org/abs/2112.10752) that uses a fixed, pretrained text encoder ([OpenCLIP-ViT/H](https://github.com/mlfoundations/open_clip)).
+- **Resources for more information:** [GitHub Repository](https://github.com/Stability-AI/).
+- **Cite as:**
+      @InProceedings{Rombach_2022_CVPR,
+          author    = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
+          title     = {High-Resolution Image Synthesis With Latent Diffusion Models},
+          booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+          month     = {June},
+          year      = {2022},
+          pages     = {10684-10695}
+      }
+# Uses
+## Direct Use 
+The model is intended for research purposes only. Possible research areas and tasks include
+- Safe deployment of models which have the potential to generate harmful content.
+- Probing and understanding the limitations and biases of generative models.
+- Generation of artworks and use in design and other artistic processes.
+- Applications in educational or creative tools.
+- Research on generative models.
+Excluded uses are described below.
+ ### Misuse, Malicious Use, and Out-of-Scope Use
+_Note: This section is originally taken from the [DALLE-MINI model card](https://huggingface.co/dalle-mini/dalle-mini), was used for Stable Diffusion v1, but applies in the same way to Stable Diffusion v2_.
+The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
+#### Out-of-Scope Use
+The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
+#### Misuse and Malicious Use
+Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
+- Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
+- Intentionally promoting or propagating discriminatory content or harmful stereotypes.
+- Impersonating individuals without their consent.
+- Sexual content without consent of the people who might see it.
+- Mis- and disinformation
+- Representations of egregious violence and gore
+- Sharing of copyrighted or licensed material in violation of its terms of use.
+- Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
+## Limitations and Bias
+### Limitations
+- The model does not achieve perfect photorealism
+- The model cannot render legible text
+- The model does not perform well on more difficult tasks which involve compositionality, such as rendering an image corresponding to “A red cube on top of a blue sphere”
+- Faces and people in general may not be generated properly.
+- The model was trained mainly with English captions and will not work as well in other languages.
+- The autoencoding part of the model is lossy
+- The model was trained on a subset of the large-scale dataset
+  [LAION-5B](https://laion.ai/blog/laion-5b/), which contains adult, violent and sexual content. To partially mitigate this, we have filtered the dataset using LAION's NFSW detector (see Training section).
+### Bias
+While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases. 
+Stable Diffusion vw was primarily trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/), 
+which consists of images that are limited to English descriptions. 
+Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for. 
+This affects the overall output of the model, as white and western cultures are often set as the default. Further, the 
+ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts.
+Stable Diffusion v2 mirrors and exacerbates biases to such a degree that viewer discretion must be advised irrespective of the input or its intent.
+## Training
+**Training Data**
+The model developers used the following dataset for training the model:
+- LAION-5B and subsets (details below). The training data is further filtered using LAION's NSFW detector. For more details, please refer to LAION-5B's [NeurIPS 2022](https://openreview.net/forum?id=M3Y74vmsMcY) paper and reviewer discussions on the topic.
+**Training Procedure**
+Stable Diffusion v2 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training, 
+- Images are encoded through an encoder, which turns images into latent representations. The autoencoder uses a relative downsampling factor of 8 and maps images of shape H x W x 3 to latents of shape H/f x W/f x 4
+- Text prompts are encoded through the OpenCLIP-ViT/H text-encoder.
+- The output of the text encoder is fed into the UNet backbone of the latent diffusion model via cross-attention.
+- The loss is a reconstruction objective between the noise that was added to the latent and the prediction made by the UNet. We also use the so-called _v-objective_, see https://arxiv.org/abs/2202.00512.
+We currently provide the following checkpoints, for various versions:
+### Version 2.1
+- `512-base-ema.ckpt`: Fine-tuned on `512-base-ema.ckpt` 2.0 with 220k extra steps taken, with `punsafe=0.98` on the same dataset.
+- `768-v-ema.ckpt`: Resumed from `768-v-ema.ckpt` 2.0 with an additional 55k steps on the same dataset (`punsafe=0.1`), and then fine-tuned for another 155k extra steps with `punsafe=0.98`.
+**SD-unCLIP 2.1** is a finetuned version of Stable Diffusion 2.1, modified to accept (noisy) CLIP image embedding in addition to the text prompt, and can be used to create image variations ([Examples](https://github.com/Stability-AI/stablediffusion/blob/main/doc/UNCLIP.MD)) or can be chained with text-to-image CLIP priors. The amount of noise added to the image embedding can be specified via the `noise_level` (0 means no noise, 1000 full noise).
+If you plan on building applications on top of the model that the general public may use, you are responsible for adding the guardrails to minimize or prevent misuse of the application, especially for use-cases highlighted in the earlier section, Misuse, Malicious Use, and Out-of-Scope Use.
+A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine)
+### Version 2.0
+- `512-base-ema.ckpt`: 550k steps at resolution `256x256` on a subset of [LAION-5B](https://laion.ai/blog/laion-5b/) filtered for explicit pornographic material, using the [LAION-NSFW classifier](https://github.com/LAION-AI/CLIP-based-NSFW-Detector) with `punsafe=0.1` and an [aesthetic score](https://github.com/christophschuhmann/improved-aesthetic-predictor) >= `4.5`.
+  850k steps at resolution `512x512` on the same dataset with resolution `>= 512x512`.
+- `768-v-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for 150k steps using a [v-objective](https://arxiv.org/abs/2202.00512) on the same dataset. Resumed for another 140k steps on a `768x768` subset of our dataset.
+- `512-depth-ema.ckpt`: Resumed from `512-base-ema.ckpt` and finetuned for 200k steps. Added an extra input channel to process the (relative) depth prediction produced by [MiDaS](https://github.com/isl-org/MiDaS) (`dpt_hybrid`) which is used as an additional conditioning.
+The additional input channels of the U-Net which process this extra information were zero-initialized.
+- `512-inpainting-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for another 200k steps. Follows the mask-generation strategy presented in [LAMA](https://github.com/saic-mdal/lama) which, in combination with the latent VAE representations of the masked image, are used as an additional conditioning.
+The additional input channels of the U-Net which process this extra information were zero-initialized. The same strategy was used to train the [1.5-inpainting checkpoint](https://github.com/saic-mdal/lama).
+- `x4-upscaling-ema.ckpt`: Trained for 1.25M steps on a 10M subset of LAION containing images `>2048x2048`. The model was trained on crops of size `512x512` and is a text-guided [latent upscaling diffusion model](https://arxiv.org/abs/2112.10752).
+In addition to the textual input, it receives a `noise_level` as an input parameter, which can be used to add noise to the low-resolution input according to a [predefined diffusion schedule](configs/stable-diffusion/x4-upscaling.yaml). 
+- **Hardware:** 32 x 8 x A100 GPUs
+- **Optimizer:** AdamW
+- **Gradient Accumulations**: 1
+- **Batch:** 32 x 8 x 2 x 4 = 2048
+- **Learning rate:** warmup to 0.0001 for 10,000 steps and then kept constant
+## Evaluation Results 
+Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
+5.0, 6.0, 7.0, 8.0) and 50 steps DDIM sampling steps show the relative improvements of the checkpoints:
+![pareto](assets/model-variants.jpg) 
+Evaluated using 50 DDIM steps and 10000 random prompts from the COCO2017 validation set, evaluated at 512x512 resolution.  Not optimized for FID scores.
+## Environmental Impact
+**Stable Diffusion v1** **Estimated Emissions**
+Based on that information, we estimate the following CO2 emissions using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). The hardware, runtime, cloud provider, and compute region were utilized to estimate the carbon impact.
+- **Hardware Type:** A100 PCIe 40GB
+- **Hours used:** 200000
+- **Cloud Provider:** AWS
+- **Compute Region:** US-east
+- **Carbon Emitted (Power consumption x Time x Carbon produced based on location of power grid):** 15000 kg CO2 eq.
+## Citation
+    @InProceedings{Rombach_2022_CVPR,
+        author    = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
+        title     = {High-Resolution Image Synthesis With Latent Diffusion Models},
+        booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+        month     = {June},
+        year      = {2022},
+        pages     = {10684-10695}
+    }
+*This model card was written by: Robin Rombach, Patrick Esser and David Ha and is based on the [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion/blob/main/Stable_Diffusion_v1_Model_Card.md) and [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
--- a/readme.txt
+++ b/readme.txt
+install dtk and hyhal
+python scripts/txt2img.py --device cuda --ckpt /public/home/lijian/model/v2-1_512-ema-pruned.ckpt --config ./configs/stable-diffusion/v2-inference.yaml --n_iter 1 --n_samples 1
+pip install diffusers==0.27.0
+change model path
+python test_diffusers.py
--- a/requirements.txt
+++ b/requirements.txt
+albumentations==0.4.3
+opencv-python
+pudb==2019.2
+imageio==2.9.0
+imageio-ffmpeg==0.4.2
+pytorch-lightning==1.4.2
+torchmetrics==0.6
+omegaconf==2.1.1
+test-tube>=0.7.5
+streamlit>=0.73.1
+einops==0.3.0
+transformers==4.19.2
+webdataset==0.2.5
+open-clip-torch==2.7.0
+gradio==3.13.2
+kornia==0.6
+invisible-watermark>=0.1.5
+streamlit-drawable-canvas==0.8.0
+-e .
--- a/scripts/gradio/depth2img.py
+++ b/scripts/gradio/depth2img.py
+import sys
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+from omegaconf import OmegaConf
+from einops import repeat, rearrange
+from pytorch_lightning import seed_everything
+from imwatermark import WatermarkEncoder
+from scripts.txt2img import put_watermark
+from ldm.util import instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.data.util import AddMiDaS
+torch.set_grad_enabled(False)
+def initialize_model(config, ckpt):
+    config = OmegaConf.load(config)
+    model = instantiate_from_config(config.model)
+    model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
+    device = torch.device(
+        "cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    sampler = DDIMSampler(model)
+    return sampler
+def make_batch_sd(
+        image,
+        txt,
+        device,
+        num_samples=1,
+        model_type="dpt_hybrid"
+):
+    image = np.array(image.convert("RGB"))
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+    # sample['jpg'] is tensor hwc in [-1, 1] at this point
+    midas_trafo = AddMiDaS(model_type=model_type)
+    batch = {
+        "jpg": image,
+        "txt": num_samples * [txt],
+    }
+    batch = midas_trafo(batch)
+    batch["jpg"] = rearrange(batch["jpg"], 'h w c -> 1 c h w')
+    batch["jpg"] = repeat(batch["jpg"].to(device=device),
+                          "1 ... -> n ...", n=num_samples)
+    batch["midas_in"] = repeat(torch.from_numpy(batch["midas_in"][None, ...]).to(
+        device=device), "1 ... -> n ...", n=num_samples)
+    return batch
+def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
+          do_full_sample=False):
+    device = torch.device(
+        "cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = sampler.model
+    seed_everything(seed)
+    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
+    wm = "SDV2"
+    wm_encoder = WatermarkEncoder()
+    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
+    with torch.no_grad(),\
+            torch.autocast("cuda"):
+        batch = make_batch_sd(
+            image, txt=prompt, device=device, num_samples=num_samples)
+        z = model.get_first_stage_encoding(model.encode_first_stage(
+            batch[model.first_stage_key]))  # move to latent space
+        c = model.cond_stage_model.encode(batch["txt"])
+        c_cat = list()
+        for ck in model.concat_keys:
+            cc = batch[ck]
+            cc = model.depth_model(cc)
+            depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
+                                                                                           keepdim=True)
+            display_depth = (cc - depth_min) / (depth_max - depth_min)
+            depth_image = Image.fromarray(
+                (display_depth[0, 0, ...].cpu().numpy() * 255.).astype(np.uint8))
+            cc = torch.nn.functional.interpolate(
+                cc,
+                size=z.shape[2:],
+                mode="bicubic",
+                align_corners=False,
+            )
+            depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
+                                                                                           keepdim=True)
+            cc = 2. * (cc - depth_min) / (depth_max - depth_min) - 1.
+            c_cat.append(cc)
+        c_cat = torch.cat(c_cat, dim=1)
+        # cond
+        cond = {"c_concat": [c_cat], "c_crossattn": [c]}
+        # uncond cond
+        uc_cross = model.get_unconditional_conditioning(num_samples, "")
+        uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
+        if not do_full_sample:
+            # encode (scaled latent)
+            z_enc = sampler.stochastic_encode(
+                z, torch.tensor([t_enc] * num_samples).to(model.device))
+        else:
+            z_enc = torch.randn_like(z)
+        # decode it
+        samples = sampler.decode(z_enc, cond, t_enc, unconditional_guidance_scale=scale,
+                                 unconditional_conditioning=uc_full, callback=callback)
+        x_samples_ddim = model.decode_first_stage(samples)
+        result = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+        result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
+    return [depth_image] + [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
+def pad_image(input_image):
+    pad_w, pad_h = np.max(((2, 2), np.ceil(
+        np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
+    im_padded = Image.fromarray(
+        np.pad(np.array(input_image), ((0, pad_h), (0, pad_w), (0, 0)), mode='edge'))
+    return im_padded
+def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength):
+    init_image = input_image.convert("RGB")
+    image = pad_image(init_image)  # resize to integer multiple of 32
+    sampler.make_schedule(steps, ddim_eta=eta, verbose=True)
+    assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]'
+    do_full_sample = strength == 1.
+    t_enc = min(int(strength * steps), steps-1)
+    result = paint(
+        sampler=sampler,
+        image=image,
+        prompt=prompt,
+        t_enc=t_enc,
+        seed=seed,
+        scale=scale,
+        num_samples=num_samples,
+        callback=None,
+        do_full_sample=do_full_sample
+    )
+    return result
+sampler = initialize_model(sys.argv[1], sys.argv[2])
+block = gr.Blocks().queue()
+with block:
+    with gr.Row():
+        gr.Markdown("## Stable Diffusion Depth2Img")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', type="pil")
+            prompt = gr.Textbox(label="Prompt")
+            run_button = gr.Button(label="Run")
+            with gr.Accordion("Advanced options", open=False):
+                num_samples = gr.Slider(
+                    label="Images", minimum=1, maximum=4, value=1, step=1)
+                ddim_steps = gr.Slider(label="Steps", minimum=1,
+                                       maximum=50, value=50, step=1)
+                scale = gr.Slider(
+                    label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1
+                )
+                strength = gr.Slider(
+                    label="Strength", minimum=0.0, maximum=1.0, value=0.9, step=0.01
+                )
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=2147483647,
+                    step=1,
+                    randomize=True,
+                )
+                eta = gr.Number(label="eta (DDIM)", value=0.0)
+        with gr.Column():
+            gallery = gr.Gallery(label="Generated images", show_label=False).style(
+                grid=[2], height="auto")
+    run_button.click(fn=predict, inputs=[
+                     input_image, prompt, ddim_steps, num_samples, scale, seed, eta, strength], outputs=[gallery])
+block.launch()
--- a/scripts/gradio/inpainting.py
+++ b/scripts/gradio/inpainting.py
+import sys
+import cv2
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+from omegaconf import OmegaConf
+from einops import repeat
+from imwatermark import WatermarkEncoder
+from pathlib import Path
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.util import instantiate_from_config
+torch.set_grad_enabled(False)
+def put_watermark(img, wm_encoder=None):
+    if wm_encoder is not None:
+        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+        img = wm_encoder.encode(img, 'dwtDct')
+        img = Image.fromarray(img[:, :, ::-1])
+    return img
+def initialize_model(config, ckpt):
+    config = OmegaConf.load(config)
+    model = instantiate_from_config(config.model)
+    model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
+    device = torch.device(
+        "cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    sampler = DDIMSampler(model)
+    return sampler
+def make_batch_sd(
+        image,
+        mask,
+        txt,
+        device,
+        num_samples=1):
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+    mask = torch.from_numpy(mask)
+    masked_image = image * (mask < 0.5)
+    batch = {
+        "image": repeat(image.to(device=device), "1 ... -> n ...", n=num_samples),
+        "txt": num_samples * [txt],
+        "mask": repeat(mask.to(device=device), "1 ... -> n ...", n=num_samples),
+        "masked_image": repeat(masked_image.to(device=device), "1 ... -> n ...", n=num_samples),
+    }
+    return batch
+def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512):
+    device = torch.device(
+        "cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = sampler.model
+    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
+    wm = "SDV2"
+    wm_encoder = WatermarkEncoder()
+    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
+    prng = np.random.RandomState(seed)
+    start_code = prng.randn(num_samples, 4, h // 8, w // 8)
+    start_code = torch.from_numpy(start_code).to(
+        device=device, dtype=torch.float32)
+    with torch.no_grad(), \
+            torch.autocast("cuda"):
+        batch = make_batch_sd(image, mask, txt=prompt,
+                              device=device, num_samples=num_samples)
+        c = model.cond_stage_model.encode(batch["txt"])
+        c_cat = list()
+        for ck in model.concat_keys:
+            cc = batch[ck].float()
+            if ck != model.masked_image_key:
+                bchw = [num_samples, 4, h // 8, w // 8]
+                cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
+            else:
+                cc = model.get_first_stage_encoding(
+                    model.encode_first_stage(cc))
+            c_cat.append(cc)
+        c_cat = torch.cat(c_cat, dim=1)
+        # cond
+        cond = {"c_concat": [c_cat], "c_crossattn": [c]}
+        # uncond cond
+        uc_cross = model.get_unconditional_conditioning(num_samples, "")
+        uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
+        shape = [model.channels, h // 8, w // 8]
+        samples_cfg, intermediates = sampler.sample(
+            ddim_steps,
+            num_samples,
+            shape,
+            cond,
+            verbose=False,
+            eta=1.0,
+            unconditional_guidance_scale=scale,
+            unconditional_conditioning=uc_full,
+            x_T=start_code,
+        )
+        x_samples_ddim = model.decode_first_stage(samples_cfg)
+        result = torch.clamp((x_samples_ddim + 1.0) / 2.0,
+                             min=0.0, max=1.0)
+        result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
+    return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
+def pad_image(input_image):
+    pad_w, pad_h = np.max(((2, 2), np.ceil(
+        np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
+    im_padded = Image.fromarray(
+        np.pad(np.array(input_image), ((0, pad_h), (0, pad_w), (0, 0)), mode='edge'))
+    return im_padded
+def predict(input_image, prompt, ddim_steps, num_samples, scale, seed):
+    init_image = input_image["image"].convert("RGB")
+    init_mask = input_image["mask"].convert("RGB")
+    image = pad_image(init_image) # resize to integer multiple of 32
+    mask = pad_image(init_mask) # resize to integer multiple of 32
+    width, height = image.size
+    print("Inpainting...", width, height)
+    result = inpaint(
+        sampler=sampler,
+        image=image,
+        mask=mask,
+        prompt=prompt,
+        seed=seed,
+        scale=scale,
+        ddim_steps=ddim_steps,
+        num_samples=num_samples,
+        h=height, w=width
+    )
+    return result
+sampler = initialize_model(sys.argv[1], sys.argv[2])
+block = gr.Blocks().queue()
+with block:
+    with gr.Row():
+        gr.Markdown("## Stable Diffusion Inpainting")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', tool='sketch', type="pil")
+            prompt = gr.Textbox(label="Prompt")
+            run_button = gr.Button(label="Run")
+            with gr.Accordion("Advanced options", open=False):
+                num_samples = gr.Slider(
+                    label="Images", minimum=1, maximum=4, value=4, step=1)
+                ddim_steps = gr.Slider(label="Steps", minimum=1,
+                                       maximum=50, value=45, step=1)
+                scale = gr.Slider(
+                    label="Guidance Scale", minimum=0.1, maximum=30.0, value=10, step=0.1
+                )
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=2147483647,
+                    step=1,
+                    randomize=True,
+                )
+        with gr.Column():
+            gallery = gr.Gallery(label="Generated images", show_label=False).style(
+                grid=[2], height="auto")
+    run_button.click(fn=predict, inputs=[
+                     input_image, prompt, ddim_steps, num_samples, scale, seed], outputs=[gallery])
+block.launch()
--- a/scripts/gradio/superresolution.py
+++ b/scripts/gradio/superresolution.py
+import sys
+import torch
+import numpy as np
+import gradio as gr
+from PIL import Image
+from omegaconf import OmegaConf
+from einops import repeat, rearrange
+from pytorch_lightning import seed_everything
+from imwatermark import WatermarkEncoder
+from scripts.txt2img import put_watermark
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.models.diffusion.ddpm import LatentUpscaleDiffusion, LatentUpscaleFinetuneDiffusion
+from ldm.util import exists, instantiate_from_config
+torch.set_grad_enabled(False)
+def initialize_model(config, ckpt):
+    config = OmegaConf.load(config)
+    model = instantiate_from_config(config.model)
+    model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
+    device = torch.device(
+        "cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    sampler = DDIMSampler(model)
+    return sampler
+def make_batch_sd(
+        image,
+        txt,
+        device,
+        num_samples=1,
+):
+    image = np.array(image.convert("RGB"))
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+    batch = {
+        "lr": rearrange(image, 'h w c -> 1 c h w'),
+        "txt": num_samples * [txt],
+    }
+    batch["lr"] = repeat(batch["lr"].to(device=device),
+                         "1 ... -> n ...", n=num_samples)
+    return batch
+def make_noise_augmentation(model, batch, noise_level=None):
+    x_low = batch[model.low_scale_key]
+    x_low = x_low.to(memory_format=torch.contiguous_format).float()
+    x_aug, noise_level = model.low_scale_model(x_low, noise_level)
+    return x_aug, noise_level
+def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
+    device = torch.device(
+        "cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = sampler.model
+    seed_everything(seed)
+    prng = np.random.RandomState(seed)
+    start_code = prng.randn(num_samples, model.channels, h, w)
+    start_code = torch.from_numpy(start_code).to(
+        device=device, dtype=torch.float32)
+    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
+    wm = "SDV2"
+    wm_encoder = WatermarkEncoder()
+    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
+    with torch.no_grad(),\
+            torch.autocast("cuda"):
+        batch = make_batch_sd(
+            image, txt=prompt, device=device, num_samples=num_samples)
+        c = model.cond_stage_model.encode(batch["txt"])
+        c_cat = list()
+        if isinstance(model, LatentUpscaleFinetuneDiffusion):
+            for ck in model.concat_keys:
+                cc = batch[ck]
+                if exists(model.reshuffle_patch_size):
+                    assert isinstance(model.reshuffle_patch_size, int)
+                    cc = rearrange(cc, 'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
+                                   p1=model.reshuffle_patch_size, p2=model.reshuffle_patch_size)
+                c_cat.append(cc)
+            c_cat = torch.cat(c_cat, dim=1)
+            # cond
+            cond = {"c_concat": [c_cat], "c_crossattn": [c]}
+            # uncond cond
+            uc_cross = model.get_unconditional_conditioning(num_samples, "")
+            uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
+        elif isinstance(model, LatentUpscaleDiffusion):
+            x_augment, noise_level = make_noise_augmentation(
+                model, batch, noise_level)
+            cond = {"c_concat": [x_augment],
+                    "c_crossattn": [c], "c_adm": noise_level}
+            # uncond cond
+            uc_cross = model.get_unconditional_conditioning(num_samples, "")
+            uc_full = {"c_concat": [x_augment], "c_crossattn": [
+                uc_cross], "c_adm": noise_level}
+        else:
+            raise NotImplementedError()
+        shape = [model.channels, h, w]
+        samples, intermediates = sampler.sample(
+            steps,
+            num_samples,
+            shape,
+            cond,
+            verbose=False,
+            eta=eta,
+            unconditional_guidance_scale=scale,
+            unconditional_conditioning=uc_full,
+            x_T=start_code,
+            callback=callback
+        )
+    with torch.no_grad():
+        x_samples_ddim = model.decode_first_stage(samples)
+    result = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+    result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
+    return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
+def pad_image(input_image):
+    pad_w, pad_h = np.max(((2, 2), np.ceil(
+        np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
+    im_padded = Image.fromarray(
+        np.pad(np.array(input_image), ((0, pad_h), (0, pad_w), (0, 0)), mode='edge'))
+    return im_padded
+def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level):
+    init_image = input_image.convert("RGB")
+    image = pad_image(init_image)  # resize to integer multiple of 32
+    width, height = image.size
+    noise_level = torch.Tensor(
+        num_samples * [noise_level]).to(sampler.model.device).long()
+    sampler.make_schedule(steps, ddim_eta=eta, verbose=True)
+    result = paint(
+        sampler=sampler,
+        image=image,
+        prompt=prompt,
+        seed=seed,
+        scale=scale,
+        h=height, w=width, steps=steps,
+        num_samples=num_samples,
+        callback=None,
+        noise_level=noise_level
+    )
+    return result
+sampler = initialize_model(sys.argv[1], sys.argv[2])
+block = gr.Blocks().queue()
+with block:
+    with gr.Row():
+        gr.Markdown("## Stable Diffusion Upscaling")
+    with gr.Row():
+        with gr.Column():
+            input_image = gr.Image(source='upload', type="pil")
+            gr.Markdown(
+                "Tip: Add a description of the object that should be upscaled, e.g.: 'a professional photograph of a cat")
+            prompt = gr.Textbox(label="Prompt")
+            run_button = gr.Button(label="Run")
+            with gr.Accordion("Advanced options", open=False):
+                num_samples = gr.Slider(
+                    label="Number of Samples", minimum=1, maximum=4, value=1, step=1)
+                steps = gr.Slider(label="DDIM Steps", minimum=2,
+                                  maximum=200, value=75, step=1)
+                scale = gr.Slider(
+                    label="Scale", minimum=0.1, maximum=30.0, value=10, step=0.1
+                )
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=2147483647,
+                    step=1,
+                    randomize=True,
+                )
+                eta = gr.Number(label="eta (DDIM)",
+                                value=0.0, min=0.0, max=1.0)
+                noise_level = None
+                if isinstance(sampler.model, LatentUpscaleDiffusion):
+                    # TODO: make this work for all models
+                    noise_level = gr.Number(
+                        label="Noise Augmentation", min=0, max=350, value=20, step=1)
+        with gr.Column():
+            gallery = gr.Gallery(label="Generated images", show_label=False).style(
+                grid=[2], height="auto")
+    run_button.click(fn=predict, inputs=[
+                     input_image, prompt, steps, num_samples, scale, seed, eta, noise_level], outputs=[gallery])
+block.launch()
--- a/scripts/img2img.py
+++ b/scripts/img2img.py
+"""make variations of input image"""
+import argparse, os
+import PIL
+import torch
+import numpy as np
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm, trange
+from itertools import islice
+from einops import rearrange, repeat
+from torchvision.utils import make_grid
+from torch import autocast
+from contextlib import nullcontext
+from pytorch_lightning import seed_everything
+from imwatermark import WatermarkEncoder
+from scripts.txt2img import put_watermark
+from ldm.util import instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+def load_model_from_config(config, ckpt, verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+    model.cuda()
+    model.eval()
+    return model
+def load_img(path):
+    image = Image.open(path).convert("RGB")
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h}) from {path}")
+    w, h = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 64
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2. * image - 1.
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        nargs="?",
+        default="a painting of a virus monster playing guitar",
+        help="the prompt to render"
+    )
+    parser.add_argument(
+        "--init-img",
+        type=str,
+        nargs="?",
+        help="path to the input image"
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        nargs="?",
+        help="dir to write results to",
+        default="outputs/img2img-samples"
+    )
+    parser.add_argument(
+        "--ddim_steps",
+        type=int,
+        default=50,
+        help="number of ddim sampling steps",
+    )
+    parser.add_argument(
+        "--fixed_code",
+        action='store_true',
+        help="if enabled, uses the same starting code across all samples ",
+    )
+    parser.add_argument(
+        "--ddim_eta",
+        type=float,
+        default=0.0,
+        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
+    )
+    parser.add_argument(
+        "--n_iter",
+        type=int,
+        default=1,
+        help="sample this often",
+    )
+    parser.add_argument(
+        "--C",
+        type=int,
+        default=4,
+        help="latent channels",
+    )
+    parser.add_argument(
+        "--f",
+        type=int,
+        default=8,
+        help="downsampling factor, most often 8 or 16",
+    )
+    parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=2,
+        help="how many samples to produce for each given prompt. A.k.a batch size",
+    )
+    parser.add_argument(
+        "--n_rows",
+        type=int,
+        default=0,
+        help="rows in the grid (default: n_samples)",
+    )
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=9.0,
+        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
+    )
+    parser.add_argument(
+        "--strength",
+        type=float,
+        default=0.8,
+        help="strength for noising/unnoising. 1.0 corresponds to full destruction of information in init image",
+    )
+    parser.add_argument(
+        "--from-file",
+        type=str,
+        help="if specified, load prompts from this file",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/stable-diffusion/v2-inference.yaml",
+        help="path to config which constructs model",
+    )
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        help="path to checkpoint of model",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="the seed (for reproducible sampling)",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        help="evaluate at this precision",
+        choices=["full", "autocast"],
+        default="autocast"
+    )
+    opt = parser.parse_args()
+    seed_everything(opt.seed)
+    config = OmegaConf.load(f"{opt.config}")
+    model = load_model_from_config(config, f"{opt.ckpt}")
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    sampler = DDIMSampler(model)
+    os.makedirs(opt.outdir, exist_ok=True)
+    outpath = opt.outdir
+    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
+    wm = "SDV2"
+    wm_encoder = WatermarkEncoder()
+    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
+    batch_size = opt.n_samples
+    n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
+    if not opt.from_file:
+        prompt = opt.prompt
+        assert prompt is not None
+        data = [batch_size * [prompt]]
+    else:
+        print(f"reading prompts from {opt.from_file}")
+        with open(opt.from_file, "r") as f:
+            data = f.read().splitlines()
+            data = list(chunk(data, batch_size))
+    sample_path = os.path.join(outpath, "samples")
+    os.makedirs(sample_path, exist_ok=True)
+    base_count = len(os.listdir(sample_path))
+    grid_count = len(os.listdir(outpath)) - 1
+    assert os.path.isfile(opt.init_img)
+    init_image = load_img(opt.init_img).to(device)
+    init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
+    init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
+    sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)
+    assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]'
+    t_enc = int(opt.strength * opt.ddim_steps)
+    print(f"target t_enc is {t_enc} steps")
+    precision_scope = autocast if opt.precision == "autocast" else nullcontext
+    with torch.no_grad():
+        with precision_scope("cuda"):
+            with model.ema_scope():
+                all_samples = list()
+                for n in trange(opt.n_iter, desc="Sampling"):
+                    for prompts in tqdm(data, desc="data"):
+                        uc = None
+                        if opt.scale != 1.0:
+                            uc = model.get_learned_conditioning(batch_size * [""])
+                        if isinstance(prompts, tuple):
+                            prompts = list(prompts)
+                        c = model.get_learned_conditioning(prompts)
+                        # encode (scaled latent)
+                        z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc] * batch_size).to(device))
+                        # decode it
+                        samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale,
+                                                 unconditional_conditioning=uc, )
+                        x_samples = model.decode_first_stage(samples)
+                        x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
+                        for x_sample in x_samples:
+                            x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
+                            img = Image.fromarray(x_sample.astype(np.uint8))
+                            img = put_watermark(img, wm_encoder)
+                            img.save(os.path.join(sample_path, f"{base_count:05}.png"))
+                            base_count += 1
+                        all_samples.append(x_samples)
+                # additionally, save as grid
+                grid = torch.stack(all_samples, 0)
+                grid = rearrange(grid, 'n b c h w -> (n b) c h w')
+                grid = make_grid(grid, nrow=n_rows)
+                # to image
+                grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
+                grid = Image.fromarray(grid.astype(np.uint8))
+                grid = put_watermark(grid, wm_encoder)
+                grid.save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
+                grid_count += 1
+    print(f"Your samples are ready and waiting for you here: \n{outpath} \nEnjoy.")
+if __name__ == "__main__":
+    main()
--- a/scripts/streamlit/depth2img.py
+++ b/scripts/streamlit/depth2img.py
+import sys
+import torch
+import numpy as np
+import streamlit as st
+from PIL import Image
+from omegaconf import OmegaConf
+from einops import repeat, rearrange
+from pytorch_lightning import seed_everything
+from imwatermark import WatermarkEncoder
+from scripts.txt2img import put_watermark
+from ldm.util import instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.data.util import AddMiDaS
+torch.set_grad_enabled(False)
+@st.cache(allow_output_mutation=True)
+def initialize_model(config, ckpt):
+    config = OmegaConf.load(config)
+    model = instantiate_from_config(config.model)
+    model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    sampler = DDIMSampler(model)
+    return sampler
+def make_batch_sd(
+        image,
+        txt,
+        device,
+        num_samples=1,
+        model_type="dpt_hybrid"
+):
+    image = np.array(image.convert("RGB"))
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+    # sample['jpg'] is tensor hwc in [-1, 1] at this point
+    midas_trafo = AddMiDaS(model_type=model_type)
+    batch = {
+        "jpg": image,
+        "txt": num_samples * [txt],
+    }
+    batch = midas_trafo(batch)
+    batch["jpg"] = rearrange(batch["jpg"], 'h w c -> 1 c h w')
+    batch["jpg"] = repeat(batch["jpg"].to(device=device), "1 ... -> n ...", n=num_samples)
+    batch["midas_in"] = repeat(torch.from_numpy(batch["midas_in"][None, ...]).to(device=device), "1 ... -> n ...", n=num_samples)
+    return batch
+def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
+          do_full_sample=False):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = sampler.model
+    seed_everything(seed)
+    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
+    wm = "SDV2"
+    wm_encoder = WatermarkEncoder()
+    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
+    with torch.no_grad(),\
+            torch.autocast("cuda"):
+        batch = make_batch_sd(image, txt=prompt, device=device, num_samples=num_samples)
+        z = model.get_first_stage_encoding(model.encode_first_stage(batch[model.first_stage_key]))  # move to latent space
+        c = model.cond_stage_model.encode(batch["txt"])
+        c_cat = list()
+        for ck in model.concat_keys:
+            cc = batch[ck]
+            cc = model.depth_model(cc)
+            depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
+                                                                                           keepdim=True)
+            display_depth = (cc - depth_min) / (depth_max - depth_min)
+            st.image(Image.fromarray((display_depth[0, 0, ...].cpu().numpy() * 255.).astype(np.uint8)))
+            cc = torch.nn.functional.interpolate(
+                cc,
+                size=z.shape[2:],
+                mode="bicubic",
+                align_corners=False,
+            )
+            depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
+                                                                                           keepdim=True)
+            cc = 2. * (cc - depth_min) / (depth_max - depth_min) - 1.
+            c_cat.append(cc)
+        c_cat = torch.cat(c_cat, dim=1)
+        # cond
+        cond = {"c_concat": [c_cat], "c_crossattn": [c]}
+        # uncond cond
+        uc_cross = model.get_unconditional_conditioning(num_samples, "")
+        uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
+        if not do_full_sample:
+            # encode (scaled latent)
+            z_enc = sampler.stochastic_encode(z, torch.tensor([t_enc] * num_samples).to(model.device))
+        else:
+            z_enc = torch.randn_like(z)
+        # decode it
+        samples = sampler.decode(z_enc, cond, t_enc, unconditional_guidance_scale=scale,
+                                 unconditional_conditioning=uc_full, callback=callback)
+        x_samples_ddim = model.decode_first_stage(samples)
+        result = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+        result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
+    return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
+def run():
+    st.title("Stable Diffusion Depth2Img")
+    # run via streamlit run scripts/demo/depth2img.py <path-tp-config> <path-to-ckpt>
+    sampler = initialize_model(sys.argv[1], sys.argv[2])
+    image = st.file_uploader("Image", ["jpg", "png"])
+    if image:
+        image = Image.open(image)
+        w, h = image.size
+        st.text(f"loaded input image of size ({w}, {h})")
+        width, height = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 64
+        image = image.resize((width, height))
+        st.text(f"resized input image to size ({width}, {height} (w, h))")
+        st.image(image)
+        prompt = st.text_input("Prompt")
+        seed = st.number_input("Seed", min_value=0, max_value=1000000, value=0)
+        num_samples = st.number_input("Number of Samples", min_value=1, max_value=64, value=1)
+        scale = st.slider("Scale", min_value=0.1, max_value=30.0, value=9.0, step=0.1)
+        steps = st.slider("DDIM Steps", min_value=0, max_value=50, value=50, step=1)
+        strength = st.slider("Strength", min_value=0., max_value=1., value=0.9)
+        t_progress = st.progress(0)
+        def t_callback(t):
+            t_progress.progress(min((t + 1) / t_enc, 1.))
+        assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]'
+        do_full_sample = strength == 1.
+        t_enc = min(int(strength * steps), steps-1)
+        sampler.make_schedule(steps, ddim_eta=0., verbose=True)
+        if st.button("Sample"):
+            result = paint(
+                sampler=sampler,
+                image=image,
+                prompt=prompt,
+                t_enc=t_enc,
+                seed=seed,
+                scale=scale,
+                num_samples=num_samples,
+                callback=t_callback,
+                do_full_sample=do_full_sample,
+            )
+            st.write("Result")
+            for image in result:
+                st.image(image, output_format='PNG')
+if __name__ == "__main__":
+    run()
--- a/scripts/streamlit/inpainting.py
+++ b/scripts/streamlit/inpainting.py
+import sys
+import cv2
+import torch
+import numpy as np
+import streamlit as st
+from PIL import Image
+from omegaconf import OmegaConf
+from einops import repeat
+from streamlit_drawable_canvas import st_canvas
+from imwatermark import WatermarkEncoder
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.util import instantiate_from_config
+torch.set_grad_enabled(False)
+def put_watermark(img, wm_encoder=None):
+    if wm_encoder is not None:
+        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+        img = wm_encoder.encode(img, 'dwtDct')
+        img = Image.fromarray(img[:, :, ::-1])
+    return img
+@st.cache(allow_output_mutation=True)
+def initialize_model(config, ckpt):
+    config = OmegaConf.load(config)
+    model = instantiate_from_config(config.model)
+    model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    sampler = DDIMSampler(model)
+    return sampler
+def make_batch_sd(
+        image,
+        mask,
+        txt,
+        device,
+        num_samples=1):
+    image = np.array(image.convert("RGB"))
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+    mask = np.array(mask.convert("L"))
+    mask = mask.astype(np.float32) / 255.0
+    mask = mask[None, None]
+    mask[mask < 0.5] = 0
+    mask[mask >= 0.5] = 1
+    mask = torch.from_numpy(mask)
+    masked_image = image * (mask < 0.5)
+    batch = {
+        "image": repeat(image.to(device=device), "1 ... -> n ...", n=num_samples),
+        "txt": num_samples * [txt],
+        "mask": repeat(mask.to(device=device), "1 ... -> n ...", n=num_samples),
+        "masked_image": repeat(masked_image.to(device=device), "1 ... -> n ...", n=num_samples),
+    }
+    return batch
+def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512, eta=1.):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = sampler.model
+    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
+    wm = "SDV2"
+    wm_encoder = WatermarkEncoder()
+    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
+    prng = np.random.RandomState(seed)
+    start_code = prng.randn(num_samples, 4, h // 8, w // 8)
+    start_code = torch.from_numpy(start_code).to(device=device, dtype=torch.float32)
+    with torch.no_grad(), \
+            torch.autocast("cuda"):
+            batch = make_batch_sd(image, mask, txt=prompt, device=device, num_samples=num_samples)
+            c = model.cond_stage_model.encode(batch["txt"])
+            c_cat = list()
+            for ck in model.concat_keys:
+                cc = batch[ck].float()
+                if ck != model.masked_image_key:
+                    bchw = [num_samples, 4, h // 8, w // 8]
+                    cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
+                else:
+                    cc = model.get_first_stage_encoding(model.encode_first_stage(cc))
+                c_cat.append(cc)
+            c_cat = torch.cat(c_cat, dim=1)
+            # cond
+            cond = {"c_concat": [c_cat], "c_crossattn": [c]}
+            # uncond cond
+            uc_cross = model.get_unconditional_conditioning(num_samples, "")
+            uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
+            shape = [model.channels, h // 8, w // 8]
+            samples_cfg, intermediates = sampler.sample(
+                ddim_steps,
+                num_samples,
+                shape,
+                cond,
+                verbose=False,
+                eta=eta,
+                unconditional_guidance_scale=scale,
+                unconditional_conditioning=uc_full,
+                x_T=start_code,
+            )
+            x_samples_ddim = model.decode_first_stage(samples_cfg)
+            result = torch.clamp((x_samples_ddim + 1.0) / 2.0,
+                                 min=0.0, max=1.0)
+            result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
+    return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
+def run():
+    st.title("Stable Diffusion Inpainting")
+    sampler = initialize_model(sys.argv[1], sys.argv[2])
+    image = st.file_uploader("Image", ["jpg", "png"])
+    if image:
+        image = Image.open(image)
+        w, h = image.size
+        print(f"loaded input image of size ({w}, {h})")
+        width, height = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 32
+        image = image.resize((width, height))
+        prompt = st.text_input("Prompt")
+        seed = st.number_input("Seed", min_value=0, max_value=1000000, value=0)
+        num_samples = st.number_input("Number of Samples", min_value=1, max_value=64, value=1)
+        scale = st.slider("Scale", min_value=0.1, max_value=30.0, value=10., step=0.1)
+        ddim_steps = st.slider("DDIM Steps", min_value=0, max_value=50, value=50, step=1)
+        eta = st.sidebar.number_input("eta (DDIM)", value=0., min_value=0., max_value=1.)
+        fill_color = "rgba(255, 255, 255, 0.0)"
+        stroke_width = st.number_input("Brush Size",
+                                       value=64,
+                                       min_value=1,
+                                       max_value=100)
+        stroke_color = "rgba(255, 255, 255, 1.0)"
+        bg_color = "rgba(0, 0, 0, 1.0)"
+        drawing_mode = "freedraw"
+        st.write("Canvas")
+        st.caption(
+            "Draw a mask to inpaint, then click the 'Send to Streamlit' button (bottom left, with an arrow on it).")
+        canvas_result = st_canvas(
+            fill_color=fill_color,
+            stroke_width=stroke_width,
+            stroke_color=stroke_color,
+            background_color=bg_color,
+            background_image=image,
+            update_streamlit=False,
+            height=height,
+            width=width,
+            drawing_mode=drawing_mode,
+            key="canvas",
+        )
+        if canvas_result:
+            mask = canvas_result.image_data
+            mask = mask[:, :, -1] > 0
+            if mask.sum() > 0:
+                mask = Image.fromarray(mask)
+                result = inpaint(
+                    sampler=sampler,
+                    image=image,
+                    mask=mask,
+                    prompt=prompt,
+                    seed=seed,
+                    scale=scale,
+                    ddim_steps=ddim_steps,
+                    num_samples=num_samples,
+                    h=height, w=width, eta=eta
+                )
+                st.write("Inpainted")
+                for image in result:
+                    st.image(image, output_format='PNG')
+if __name__ == "__main__":
+    run()
\ No newline at end of file
--- a/scripts/streamlit/stableunclip.py
+++ b/scripts/streamlit/stableunclip.py
+import importlib
+import streamlit as st
+import torch
+import cv2
+import numpy as np
+import PIL
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import trange
+import io, os
+from torch import autocast
+from einops import rearrange, repeat
+from torchvision.utils import make_grid
+from pytorch_lightning import seed_everything
+from contextlib import nullcontext
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.models.diffusion.plms import PLMSSampler
+from ldm.models.diffusion.dpm_solver import DPMSolverSampler
+torch.set_grad_enabled(False)
+PROMPTS_ROOT = "scripts/prompts/"
+SAVE_PATH = "outputs/demo/stable-unclip/"
+VERSION2SPECS = {
+    "Stable unCLIP-L": {"H": 768, "W": 768, "C": 4, "f": 8},
+    "Stable unOpenCLIP-H": {"H": 768, "W": 768, "C": 4, "f": 8},
+    "Full Karlo": {}
+}
+def get_obj_from_str(string, reload=False):
+    module, cls = string.rsplit(".", 1)
+    importlib.invalidate_caches()
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+def instantiate_from_config(config):
+    if not "target" in config:
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()))
+def get_interactive_image(key=None):
+    image = st.file_uploader("Input", type=["jpg", "JPEG", "png"], key=key)
+    if image is not None:
+        image = Image.open(image)
+        if not image.mode == "RGB":
+            image = image.convert("RGB")
+        return image
+def load_img(display=True, key=None):
+    image = get_interactive_image(key=key)
+    if display:
+        st.image(image)
+    w, h = image.size
+    print(f"loaded input image of size ({w}, {h})")
+    w, h = map(lambda x: x - x % 64, (w, h))
+    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
+    image = np.array(image).astype(np.float32) / 255.0
+    image = image[None].transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return 2. * image - 1.
+def get_init_img(batch_size=1, key=None):
+    init_image = load_img(key=key).cuda()
+    init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
+    return init_image
+def sample(
+        model,
+        prompt,
+        n_runs=3,
+        n_samples=2,
+        H=512,
+        W=512,
+        C=4,
+        f=8,
+        scale=10.0,
+        ddim_steps=50,
+        ddim_eta=0.0,
+        callback=None,
+        skip_single_save=False,
+        save_grid=True,
+        ucg_schedule=None,
+        negative_prompt="",
+        adm_cond=None,
+        adm_uc=None,
+        use_full_precision=False,
+        only_adm_cond=False
+):
+    batch_size = n_samples
+    precision_scope = autocast if not use_full_precision else nullcontext
+    # decoderscope = autocast if not use_full_precision else nullcontext
+    if use_full_precision: st.warning(f"Running {model.__class__.__name__} at full precision.")
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    prompts = batch_size * prompt
+    outputs = st.empty()
+    with precision_scope("cuda"):
+        with model.ema_scope():
+            all_samples = list()
+            for n in trange(n_runs, desc="Sampling"):
+                shape = [C, H // f, W // f]
+                if not only_adm_cond:
+                    uc = None
+                    if scale != 1.0:
+                        uc = model.get_learned_conditioning(batch_size * [negative_prompt])
+                    if isinstance(prompts, tuple):
+                        prompts = list(prompts)
+                    c = model.get_learned_conditioning(prompts)
+                if adm_cond is not None:
+                    if adm_cond.shape[0] == 1:
+                        adm_cond = repeat(adm_cond, '1 ... -> b ...', b=batch_size)
+                    if adm_uc is None:
+                        st.warning("Not guiding via c_adm")
+                        adm_uc = adm_cond
+                    else:
+                        if adm_uc.shape[0] == 1:
+                            adm_uc = repeat(adm_uc, '1 ... -> b ...', b=batch_size)
+                    if not only_adm_cond:
+                        c = {"c_crossattn": [c], "c_adm": adm_cond}
+                        uc = {"c_crossattn": [uc], "c_adm": adm_uc}
+                    else:
+                        c = adm_cond
+                        uc = adm_uc
+                samples_ddim, _ = sampler.sample(S=ddim_steps,
+                                                 conditioning=c,
+                                                 batch_size=batch_size,
+                                                 shape=shape,
+                                                 verbose=False,
+                                                 unconditional_guidance_scale=scale,
+                                                 unconditional_conditioning=uc,
+                                                 eta=ddim_eta,
+                                                 x_T=None,
+                                                 callback=callback,
+                                                 ucg_schedule=ucg_schedule
+                                                 )
+                x_samples = model.decode_first_stage(samples_ddim)
+                x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
+                if not skip_single_save:
+                    base_count = len(os.listdir(os.path.join(SAVE_PATH, "samples")))
+                    for x_sample in x_samples:
+                        x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
+                        Image.fromarray(x_sample.astype(np.uint8)).save(
+                            os.path.join(SAVE_PATH, "samples", f"{base_count:09}.png"))
+                        base_count += 1
+                all_samples.append(x_samples)
+                # get grid of all samples
+                grid = torch.stack(all_samples, 0)
+                grid = rearrange(grid, 'n b c h w -> (n h) (b w) c')
+                outputs.image(grid.cpu().numpy())
+            # additionally, save grid
+            grid = Image.fromarray((255. * grid.cpu().numpy()).astype(np.uint8))
+            if save_grid:
+                grid_count = len(os.listdir(SAVE_PATH)) - 1
+                grid.save(os.path.join(SAVE_PATH, f'grid-{grid_count:06}.png'))
+    return x_samples
+def make_oscillating_guidance_schedule(num_steps, max_weight=15., min_weight=1.):
+    schedule = list()
+    for i in range(num_steps):
+        if float(i / num_steps) < 0.1:
+            schedule.append(max_weight)
+        elif i % 2 == 0:
+            schedule.append(min_weight)
+        else:
+            schedule.append(max_weight)
+    print(f"OSCILLATING GUIDANCE SCHEDULE: \n {schedule}")
+    return schedule
+def torch2np(x):
+    x = ((x + 1.0) * 127.5).clamp(0, 255).to(dtype=torch.uint8)
+    x = x.permute(0, 2, 3, 1).detach().cpu().numpy()
+    return x
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
+def init(version="Stable unCLIP-L", load_karlo_prior=False):
+    state = dict()
+    if not "model" in state:
+        if version == "Stable unCLIP-L":
+            config = "configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml"
+            ckpt = "checkpoints/sd21-unclip-l.ckpt"
+        elif version == "Stable unOpenCLIP-H":
+            config = "configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml"
+            ckpt = "checkpoints/sd21-unclip-h.ckpt"
+        elif version == "Full Karlo":
+            from ldm.modules.karlo.kakao.sampler import T2ISampler
+            st.info("Loading full KARLO..")
+            karlo = T2ISampler.from_pretrained(
+                root_dir="checkpoints/karlo_models",
+                clip_model_path="ViT-L-14.pt",
+                clip_stat_path="ViT-L-14_stats.th",
+                sampling_type="default",
+            )
+            state["karlo_prior"] = karlo
+            state["msg"] = "loaded full Karlo"
+            return state
+        else:
+            raise ValueError(f"version {version} unknown!")
+        config = OmegaConf.load(config)
+        model, msg = load_model_from_config(config, ckpt, vae_sd=None)
+        state["msg"] = msg
+        if load_karlo_prior:
+            from ldm.modules.karlo.kakao.sampler import PriorSampler
+            st.info("Loading KARLO CLIP prior...")
+            karlo_prior = PriorSampler.from_pretrained(
+                root_dir="checkpoints/karlo_models",
+                clip_model_path="ViT-L-14.pt",
+                clip_stat_path="ViT-L-14_stats.th",
+                sampling_type="default",
+            )
+            state["karlo_prior"] = karlo_prior
+        state["model"] = model
+        state["ckpt"] = ckpt
+        state["config"] = config
+    return state
+def load_model_from_config(config, ckpt, verbose=False, vae_sd=None):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    msg = None
+    if "global_step" in pl_sd:
+        msg = f"This is global step {pl_sd['global_step']}. "
+    if "model_ema.num_updates" in pl_sd["state_dict"]:
+        msg += f"And we got {pl_sd['state_dict']['model_ema.num_updates']} EMA updates."
+    global_step = pl_sd.get("global_step", "?")
+    sd = pl_sd["state_dict"]
+    if vae_sd is not None:
+        for k in sd.keys():
+            if "first_stage" in k:
+                sd[k] = vae_sd[k[len("first_stage_model."):]]
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+    model.cuda()
+    model.eval()
+    print(f"Loaded global step {global_step}")
+    return model, msg
+if __name__ == "__main__":
+    st.title("Stable unCLIP")
+    mode = "txt2img"
+    version = st.selectbox("Model Version", list(VERSION2SPECS.keys()), 0)
+    use_karlo_prior = version in ["Stable unCLIP-L"] and st.checkbox("Use KARLO prior", False)
+    state = init(version=version, load_karlo_prior=use_karlo_prior)
+    prompt = st.text_input("Prompt", "a professional photograph")
+    negative_prompt = st.text_input("Negative Prompt", "")
+    scale = st.number_input("cfg-scale", value=10., min_value=-100., max_value=100.)
+    number_rows = st.number_input("num rows", value=2, min_value=1, max_value=10)
+    number_cols = st.number_input("num cols", value=2, min_value=1, max_value=10)
+    steps = st.sidebar.number_input("steps", value=20, min_value=1, max_value=1000)
+    eta = st.sidebar.number_input("eta (DDIM)", value=0., min_value=0., max_value=1.)
+    force_full_precision = st.sidebar.checkbox("Force FP32", False)  # TODO: check if/where things break.
+    if version != "Full Karlo":
+        H = st.sidebar.number_input("H", value=VERSION2SPECS[version]["H"], min_value=64, max_value=2048)
+        W = st.sidebar.number_input("W", value=VERSION2SPECS[version]["W"], min_value=64, max_value=2048)
+        C = VERSION2SPECS[version]["C"]
+        f = VERSION2SPECS[version]["f"]
+    SAVE_PATH = os.path.join(SAVE_PATH, version)
+    os.makedirs(os.path.join(SAVE_PATH, "samples"), exist_ok=True)
+    seed = st.sidebar.number_input("seed", value=42, min_value=0, max_value=int(1e9))
+    seed_everything(seed)
+    ucg_schedule = None
+    sampler = st.sidebar.selectbox("Sampler", ["DDIM", "DPM"], 0)
+    if version == "Full Karlo":
+        pass
+    else:
+        if sampler == "DPM":
+            sampler = DPMSolverSampler(state["model"])
+        elif sampler == "DDIM":
+            sampler = DDIMSampler(state["model"])
+        else:
+            raise ValueError(f"unknown sampler {sampler}!")
+    adm_cond, adm_uc = None, None
+    if use_karlo_prior:
+        # uses the prior
+        karlo_sampler = state["karlo_prior"]
+        noise_level = None
+        if state["model"].noise_augmentor is not None:
+            noise_level = st.number_input("Noise Augmentation for CLIP embeddings", min_value=0,
+                                          max_value=state["model"].noise_augmentor.max_noise_level - 1, value=0)
+        with torch.no_grad():
+            karlo_prediction = iter(
+                karlo_sampler(
+                    prompt=prompt,
+                    bsz=number_cols,
+                    progressive_mode="final",
+                )
+            ).__next__()
+            adm_cond = karlo_prediction
+            if noise_level is not None:
+                c_adm, noise_level_emb = state["model"].noise_augmentor(adm_cond, noise_level=repeat(
+                    torch.tensor([noise_level]).to(state["model"].device), '1 -> b', b=number_cols))
+                adm_cond = torch.cat((c_adm, noise_level_emb), 1)
+            adm_uc = torch.zeros_like(adm_cond)
+    elif version == "Full Karlo":
+        pass
+    else:
+        num_inputs = st.number_input("Number of Input Images", 1)
+        def make_conditionings_from_input(num=1, key=None):
+            init_img = get_init_img(batch_size=number_cols, key=key)
+            with torch.no_grad():
+                adm_cond = state["model"].embedder(init_img)
+                weight = st.slider(f"Weight for Input {num}", min_value=-10., max_value=10., value=1.)
+                if state["model"].noise_augmentor is not None:
+                    noise_level = st.number_input(f"Noise Augmentation for CLIP embedding of input #{num}", min_value=0,
+                                                  max_value=state["model"].noise_augmentor.max_noise_level - 1,
+                                                  value=0, )
+                    c_adm, noise_level_emb = state["model"].noise_augmentor(adm_cond, noise_level=repeat(
+                        torch.tensor([noise_level]).to(state["model"].device), '1 -> b', b=number_cols))
+                    adm_cond = torch.cat((c_adm, noise_level_emb), 1) * weight
+                adm_uc = torch.zeros_like(adm_cond)
+            return adm_cond, adm_uc, weight
+        adm_inputs = list()
+        weights = list()
+        for n in range(num_inputs):
+            adm_cond, adm_uc, w = make_conditionings_from_input(num=n + 1, key=n)
+            weights.append(w)
+            adm_inputs.append(adm_cond)
+        adm_cond = torch.stack(adm_inputs).sum(0) / sum(weights)
+        if num_inputs > 1:
+            if st.checkbox("Apply Noise to Embedding Mix", True):
+                noise_level = st.number_input(f"Noise Augmentation for averaged CLIP embeddings", min_value=0,
+                                              max_value=state["model"].noise_augmentor.max_noise_level - 1, value=50, )
+                c_adm, noise_level_emb = state["model"].noise_augmentor(
+                    adm_cond[:, :state["model"].noise_augmentor.time_embed.dim],
+                    noise_level=repeat(
+                        torch.tensor([noise_level]).to(state["model"].device), '1 -> b', b=number_cols))
+                adm_cond = torch.cat((c_adm, noise_level_emb), 1)
+    if st.button("Sample"):
+        print("running prompt:", prompt)
+        st.text("Sampling")
+        t_progress = st.progress(0)
+        result = st.empty()
+        def t_callback(t):
+            t_progress.progress(min((t + 1) / steps, 1.))
+        if version == "Full Karlo":
+            outputs = st.empty()
+            karlo_sampler = state["karlo_prior"]
+            all_samples = list()
+            with torch.no_grad():
+                for _ in range(number_rows):
+                    karlo_prediction = iter(
+                        karlo_sampler(
+                            prompt=prompt,
+                            bsz=number_cols,
+                            progressive_mode="final",
+                        )
+                    ).__next__()
+                    all_samples.append(karlo_prediction)
+            grid = torch.stack(all_samples, 0)
+            grid = rearrange(grid, 'n b c h w -> (n h) (b w) c')
+            outputs.image(grid.cpu().numpy())
+        else:
+            samples = sample(
+                state["model"],
+                prompt,
+                n_runs=number_rows,
+                n_samples=number_cols,
+                H=H, W=W, C=C, f=f,
+                scale=scale,
+                ddim_steps=steps,
+                ddim_eta=eta,
+                callback=t_callback,
+                ucg_schedule=ucg_schedule,
+                negative_prompt=negative_prompt,
+                adm_cond=adm_cond, adm_uc=adm_uc,
+                use_full_precision=force_full_precision,
+                only_adm_cond=False
+            )
--- a/scripts/streamlit/superresolution.py
+++ b/scripts/streamlit/superresolution.py
+import sys
+import torch
+import numpy as np
+import streamlit as st
+from PIL import Image
+from omegaconf import OmegaConf
+from einops import repeat, rearrange
+from pytorch_lightning import seed_everything
+from imwatermark import WatermarkEncoder
+from scripts.txt2img import put_watermark
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.models.diffusion.ddpm import LatentUpscaleDiffusion, LatentUpscaleFinetuneDiffusion
+from ldm.util import exists, instantiate_from_config
+torch.set_grad_enabled(False)
+@st.cache(allow_output_mutation=True)
+def initialize_model(config, ckpt):
+    config = OmegaConf.load(config)
+    model = instantiate_from_config(config.model)
+    model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    sampler = DDIMSampler(model)
+    return sampler
+def make_batch_sd(
+        image,
+        txt,
+        device,
+        num_samples=1,
+):
+    image = np.array(image.convert("RGB"))
+    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
+    batch = {
+        "lr": rearrange(image, 'h w c -> 1 c h w'),
+        "txt": num_samples * [txt],
+    }
+    batch["lr"] = repeat(batch["lr"].to(device=device), "1 ... -> n ...", n=num_samples)
+    return batch
+def make_noise_augmentation(model, batch, noise_level=None):
+    x_low = batch[model.low_scale_key]
+    x_low = x_low.to(memory_format=torch.contiguous_format).float()
+    x_aug, noise_level = model.low_scale_model(x_low, noise_level)
+    return x_aug, noise_level
+def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = sampler.model
+    seed_everything(seed)
+    prng = np.random.RandomState(seed)
+    start_code = prng.randn(num_samples, model.channels, h , w)
+    start_code = torch.from_numpy(start_code).to(device=device, dtype=torch.float32)
+    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
+    wm = "SDV2"
+    wm_encoder = WatermarkEncoder()
+    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
+    with torch.no_grad(),\
+            torch.autocast("cuda"):
+        batch = make_batch_sd(image, txt=prompt, device=device, num_samples=num_samples)
+        c = model.cond_stage_model.encode(batch["txt"])
+        c_cat = list()
+        if isinstance(model, LatentUpscaleFinetuneDiffusion):
+            for ck in model.concat_keys:
+                cc = batch[ck]
+                if exists(model.reshuffle_patch_size):
+                    assert isinstance(model.reshuffle_patch_size, int)
+                    cc = rearrange(cc, 'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
+                                   p1=model.reshuffle_patch_size, p2=model.reshuffle_patch_size)
+                c_cat.append(cc)
+            c_cat = torch.cat(c_cat, dim=1)
+            # cond
+            cond = {"c_concat": [c_cat], "c_crossattn": [c]}
+            # uncond cond
+            uc_cross = model.get_unconditional_conditioning(num_samples, "")
+            uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
+        elif isinstance(model, LatentUpscaleDiffusion):
+            x_augment, noise_level = make_noise_augmentation(model, batch, noise_level)
+            cond = {"c_concat": [x_augment], "c_crossattn": [c], "c_adm": noise_level}
+            # uncond cond
+            uc_cross = model.get_unconditional_conditioning(num_samples, "")
+            uc_full = {"c_concat": [x_augment], "c_crossattn": [uc_cross], "c_adm": noise_level}
+        else:
+            raise NotImplementedError()
+        shape = [model.channels, h, w]
+        samples, intermediates = sampler.sample(
+            steps,
+            num_samples,
+            shape,
+            cond,
+            verbose=False,
+            eta=eta,
+            unconditional_guidance_scale=scale,
+            unconditional_conditioning=uc_full,
+            x_T=start_code,
+            callback=callback
+        )
+    with torch.no_grad():
+        x_samples_ddim = model.decode_first_stage(samples)
+    result = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
+    result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
+    st.text(f"upscaled image shape: {result.shape}")
+    return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
+def run():
+    st.title("Stable Diffusion Upscaling")
+    # run via streamlit run scripts/demo/depth2img.py <path-tp-config> <path-to-ckpt>
+    sampler = initialize_model(sys.argv[1], sys.argv[2])
+    image = st.file_uploader("Image", ["jpg", "png"])
+    if image:
+        image = Image.open(image)
+        w, h = image.size
+        st.text(f"loaded input image of size ({w}, {h})")
+        width, height = map(lambda x: x - x % 64, (w, h))  # resize to integer multiple of 64
+        image = image.resize((width, height))
+        st.text(f"resized input image to size ({width}, {height} (w, h))")
+        st.image(image)
+        st.write(f"\n Tip: Add a description of the object that should be upscaled, e.g.: 'a professional photograph of a cat'")
+        prompt = st.text_input("Prompt", "a high quality professional photograph")
+        seed = st.number_input("Seed", min_value=0, max_value=1000000, value=0)
+        num_samples = st.number_input("Number of Samples", min_value=1, max_value=64, value=1)
+        scale = st.slider("Scale", min_value=0.1, max_value=30.0, value=9.0, step=0.1)
+        steps = st.slider("DDIM Steps", min_value=2, max_value=250, value=50, step=1)
+        eta = st.sidebar.number_input("eta (DDIM)", value=0., min_value=0., max_value=1.)
+        noise_level = None
+        if isinstance(sampler.model, LatentUpscaleDiffusion):
+            # TODO: make this work for all models
+            noise_level = st.sidebar.number_input("Noise Augmentation", min_value=0, max_value=350, value=20)
+            noise_level = torch.Tensor(num_samples * [noise_level]).to(sampler.model.device).long()
+        t_progress = st.progress(0)
+        def t_callback(t):
+            t_progress.progress(min((t + 1) / steps, 1.))
+        sampler.make_schedule(steps, ddim_eta=eta, verbose=True)
+        if st.button("Sample"):
+            result = paint(
+                sampler=sampler,
+                image=image,
+                prompt=prompt,
+                seed=seed,
+                scale=scale,
+                h=height, w=width, steps=steps,
+                num_samples=num_samples,
+                callback=t_callback,
+                noise_level=noise_level,
+                eta=eta
+            )
+            st.write("Result")
+            for image in result:
+                st.image(image, output_format='PNG')
+if __name__ == "__main__":
+    run()
--- a/scripts/tests/test_watermark.py
+++ b/scripts/tests/test_watermark.py
+import cv2
+import fire
+from imwatermark import WatermarkDecoder
+def testit(img_path):
+    bgr = cv2.imread(img_path)
+    decoder = WatermarkDecoder('bytes', 136)
+    watermark = decoder.decode(bgr, 'dwtDct')
+    try:
+        dec = watermark.decode('utf-8')
+    except:
+        dec = "null"
+    print(dec)
+if __name__ == "__main__":
+    fire.Fire(testit)
\ No newline at end of file
--- a/scripts/txt2img.py
+++ b/scripts/txt2img.py
+import argparse, os
+import cv2
+import torch
+import numpy as np
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm, trange
+from itertools import islice
+from einops import rearrange
+from torchvision.utils import make_grid
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import nullcontext
+from imwatermark import WatermarkEncoder
+from ldm.util import instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.models.diffusion.plms import PLMSSampler
+from ldm.models.diffusion.dpm_solver import DPMSolverSampler
+torch.set_grad_enabled(False)
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+def load_model_from_config(config, ckpt, device=torch.device("cuda"), verbose=False):
+    print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    model = instantiate_from_config(config.model)
+    m, u = model.load_state_dict(sd, strict=False)
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+    if device == torch.device("cuda"):
+        model.cuda()
+    elif device == torch.device("cpu"):
+        model.cpu()
+        model.cond_stage_model.device = "cpu"
+    else:
+        raise ValueError(f"Incorrect device name. Received: {device}")
+    model.eval()
+    return model
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        nargs="?",
+        default="a professional photograph of an astronaut riding a triceratops",
+        help="the prompt to render"
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        nargs="?",
+        help="dir to write results to",
+        default="outputs/txt2img-samples"
+    )
+    parser.add_argument(
+        "--steps",
+        type=int,
+        default=50,
+        help="number of ddim sampling steps",
+    )
+    parser.add_argument(
+        "--plms",
+        action='store_true',
+        help="use plms sampling",
+    )
+    parser.add_argument(
+        "--dpm",
+        action='store_true',
+        help="use DPM (2) sampler",
+    )
+    parser.add_argument(
+        "--fixed_code",
+        action='store_true',
+        help="if enabled, uses the same starting code across all samples ",
+    )
+    parser.add_argument(
+        "--ddim_eta",
+        type=float,
+        default=0.0,
+        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
+    )
+    parser.add_argument(
+        "--n_iter",
+        type=int,
+        default=3,
+        help="sample this often",
+    )
+    parser.add_argument(
+        "--H",
+        type=int,
+        default=512,
+        help="image height, in pixel space",
+    )
+    parser.add_argument(
+        "--W",
+        type=int,
+        default=512,
+        help="image width, in pixel space",
+    )
+    parser.add_argument(
+        "--C",
+        type=int,
+        default=4,
+        help="latent channels",
+    )
+    parser.add_argument(
+        "--f",
+        type=int,
+        default=8,
+        help="downsampling factor, most often 8 or 16",
+    )
+    parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=3,
+        help="how many samples to produce for each given prompt. A.k.a batch size",
+    )
+    parser.add_argument(
+        "--n_rows",
+        type=int,
+        default=0,
+        help="rows in the grid (default: n_samples)",
+    )
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=9.0,
+        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
+    )
+    parser.add_argument(
+        "--from-file",
+        type=str,
+        help="if specified, load prompts from this file, separated by newlines",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/stable-diffusion/v2-inference.yaml",
+        help="path to config which constructs model",
+    )
+    parser.add_argument(
+        "--ckpt",
+        type=str,
+        help="path to checkpoint of model",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="the seed (for reproducible sampling)",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        help="evaluate at this precision",
+        choices=["full", "autocast"],
+        default="autocast"
+    )
+    parser.add_argument(
+        "--repeat",
+        type=int,
+        default=1,
+        help="repeat each prompt in file this often",
+    )
+    parser.add_argument(
+        "--device",
+        type=str,
+        help="Device on which Stable Diffusion will be run",
+        choices=["cpu", "cuda"],
+        default="cpu"
+    )
+    parser.add_argument(
+        "--torchscript",
+        action='store_true',
+        help="Use TorchScript",
+    )
+    parser.add_argument(
+        "--ipex",
+        action='store_true',
+        help="Use Intel® Extension for PyTorch*",
+    )
+    parser.add_argument(
+        "--bf16",
+        action='store_true',
+        help="Use bfloat16",
+    )
+    opt = parser.parse_args()
+    return opt
+def put_watermark(img, wm_encoder=None):
+    if wm_encoder is not None:
+        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+        img = wm_encoder.encode(img, 'dwtDct')
+        img = Image.fromarray(img[:, :, ::-1])
+    return img
+def main(opt):
+    seed_everything(opt.seed)
+    config = OmegaConf.load(f"{opt.config}")
+    device = torch.device("cuda") if opt.device == "cuda" else torch.device("cpu")
+    model = load_model_from_config(config, f"{opt.ckpt}", device)
+    if opt.plms:
+        sampler = PLMSSampler(model, device=device)
+    elif opt.dpm:
+        sampler = DPMSolverSampler(model, device=device)
+    else:
+        sampler = DDIMSampler(model, device=device)
+    os.makedirs(opt.outdir, exist_ok=True)
+    outpath = opt.outdir
+    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
+    wm = "SDV2"
+    wm_encoder = WatermarkEncoder()
+    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
+    batch_size = opt.n_samples
+    n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
+    if not opt.from_file:
+        prompt = opt.prompt
+        assert prompt is not None
+        data = [batch_size * [prompt]]
+    else:
+        print(f"reading prompts from {opt.from_file}")
+        with open(opt.from_file, "r") as f:
+            data = f.read().splitlines()
+            data = [p for p in data for i in range(opt.repeat)]
+            data = list(chunk(data, batch_size))
+    sample_path = os.path.join(outpath, "samples")
+    os.makedirs(sample_path, exist_ok=True)
+    sample_count = 0
+    base_count = len(os.listdir(sample_path))
+    grid_count = len(os.listdir(outpath)) - 1
+    start_code = None
+    if opt.fixed_code:
+        start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device)
+    if opt.torchscript or opt.ipex:
+        transformer = model.cond_stage_model.model
+        unet = model.model.diffusion_model
+        decoder = model.first_stage_model.decoder
+        additional_context = torch.cpu.amp.autocast() if opt.bf16 else nullcontext()
+        shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
+        if opt.bf16 and not opt.torchscript and not opt.ipex:
+            raise ValueError('Bfloat16 is supported only for torchscript+ipex')
+        if opt.bf16 and unet.dtype != torch.bfloat16:
+            raise ValueError("Use configs/stable-diffusion/intel/ configs with bf16 enabled if " +
+                             "you'd like to use bfloat16 with CPU.")
+        if unet.dtype == torch.float16 and device == torch.device("cpu"):
+            raise ValueError("Use configs/stable-diffusion/intel/ configs for your model if you'd like to run it on CPU.")
+        if opt.ipex:
+            import intel_extension_for_pytorch as ipex
+            bf16_dtype = torch.bfloat16 if opt.bf16 else None
+            transformer = transformer.to(memory_format=torch.channels_last)
+            transformer = ipex.optimize(transformer, level="O1", inplace=True)
+            unet = unet.to(memory_format=torch.channels_last)
+            unet = ipex.optimize(unet, level="O1", auto_kernel_selection=True, inplace=True, dtype=bf16_dtype)
+            decoder = decoder.to(memory_format=torch.channels_last)
+            decoder = ipex.optimize(decoder, level="O1", auto_kernel_selection=True, inplace=True, dtype=bf16_dtype)
+        if opt.torchscript:
+            with torch.no_grad(), additional_context:
+                # get UNET scripted
+                if unet.use_checkpoint:
+                    raise ValueError("Gradient checkpoint won't work with tracing. " +
+                    "Use configs/stable-diffusion/intel/ configs for your model or disable checkpoint in your config.")
+                img_in = torch.ones(2, 4, 96, 96, dtype=torch.float32)
+                t_in = torch.ones(2, dtype=torch.int64)
+                context = torch.ones(2, 77, 1024, dtype=torch.float32)
+                scripted_unet = torch.jit.trace(unet, (img_in, t_in, context))
+                scripted_unet = torch.jit.optimize_for_inference(scripted_unet)
+                print(type(scripted_unet))
+                model.model.scripted_diffusion_model = scripted_unet
+                # get Decoder for first stage model scripted
+                samples_ddim = torch.ones(1, 4, 96, 96, dtype=torch.float32)
+                scripted_decoder = torch.jit.trace(decoder, (samples_ddim))
+                scripted_decoder = torch.jit.optimize_for_inference(scripted_decoder)
+                print(type(scripted_decoder))
+                model.first_stage_model.decoder = scripted_decoder
+        prompts = data[0]
+        print("Running a forward pass to initialize optimizations")
+        uc = None
+        if opt.scale != 1.0:
+            uc = model.get_learned_conditioning(batch_size * [""])
+        if isinstance(prompts, tuple):
+            prompts = list(prompts)
+        with torch.no_grad(), additional_context:
+            for _ in range(3):
+                c = model.get_learned_conditioning(prompts)
+            samples_ddim, _ = sampler.sample(S=5,
+                                             conditioning=c,
+                                             batch_size=batch_size,
+                                             shape=shape,
+                                             verbose=False,
+                                             unconditional_guidance_scale=opt.scale,
+                                             unconditional_conditioning=uc,
+                                             eta=opt.ddim_eta,
+                                             x_T=start_code)
+            print("Running a forward pass for decoder")
+            for _ in range(3):
+                x_samples_ddim = model.decode_first_stage(samples_ddim)
+    precision_scope = autocast if opt.precision=="autocast" or opt.bf16 else nullcontext
+    with torch.no_grad(), \
+        precision_scope(opt.device), \
+        model.ema_scope():
+            all_samples = list()
+            for n in trange(opt.n_iter, desc="Sampling"):
+                for prompts in tqdm(data, desc="data"):
+                    uc = None
+                    if opt.scale != 1.0:
+                        uc = model.get_learned_conditioning(batch_size * [""])
+                    if isinstance(prompts, tuple):
+                        prompts = list(prompts)
+                    c = model.get_learned_conditioning(prompts)
+                    shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
+                    samples, _ = sampler.sample(S=opt.steps,
+                                                     conditioning=c,
+                                                     batch_size=opt.n_samples,
+                                                     shape=shape,
+                                                     verbose=False,
+                                                     unconditional_guidance_scale=opt.scale,
+                                                     unconditional_conditioning=uc,
+                                                     eta=opt.ddim_eta,
+                                                     x_T=start_code)
+                    x_samples = model.decode_first_stage(samples)
+                    x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
+                    for x_sample in x_samples:
+                        x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
+                        img = Image.fromarray(x_sample.astype(np.uint8))
+                        img = put_watermark(img, wm_encoder)
+                        img.save(os.path.join(sample_path, f"{base_count:05}.png"))
+                        base_count += 1
+                        sample_count += 1
+                    all_samples.append(x_samples)
+            # additionally, save as grid
+            grid = torch.stack(all_samples, 0)
+            grid = rearrange(grid, 'n b c h w -> (n b) c h w')
+            grid = make_grid(grid, nrow=n_rows)
+            # to image
+            grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
+            grid = Image.fromarray(grid.astype(np.uint8))
+            grid = put_watermark(grid, wm_encoder)
+            grid.save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
+            grid_count += 1
+    print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
+          f" \nEnjoy.")
+if __name__ == "__main__":
+    opt = parse_args()
+    main(opt)
--- a/setup.py
+++ b/setup.py
+from setuptools import setup, find_packages
+setup(
+    name='stable-diffusion',
+    version='0.0.1',
+    description='',
+    packages=find_packages(),
+    install_requires=[
+        'torch',
+        'numpy',
+        'tqdm',
+    ],
+)
\ No newline at end of file
--- a/test_diffusers.py
+++ b/test_diffusers.py
+import torch
+import time
+import os
+import pandas as pd
+from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler, EulerDiscreteScheduler, DDIMScheduler, DiffusionPipeline
+model_id = "/public/home/lijian/model/stable-diffusion-2-1-base/"
+text_file = "PartiPrompts.tsv"
+df = pd.read_csv(text_file, sep='\t')
+prompts = df['Prompt']
+num_inference_steps = 50
+pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+base_count = 0
+print("======================================start DPM ==================================")
+for prompt in prompts:
+    start = time.time()
+    image = pipe(prompt, 512, 512, num_inference_steps=num_inference_steps, num_images_per_prompt=1).images[0]
+    print(f"the {base_count} text-to-image use time {time.time()-start}")
+    base_count += 1
+    image.save(f"{base_count:05}.png")
+    if base_count == 20:
+        break
+print(f"Your samples are ready and waiting for you here\n \n"
+          f" \nEnjoy.")