Commit 4007efdd authored by lijian6's avatar lijian6
Browse files

Initial commit

parents
Pipeline #994 canceled with stages
import torch
import torch.nn as nn
import timm
import types
import math
import torch.nn.functional as F
class Slice(nn.Module):
def __init__(self, start_index=1):
super(Slice, self).__init__()
self.start_index = start_index
def forward(self, x):
return x[:, self.start_index :]
class AddReadout(nn.Module):
def __init__(self, start_index=1):
super(AddReadout, self).__init__()
self.start_index = start_index
def forward(self, x):
if self.start_index == 2:
readout = (x[:, 0] + x[:, 1]) / 2
else:
readout = x[:, 0]
return x[:, self.start_index :] + readout.unsqueeze(1)
class ProjectReadout(nn.Module):
def __init__(self, in_features, start_index=1):
super(ProjectReadout, self).__init__()
self.start_index = start_index
self.project = nn.Sequential(nn.Linear(2 * in_features, in_features), nn.GELU())
def forward(self, x):
readout = x[:, 0].unsqueeze(1).expand_as(x[:, self.start_index :])
features = torch.cat((x[:, self.start_index :], readout), -1)
return self.project(features)
class Transpose(nn.Module):
def __init__(self, dim0, dim1):
super(Transpose, self).__init__()
self.dim0 = dim0
self.dim1 = dim1
def forward(self, x):
x = x.transpose(self.dim0, self.dim1)
return x
def forward_vit(pretrained, x):
b, c, h, w = x.shape
glob = pretrained.model.forward_flex(x)
layer_1 = pretrained.activations["1"]
layer_2 = pretrained.activations["2"]
layer_3 = pretrained.activations["3"]
layer_4 = pretrained.activations["4"]
layer_1 = pretrained.act_postprocess1[0:2](layer_1)
layer_2 = pretrained.act_postprocess2[0:2](layer_2)
layer_3 = pretrained.act_postprocess3[0:2](layer_3)
layer_4 = pretrained.act_postprocess4[0:2](layer_4)
unflatten = nn.Sequential(
nn.Unflatten(
2,
torch.Size(
[
h // pretrained.model.patch_size[1],
w // pretrained.model.patch_size[0],
]
),
)
)
if layer_1.ndim == 3:
layer_1 = unflatten(layer_1)
if layer_2.ndim == 3:
layer_2 = unflatten(layer_2)
if layer_3.ndim == 3:
layer_3 = unflatten(layer_3)
if layer_4.ndim == 3:
layer_4 = unflatten(layer_4)
layer_1 = pretrained.act_postprocess1[3 : len(pretrained.act_postprocess1)](layer_1)
layer_2 = pretrained.act_postprocess2[3 : len(pretrained.act_postprocess2)](layer_2)
layer_3 = pretrained.act_postprocess3[3 : len(pretrained.act_postprocess3)](layer_3)
layer_4 = pretrained.act_postprocess4[3 : len(pretrained.act_postprocess4)](layer_4)
return layer_1, layer_2, layer_3, layer_4
def _resize_pos_embed(self, posemb, gs_h, gs_w):
posemb_tok, posemb_grid = (
posemb[:, : self.start_index],
posemb[0, self.start_index :],
)
gs_old = int(math.sqrt(len(posemb_grid)))
posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
posemb_grid = F.interpolate(posemb_grid, size=(gs_h, gs_w), mode="bilinear")
posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_h * gs_w, -1)
posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
return posemb
def forward_flex(self, x):
b, c, h, w = x.shape
pos_embed = self._resize_pos_embed(
self.pos_embed, h // self.patch_size[1], w // self.patch_size[0]
)
B = x.shape[0]
if hasattr(self.patch_embed, "backbone"):
x = self.patch_embed.backbone(x)
if isinstance(x, (list, tuple)):
x = x[-1] # last feature if backbone outputs list/tuple of features
x = self.patch_embed.proj(x).flatten(2).transpose(1, 2)
if getattr(self, "dist_token", None) is not None:
cls_tokens = self.cls_token.expand(
B, -1, -1
) # stole cls_tokens impl from Phil Wang, thanks
dist_token = self.dist_token.expand(B, -1, -1)
x = torch.cat((cls_tokens, dist_token, x), dim=1)
else:
cls_tokens = self.cls_token.expand(
B, -1, -1
) # stole cls_tokens impl from Phil Wang, thanks
x = torch.cat((cls_tokens, x), dim=1)
x = x + pos_embed
x = self.pos_drop(x)
for blk in self.blocks:
x = blk(x)
x = self.norm(x)
return x
activations = {}
def get_activation(name):
def hook(model, input, output):
activations[name] = output
return hook
def get_readout_oper(vit_features, features, use_readout, start_index=1):
if use_readout == "ignore":
readout_oper = [Slice(start_index)] * len(features)
elif use_readout == "add":
readout_oper = [AddReadout(start_index)] * len(features)
elif use_readout == "project":
readout_oper = [
ProjectReadout(vit_features, start_index) for out_feat in features
]
else:
assert (
False
), "wrong operation for readout token, use_readout can be 'ignore', 'add', or 'project'"
return readout_oper
def _make_vit_b16_backbone(
model,
features=[96, 192, 384, 768],
size=[384, 384],
hooks=[2, 5, 8, 11],
vit_features=768,
use_readout="ignore",
start_index=1,
):
pretrained = nn.Module()
pretrained.model = model
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
pretrained.activations = activations
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
# 32, 48, 136, 384
pretrained.act_postprocess1 = nn.Sequential(
readout_oper[0],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[0],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[0],
out_channels=features[0],
kernel_size=4,
stride=4,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)
pretrained.act_postprocess2 = nn.Sequential(
readout_oper[1],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[1],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[1],
out_channels=features[1],
kernel_size=2,
stride=2,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)
pretrained.act_postprocess3 = nn.Sequential(
readout_oper[2],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[2],
kernel_size=1,
stride=1,
padding=0,
),
)
pretrained.act_postprocess4 = nn.Sequential(
readout_oper[3],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[3],
kernel_size=1,
stride=1,
padding=0,
),
nn.Conv2d(
in_channels=features[3],
out_channels=features[3],
kernel_size=3,
stride=2,
padding=1,
),
)
pretrained.model.start_index = start_index
pretrained.model.patch_size = [16, 16]
# We inject this function into the VisionTransformer instances so that
# we can use it with interpolated position embeddings without modifying the library source.
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
pretrained.model._resize_pos_embed = types.MethodType(
_resize_pos_embed, pretrained.model
)
return pretrained
def _make_pretrained_vitl16_384(pretrained, use_readout="ignore", hooks=None):
model = timm.create_model("vit_large_patch16_384", pretrained=pretrained)
hooks = [5, 11, 17, 23] if hooks == None else hooks
return _make_vit_b16_backbone(
model,
features=[256, 512, 1024, 1024],
hooks=hooks,
vit_features=1024,
use_readout=use_readout,
)
def _make_pretrained_vitb16_384(pretrained, use_readout="ignore", hooks=None):
model = timm.create_model("vit_base_patch16_384", pretrained=pretrained)
hooks = [2, 5, 8, 11] if hooks == None else hooks
return _make_vit_b16_backbone(
model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
)
def _make_pretrained_deitb16_384(pretrained, use_readout="ignore", hooks=None):
model = timm.create_model("vit_deit_base_patch16_384", pretrained=pretrained)
hooks = [2, 5, 8, 11] if hooks == None else hooks
return _make_vit_b16_backbone(
model, features=[96, 192, 384, 768], hooks=hooks, use_readout=use_readout
)
def _make_pretrained_deitb16_distil_384(pretrained, use_readout="ignore", hooks=None):
model = timm.create_model(
"vit_deit_base_distilled_patch16_384", pretrained=pretrained
)
hooks = [2, 5, 8, 11] if hooks == None else hooks
return _make_vit_b16_backbone(
model,
features=[96, 192, 384, 768],
hooks=hooks,
use_readout=use_readout,
start_index=2,
)
def _make_vit_b_rn50_backbone(
model,
features=[256, 512, 768, 768],
size=[384, 384],
hooks=[0, 1, 8, 11],
vit_features=768,
use_vit_only=False,
use_readout="ignore",
start_index=1,
):
pretrained = nn.Module()
pretrained.model = model
if use_vit_only == True:
pretrained.model.blocks[hooks[0]].register_forward_hook(get_activation("1"))
pretrained.model.blocks[hooks[1]].register_forward_hook(get_activation("2"))
else:
pretrained.model.patch_embed.backbone.stages[0].register_forward_hook(
get_activation("1")
)
pretrained.model.patch_embed.backbone.stages[1].register_forward_hook(
get_activation("2")
)
pretrained.model.blocks[hooks[2]].register_forward_hook(get_activation("3"))
pretrained.model.blocks[hooks[3]].register_forward_hook(get_activation("4"))
pretrained.activations = activations
readout_oper = get_readout_oper(vit_features, features, use_readout, start_index)
if use_vit_only == True:
pretrained.act_postprocess1 = nn.Sequential(
readout_oper[0],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[0],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[0],
out_channels=features[0],
kernel_size=4,
stride=4,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)
pretrained.act_postprocess2 = nn.Sequential(
readout_oper[1],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[1],
kernel_size=1,
stride=1,
padding=0,
),
nn.ConvTranspose2d(
in_channels=features[1],
out_channels=features[1],
kernel_size=2,
stride=2,
padding=0,
bias=True,
dilation=1,
groups=1,
),
)
else:
pretrained.act_postprocess1 = nn.Sequential(
nn.Identity(), nn.Identity(), nn.Identity()
)
pretrained.act_postprocess2 = nn.Sequential(
nn.Identity(), nn.Identity(), nn.Identity()
)
pretrained.act_postprocess3 = nn.Sequential(
readout_oper[2],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[2],
kernel_size=1,
stride=1,
padding=0,
),
)
pretrained.act_postprocess4 = nn.Sequential(
readout_oper[3],
Transpose(1, 2),
nn.Unflatten(2, torch.Size([size[0] // 16, size[1] // 16])),
nn.Conv2d(
in_channels=vit_features,
out_channels=features[3],
kernel_size=1,
stride=1,
padding=0,
),
nn.Conv2d(
in_channels=features[3],
out_channels=features[3],
kernel_size=3,
stride=2,
padding=1,
),
)
pretrained.model.start_index = start_index
pretrained.model.patch_size = [16, 16]
# We inject this function into the VisionTransformer instances so that
# we can use it with interpolated position embeddings without modifying the library source.
pretrained.model.forward_flex = types.MethodType(forward_flex, pretrained.model)
# We inject this function into the VisionTransformer instances so that
# we can use it with interpolated position embeddings without modifying the library source.
pretrained.model._resize_pos_embed = types.MethodType(
_resize_pos_embed, pretrained.model
)
return pretrained
def _make_pretrained_vitb_rn50_384(
pretrained, use_readout="ignore", hooks=None, use_vit_only=False
):
model = timm.create_model("vit_base_resnet50_384", pretrained=pretrained)
hooks = [0, 1, 8, 11] if hooks == None else hooks
return _make_vit_b_rn50_backbone(
model,
features=[256, 512, 768, 768],
size=[384, 384],
hooks=hooks,
use_vit_only=use_vit_only,
use_readout=use_readout,
)
"""Utils for monoDepth."""
import sys
import re
import numpy as np
import cv2
import torch
def read_pfm(path):
"""Read pfm file.
Args:
path (str): path to file
Returns:
tuple: (data, scale)
"""
with open(path, "rb") as file:
color = None
width = None
height = None
scale = None
endian = None
header = file.readline().rstrip()
if header.decode("ascii") == "PF":
color = True
elif header.decode("ascii") == "Pf":
color = False
else:
raise Exception("Not a PFM file: " + path)
dim_match = re.match(r"^(\d+)\s(\d+)\s$", file.readline().decode("ascii"))
if dim_match:
width, height = list(map(int, dim_match.groups()))
else:
raise Exception("Malformed PFM header.")
scale = float(file.readline().decode("ascii").rstrip())
if scale < 0:
# little-endian
endian = "<"
scale = -scale
else:
# big-endian
endian = ">"
data = np.fromfile(file, endian + "f")
shape = (height, width, 3) if color else (height, width)
data = np.reshape(data, shape)
data = np.flipud(data)
return data, scale
def write_pfm(path, image, scale=1):
"""Write pfm file.
Args:
path (str): pathto file
image (array): data
scale (int, optional): Scale. Defaults to 1.
"""
with open(path, "wb") as file:
color = None
if image.dtype.name != "float32":
raise Exception("Image dtype must be float32.")
image = np.flipud(image)
if len(image.shape) == 3 and image.shape[2] == 3: # color image
color = True
elif (
len(image.shape) == 2 or len(image.shape) == 3 and image.shape[2] == 1
): # greyscale
color = False
else:
raise Exception("Image must have H x W x 3, H x W x 1 or H x W dimensions.")
file.write("PF\n" if color else "Pf\n".encode())
file.write("%d %d\n".encode() % (image.shape[1], image.shape[0]))
endian = image.dtype.byteorder
if endian == "<" or endian == "=" and sys.byteorder == "little":
scale = -scale
file.write("%f\n".encode() % scale)
image.tofile(file)
def read_image(path):
"""Read image and output RGB image (0-1).
Args:
path (str): path to file
Returns:
array: RGB image (0-1)
"""
img = cv2.imread(path)
if img.ndim == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) / 255.0
return img
def resize_image(img):
"""Resize image and make it fit for network.
Args:
img (array): image
Returns:
tensor: data ready for network
"""
height_orig = img.shape[0]
width_orig = img.shape[1]
if width_orig > height_orig:
scale = width_orig / 384
else:
scale = height_orig / 384
height = (np.ceil(height_orig / scale / 32) * 32).astype(int)
width = (np.ceil(width_orig / scale / 32) * 32).astype(int)
img_resized = cv2.resize(img, (width, height), interpolation=cv2.INTER_AREA)
img_resized = (
torch.from_numpy(np.transpose(img_resized, (2, 0, 1))).contiguous().float()
)
img_resized = img_resized.unsqueeze(0)
return img_resized
def resize_depth(depth, width, height):
"""Resize depth map and bring to CPU (numpy).
Args:
depth (tensor): depth
width (int): image width
height (int): image height
Returns:
array: processed depth
"""
depth = torch.squeeze(depth[0, :, :, :]).to("cpu")
depth_resized = cv2.resize(
depth.numpy(), (width, height), interpolation=cv2.INTER_CUBIC
)
return depth_resized
def write_depth(path, depth, bits=1):
"""Write depth map to pfm and png file.
Args:
path (str): filepath without extension
depth (array): depth
"""
write_pfm(path + ".pfm", depth.astype(np.float32))
depth_min = depth.min()
depth_max = depth.max()
max_val = (2**(8*bits))-1
if depth_max - depth_min > np.finfo("float").eps:
out = max_val * (depth - depth_min) / (depth_max - depth_min)
else:
out = np.zeros(depth.shape, dtype=depth.type)
if bits == 1:
cv2.imwrite(path + ".png", out.astype("uint8"))
elif bits == 2:
cv2.imwrite(path + ".png", out.astype("uint16"))
return
import importlib
import torch
from torch import optim
import numpy as np
from inspect import isfunction
from PIL import Image, ImageDraw, ImageFont
def autocast(f):
def do_autocast(*args, **kwargs):
with torch.cuda.amp.autocast(enabled=True,
dtype=torch.get_autocast_gpu_dtype(),
cache_enabled=torch.is_autocast_cache_enabled()):
return f(*args, **kwargs)
return do_autocast
def log_txt_as_img(wh, xc, size=10):
# wh a tuple of (width, height)
# xc a list of captions to plot
b = len(xc)
txts = list()
for bi in range(b):
txt = Image.new("RGB", wh, color="white")
draw = ImageDraw.Draw(txt)
font = ImageFont.truetype('data/DejaVuSans.ttf', size=size)
nc = int(40 * (wh[0] / 256))
lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc))
try:
draw.text((0, 0), lines, fill="black", font=font)
except UnicodeEncodeError:
print("Cant encode string for logging. Skipping.")
txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
txts.append(txt)
txts = np.stack(txts)
txts = torch.tensor(txts)
return txts
def ismap(x):
if not isinstance(x, torch.Tensor):
return False
return (len(x.shape) == 4) and (x.shape[1] > 3)
def isimage(x):
if not isinstance(x,torch.Tensor):
return False
return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
def exists(x):
return x is not None
def default(val, d):
if exists(val):
return val
return d() if isfunction(d) else d
def mean_flat(tensor):
"""
https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
Take the mean over all non-batch dimensions.
"""
return tensor.mean(dim=list(range(1, len(tensor.shape))))
def count_params(model, verbose=False):
total_params = sum(p.numel() for p in model.parameters())
if verbose:
print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
return total_params
def instantiate_from_config(config):
if not "target" in config:
if config == '__is_first_stage__':
return None
elif config == "__is_unconditional__":
return None
raise KeyError("Expected key `target` to instantiate.")
return get_obj_from_str(config["target"])(**config.get("params", dict()))
def get_obj_from_str(string, reload=False):
module, cls = string.rsplit(".", 1)
if reload:
module_imp = importlib.import_module(module)
importlib.reload(module_imp)
return getattr(importlib.import_module(module, package=None), cls)
class AdamWwithEMAandWings(optim.Optimizer):
# credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298
def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8, # TODO: check hyperparameters before using
weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999, # ema decay to match previous code
ema_power=1., param_names=()):
"""AdamW that saves EMA versions of the parameters."""
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
if not 0.0 <= weight_decay:
raise ValueError("Invalid weight_decay value: {}".format(weight_decay))
if not 0.0 <= ema_decay <= 1.0:
raise ValueError("Invalid ema_decay value: {}".format(ema_decay))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay,
ema_power=ema_power, param_names=param_names)
super().__init__(params, defaults)
def __setstate__(self, state):
super().__setstate__(state)
for group in self.param_groups:
group.setdefault('amsgrad', False)
@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.
Args:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
for group in self.param_groups:
params_with_grad = []
grads = []
exp_avgs = []
exp_avg_sqs = []
ema_params_with_grad = []
state_sums = []
max_exp_avg_sqs = []
state_steps = []
amsgrad = group['amsgrad']
beta1, beta2 = group['betas']
ema_decay = group['ema_decay']
ema_power = group['ema_power']
for p in group['params']:
if p.grad is None:
continue
params_with_grad.append(p)
if p.grad.is_sparse:
raise RuntimeError('AdamW does not support sparse gradients')
grads.append(p.grad)
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
if amsgrad:
# Maintains max of all exp. moving avg. of sq. grad. values
state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format)
# Exponential moving average of parameter values
state['param_exp_avg'] = p.detach().float().clone()
exp_avgs.append(state['exp_avg'])
exp_avg_sqs.append(state['exp_avg_sq'])
ema_params_with_grad.append(state['param_exp_avg'])
if amsgrad:
max_exp_avg_sqs.append(state['max_exp_avg_sq'])
# update the steps for each param group update
state['step'] += 1
# record the step after step update
state_steps.append(state['step'])
optim._functional.adamw(params_with_grad,
grads,
exp_avgs,
exp_avg_sqs,
max_exp_avg_sqs,
state_steps,
amsgrad=amsgrad,
beta1=beta1,
beta2=beta2,
lr=group['lr'],
weight_decay=group['weight_decay'],
eps=group['eps'],
maximize=False)
cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power)
for param, ema_param in zip(params_with_grad, ema_params_with_grad):
ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay)
return loss
\ No newline at end of file
# Stable Diffusion v2 Model Card
This model card focuses on the models associated with the Stable Diffusion v2, available [here](https://github.com/Stability-AI/stablediffusion/).
## Model Details
- **Developed by:** Robin Rombach, Patrick Esser
- **Model type:** Diffusion-based text-to-image generation model
- **Language(s):** English
- **License:** CreativeML Open RAIL++-M License
- **Model Description:** This is a model that can be used to generate and modify images based on text prompts. It is a [Latent Diffusion Model](https://arxiv.org/abs/2112.10752) that uses a fixed, pretrained text encoder ([OpenCLIP-ViT/H](https://github.com/mlfoundations/open_clip)).
- **Resources for more information:** [GitHub Repository](https://github.com/Stability-AI/).
- **Cite as:**
@InProceedings{Rombach_2022_CVPR,
author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
title = {High-Resolution Image Synthesis With Latent Diffusion Models},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2022},
pages = {10684-10695}
}
# Uses
## Direct Use
The model is intended for research purposes only. Possible research areas and tasks include
- Safe deployment of models which have the potential to generate harmful content.
- Probing and understanding the limitations and biases of generative models.
- Generation of artworks and use in design and other artistic processes.
- Applications in educational or creative tools.
- Research on generative models.
Excluded uses are described below.
### Misuse, Malicious Use, and Out-of-Scope Use
_Note: This section is originally taken from the [DALLE-MINI model card](https://huggingface.co/dalle-mini/dalle-mini), was used for Stable Diffusion v1, but applies in the same way to Stable Diffusion v2_.
The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
#### Out-of-Scope Use
The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
#### Misuse and Malicious Use
Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
- Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
- Intentionally promoting or propagating discriminatory content or harmful stereotypes.
- Impersonating individuals without their consent.
- Sexual content without consent of the people who might see it.
- Mis- and disinformation
- Representations of egregious violence and gore
- Sharing of copyrighted or licensed material in violation of its terms of use.
- Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
## Limitations and Bias
### Limitations
- The model does not achieve perfect photorealism
- The model cannot render legible text
- The model does not perform well on more difficult tasks which involve compositionality, such as rendering an image corresponding to “A red cube on top of a blue sphere”
- Faces and people in general may not be generated properly.
- The model was trained mainly with English captions and will not work as well in other languages.
- The autoencoding part of the model is lossy
- The model was trained on a subset of the large-scale dataset
[LAION-5B](https://laion.ai/blog/laion-5b/), which contains adult, violent and sexual content. To partially mitigate this, we have filtered the dataset using LAION's NFSW detector (see Training section).
### Bias
While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
Stable Diffusion vw was primarily trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/),
which consists of images that are limited to English descriptions.
Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for.
This affects the overall output of the model, as white and western cultures are often set as the default. Further, the
ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts.
Stable Diffusion v2 mirrors and exacerbates biases to such a degree that viewer discretion must be advised irrespective of the input or its intent.
## Training
**Training Data**
The model developers used the following dataset for training the model:
- LAION-5B and subsets (details below). The training data is further filtered using LAION's NSFW detector. For more details, please refer to LAION-5B's [NeurIPS 2022](https://openreview.net/forum?id=M3Y74vmsMcY) paper and reviewer discussions on the topic.
**Training Procedure**
Stable Diffusion v2 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training,
- Images are encoded through an encoder, which turns images into latent representations. The autoencoder uses a relative downsampling factor of 8 and maps images of shape H x W x 3 to latents of shape H/f x W/f x 4
- Text prompts are encoded through the OpenCLIP-ViT/H text-encoder.
- The output of the text encoder is fed into the UNet backbone of the latent diffusion model via cross-attention.
- The loss is a reconstruction objective between the noise that was added to the latent and the prediction made by the UNet. We also use the so-called _v-objective_, see https://arxiv.org/abs/2202.00512.
We currently provide the following checkpoints, for various versions:
### Version 2.1
- `512-base-ema.ckpt`: Fine-tuned on `512-base-ema.ckpt` 2.0 with 220k extra steps taken, with `punsafe=0.98` on the same dataset.
- `768-v-ema.ckpt`: Resumed from `768-v-ema.ckpt` 2.0 with an additional 55k steps on the same dataset (`punsafe=0.1`), and then fine-tuned for another 155k extra steps with `punsafe=0.98`.
**SD-unCLIP 2.1** is a finetuned version of Stable Diffusion 2.1, modified to accept (noisy) CLIP image embedding in addition to the text prompt, and can be used to create image variations ([Examples](https://github.com/Stability-AI/stablediffusion/blob/main/doc/UNCLIP.MD)) or can be chained with text-to-image CLIP priors. The amount of noise added to the image embedding can be specified via the `noise_level` (0 means no noise, 1000 full noise).
If you plan on building applications on top of the model that the general public may use, you are responsible for adding the guardrails to minimize or prevent misuse of the application, especially for use-cases highlighted in the earlier section, Misuse, Malicious Use, and Out-of-Scope Use.
A public demo of SD-unCLIP is already available at [clipdrop.co/stable-diffusion-reimagine](https://clipdrop.co/stable-diffusion-reimagine)
### Version 2.0
- `512-base-ema.ckpt`: 550k steps at resolution `256x256` on a subset of [LAION-5B](https://laion.ai/blog/laion-5b/) filtered for explicit pornographic material, using the [LAION-NSFW classifier](https://github.com/LAION-AI/CLIP-based-NSFW-Detector) with `punsafe=0.1` and an [aesthetic score](https://github.com/christophschuhmann/improved-aesthetic-predictor) >= `4.5`.
850k steps at resolution `512x512` on the same dataset with resolution `>= 512x512`.
- `768-v-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for 150k steps using a [v-objective](https://arxiv.org/abs/2202.00512) on the same dataset. Resumed for another 140k steps on a `768x768` subset of our dataset.
- `512-depth-ema.ckpt`: Resumed from `512-base-ema.ckpt` and finetuned for 200k steps. Added an extra input channel to process the (relative) depth prediction produced by [MiDaS](https://github.com/isl-org/MiDaS) (`dpt_hybrid`) which is used as an additional conditioning.
The additional input channels of the U-Net which process this extra information were zero-initialized.
- `512-inpainting-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for another 200k steps. Follows the mask-generation strategy presented in [LAMA](https://github.com/saic-mdal/lama) which, in combination with the latent VAE representations of the masked image, are used as an additional conditioning.
The additional input channels of the U-Net which process this extra information were zero-initialized. The same strategy was used to train the [1.5-inpainting checkpoint](https://github.com/saic-mdal/lama).
- `x4-upscaling-ema.ckpt`: Trained for 1.25M steps on a 10M subset of LAION containing images `>2048x2048`. The model was trained on crops of size `512x512` and is a text-guided [latent upscaling diffusion model](https://arxiv.org/abs/2112.10752).
In addition to the textual input, it receives a `noise_level` as an input parameter, which can be used to add noise to the low-resolution input according to a [predefined diffusion schedule](configs/stable-diffusion/x4-upscaling.yaml).
- **Hardware:** 32 x 8 x A100 GPUs
- **Optimizer:** AdamW
- **Gradient Accumulations**: 1
- **Batch:** 32 x 8 x 2 x 4 = 2048
- **Learning rate:** warmup to 0.0001 for 10,000 steps and then kept constant
## Evaluation Results
Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
5.0, 6.0, 7.0, 8.0) and 50 steps DDIM sampling steps show the relative improvements of the checkpoints:
![pareto](assets/model-variants.jpg)
Evaluated using 50 DDIM steps and 10000 random prompts from the COCO2017 validation set, evaluated at 512x512 resolution. Not optimized for FID scores.
## Environmental Impact
**Stable Diffusion v1** **Estimated Emissions**
Based on that information, we estimate the following CO2 emissions using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). The hardware, runtime, cloud provider, and compute region were utilized to estimate the carbon impact.
- **Hardware Type:** A100 PCIe 40GB
- **Hours used:** 200000
- **Cloud Provider:** AWS
- **Compute Region:** US-east
- **Carbon Emitted (Power consumption x Time x Carbon produced based on location of power grid):** 15000 kg CO2 eq.
## Citation
@InProceedings{Rombach_2022_CVPR,
author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
title = {High-Resolution Image Synthesis With Latent Diffusion Models},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2022},
pages = {10684-10695}
}
*This model card was written by: Robin Rombach, Patrick Esser and David Ha and is based on the [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion/blob/main/Stable_Diffusion_v1_Model_Card.md) and [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
install dtk and hyhal
python scripts/txt2img.py --device cuda --ckpt /public/home/lijian/model/v2-1_512-ema-pruned.ckpt --config ./configs/stable-diffusion/v2-inference.yaml --n_iter 1 --n_samples 1
pip install diffusers==0.27.0
change model path
python test_diffusers.py
import sys
import torch
import numpy as np
import gradio as gr
from PIL import Image
from omegaconf import OmegaConf
from einops import repeat, rearrange
from pytorch_lightning import seed_everything
from imwatermark import WatermarkEncoder
from scripts.txt2img import put_watermark
from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.data.util import AddMiDaS
torch.set_grad_enabled(False)
def initialize_model(config, ckpt):
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
sampler = DDIMSampler(model)
return sampler
def make_batch_sd(
image,
txt,
device,
num_samples=1,
model_type="dpt_hybrid"
):
image = np.array(image.convert("RGB"))
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
# sample['jpg'] is tensor hwc in [-1, 1] at this point
midas_trafo = AddMiDaS(model_type=model_type)
batch = {
"jpg": image,
"txt": num_samples * [txt],
}
batch = midas_trafo(batch)
batch["jpg"] = rearrange(batch["jpg"], 'h w c -> 1 c h w')
batch["jpg"] = repeat(batch["jpg"].to(device=device),
"1 ... -> n ...", n=num_samples)
batch["midas_in"] = repeat(torch.from_numpy(batch["midas_in"][None, ...]).to(
device=device), "1 ... -> n ...", n=num_samples)
return batch
def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
do_full_sample=False):
device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model
seed_everything(seed)
print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
wm = "SDV2"
wm_encoder = WatermarkEncoder()
wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
with torch.no_grad(),\
torch.autocast("cuda"):
batch = make_batch_sd(
image, txt=prompt, device=device, num_samples=num_samples)
z = model.get_first_stage_encoding(model.encode_first_stage(
batch[model.first_stage_key])) # move to latent space
c = model.cond_stage_model.encode(batch["txt"])
c_cat = list()
for ck in model.concat_keys:
cc = batch[ck]
cc = model.depth_model(cc)
depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
keepdim=True)
display_depth = (cc - depth_min) / (depth_max - depth_min)
depth_image = Image.fromarray(
(display_depth[0, 0, ...].cpu().numpy() * 255.).astype(np.uint8))
cc = torch.nn.functional.interpolate(
cc,
size=z.shape[2:],
mode="bicubic",
align_corners=False,
)
depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
keepdim=True)
cc = 2. * (cc - depth_min) / (depth_max - depth_min) - 1.
c_cat.append(cc)
c_cat = torch.cat(c_cat, dim=1)
# cond
cond = {"c_concat": [c_cat], "c_crossattn": [c]}
# uncond cond
uc_cross = model.get_unconditional_conditioning(num_samples, "")
uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
if not do_full_sample:
# encode (scaled latent)
z_enc = sampler.stochastic_encode(
z, torch.tensor([t_enc] * num_samples).to(model.device))
else:
z_enc = torch.randn_like(z)
# decode it
samples = sampler.decode(z_enc, cond, t_enc, unconditional_guidance_scale=scale,
unconditional_conditioning=uc_full, callback=callback)
x_samples_ddim = model.decode_first_stage(samples)
result = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
return [depth_image] + [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
def pad_image(input_image):
pad_w, pad_h = np.max(((2, 2), np.ceil(
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
im_padded = Image.fromarray(
np.pad(np.array(input_image), ((0, pad_h), (0, pad_w), (0, 0)), mode='edge'))
return im_padded
def predict(input_image, prompt, steps, num_samples, scale, seed, eta, strength):
init_image = input_image.convert("RGB")
image = pad_image(init_image) # resize to integer multiple of 32
sampler.make_schedule(steps, ddim_eta=eta, verbose=True)
assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]'
do_full_sample = strength == 1.
t_enc = min(int(strength * steps), steps-1)
result = paint(
sampler=sampler,
image=image,
prompt=prompt,
t_enc=t_enc,
seed=seed,
scale=scale,
num_samples=num_samples,
callback=None,
do_full_sample=do_full_sample
)
return result
sampler = initialize_model(sys.argv[1], sys.argv[2])
block = gr.Blocks().queue()
with block:
with gr.Row():
gr.Markdown("## Stable Diffusion Depth2Img")
with gr.Row():
with gr.Column():
input_image = gr.Image(source='upload', type="pil")
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
num_samples = gr.Slider(
label="Images", minimum=1, maximum=4, value=1, step=1)
ddim_steps = gr.Slider(label="Steps", minimum=1,
maximum=50, value=50, step=1)
scale = gr.Slider(
label="Guidance Scale", minimum=0.1, maximum=30.0, value=9.0, step=0.1
)
strength = gr.Slider(
label="Strength", minimum=0.0, maximum=1.0, value=0.9, step=0.01
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=2147483647,
step=1,
randomize=True,
)
eta = gr.Number(label="eta (DDIM)", value=0.0)
with gr.Column():
gallery = gr.Gallery(label="Generated images", show_label=False).style(
grid=[2], height="auto")
run_button.click(fn=predict, inputs=[
input_image, prompt, ddim_steps, num_samples, scale, seed, eta, strength], outputs=[gallery])
block.launch()
import sys
import cv2
import torch
import numpy as np
import gradio as gr
from PIL import Image
from omegaconf import OmegaConf
from einops import repeat
from imwatermark import WatermarkEncoder
from pathlib import Path
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.util import instantiate_from_config
torch.set_grad_enabled(False)
def put_watermark(img, wm_encoder=None):
if wm_encoder is not None:
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img = wm_encoder.encode(img, 'dwtDct')
img = Image.fromarray(img[:, :, ::-1])
return img
def initialize_model(config, ckpt):
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
sampler = DDIMSampler(model)
return sampler
def make_batch_sd(
image,
mask,
txt,
device,
num_samples=1):
image = np.array(image.convert("RGB"))
image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
mask = np.array(mask.convert("L"))
mask = mask.astype(np.float32) / 255.0
mask = mask[None, None]
mask[mask < 0.5] = 0
mask[mask >= 0.5] = 1
mask = torch.from_numpy(mask)
masked_image = image * (mask < 0.5)
batch = {
"image": repeat(image.to(device=device), "1 ... -> n ...", n=num_samples),
"txt": num_samples * [txt],
"mask": repeat(mask.to(device=device), "1 ... -> n ...", n=num_samples),
"masked_image": repeat(masked_image.to(device=device), "1 ... -> n ...", n=num_samples),
}
return batch
def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512):
device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model
print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
wm = "SDV2"
wm_encoder = WatermarkEncoder()
wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
prng = np.random.RandomState(seed)
start_code = prng.randn(num_samples, 4, h // 8, w // 8)
start_code = torch.from_numpy(start_code).to(
device=device, dtype=torch.float32)
with torch.no_grad(), \
torch.autocast("cuda"):
batch = make_batch_sd(image, mask, txt=prompt,
device=device, num_samples=num_samples)
c = model.cond_stage_model.encode(batch["txt"])
c_cat = list()
for ck in model.concat_keys:
cc = batch[ck].float()
if ck != model.masked_image_key:
bchw = [num_samples, 4, h // 8, w // 8]
cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
else:
cc = model.get_first_stage_encoding(
model.encode_first_stage(cc))
c_cat.append(cc)
c_cat = torch.cat(c_cat, dim=1)
# cond
cond = {"c_concat": [c_cat], "c_crossattn": [c]}
# uncond cond
uc_cross = model.get_unconditional_conditioning(num_samples, "")
uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
shape = [model.channels, h // 8, w // 8]
samples_cfg, intermediates = sampler.sample(
ddim_steps,
num_samples,
shape,
cond,
verbose=False,
eta=1.0,
unconditional_guidance_scale=scale,
unconditional_conditioning=uc_full,
x_T=start_code,
)
x_samples_ddim = model.decode_first_stage(samples_cfg)
result = torch.clamp((x_samples_ddim + 1.0) / 2.0,
min=0.0, max=1.0)
result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
def pad_image(input_image):
pad_w, pad_h = np.max(((2, 2), np.ceil(
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
im_padded = Image.fromarray(
np.pad(np.array(input_image), ((0, pad_h), (0, pad_w), (0, 0)), mode='edge'))
return im_padded
def predict(input_image, prompt, ddim_steps, num_samples, scale, seed):
init_image = input_image["image"].convert("RGB")
init_mask = input_image["mask"].convert("RGB")
image = pad_image(init_image) # resize to integer multiple of 32
mask = pad_image(init_mask) # resize to integer multiple of 32
width, height = image.size
print("Inpainting...", width, height)
result = inpaint(
sampler=sampler,
image=image,
mask=mask,
prompt=prompt,
seed=seed,
scale=scale,
ddim_steps=ddim_steps,
num_samples=num_samples,
h=height, w=width
)
return result
sampler = initialize_model(sys.argv[1], sys.argv[2])
block = gr.Blocks().queue()
with block:
with gr.Row():
gr.Markdown("## Stable Diffusion Inpainting")
with gr.Row():
with gr.Column():
input_image = gr.Image(source='upload', tool='sketch', type="pil")
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
num_samples = gr.Slider(
label="Images", minimum=1, maximum=4, value=4, step=1)
ddim_steps = gr.Slider(label="Steps", minimum=1,
maximum=50, value=45, step=1)
scale = gr.Slider(
label="Guidance Scale", minimum=0.1, maximum=30.0, value=10, step=0.1
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=2147483647,
step=1,
randomize=True,
)
with gr.Column():
gallery = gr.Gallery(label="Generated images", show_label=False).style(
grid=[2], height="auto")
run_button.click(fn=predict, inputs=[
input_image, prompt, ddim_steps, num_samples, scale, seed], outputs=[gallery])
block.launch()
import sys
import torch
import numpy as np
import gradio as gr
from PIL import Image
from omegaconf import OmegaConf
from einops import repeat, rearrange
from pytorch_lightning import seed_everything
from imwatermark import WatermarkEncoder
from scripts.txt2img import put_watermark
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.models.diffusion.ddpm import LatentUpscaleDiffusion, LatentUpscaleFinetuneDiffusion
from ldm.util import exists, instantiate_from_config
torch.set_grad_enabled(False)
def initialize_model(config, ckpt):
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
sampler = DDIMSampler(model)
return sampler
def make_batch_sd(
image,
txt,
device,
num_samples=1,
):
image = np.array(image.convert("RGB"))
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
batch = {
"lr": rearrange(image, 'h w c -> 1 c h w'),
"txt": num_samples * [txt],
}
batch["lr"] = repeat(batch["lr"].to(device=device),
"1 ... -> n ...", n=num_samples)
return batch
def make_noise_augmentation(model, batch, noise_level=None):
x_low = batch[model.low_scale_key]
x_low = x_low.to(memory_format=torch.contiguous_format).float()
x_aug, noise_level = model.low_scale_model(x_low, noise_level)
return x_aug, noise_level
def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
device = torch.device(
"cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model
seed_everything(seed)
prng = np.random.RandomState(seed)
start_code = prng.randn(num_samples, model.channels, h, w)
start_code = torch.from_numpy(start_code).to(
device=device, dtype=torch.float32)
print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
wm = "SDV2"
wm_encoder = WatermarkEncoder()
wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
with torch.no_grad(),\
torch.autocast("cuda"):
batch = make_batch_sd(
image, txt=prompt, device=device, num_samples=num_samples)
c = model.cond_stage_model.encode(batch["txt"])
c_cat = list()
if isinstance(model, LatentUpscaleFinetuneDiffusion):
for ck in model.concat_keys:
cc = batch[ck]
if exists(model.reshuffle_patch_size):
assert isinstance(model.reshuffle_patch_size, int)
cc = rearrange(cc, 'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
p1=model.reshuffle_patch_size, p2=model.reshuffle_patch_size)
c_cat.append(cc)
c_cat = torch.cat(c_cat, dim=1)
# cond
cond = {"c_concat": [c_cat], "c_crossattn": [c]}
# uncond cond
uc_cross = model.get_unconditional_conditioning(num_samples, "")
uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
elif isinstance(model, LatentUpscaleDiffusion):
x_augment, noise_level = make_noise_augmentation(
model, batch, noise_level)
cond = {"c_concat": [x_augment],
"c_crossattn": [c], "c_adm": noise_level}
# uncond cond
uc_cross = model.get_unconditional_conditioning(num_samples, "")
uc_full = {"c_concat": [x_augment], "c_crossattn": [
uc_cross], "c_adm": noise_level}
else:
raise NotImplementedError()
shape = [model.channels, h, w]
samples, intermediates = sampler.sample(
steps,
num_samples,
shape,
cond,
verbose=False,
eta=eta,
unconditional_guidance_scale=scale,
unconditional_conditioning=uc_full,
x_T=start_code,
callback=callback
)
with torch.no_grad():
x_samples_ddim = model.decode_first_stage(samples)
result = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
def pad_image(input_image):
pad_w, pad_h = np.max(((2, 2), np.ceil(
np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
im_padded = Image.fromarray(
np.pad(np.array(input_image), ((0, pad_h), (0, pad_w), (0, 0)), mode='edge'))
return im_padded
def predict(input_image, prompt, steps, num_samples, scale, seed, eta, noise_level):
init_image = input_image.convert("RGB")
image = pad_image(init_image) # resize to integer multiple of 32
width, height = image.size
noise_level = torch.Tensor(
num_samples * [noise_level]).to(sampler.model.device).long()
sampler.make_schedule(steps, ddim_eta=eta, verbose=True)
result = paint(
sampler=sampler,
image=image,
prompt=prompt,
seed=seed,
scale=scale,
h=height, w=width, steps=steps,
num_samples=num_samples,
callback=None,
noise_level=noise_level
)
return result
sampler = initialize_model(sys.argv[1], sys.argv[2])
block = gr.Blocks().queue()
with block:
with gr.Row():
gr.Markdown("## Stable Diffusion Upscaling")
with gr.Row():
with gr.Column():
input_image = gr.Image(source='upload', type="pil")
gr.Markdown(
"Tip: Add a description of the object that should be upscaled, e.g.: 'a professional photograph of a cat")
prompt = gr.Textbox(label="Prompt")
run_button = gr.Button(label="Run")
with gr.Accordion("Advanced options", open=False):
num_samples = gr.Slider(
label="Number of Samples", minimum=1, maximum=4, value=1, step=1)
steps = gr.Slider(label="DDIM Steps", minimum=2,
maximum=200, value=75, step=1)
scale = gr.Slider(
label="Scale", minimum=0.1, maximum=30.0, value=10, step=0.1
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=2147483647,
step=1,
randomize=True,
)
eta = gr.Number(label="eta (DDIM)",
value=0.0, min=0.0, max=1.0)
noise_level = None
if isinstance(sampler.model, LatentUpscaleDiffusion):
# TODO: make this work for all models
noise_level = gr.Number(
label="Noise Augmentation", min=0, max=350, value=20, step=1)
with gr.Column():
gallery = gr.Gallery(label="Generated images", show_label=False).style(
grid=[2], height="auto")
run_button.click(fn=predict, inputs=[
input_image, prompt, steps, num_samples, scale, seed, eta, noise_level], outputs=[gallery])
block.launch()
"""make variations of input image"""
import argparse, os
import PIL
import torch
import numpy as np
from omegaconf import OmegaConf
from PIL import Image
from tqdm import tqdm, trange
from itertools import islice
from einops import rearrange, repeat
from torchvision.utils import make_grid
from torch import autocast
from contextlib import nullcontext
from pytorch_lightning import seed_everything
from imwatermark import WatermarkEncoder
from scripts.txt2img import put_watermark
from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
def chunk(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
def load_model_from_config(config, ckpt, verbose=False):
print(f"Loading model from {ckpt}")
pl_sd = torch.load(ckpt, map_location="cpu")
if "global_step" in pl_sd:
print(f"Global Step: {pl_sd['global_step']}")
sd = pl_sd["state_dict"]
model = instantiate_from_config(config.model)
m, u = model.load_state_dict(sd, strict=False)
if len(m) > 0 and verbose:
print("missing keys:")
print(m)
if len(u) > 0 and verbose:
print("unexpected keys:")
print(u)
model.cuda()
model.eval()
return model
def load_img(path):
image = Image.open(path).convert("RGB")
w, h = image.size
print(f"loaded input image of size ({w}, {h}) from {path}")
w, h = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 64
image = image.resize((w, h), resample=PIL.Image.LANCZOS)
image = np.array(image).astype(np.float32) / 255.0
image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image)
return 2. * image - 1.
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--prompt",
type=str,
nargs="?",
default="a painting of a virus monster playing guitar",
help="the prompt to render"
)
parser.add_argument(
"--init-img",
type=str,
nargs="?",
help="path to the input image"
)
parser.add_argument(
"--outdir",
type=str,
nargs="?",
help="dir to write results to",
default="outputs/img2img-samples"
)
parser.add_argument(
"--ddim_steps",
type=int,
default=50,
help="number of ddim sampling steps",
)
parser.add_argument(
"--fixed_code",
action='store_true',
help="if enabled, uses the same starting code across all samples ",
)
parser.add_argument(
"--ddim_eta",
type=float,
default=0.0,
help="ddim eta (eta=0.0 corresponds to deterministic sampling",
)
parser.add_argument(
"--n_iter",
type=int,
default=1,
help="sample this often",
)
parser.add_argument(
"--C",
type=int,
default=4,
help="latent channels",
)
parser.add_argument(
"--f",
type=int,
default=8,
help="downsampling factor, most often 8 or 16",
)
parser.add_argument(
"--n_samples",
type=int,
default=2,
help="how many samples to produce for each given prompt. A.k.a batch size",
)
parser.add_argument(
"--n_rows",
type=int,
default=0,
help="rows in the grid (default: n_samples)",
)
parser.add_argument(
"--scale",
type=float,
default=9.0,
help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
)
parser.add_argument(
"--strength",
type=float,
default=0.8,
help="strength for noising/unnoising. 1.0 corresponds to full destruction of information in init image",
)
parser.add_argument(
"--from-file",
type=str,
help="if specified, load prompts from this file",
)
parser.add_argument(
"--config",
type=str,
default="configs/stable-diffusion/v2-inference.yaml",
help="path to config which constructs model",
)
parser.add_argument(
"--ckpt",
type=str,
help="path to checkpoint of model",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="the seed (for reproducible sampling)",
)
parser.add_argument(
"--precision",
type=str,
help="evaluate at this precision",
choices=["full", "autocast"],
default="autocast"
)
opt = parser.parse_args()
seed_everything(opt.seed)
config = OmegaConf.load(f"{opt.config}")
model = load_model_from_config(config, f"{opt.ckpt}")
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
sampler = DDIMSampler(model)
os.makedirs(opt.outdir, exist_ok=True)
outpath = opt.outdir
print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
wm = "SDV2"
wm_encoder = WatermarkEncoder()
wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
batch_size = opt.n_samples
n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
if not opt.from_file:
prompt = opt.prompt
assert prompt is not None
data = [batch_size * [prompt]]
else:
print(f"reading prompts from {opt.from_file}")
with open(opt.from_file, "r") as f:
data = f.read().splitlines()
data = list(chunk(data, batch_size))
sample_path = os.path.join(outpath, "samples")
os.makedirs(sample_path, exist_ok=True)
base_count = len(os.listdir(sample_path))
grid_count = len(os.listdir(outpath)) - 1
assert os.path.isfile(opt.init_img)
init_image = load_img(opt.init_img).to(device)
init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image)) # move to latent space
sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)
assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]'
t_enc = int(opt.strength * opt.ddim_steps)
print(f"target t_enc is {t_enc} steps")
precision_scope = autocast if opt.precision == "autocast" else nullcontext
with torch.no_grad():
with precision_scope("cuda"):
with model.ema_scope():
all_samples = list()
for n in trange(opt.n_iter, desc="Sampling"):
for prompts in tqdm(data, desc="data"):
uc = None
if opt.scale != 1.0:
uc = model.get_learned_conditioning(batch_size * [""])
if isinstance(prompts, tuple):
prompts = list(prompts)
c = model.get_learned_conditioning(prompts)
# encode (scaled latent)
z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc] * batch_size).to(device))
# decode it
samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale,
unconditional_conditioning=uc, )
x_samples = model.decode_first_stage(samples)
x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
for x_sample in x_samples:
x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
img = Image.fromarray(x_sample.astype(np.uint8))
img = put_watermark(img, wm_encoder)
img.save(os.path.join(sample_path, f"{base_count:05}.png"))
base_count += 1
all_samples.append(x_samples)
# additionally, save as grid
grid = torch.stack(all_samples, 0)
grid = rearrange(grid, 'n b c h w -> (n b) c h w')
grid = make_grid(grid, nrow=n_rows)
# to image
grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
grid = Image.fromarray(grid.astype(np.uint8))
grid = put_watermark(grid, wm_encoder)
grid.save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
grid_count += 1
print(f"Your samples are ready and waiting for you here: \n{outpath} \nEnjoy.")
if __name__ == "__main__":
main()
import sys
import torch
import numpy as np
import streamlit as st
from PIL import Image
from omegaconf import OmegaConf
from einops import repeat, rearrange
from pytorch_lightning import seed_everything
from imwatermark import WatermarkEncoder
from scripts.txt2img import put_watermark
from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.data.util import AddMiDaS
torch.set_grad_enabled(False)
@st.cache(allow_output_mutation=True)
def initialize_model(config, ckpt):
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
sampler = DDIMSampler(model)
return sampler
def make_batch_sd(
image,
txt,
device,
num_samples=1,
model_type="dpt_hybrid"
):
image = np.array(image.convert("RGB"))
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
# sample['jpg'] is tensor hwc in [-1, 1] at this point
midas_trafo = AddMiDaS(model_type=model_type)
batch = {
"jpg": image,
"txt": num_samples * [txt],
}
batch = midas_trafo(batch)
batch["jpg"] = rearrange(batch["jpg"], 'h w c -> 1 c h w')
batch["jpg"] = repeat(batch["jpg"].to(device=device), "1 ... -> n ...", n=num_samples)
batch["midas_in"] = repeat(torch.from_numpy(batch["midas_in"][None, ...]).to(device=device), "1 ... -> n ...", n=num_samples)
return batch
def paint(sampler, image, prompt, t_enc, seed, scale, num_samples=1, callback=None,
do_full_sample=False):
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model
seed_everything(seed)
print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
wm = "SDV2"
wm_encoder = WatermarkEncoder()
wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
with torch.no_grad(),\
torch.autocast("cuda"):
batch = make_batch_sd(image, txt=prompt, device=device, num_samples=num_samples)
z = model.get_first_stage_encoding(model.encode_first_stage(batch[model.first_stage_key])) # move to latent space
c = model.cond_stage_model.encode(batch["txt"])
c_cat = list()
for ck in model.concat_keys:
cc = batch[ck]
cc = model.depth_model(cc)
depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
keepdim=True)
display_depth = (cc - depth_min) / (depth_max - depth_min)
st.image(Image.fromarray((display_depth[0, 0, ...].cpu().numpy() * 255.).astype(np.uint8)))
cc = torch.nn.functional.interpolate(
cc,
size=z.shape[2:],
mode="bicubic",
align_corners=False,
)
depth_min, depth_max = torch.amin(cc, dim=[1, 2, 3], keepdim=True), torch.amax(cc, dim=[1, 2, 3],
keepdim=True)
cc = 2. * (cc - depth_min) / (depth_max - depth_min) - 1.
c_cat.append(cc)
c_cat = torch.cat(c_cat, dim=1)
# cond
cond = {"c_concat": [c_cat], "c_crossattn": [c]}
# uncond cond
uc_cross = model.get_unconditional_conditioning(num_samples, "")
uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
if not do_full_sample:
# encode (scaled latent)
z_enc = sampler.stochastic_encode(z, torch.tensor([t_enc] * num_samples).to(model.device))
else:
z_enc = torch.randn_like(z)
# decode it
samples = sampler.decode(z_enc, cond, t_enc, unconditional_guidance_scale=scale,
unconditional_conditioning=uc_full, callback=callback)
x_samples_ddim = model.decode_first_stage(samples)
result = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
def run():
st.title("Stable Diffusion Depth2Img")
# run via streamlit run scripts/demo/depth2img.py <path-tp-config> <path-to-ckpt>
sampler = initialize_model(sys.argv[1], sys.argv[2])
image = st.file_uploader("Image", ["jpg", "png"])
if image:
image = Image.open(image)
w, h = image.size
st.text(f"loaded input image of size ({w}, {h})")
width, height = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 64
image = image.resize((width, height))
st.text(f"resized input image to size ({width}, {height} (w, h))")
st.image(image)
prompt = st.text_input("Prompt")
seed = st.number_input("Seed", min_value=0, max_value=1000000, value=0)
num_samples = st.number_input("Number of Samples", min_value=1, max_value=64, value=1)
scale = st.slider("Scale", min_value=0.1, max_value=30.0, value=9.0, step=0.1)
steps = st.slider("DDIM Steps", min_value=0, max_value=50, value=50, step=1)
strength = st.slider("Strength", min_value=0., max_value=1., value=0.9)
t_progress = st.progress(0)
def t_callback(t):
t_progress.progress(min((t + 1) / t_enc, 1.))
assert 0. <= strength <= 1., 'can only work with strength in [0.0, 1.0]'
do_full_sample = strength == 1.
t_enc = min(int(strength * steps), steps-1)
sampler.make_schedule(steps, ddim_eta=0., verbose=True)
if st.button("Sample"):
result = paint(
sampler=sampler,
image=image,
prompt=prompt,
t_enc=t_enc,
seed=seed,
scale=scale,
num_samples=num_samples,
callback=t_callback,
do_full_sample=do_full_sample,
)
st.write("Result")
for image in result:
st.image(image, output_format='PNG')
if __name__ == "__main__":
run()
import sys
import cv2
import torch
import numpy as np
import streamlit as st
from PIL import Image
from omegaconf import OmegaConf
from einops import repeat
from streamlit_drawable_canvas import st_canvas
from imwatermark import WatermarkEncoder
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.util import instantiate_from_config
torch.set_grad_enabled(False)
def put_watermark(img, wm_encoder=None):
if wm_encoder is not None:
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img = wm_encoder.encode(img, 'dwtDct')
img = Image.fromarray(img[:, :, ::-1])
return img
@st.cache(allow_output_mutation=True)
def initialize_model(config, ckpt):
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
sampler = DDIMSampler(model)
return sampler
def make_batch_sd(
image,
mask,
txt,
device,
num_samples=1):
image = np.array(image.convert("RGB"))
image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
mask = np.array(mask.convert("L"))
mask = mask.astype(np.float32) / 255.0
mask = mask[None, None]
mask[mask < 0.5] = 0
mask[mask >= 0.5] = 1
mask = torch.from_numpy(mask)
masked_image = image * (mask < 0.5)
batch = {
"image": repeat(image.to(device=device), "1 ... -> n ...", n=num_samples),
"txt": num_samples * [txt],
"mask": repeat(mask.to(device=device), "1 ... -> n ...", n=num_samples),
"masked_image": repeat(masked_image.to(device=device), "1 ... -> n ...", n=num_samples),
}
return batch
def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512, eta=1.):
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model
print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
wm = "SDV2"
wm_encoder = WatermarkEncoder()
wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
prng = np.random.RandomState(seed)
start_code = prng.randn(num_samples, 4, h // 8, w // 8)
start_code = torch.from_numpy(start_code).to(device=device, dtype=torch.float32)
with torch.no_grad(), \
torch.autocast("cuda"):
batch = make_batch_sd(image, mask, txt=prompt, device=device, num_samples=num_samples)
c = model.cond_stage_model.encode(batch["txt"])
c_cat = list()
for ck in model.concat_keys:
cc = batch[ck].float()
if ck != model.masked_image_key:
bchw = [num_samples, 4, h // 8, w // 8]
cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
else:
cc = model.get_first_stage_encoding(model.encode_first_stage(cc))
c_cat.append(cc)
c_cat = torch.cat(c_cat, dim=1)
# cond
cond = {"c_concat": [c_cat], "c_crossattn": [c]}
# uncond cond
uc_cross = model.get_unconditional_conditioning(num_samples, "")
uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
shape = [model.channels, h // 8, w // 8]
samples_cfg, intermediates = sampler.sample(
ddim_steps,
num_samples,
shape,
cond,
verbose=False,
eta=eta,
unconditional_guidance_scale=scale,
unconditional_conditioning=uc_full,
x_T=start_code,
)
x_samples_ddim = model.decode_first_stage(samples_cfg)
result = torch.clamp((x_samples_ddim + 1.0) / 2.0,
min=0.0, max=1.0)
result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
def run():
st.title("Stable Diffusion Inpainting")
sampler = initialize_model(sys.argv[1], sys.argv[2])
image = st.file_uploader("Image", ["jpg", "png"])
if image:
image = Image.open(image)
w, h = image.size
print(f"loaded input image of size ({w}, {h})")
width, height = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 32
image = image.resize((width, height))
prompt = st.text_input("Prompt")
seed = st.number_input("Seed", min_value=0, max_value=1000000, value=0)
num_samples = st.number_input("Number of Samples", min_value=1, max_value=64, value=1)
scale = st.slider("Scale", min_value=0.1, max_value=30.0, value=10., step=0.1)
ddim_steps = st.slider("DDIM Steps", min_value=0, max_value=50, value=50, step=1)
eta = st.sidebar.number_input("eta (DDIM)", value=0., min_value=0., max_value=1.)
fill_color = "rgba(255, 255, 255, 0.0)"
stroke_width = st.number_input("Brush Size",
value=64,
min_value=1,
max_value=100)
stroke_color = "rgba(255, 255, 255, 1.0)"
bg_color = "rgba(0, 0, 0, 1.0)"
drawing_mode = "freedraw"
st.write("Canvas")
st.caption(
"Draw a mask to inpaint, then click the 'Send to Streamlit' button (bottom left, with an arrow on it).")
canvas_result = st_canvas(
fill_color=fill_color,
stroke_width=stroke_width,
stroke_color=stroke_color,
background_color=bg_color,
background_image=image,
update_streamlit=False,
height=height,
width=width,
drawing_mode=drawing_mode,
key="canvas",
)
if canvas_result:
mask = canvas_result.image_data
mask = mask[:, :, -1] > 0
if mask.sum() > 0:
mask = Image.fromarray(mask)
result = inpaint(
sampler=sampler,
image=image,
mask=mask,
prompt=prompt,
seed=seed,
scale=scale,
ddim_steps=ddim_steps,
num_samples=num_samples,
h=height, w=width, eta=eta
)
st.write("Inpainted")
for image in result:
st.image(image, output_format='PNG')
if __name__ == "__main__":
run()
\ No newline at end of file
import importlib
import streamlit as st
import torch
import cv2
import numpy as np
import PIL
from omegaconf import OmegaConf
from PIL import Image
from tqdm import trange
import io, os
from torch import autocast
from einops import rearrange, repeat
from torchvision.utils import make_grid
from pytorch_lightning import seed_everything
from contextlib import nullcontext
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.models.diffusion.plms import PLMSSampler
from ldm.models.diffusion.dpm_solver import DPMSolverSampler
torch.set_grad_enabled(False)
PROMPTS_ROOT = "scripts/prompts/"
SAVE_PATH = "outputs/demo/stable-unclip/"
VERSION2SPECS = {
"Stable unCLIP-L": {"H": 768, "W": 768, "C": 4, "f": 8},
"Stable unOpenCLIP-H": {"H": 768, "W": 768, "C": 4, "f": 8},
"Full Karlo": {}
}
def get_obj_from_str(string, reload=False):
module, cls = string.rsplit(".", 1)
importlib.invalidate_caches()
if reload:
module_imp = importlib.import_module(module)
importlib.reload(module_imp)
return getattr(importlib.import_module(module, package=None), cls)
def instantiate_from_config(config):
if not "target" in config:
raise KeyError("Expected key `target` to instantiate.")
return get_obj_from_str(config["target"])(**config.get("params", dict()))
def get_interactive_image(key=None):
image = st.file_uploader("Input", type=["jpg", "JPEG", "png"], key=key)
if image is not None:
image = Image.open(image)
if not image.mode == "RGB":
image = image.convert("RGB")
return image
def load_img(display=True, key=None):
image = get_interactive_image(key=key)
if display:
st.image(image)
w, h = image.size
print(f"loaded input image of size ({w}, {h})")
w, h = map(lambda x: x - x % 64, (w, h))
image = image.resize((w, h), resample=PIL.Image.LANCZOS)
image = np.array(image).astype(np.float32) / 255.0
image = image[None].transpose(0, 3, 1, 2)
image = torch.from_numpy(image)
return 2. * image - 1.
def get_init_img(batch_size=1, key=None):
init_image = load_img(key=key).cuda()
init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
return init_image
def sample(
model,
prompt,
n_runs=3,
n_samples=2,
H=512,
W=512,
C=4,
f=8,
scale=10.0,
ddim_steps=50,
ddim_eta=0.0,
callback=None,
skip_single_save=False,
save_grid=True,
ucg_schedule=None,
negative_prompt="",
adm_cond=None,
adm_uc=None,
use_full_precision=False,
only_adm_cond=False
):
batch_size = n_samples
precision_scope = autocast if not use_full_precision else nullcontext
# decoderscope = autocast if not use_full_precision else nullcontext
if use_full_precision: st.warning(f"Running {model.__class__.__name__} at full precision.")
if isinstance(prompt, str):
prompt = [prompt]
prompts = batch_size * prompt
outputs = st.empty()
with precision_scope("cuda"):
with model.ema_scope():
all_samples = list()
for n in trange(n_runs, desc="Sampling"):
shape = [C, H // f, W // f]
if not only_adm_cond:
uc = None
if scale != 1.0:
uc = model.get_learned_conditioning(batch_size * [negative_prompt])
if isinstance(prompts, tuple):
prompts = list(prompts)
c = model.get_learned_conditioning(prompts)
if adm_cond is not None:
if adm_cond.shape[0] == 1:
adm_cond = repeat(adm_cond, '1 ... -> b ...', b=batch_size)
if adm_uc is None:
st.warning("Not guiding via c_adm")
adm_uc = adm_cond
else:
if adm_uc.shape[0] == 1:
adm_uc = repeat(adm_uc, '1 ... -> b ...', b=batch_size)
if not only_adm_cond:
c = {"c_crossattn": [c], "c_adm": adm_cond}
uc = {"c_crossattn": [uc], "c_adm": adm_uc}
else:
c = adm_cond
uc = adm_uc
samples_ddim, _ = sampler.sample(S=ddim_steps,
conditioning=c,
batch_size=batch_size,
shape=shape,
verbose=False,
unconditional_guidance_scale=scale,
unconditional_conditioning=uc,
eta=ddim_eta,
x_T=None,
callback=callback,
ucg_schedule=ucg_schedule
)
x_samples = model.decode_first_stage(samples_ddim)
x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
if not skip_single_save:
base_count = len(os.listdir(os.path.join(SAVE_PATH, "samples")))
for x_sample in x_samples:
x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
Image.fromarray(x_sample.astype(np.uint8)).save(
os.path.join(SAVE_PATH, "samples", f"{base_count:09}.png"))
base_count += 1
all_samples.append(x_samples)
# get grid of all samples
grid = torch.stack(all_samples, 0)
grid = rearrange(grid, 'n b c h w -> (n h) (b w) c')
outputs.image(grid.cpu().numpy())
# additionally, save grid
grid = Image.fromarray((255. * grid.cpu().numpy()).astype(np.uint8))
if save_grid:
grid_count = len(os.listdir(SAVE_PATH)) - 1
grid.save(os.path.join(SAVE_PATH, f'grid-{grid_count:06}.png'))
return x_samples
def make_oscillating_guidance_schedule(num_steps, max_weight=15., min_weight=1.):
schedule = list()
for i in range(num_steps):
if float(i / num_steps) < 0.1:
schedule.append(max_weight)
elif i % 2 == 0:
schedule.append(min_weight)
else:
schedule.append(max_weight)
print(f"OSCILLATING GUIDANCE SCHEDULE: \n {schedule}")
return schedule
def torch2np(x):
x = ((x + 1.0) * 127.5).clamp(0, 255).to(dtype=torch.uint8)
x = x.permute(0, 2, 3, 1).detach().cpu().numpy()
return x
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
def init(version="Stable unCLIP-L", load_karlo_prior=False):
state = dict()
if not "model" in state:
if version == "Stable unCLIP-L":
config = "configs/stable-diffusion/v2-1-stable-unclip-l-inference.yaml"
ckpt = "checkpoints/sd21-unclip-l.ckpt"
elif version == "Stable unOpenCLIP-H":
config = "configs/stable-diffusion/v2-1-stable-unclip-h-inference.yaml"
ckpt = "checkpoints/sd21-unclip-h.ckpt"
elif version == "Full Karlo":
from ldm.modules.karlo.kakao.sampler import T2ISampler
st.info("Loading full KARLO..")
karlo = T2ISampler.from_pretrained(
root_dir="checkpoints/karlo_models",
clip_model_path="ViT-L-14.pt",
clip_stat_path="ViT-L-14_stats.th",
sampling_type="default",
)
state["karlo_prior"] = karlo
state["msg"] = "loaded full Karlo"
return state
else:
raise ValueError(f"version {version} unknown!")
config = OmegaConf.load(config)
model, msg = load_model_from_config(config, ckpt, vae_sd=None)
state["msg"] = msg
if load_karlo_prior:
from ldm.modules.karlo.kakao.sampler import PriorSampler
st.info("Loading KARLO CLIP prior...")
karlo_prior = PriorSampler.from_pretrained(
root_dir="checkpoints/karlo_models",
clip_model_path="ViT-L-14.pt",
clip_stat_path="ViT-L-14_stats.th",
sampling_type="default",
)
state["karlo_prior"] = karlo_prior
state["model"] = model
state["ckpt"] = ckpt
state["config"] = config
return state
def load_model_from_config(config, ckpt, verbose=False, vae_sd=None):
print(f"Loading model from {ckpt}")
pl_sd = torch.load(ckpt, map_location="cpu")
msg = None
if "global_step" in pl_sd:
msg = f"This is global step {pl_sd['global_step']}. "
if "model_ema.num_updates" in pl_sd["state_dict"]:
msg += f"And we got {pl_sd['state_dict']['model_ema.num_updates']} EMA updates."
global_step = pl_sd.get("global_step", "?")
sd = pl_sd["state_dict"]
if vae_sd is not None:
for k in sd.keys():
if "first_stage" in k:
sd[k] = vae_sd[k[len("first_stage_model."):]]
model = instantiate_from_config(config.model)
m, u = model.load_state_dict(sd, strict=False)
if len(m) > 0 and verbose:
print("missing keys:")
print(m)
if len(u) > 0 and verbose:
print("unexpected keys:")
print(u)
model.cuda()
model.eval()
print(f"Loaded global step {global_step}")
return model, msg
if __name__ == "__main__":
st.title("Stable unCLIP")
mode = "txt2img"
version = st.selectbox("Model Version", list(VERSION2SPECS.keys()), 0)
use_karlo_prior = version in ["Stable unCLIP-L"] and st.checkbox("Use KARLO prior", False)
state = init(version=version, load_karlo_prior=use_karlo_prior)
prompt = st.text_input("Prompt", "a professional photograph")
negative_prompt = st.text_input("Negative Prompt", "")
scale = st.number_input("cfg-scale", value=10., min_value=-100., max_value=100.)
number_rows = st.number_input("num rows", value=2, min_value=1, max_value=10)
number_cols = st.number_input("num cols", value=2, min_value=1, max_value=10)
steps = st.sidebar.number_input("steps", value=20, min_value=1, max_value=1000)
eta = st.sidebar.number_input("eta (DDIM)", value=0., min_value=0., max_value=1.)
force_full_precision = st.sidebar.checkbox("Force FP32", False) # TODO: check if/where things break.
if version != "Full Karlo":
H = st.sidebar.number_input("H", value=VERSION2SPECS[version]["H"], min_value=64, max_value=2048)
W = st.sidebar.number_input("W", value=VERSION2SPECS[version]["W"], min_value=64, max_value=2048)
C = VERSION2SPECS[version]["C"]
f = VERSION2SPECS[version]["f"]
SAVE_PATH = os.path.join(SAVE_PATH, version)
os.makedirs(os.path.join(SAVE_PATH, "samples"), exist_ok=True)
seed = st.sidebar.number_input("seed", value=42, min_value=0, max_value=int(1e9))
seed_everything(seed)
ucg_schedule = None
sampler = st.sidebar.selectbox("Sampler", ["DDIM", "DPM"], 0)
if version == "Full Karlo":
pass
else:
if sampler == "DPM":
sampler = DPMSolverSampler(state["model"])
elif sampler == "DDIM":
sampler = DDIMSampler(state["model"])
else:
raise ValueError(f"unknown sampler {sampler}!")
adm_cond, adm_uc = None, None
if use_karlo_prior:
# uses the prior
karlo_sampler = state["karlo_prior"]
noise_level = None
if state["model"].noise_augmentor is not None:
noise_level = st.number_input("Noise Augmentation for CLIP embeddings", min_value=0,
max_value=state["model"].noise_augmentor.max_noise_level - 1, value=0)
with torch.no_grad():
karlo_prediction = iter(
karlo_sampler(
prompt=prompt,
bsz=number_cols,
progressive_mode="final",
)
).__next__()
adm_cond = karlo_prediction
if noise_level is not None:
c_adm, noise_level_emb = state["model"].noise_augmentor(adm_cond, noise_level=repeat(
torch.tensor([noise_level]).to(state["model"].device), '1 -> b', b=number_cols))
adm_cond = torch.cat((c_adm, noise_level_emb), 1)
adm_uc = torch.zeros_like(adm_cond)
elif version == "Full Karlo":
pass
else:
num_inputs = st.number_input("Number of Input Images", 1)
def make_conditionings_from_input(num=1, key=None):
init_img = get_init_img(batch_size=number_cols, key=key)
with torch.no_grad():
adm_cond = state["model"].embedder(init_img)
weight = st.slider(f"Weight for Input {num}", min_value=-10., max_value=10., value=1.)
if state["model"].noise_augmentor is not None:
noise_level = st.number_input(f"Noise Augmentation for CLIP embedding of input #{num}", min_value=0,
max_value=state["model"].noise_augmentor.max_noise_level - 1,
value=0, )
c_adm, noise_level_emb = state["model"].noise_augmentor(adm_cond, noise_level=repeat(
torch.tensor([noise_level]).to(state["model"].device), '1 -> b', b=number_cols))
adm_cond = torch.cat((c_adm, noise_level_emb), 1) * weight
adm_uc = torch.zeros_like(adm_cond)
return adm_cond, adm_uc, weight
adm_inputs = list()
weights = list()
for n in range(num_inputs):
adm_cond, adm_uc, w = make_conditionings_from_input(num=n + 1, key=n)
weights.append(w)
adm_inputs.append(adm_cond)
adm_cond = torch.stack(adm_inputs).sum(0) / sum(weights)
if num_inputs > 1:
if st.checkbox("Apply Noise to Embedding Mix", True):
noise_level = st.number_input(f"Noise Augmentation for averaged CLIP embeddings", min_value=0,
max_value=state["model"].noise_augmentor.max_noise_level - 1, value=50, )
c_adm, noise_level_emb = state["model"].noise_augmentor(
adm_cond[:, :state["model"].noise_augmentor.time_embed.dim],
noise_level=repeat(
torch.tensor([noise_level]).to(state["model"].device), '1 -> b', b=number_cols))
adm_cond = torch.cat((c_adm, noise_level_emb), 1)
if st.button("Sample"):
print("running prompt:", prompt)
st.text("Sampling")
t_progress = st.progress(0)
result = st.empty()
def t_callback(t):
t_progress.progress(min((t + 1) / steps, 1.))
if version == "Full Karlo":
outputs = st.empty()
karlo_sampler = state["karlo_prior"]
all_samples = list()
with torch.no_grad():
for _ in range(number_rows):
karlo_prediction = iter(
karlo_sampler(
prompt=prompt,
bsz=number_cols,
progressive_mode="final",
)
).__next__()
all_samples.append(karlo_prediction)
grid = torch.stack(all_samples, 0)
grid = rearrange(grid, 'n b c h w -> (n h) (b w) c')
outputs.image(grid.cpu().numpy())
else:
samples = sample(
state["model"],
prompt,
n_runs=number_rows,
n_samples=number_cols,
H=H, W=W, C=C, f=f,
scale=scale,
ddim_steps=steps,
ddim_eta=eta,
callback=t_callback,
ucg_schedule=ucg_schedule,
negative_prompt=negative_prompt,
adm_cond=adm_cond, adm_uc=adm_uc,
use_full_precision=force_full_precision,
only_adm_cond=False
)
import sys
import torch
import numpy as np
import streamlit as st
from PIL import Image
from omegaconf import OmegaConf
from einops import repeat, rearrange
from pytorch_lightning import seed_everything
from imwatermark import WatermarkEncoder
from scripts.txt2img import put_watermark
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.models.diffusion.ddpm import LatentUpscaleDiffusion, LatentUpscaleFinetuneDiffusion
from ldm.util import exists, instantiate_from_config
torch.set_grad_enabled(False)
@st.cache(allow_output_mutation=True)
def initialize_model(config, ckpt):
config = OmegaConf.load(config)
model = instantiate_from_config(config.model)
model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)
sampler = DDIMSampler(model)
return sampler
def make_batch_sd(
image,
txt,
device,
num_samples=1,
):
image = np.array(image.convert("RGB"))
image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
batch = {
"lr": rearrange(image, 'h w c -> 1 c h w'),
"txt": num_samples * [txt],
}
batch["lr"] = repeat(batch["lr"].to(device=device), "1 ... -> n ...", n=num_samples)
return batch
def make_noise_augmentation(model, batch, noise_level=None):
x_low = batch[model.low_scale_key]
x_low = x_low.to(memory_format=torch.contiguous_format).float()
x_aug, noise_level = model.low_scale_model(x_low, noise_level)
return x_aug, noise_level
def paint(sampler, image, prompt, seed, scale, h, w, steps, num_samples=1, callback=None, eta=0., noise_level=None):
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = sampler.model
seed_everything(seed)
prng = np.random.RandomState(seed)
start_code = prng.randn(num_samples, model.channels, h , w)
start_code = torch.from_numpy(start_code).to(device=device, dtype=torch.float32)
print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
wm = "SDV2"
wm_encoder = WatermarkEncoder()
wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
with torch.no_grad(),\
torch.autocast("cuda"):
batch = make_batch_sd(image, txt=prompt, device=device, num_samples=num_samples)
c = model.cond_stage_model.encode(batch["txt"])
c_cat = list()
if isinstance(model, LatentUpscaleFinetuneDiffusion):
for ck in model.concat_keys:
cc = batch[ck]
if exists(model.reshuffle_patch_size):
assert isinstance(model.reshuffle_patch_size, int)
cc = rearrange(cc, 'b c (p1 h) (p2 w) -> b (p1 p2 c) h w',
p1=model.reshuffle_patch_size, p2=model.reshuffle_patch_size)
c_cat.append(cc)
c_cat = torch.cat(c_cat, dim=1)
# cond
cond = {"c_concat": [c_cat], "c_crossattn": [c]}
# uncond cond
uc_cross = model.get_unconditional_conditioning(num_samples, "")
uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
elif isinstance(model, LatentUpscaleDiffusion):
x_augment, noise_level = make_noise_augmentation(model, batch, noise_level)
cond = {"c_concat": [x_augment], "c_crossattn": [c], "c_adm": noise_level}
# uncond cond
uc_cross = model.get_unconditional_conditioning(num_samples, "")
uc_full = {"c_concat": [x_augment], "c_crossattn": [uc_cross], "c_adm": noise_level}
else:
raise NotImplementedError()
shape = [model.channels, h, w]
samples, intermediates = sampler.sample(
steps,
num_samples,
shape,
cond,
verbose=False,
eta=eta,
unconditional_guidance_scale=scale,
unconditional_conditioning=uc_full,
x_T=start_code,
callback=callback
)
with torch.no_grad():
x_samples_ddim = model.decode_first_stage(samples)
result = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0)
result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
st.text(f"upscaled image shape: {result.shape}")
return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
def run():
st.title("Stable Diffusion Upscaling")
# run via streamlit run scripts/demo/depth2img.py <path-tp-config> <path-to-ckpt>
sampler = initialize_model(sys.argv[1], sys.argv[2])
image = st.file_uploader("Image", ["jpg", "png"])
if image:
image = Image.open(image)
w, h = image.size
st.text(f"loaded input image of size ({w}, {h})")
width, height = map(lambda x: x - x % 64, (w, h)) # resize to integer multiple of 64
image = image.resize((width, height))
st.text(f"resized input image to size ({width}, {height} (w, h))")
st.image(image)
st.write(f"\n Tip: Add a description of the object that should be upscaled, e.g.: 'a professional photograph of a cat'")
prompt = st.text_input("Prompt", "a high quality professional photograph")
seed = st.number_input("Seed", min_value=0, max_value=1000000, value=0)
num_samples = st.number_input("Number of Samples", min_value=1, max_value=64, value=1)
scale = st.slider("Scale", min_value=0.1, max_value=30.0, value=9.0, step=0.1)
steps = st.slider("DDIM Steps", min_value=2, max_value=250, value=50, step=1)
eta = st.sidebar.number_input("eta (DDIM)", value=0., min_value=0., max_value=1.)
noise_level = None
if isinstance(sampler.model, LatentUpscaleDiffusion):
# TODO: make this work for all models
noise_level = st.sidebar.number_input("Noise Augmentation", min_value=0, max_value=350, value=20)
noise_level = torch.Tensor(num_samples * [noise_level]).to(sampler.model.device).long()
t_progress = st.progress(0)
def t_callback(t):
t_progress.progress(min((t + 1) / steps, 1.))
sampler.make_schedule(steps, ddim_eta=eta, verbose=True)
if st.button("Sample"):
result = paint(
sampler=sampler,
image=image,
prompt=prompt,
seed=seed,
scale=scale,
h=height, w=width, steps=steps,
num_samples=num_samples,
callback=t_callback,
noise_level=noise_level,
eta=eta
)
st.write("Result")
for image in result:
st.image(image, output_format='PNG')
if __name__ == "__main__":
run()
import cv2
import fire
from imwatermark import WatermarkDecoder
def testit(img_path):
bgr = cv2.imread(img_path)
decoder = WatermarkDecoder('bytes', 136)
watermark = decoder.decode(bgr, 'dwtDct')
try:
dec = watermark.decode('utf-8')
except:
dec = "null"
print(dec)
if __name__ == "__main__":
fire.Fire(testit)
\ No newline at end of file
import argparse, os
import cv2
import torch
import numpy as np
from omegaconf import OmegaConf
from PIL import Image
from tqdm import tqdm, trange
from itertools import islice
from einops import rearrange
from torchvision.utils import make_grid
from pytorch_lightning import seed_everything
from torch import autocast
from contextlib import nullcontext
from imwatermark import WatermarkEncoder
from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.models.diffusion.plms import PLMSSampler
from ldm.models.diffusion.dpm_solver import DPMSolverSampler
torch.set_grad_enabled(False)
def chunk(it, size):
it = iter(it)
return iter(lambda: tuple(islice(it, size)), ())
def load_model_from_config(config, ckpt, device=torch.device("cuda"), verbose=False):
print(f"Loading model from {ckpt}")
pl_sd = torch.load(ckpt, map_location="cpu")
if "global_step" in pl_sd:
print(f"Global Step: {pl_sd['global_step']}")
sd = pl_sd["state_dict"]
model = instantiate_from_config(config.model)
m, u = model.load_state_dict(sd, strict=False)
if len(m) > 0 and verbose:
print("missing keys:")
print(m)
if len(u) > 0 and verbose:
print("unexpected keys:")
print(u)
if device == torch.device("cuda"):
model.cuda()
elif device == torch.device("cpu"):
model.cpu()
model.cond_stage_model.device = "cpu"
else:
raise ValueError(f"Incorrect device name. Received: {device}")
model.eval()
return model
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--prompt",
type=str,
nargs="?",
default="a professional photograph of an astronaut riding a triceratops",
help="the prompt to render"
)
parser.add_argument(
"--outdir",
type=str,
nargs="?",
help="dir to write results to",
default="outputs/txt2img-samples"
)
parser.add_argument(
"--steps",
type=int,
default=50,
help="number of ddim sampling steps",
)
parser.add_argument(
"--plms",
action='store_true',
help="use plms sampling",
)
parser.add_argument(
"--dpm",
action='store_true',
help="use DPM (2) sampler",
)
parser.add_argument(
"--fixed_code",
action='store_true',
help="if enabled, uses the same starting code across all samples ",
)
parser.add_argument(
"--ddim_eta",
type=float,
default=0.0,
help="ddim eta (eta=0.0 corresponds to deterministic sampling",
)
parser.add_argument(
"--n_iter",
type=int,
default=3,
help="sample this often",
)
parser.add_argument(
"--H",
type=int,
default=512,
help="image height, in pixel space",
)
parser.add_argument(
"--W",
type=int,
default=512,
help="image width, in pixel space",
)
parser.add_argument(
"--C",
type=int,
default=4,
help="latent channels",
)
parser.add_argument(
"--f",
type=int,
default=8,
help="downsampling factor, most often 8 or 16",
)
parser.add_argument(
"--n_samples",
type=int,
default=3,
help="how many samples to produce for each given prompt. A.k.a batch size",
)
parser.add_argument(
"--n_rows",
type=int,
default=0,
help="rows in the grid (default: n_samples)",
)
parser.add_argument(
"--scale",
type=float,
default=9.0,
help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
)
parser.add_argument(
"--from-file",
type=str,
help="if specified, load prompts from this file, separated by newlines",
)
parser.add_argument(
"--config",
type=str,
default="configs/stable-diffusion/v2-inference.yaml",
help="path to config which constructs model",
)
parser.add_argument(
"--ckpt",
type=str,
help="path to checkpoint of model",
)
parser.add_argument(
"--seed",
type=int,
default=42,
help="the seed (for reproducible sampling)",
)
parser.add_argument(
"--precision",
type=str,
help="evaluate at this precision",
choices=["full", "autocast"],
default="autocast"
)
parser.add_argument(
"--repeat",
type=int,
default=1,
help="repeat each prompt in file this often",
)
parser.add_argument(
"--device",
type=str,
help="Device on which Stable Diffusion will be run",
choices=["cpu", "cuda"],
default="cpu"
)
parser.add_argument(
"--torchscript",
action='store_true',
help="Use TorchScript",
)
parser.add_argument(
"--ipex",
action='store_true',
help="Use Intel® Extension for PyTorch*",
)
parser.add_argument(
"--bf16",
action='store_true',
help="Use bfloat16",
)
opt = parser.parse_args()
return opt
def put_watermark(img, wm_encoder=None):
if wm_encoder is not None:
img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
img = wm_encoder.encode(img, 'dwtDct')
img = Image.fromarray(img[:, :, ::-1])
return img
def main(opt):
seed_everything(opt.seed)
config = OmegaConf.load(f"{opt.config}")
device = torch.device("cuda") if opt.device == "cuda" else torch.device("cpu")
model = load_model_from_config(config, f"{opt.ckpt}", device)
if opt.plms:
sampler = PLMSSampler(model, device=device)
elif opt.dpm:
sampler = DPMSolverSampler(model, device=device)
else:
sampler = DDIMSampler(model, device=device)
os.makedirs(opt.outdir, exist_ok=True)
outpath = opt.outdir
print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
wm = "SDV2"
wm_encoder = WatermarkEncoder()
wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
batch_size = opt.n_samples
n_rows = opt.n_rows if opt.n_rows > 0 else batch_size
if not opt.from_file:
prompt = opt.prompt
assert prompt is not None
data = [batch_size * [prompt]]
else:
print(f"reading prompts from {opt.from_file}")
with open(opt.from_file, "r") as f:
data = f.read().splitlines()
data = [p for p in data for i in range(opt.repeat)]
data = list(chunk(data, batch_size))
sample_path = os.path.join(outpath, "samples")
os.makedirs(sample_path, exist_ok=True)
sample_count = 0
base_count = len(os.listdir(sample_path))
grid_count = len(os.listdir(outpath)) - 1
start_code = None
if opt.fixed_code:
start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device)
if opt.torchscript or opt.ipex:
transformer = model.cond_stage_model.model
unet = model.model.diffusion_model
decoder = model.first_stage_model.decoder
additional_context = torch.cpu.amp.autocast() if opt.bf16 else nullcontext()
shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
if opt.bf16 and not opt.torchscript and not opt.ipex:
raise ValueError('Bfloat16 is supported only for torchscript+ipex')
if opt.bf16 and unet.dtype != torch.bfloat16:
raise ValueError("Use configs/stable-diffusion/intel/ configs with bf16 enabled if " +
"you'd like to use bfloat16 with CPU.")
if unet.dtype == torch.float16 and device == torch.device("cpu"):
raise ValueError("Use configs/stable-diffusion/intel/ configs for your model if you'd like to run it on CPU.")
if opt.ipex:
import intel_extension_for_pytorch as ipex
bf16_dtype = torch.bfloat16 if opt.bf16 else None
transformer = transformer.to(memory_format=torch.channels_last)
transformer = ipex.optimize(transformer, level="O1", inplace=True)
unet = unet.to(memory_format=torch.channels_last)
unet = ipex.optimize(unet, level="O1", auto_kernel_selection=True, inplace=True, dtype=bf16_dtype)
decoder = decoder.to(memory_format=torch.channels_last)
decoder = ipex.optimize(decoder, level="O1", auto_kernel_selection=True, inplace=True, dtype=bf16_dtype)
if opt.torchscript:
with torch.no_grad(), additional_context:
# get UNET scripted
if unet.use_checkpoint:
raise ValueError("Gradient checkpoint won't work with tracing. " +
"Use configs/stable-diffusion/intel/ configs for your model or disable checkpoint in your config.")
img_in = torch.ones(2, 4, 96, 96, dtype=torch.float32)
t_in = torch.ones(2, dtype=torch.int64)
context = torch.ones(2, 77, 1024, dtype=torch.float32)
scripted_unet = torch.jit.trace(unet, (img_in, t_in, context))
scripted_unet = torch.jit.optimize_for_inference(scripted_unet)
print(type(scripted_unet))
model.model.scripted_diffusion_model = scripted_unet
# get Decoder for first stage model scripted
samples_ddim = torch.ones(1, 4, 96, 96, dtype=torch.float32)
scripted_decoder = torch.jit.trace(decoder, (samples_ddim))
scripted_decoder = torch.jit.optimize_for_inference(scripted_decoder)
print(type(scripted_decoder))
model.first_stage_model.decoder = scripted_decoder
prompts = data[0]
print("Running a forward pass to initialize optimizations")
uc = None
if opt.scale != 1.0:
uc = model.get_learned_conditioning(batch_size * [""])
if isinstance(prompts, tuple):
prompts = list(prompts)
with torch.no_grad(), additional_context:
for _ in range(3):
c = model.get_learned_conditioning(prompts)
samples_ddim, _ = sampler.sample(S=5,
conditioning=c,
batch_size=batch_size,
shape=shape,
verbose=False,
unconditional_guidance_scale=opt.scale,
unconditional_conditioning=uc,
eta=opt.ddim_eta,
x_T=start_code)
print("Running a forward pass for decoder")
for _ in range(3):
x_samples_ddim = model.decode_first_stage(samples_ddim)
precision_scope = autocast if opt.precision=="autocast" or opt.bf16 else nullcontext
with torch.no_grad(), \
precision_scope(opt.device), \
model.ema_scope():
all_samples = list()
for n in trange(opt.n_iter, desc="Sampling"):
for prompts in tqdm(data, desc="data"):
uc = None
if opt.scale != 1.0:
uc = model.get_learned_conditioning(batch_size * [""])
if isinstance(prompts, tuple):
prompts = list(prompts)
c = model.get_learned_conditioning(prompts)
shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
samples, _ = sampler.sample(S=opt.steps,
conditioning=c,
batch_size=opt.n_samples,
shape=shape,
verbose=False,
unconditional_guidance_scale=opt.scale,
unconditional_conditioning=uc,
eta=opt.ddim_eta,
x_T=start_code)
x_samples = model.decode_first_stage(samples)
x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
for x_sample in x_samples:
x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
img = Image.fromarray(x_sample.astype(np.uint8))
img = put_watermark(img, wm_encoder)
img.save(os.path.join(sample_path, f"{base_count:05}.png"))
base_count += 1
sample_count += 1
all_samples.append(x_samples)
# additionally, save as grid
grid = torch.stack(all_samples, 0)
grid = rearrange(grid, 'n b c h w -> (n b) c h w')
grid = make_grid(grid, nrow=n_rows)
# to image
grid = 255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()
grid = Image.fromarray(grid.astype(np.uint8))
grid = put_watermark(grid, wm_encoder)
grid.save(os.path.join(outpath, f'grid-{grid_count:04}.png'))
grid_count += 1
print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
f" \nEnjoy.")
if __name__ == "__main__":
opt = parse_args()
main(opt)
from setuptools import setup, find_packages
setup(
name='stable-diffusion',
version='0.0.1',
description='',
packages=find_packages(),
install_requires=[
'torch',
'numpy',
'tqdm',
],
)
\ No newline at end of file
import torch
import time
import os
import pandas as pd
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler, EulerDiscreteScheduler, DDIMScheduler, DiffusionPipeline
model_id = "/public/home/lijian/model/stable-diffusion-2-1-base/"
text_file = "PartiPrompts.tsv"
df = pd.read_csv(text_file, sep='\t')
prompts = df['Prompt']
num_inference_steps = 50
pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")
base_count = 0
print("======================================start DPM ==================================")
for prompt in prompts:
start = time.time()
image = pipe(prompt, 512, 512, num_inference_steps=num_inference_steps, num_images_per_prompt=1).images[0]
print(f"the {base_count} text-to-image use time {time.time()-start}")
base_count += 1
image.save(f"{base_count:05}.png")
if base_count == 20:
break
print(f"Your samples are ready and waiting for you here\n \n"
f" \nEnjoy.")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment