test commit

5a92374a · suily · 5a92374a · 5a92374a · 5a92374a · 5a92374a
Commit 5a92374a authored Feb 03, 2026 by suily
11 changed files
--- a/sat/sgm/modules/diffusionmodules/sampling.py
+++ b/sat/sgm/modules/diffusionmodules/sampling.py
+"""
+    Partially ported from https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/sampling.py
+"""
+
+
+from typing import Dict, Union
+
+import torch
+from omegaconf import ListConfig, OmegaConf
+from tqdm import tqdm
+
+from ...modules.diffusionmodules.sampling_utils import (
+    get_ancestral_step,
+    linear_multistep_coeff,
+    to_d,
+    to_neg_log_sigma,
+    to_sigma,
+)
+from ...modules.diffusionmodules.discretizer import generate_roughly_equally_spaced_steps
+from ...util import append_dims, default, instantiate_from_config
+
+DEFAULT_GUIDER = {"target": "sgm.modules.diffusionmodules.guiders.IdentityGuider"}
+
+def guidance_scale_embedding(w, embedding_dim=512, dtype=torch.float32):
+    """
+    See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
+
+    Args:
+        timesteps (`torch.Tensor`):
+            generate embedding vectors at these timesteps
+        embedding_dim (`int`, *optional*, defaults to 512):
+            dimension of the embeddings to generate
+        dtype:
+            data type of the generated embeddings
+
+    Returns:
+        `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
+    """
+    assert len(w.shape) == 1
+    w = w * 1000.0
+
+    half_dim = embedding_dim // 2
+    emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
+    emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb).to(w.device).to(w.dtype)
+    emb = w.to(dtype)[:, None] * emb[None, :]
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
+    if embedding_dim % 2 == 1:  # zero pad
+        emb = torch.nn.functional.pad(emb, (0, 1))
+    assert emb.shape == (w.shape[0], embedding_dim)
+    return emb
+
+class BaseDiffusionSampler:
+    def __init__(
+        self,
+        discretization_config: Union[Dict, ListConfig, OmegaConf],
+        num_steps: Union[int, None] = None,
+        guider_config: Union[Dict, ListConfig, OmegaConf, None] = None,
+        cfg_cond_scale: Union[int, None] = None,
+        cfg_cond_embed_dim: Union[int, None] = 256,
+        verbose: bool = False,
+        device: str = "cuda",
+    ):
+        self.num_steps = num_steps
+        self.discretization = instantiate_from_config(discretization_config)
+        self.guider = instantiate_from_config(
+            default(
+                guider_config,
+                DEFAULT_GUIDER,
+            )
+        )
+
+        # pd params
+        self.cfg_cond_scale = cfg_cond_scale
+        self.cfg_cond_embed_dim = cfg_cond_embed_dim
+        
+        self.verbose = verbose
+        self.device = device
+
+    def prepare_sampling_loop(self, x, cond, uc=None, num_steps=None):
+        sigmas = self.discretization(
+            self.num_steps if num_steps is None else num_steps, device=self.device
+        )
+        uc = default(uc, cond)
+
+        x *= torch.sqrt(1.0 + sigmas[0] ** 2.0)
+        num_sigmas = len(sigmas)
+
+        s_in = x.new_ones([x.shape[0]]).float()
+
+        return x, s_in, sigmas, num_sigmas, cond, uc
+
+    def denoise(self, x, denoiser, sigma, cond, uc):
+        if self.cfg_cond_scale is not None:
+            batch_size = x.shape[0]
+            scale_emb = guidance_scale_embedding(torch.ones(batch_size, device=x.device) * self.cfg_cond_scale, embedding_dim=self.cfg_cond_embed_dim, dtype=x.dtype)
+            denoised = denoiser(*self.guider.prepare_inputs(x, sigma, cond, uc), scale_emb=scale_emb)
+        else:
+            denoised = denoiser(*self.guider.prepare_inputs(x, sigma, cond, uc))
+        denoised = self.guider(denoised, sigma)
+        return denoised
+
+    def get_sigma_gen(self, num_sigmas):
+        sigma_generator = range(num_sigmas - 1)
+        if self.verbose:
+            print("#" * 30, " Sampling setting ", "#" * 30)
+            print(f"Sampler: {self.__class__.__name__}")
+            print(f"Discretization: {self.discretization.__class__.__name__}")
+            print(f"Guider: {self.guider.__class__.__name__}")
+            sigma_generator = tqdm(
+                sigma_generator,
+                total=num_sigmas,
+                desc=f"Sampling with {self.__class__.__name__} for {num_sigmas} steps",
+            )
+        return sigma_generator
+
+
+class SingleStepDiffusionSampler(BaseDiffusionSampler):
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc, *args, **kwargs):
+        raise NotImplementedError
+
+    def euler_step(self, x, d, dt):
+        return x + dt * d
+
+
+class EDMSampler(SingleStepDiffusionSampler):
+    def __init__(
+        self, s_churn=0.0, s_tmin=0.0, s_tmax=float("inf"), s_noise=1.0, *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.s_churn = s_churn
+        self.s_tmin = s_tmin
+        self.s_tmax = s_tmax
+        self.s_noise = s_noise
+
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, gamma=0.0):
+        sigma_hat = sigma * (gamma + 1.0)
+        if gamma > 0:
+            eps = torch.randn_like(x) * self.s_noise
+            x = x + eps * append_dims(sigma_hat**2 - sigma**2, x.ndim) ** 0.5
+
+        denoised = self.denoise(x, denoiser, sigma_hat, cond, uc)
+        d = to_d(x, sigma_hat, denoised)
+        dt = append_dims(next_sigma - sigma_hat, x.ndim)
+
+        euler_step = self.euler_step(x, d, dt)
+        x = self.possible_correction_step(
+            euler_step, x, d, dt, next_sigma, denoiser, cond, uc
+        )
+        return x
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        for i in self.get_sigma_gen(num_sigmas):
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                if self.s_tmin <= sigmas[i] <= self.s_tmax
+                else 0.0
+            )
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc,
+                gamma,
+            )
+
+        return x
+
+
+class DDIMSampler(SingleStepDiffusionSampler):
+    def __init__(
+        self, s_noise=0.1, *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.s_noise = s_noise
+
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, s_noise=0.0):
+
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+        d = to_d(x, sigma, denoised)
+        dt = append_dims(next_sigma * (1 - s_noise**2)**0.5 - sigma, x.ndim)
+
+        euler_step = x + dt * d + s_noise * append_dims(next_sigma, x.ndim) * torch.randn_like(x)
+
+        x = self.possible_correction_step(
+            euler_step, x, d, dt, next_sigma, denoiser, cond, uc
+        )
+        return x
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        for i in self.get_sigma_gen(num_sigmas):
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc,
+                self.s_noise,
+            )
+
+        return x
+
+
+class AncestralSampler(SingleStepDiffusionSampler):
+    def __init__(self, eta=1.0, s_noise=1.0, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.eta = eta
+        self.s_noise = s_noise
+        self.noise_sampler = lambda x: torch.randn_like(x)
+
+    def ancestral_euler_step(self, x, denoised, sigma, sigma_down):
+        d = to_d(x, sigma, denoised)
+        dt = append_dims(sigma_down - sigma, x.ndim)
+
+        return self.euler_step(x, d, dt)
+
+    def ancestral_step(self, x, sigma, next_sigma, sigma_up):
+        x = torch.where(
+            append_dims(next_sigma, x.ndim) > 0.0,
+            x + self.noise_sampler(x) * self.s_noise * append_dims(sigma_up, x.ndim),
+            x,
+        )
+        return x
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        for i in self.get_sigma_gen(num_sigmas):
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc,
+            )
+
+        return x
+
+
+class LinearMultistepSampler(BaseDiffusionSampler):
+    def __init__(
+        self,
+        order=4,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.order = order
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None, **kwargs):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        ds = []
+        sigmas_cpu = sigmas.detach().cpu().numpy()
+        for i in self.get_sigma_gen(num_sigmas):
+            sigma = s_in * sigmas[i]
+            denoised = denoiser(
+                *self.guider.prepare_inputs(x, sigma, cond, uc), **kwargs
+            )
+            denoised = self.guider(denoised, sigma)
+            d = to_d(x, sigma, denoised)
+            ds.append(d)
+            if len(ds) > self.order:
+                ds.pop(0)
+            cur_order = min(i + 1, self.order)
+            coeffs = [
+                linear_multistep_coeff(cur_order, sigmas_cpu, i, j)
+                for j in range(cur_order)
+            ]
+            x = x + sum(coeff * d for coeff, d in zip(coeffs, reversed(ds)))
+
+        return x
+
+
+class EulerEDMSampler(EDMSampler):
+    def possible_correction_step(
+        self, euler_step, x, d, dt, next_sigma, denoiser, cond, uc
+    ):
+        return euler_step
+
+
+class HeunEDMSampler(EDMSampler):
+    def possible_correction_step(
+        self, euler_step, x, d, dt, next_sigma, denoiser, cond, uc
+    ):
+        if torch.sum(next_sigma) < 1e-14:
+            # Save a network evaluation if all noise levels are 0
+            return euler_step
+        else:
+            denoised = self.denoise(euler_step, denoiser, next_sigma, cond, uc)
+            d_new = to_d(euler_step, next_sigma, denoised)
+            d_prime = (d + d_new) / 2.0
+
+            # apply correction if noise level is not 0
+            x = torch.where(
+                append_dims(next_sigma, x.ndim) > 0.0, x + d_prime * dt, euler_step
+            )
+            return x
+
+
+class EulerAncestralSampler(AncestralSampler):
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc):
+        sigma_down, sigma_up = get_ancestral_step(sigma, next_sigma, eta=self.eta)
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+        x = self.ancestral_euler_step(x, denoised, sigma, sigma_down)
+        x = self.ancestral_step(x, sigma, next_sigma, sigma_up)
+
+        return x
+
+
+class DPMPP2SAncestralSampler(AncestralSampler):
+    def get_variables(self, sigma, sigma_down):
+        t, t_next = [to_neg_log_sigma(s) for s in (sigma, sigma_down)]
+        h = t_next - t
+        s = t + 0.5 * h
+        return h, s, t, t_next
+
+    def get_mult(self, h, s, t, t_next):
+        mult1 = to_sigma(s) / to_sigma(t)
+        mult2 = (-0.5 * h).expm1()
+        mult3 = to_sigma(t_next) / to_sigma(t)
+        mult4 = (-h).expm1()
+
+        return mult1, mult2, mult3, mult4
+
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, **kwargs):
+        sigma_down, sigma_up = get_ancestral_step(sigma, next_sigma, eta=self.eta)
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+        x_euler = self.ancestral_euler_step(x, denoised, sigma, sigma_down)
+
+        if torch.sum(sigma_down) < 1e-14:
+            # Save a network evaluation if all noise levels are 0
+            x = x_euler
+        else:
+            h, s, t, t_next = self.get_variables(sigma, sigma_down)
+            mult = [
+                append_dims(mult, x.ndim) for mult in self.get_mult(h, s, t, t_next)
+            ]
+
+            x2 = mult[0] * x - mult[1] * denoised
+            denoised2 = self.denoise(x2, denoiser, to_sigma(s), cond, uc)
+            x_dpmpp2s = mult[2] * x - mult[3] * denoised2
+
+            # apply correction if noise level is not 0
+            x = torch.where(append_dims(sigma_down, x.ndim) > 0.0, x_dpmpp2s, x_euler)
+
+        x = self.ancestral_step(x, sigma, next_sigma, sigma_up)
+        return x
+
+
+class DPMPP2MSampler(BaseDiffusionSampler):
+    def get_variables(self, sigma, next_sigma, previous_sigma=None):
+        t, t_next = [to_neg_log_sigma(s) for s in (sigma, next_sigma)]
+        h = t_next - t
+
+        if previous_sigma is not None:
+            h_last = t - to_neg_log_sigma(previous_sigma)
+            r = h_last / h
+            return h, r, t, t_next
+        else:
+            return h, None, t, t_next
+
+    def get_mult(self, h, r, t, t_next, previous_sigma):
+        mult1 = to_sigma(t_next) / to_sigma(t)
+        mult2 = (-h).expm1()
+
+        if previous_sigma is not None:
+            mult3 = 1 + 1 / (2 * r)
+            mult4 = 1 / (2 * r)
+            return mult1, mult2, mult3, mult4
+        else:
+            return mult1, mult2
+
+    def sampler_step(
+        self,
+        old_denoised,
+        previous_sigma,
+        sigma,
+        next_sigma,
+        denoiser,
+        x,
+        cond,
+        uc=None,
+    ):
+        denoised = self.denoise(x, denoiser, sigma, cond, uc)
+
+        h, r, t, t_next = self.get_variables(sigma, next_sigma, previous_sigma)
+        mult = [
+            append_dims(mult, x.ndim)
+            for mult in self.get_mult(h, r, t, t_next, previous_sigma)
+        ]
+
+        x_standard = mult[0] * x - mult[1] * denoised
+        if old_denoised is None or torch.sum(next_sigma) < 1e-14:
+            # Save a network evaluation if all noise levels are 0 or on the first step
+            return x_standard, denoised
+        else:
+            denoised_d = mult[2] * denoised - mult[3] * old_denoised
+            x_advanced = mult[0] * x - mult[1] * denoised_d
+
+            # apply correction if noise level is not 0 and not first step
+            x = torch.where(
+                append_dims(next_sigma, x.ndim) > 0.0, x_advanced, x_standard
+            )
+
+        return x, denoised
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None, **kwargs):
+        x, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        old_denoised = None
+        for i in self.get_sigma_gen(num_sigmas):
+            x, old_denoised = self.sampler_step(
+                old_denoised,
+                None if i == 0 else s_in * sigmas[i - 1],
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc=uc,
+            )
+
+        return x
+
+    
+def relay_to_d(x, sigma, denoised, image, step, total_step):
+    blurring_d = (denoised - image) / total_step
+    blurring_denoised = image + blurring_d * step
+    d = (x - blurring_denoised) / append_dims(sigma, x.ndim)
+    return d, blurring_d
+    
+
+class LinearRelayEDMSampler(EulerEDMSampler):
+    def __init__(self, partial_num_steps=20, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.partial_num_steps = partial_num_steps
+
+    def __call__(self, denoiser, image, randn, cond, uc=None, num_steps=None):
+        randn_unit = randn.clone()
+        randn, s_in, sigmas, num_sigmas, cond, uc = self.prepare_sampling_loop(
+            randn, cond, uc, num_steps
+        )
+        x = None
+
+        for i in self.get_sigma_gen(num_sigmas):
+            if i < self.num_steps - self.partial_num_steps:
+                continue
+            if x is None:
+                x = image + randn_unit * append_dims(s_in * sigmas[i], len(randn_unit.shape))
+
+            gamma = (
+                min(self.s_churn / (num_sigmas - 1), 2**0.5 - 1)
+                if self.s_tmin <= sigmas[i] <= self.s_tmax
+                else 0.0
+            )
+            x = self.sampler_step(
+                s_in * sigmas[i],
+                s_in * sigmas[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc,
+                gamma,
+                step=i - self.num_steps + self.partial_num_steps,
+                image=image,
+                index=self.num_steps - i,
+            )
+
+        return x
+
+    def euler_step(self, x, d, dt, blurring_d):
+        return x + dt * d + blurring_d
+
+    def sampler_step(self, sigma, next_sigma, denoiser, x, cond, uc=None, gamma=0.0, step=None, image=None, index=None):
+        sigma_hat = sigma * (gamma + 1.0)
+        if gamma > 0:
+            eps = torch.randn_like(x) * self.s_noise
+            x = x + eps * append_dims(sigma_hat**2 - sigma**2, x.ndim) ** 0.5
+
+        denoised = self.denoise(x, denoiser, sigma_hat, cond, uc)
+        beta_t = next_sigma / sigma_hat * index / self.partial_num_steps - (index - 1) / self.partial_num_steps
+        x = x * append_dims(next_sigma / sigma_hat, x.ndim) + denoised * append_dims(1 - next_sigma / sigma_hat + beta_t, x.ndim) - image * append_dims(beta_t, x.ndim)
+        return x
+    
+
+class ZeroSNRDDIMSampler(SingleStepDiffusionSampler):
+    def __init__(
+        self,
+        do_cfg=True,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.do_cfg = do_cfg
+
+    def prepare_sampling_loop(self, x, cond, uc=None, num_steps=None):
+        alpha_cumprod_sqrt, indices = self.discretization(
+            self.num_steps if num_steps is None else num_steps, device=self.device, return_idx=True
+        )
+        uc = default(uc, cond)
+
+        num_sigmas = len(alpha_cumprod_sqrt)
+
+        s_in = x.new_ones([x.shape[0]])
+
+        return x, s_in, alpha_cumprod_sqrt, num_sigmas, cond, uc, indices
+    
+    def denoise(self, x, denoiser, alpha_cumprod_sqrt, cond, uc, i=None, idx=None):
+        additional_model_inputs = {}
+        if self.do_cfg:
+            additional_model_inputs['idx'] = torch.cat([x.new_ones([x.shape[0]]) * idx] * 2)
+        else:
+            additional_model_inputs['idx'] = torch.cat([x.new_ones([x.shape[0]]) * idx])
+        denoised = denoiser(*self.guider.prepare_inputs(x, alpha_cumprod_sqrt, cond, uc), **additional_model_inputs)
+        denoised = self.guider(denoised, alpha_cumprod_sqrt, step=i, num_steps=self.num_steps)
+        return denoised
+
+    def sampler_step(self, alpha_cumprod_sqrt, next_alpha_cumprod_sqrt, denoiser, x, cond, uc=None, i=None, idx=None, return_denoised=False):
+        denoised = self.denoise(x, denoiser, alpha_cumprod_sqrt, cond, uc, i, idx).to(torch.float32)
+        if i == self.num_steps - 1:
+            if return_denoised:
+                return denoised, denoised
+            return denoised
+
+        a_t = ((1-next_alpha_cumprod_sqrt**2)/(1-alpha_cumprod_sqrt**2))**0.5
+        b_t = next_alpha_cumprod_sqrt - alpha_cumprod_sqrt * a_t
+
+        x = append_dims(a_t, x.ndim) * x + append_dims(b_t, x.ndim) * denoised
+        if return_denoised:
+            return x, denoised
+        return x
+
+    def __call__(self, denoiser, x, cond, uc=None, num_steps=None):
+        x, s_in, alpha_cumprod_sqrts, num_sigmas, cond, uc, indices = self.prepare_sampling_loop(
+            x, cond, uc, num_steps
+        )
+
+        for i in self.get_sigma_gen(num_sigmas):
+            x = self.sampler_step(
+                s_in * alpha_cumprod_sqrts[i],
+                s_in * alpha_cumprod_sqrts[i + 1],
+                denoiser,
+                x,
+                cond,
+                uc,
+                i=i,
+                idx=indices[self.num_steps-i-1],
+            )
+
+        return x
--- a/sat/sgm/modules/diffusionmodules/sampling_utils.py
+++ b/sat/sgm/modules/diffusionmodules/sampling_utils.py
+import math
+import torch
+from scipy import integrate
+
+from ...util import append_dims
+
+
+class NoDynamicThresholding:
+    def __call__(self, uncond, cond, scale, **kwargs):
+        return uncond + scale * (cond - uncond)
+
+class RescaleThresholding:
+    def __init__(self, phi=0.7):
+        self.phi = phi
+
+    def __call__(self, uncond, cond, scale, **kwargs):
+        denoised_cfg = uncond + scale * (cond - uncond)
+        sigma_pos, sigma_cfg = cond.std(), denoised_cfg.std()
+        factor = self.phi * sigma_pos / sigma_cfg + (1 - self.phi)
+        denoised_final = denoised_cfg * factor
+        return denoised_final
+    
+class DynamicThresholding:
+    Modes = ["Constant", "Linear Up", "Linear Down", "Half Cosine Up", "Half Cosine Down", "Power Up", "Power Down", "Cosine Down","Cosine Up"]
+    def __init__(self, interpret_mode, 
+                 scale_min = 3,
+                 mimic_interpret_mode = 'Constant',
+                 mimic_scale = 3, 
+                 mimic_scale_min = 3, 
+                 threshold_percentile = 1.0,
+                 phi = 1.0,
+                 separate_feature_channels = True,
+                 measure = 'AD',
+                 scaling_startpoint = 'ZERO',
+                 ):
+        assert interpret_mode in self.Modes
+        assert mimic_interpret_mode in self.Modes
+        assert measure in ['AD', 'STD']
+        assert scaling_startpoint in ['ZERO', 'MEAN']
+        self.mode = interpret_mode
+        self.mimic_mode = mimic_interpret_mode
+        self.scale_min = scale_min
+        self.mimic_scale = mimic_scale
+        self.mimic_scale_min = mimic_scale_min
+        self.threshold_percentile = threshold_percentile
+        self.phi = phi
+        self.separate_feature_channels = separate_feature_channels
+        self.measure = measure
+        self.scaling_startpoint = scaling_startpoint
+    
+    def interpret_scale(self, mode, scale, scale_min, step, num_steps):
+        """
+        num_steps = 50
+        step from 0 to 50.
+        """
+        scale -= scale_min
+        frac = step / num_steps
+        if mode == 'Constant':
+            pass
+        elif mode == "Linear Up":
+            scale *= frac
+        elif mode == "Linear Down":
+            scale *= 1.0 - frac
+        elif mode == "Half Cosine Up":
+            scale *= 1.0 - math.cos(frac)
+        elif mode == "Half Cosine Down":
+            scale *= math.cos(frac)
+        elif mode == "Cosine Down":
+            scale *= math.cos(frac * 1.5707)
+        elif mode == "Cosine Up":
+            scale *= 1.0 - math.cos(frac * 1.5707)
+        elif mode == "Power Up":
+            scale *= math.pow(frac, 2.0)
+        elif mode == "Power Down":
+            scale *= 1.0 - math.pow(frac, 2.0)
+        scale += scale_min
+        return scale
+    
+    def __call__(self, uncond, cond, scale, step, num_steps):
+        cfg_scale = self.interpret_scale(self.mode, scale, self.scale_min, step, num_steps)
+        mimic_cfg_scale = self.interpret_scale(self.mimic_mode, self.mimic_scale, self.mimic_scale_min, step, num_steps)
+
+        x = uncond + cfg_scale*(cond - uncond)
+        mimic_x = uncond + mimic_cfg_scale*(cond - uncond)  
+
+        x_flattened = x.flatten(2)
+        mimic_x_flattened = mimic_x.flatten(2)
+        
+        if self.scaling_startpoint == 'MEAN':
+            x_means = x_flattened.mean(dim=2, keepdim = True)
+            mimic_x_means = mimic_x_flattened.mean(dim=2, keepdim = True)
+            x_centered = x_flattened - x_means
+            mimic_x_centered = mimic_x_flattened - mimic_x_means
+        else:
+            x_centered = x_flattened
+            mimic_x_centered = mimic_x_flattened
+            
+        if self.separate_feature_channels:
+            if self.measure == 'AD':
+                x_scaleref = torch.quantile(x_centered.abs(), self.threshold_percentile, dim=2, keepdim = True)
+                mimic_x_scaleref = mimic_x_centered.abs().max(dim=2, keepdim = True).values
+            elif self.measure == 'STD':
+                x_scaleref = x_centered.std(dim=2, keepdim = True)
+                mimic_x_scaleref = mimic_x_centered.std(dim=2, keepdim = True)
+        else:
+            if self.measure == 'AD':
+                x_scaleref = torch.quantile(x_centered.abs(), self.threshold_percentile)
+                mimic_x_scaleref = mimic_x_centered.abs().max()
+            elif self.measure == 'STD':
+                x_scaleref = x_centered.std()
+                mimic_x_scaleref = mimic_x_centered.std()
+        
+        if self.measure == 'AD':
+            max_scaleref = torch.maximum(x_scaleref, mimic_x_scaleref)
+            x_clamped = x_centered.clamp(-max_scaleref, max_scaleref)
+            x_renormed = x_clamped * (mimic_x_scaleref / max_scaleref)
+        elif self.measure == 'STD':
+            x_renormed = x_centered * (mimic_x_scaleref / x_scaleref)
+        
+        if self.scaling_startpoint == 'MEAN':
+            x_dyn = x_means + x_renormed
+        else:
+            x_dyn = x_renormed
+        x_dyn = x_dyn.unflatten(2, x.shape[2:])
+        return self.phi*x_dyn + (1-self.phi)*x
+        
+
+def linear_multistep_coeff(order, t, i, j, epsrel=1e-4):
+    if order - 1 > i:
+        raise ValueError(f"Order {order} too high for step {i}")
+
+    def fn(tau):
+        prod = 1.0
+        for k in range(order):
+            if j == k:
+                continue
+            prod *= (tau - t[i - k]) / (t[i - j] - t[i - k])
+        return prod
+
+    return integrate.quad(fn, t[i], t[i + 1], epsrel=epsrel)[0]
+
+
+def get_ancestral_step(sigma_from, sigma_to, eta=1.0):
+    if not eta:
+        return sigma_to, 0.0
+    sigma_up = torch.minimum(
+        sigma_to,
+        eta
+        * (sigma_to**2 * (sigma_from**2 - sigma_to**2) / sigma_from**2) ** 0.5,
+    )
+    sigma_down = (sigma_to**2 - sigma_up**2) ** 0.5
+    return sigma_down, sigma_up
+
+
+def to_d(x, sigma, denoised):
+    return (x - denoised) / append_dims(sigma, x.ndim)
+
+
+def to_neg_log_sigma(sigma):
+    return sigma.log().neg()
+
+
+def to_sigma(neg_log_sigma):
+    return neg_log_sigma.neg().exp()
--- a/sat/sgm/modules/diffusionmodules/sigma_sampling.py
+++ b/sat/sgm/modules/diffusionmodules/sigma_sampling.py
+import torch
+
+from ...util import default, instantiate_from_config
+
+
+class EDMSampling:
+    def __init__(self, p_mean=-1.2, p_std=1.2):
+        self.p_mean = p_mean
+        self.p_std = p_std
+
+    def __call__(self, n_samples, rand=None):
+        log_sigma = self.p_mean + self.p_std * default(rand, torch.randn((n_samples,)))
+        return log_sigma.exp()
+
+
+class DiscreteSampling:
+    def __init__(self, discretization_config, num_idx, do_append_zero=False, flip=True, low_bound=0, up_bound=1):
+        self.num_idx = num_idx
+        self.sigmas = instantiate_from_config(discretization_config)(
+            num_idx, do_append_zero=do_append_zero, flip=flip
+        )
+        self.low_bound = int(low_bound * num_idx)
+        self.up_bound = int(up_bound * num_idx)
+        print(f'sigma sampling from {self.low_bound} to {self.up_bound}')
+
+    def idx_to_sigma(self, idx):
+        return self.sigmas[idx]
+
+    def __call__(self, n_samples, rand=None, return_idx=False):
+        idx = default(
+            rand,
+            torch.randint(self.low_bound, self.up_bound, (n_samples,)),
+        )
+        if return_idx:
+            return self.idx_to_sigma(idx), idx
+        else:
+            return self.idx_to_sigma(idx)
+
+class PartialDiscreteSampling:
+    def __init__(self, discretization_config, total_num_idx, partial_num_idx, do_append_zero=False, flip=True):
+        self.total_num_idx = total_num_idx
+        self.partial_num_idx = partial_num_idx
+        self.sigmas = instantiate_from_config(discretization_config)(
+            total_num_idx, do_append_zero=do_append_zero, flip=flip
+        )
+
+    def idx_to_sigma(self, idx):
+        return self.sigmas[idx]
+
+    def __call__(self, n_samples, rand=None):
+        idx = default(
+            rand,
+            # torch.randint(self.total_num_idx-self.partial_num_idx, self.total_num_idx, (n_samples,)),
+            torch.randint(0, self.partial_num_idx, (n_samples,)),
+        )
+        return self.idx_to_sigma(idx)
\ No newline at end of file
--- a/sat/sgm/modules/diffusionmodules/util.py
+++ b/sat/sgm/modules/diffusionmodules/util.py
+"""
+adopted from
+https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
+and
+https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
+and
+https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
+
+thanks!
+"""
+
+import math
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+
+
+def make_beta_schedule(
+    schedule,
+    n_timestep,
+    linear_start=1e-4,
+    linear_end=2e-2,
+):
+    if schedule == "linear":
+        betas = (
+            torch.linspace(
+                linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64
+            )
+            ** 2
+        )
+    return betas.numpy()
+
+
+def extract_into_tensor(a, t, x_shape):
+    b, *_ = t.shape
+    out = a.gather(-1, t)
+    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
+
+
+def mixed_checkpoint(func, inputs: dict, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass. This differs from the original checkpoint function
+    borrowed from https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py in that
+    it also works with non-tensor inputs
+    :param func: the function to evaluate.
+    :param inputs: the argument dictionary to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        tensor_keys = [key for key in inputs if isinstance(inputs[key], torch.Tensor)]
+        tensor_inputs = [
+            inputs[key] for key in inputs if isinstance(inputs[key], torch.Tensor)
+        ]
+        non_tensor_keys = [
+            key for key in inputs if not isinstance(inputs[key], torch.Tensor)
+        ]
+        non_tensor_inputs = [
+            inputs[key] for key in inputs if not isinstance(inputs[key], torch.Tensor)
+        ]
+        args = tuple(tensor_inputs) + tuple(non_tensor_inputs) + tuple(params)
+        return MixedCheckpointFunction.apply(
+            func,
+            len(tensor_inputs),
+            len(non_tensor_inputs),
+            tensor_keys,
+            non_tensor_keys,
+            *args,
+        )
+    else:
+        return func(**inputs)
+
+
+class MixedCheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        run_function,
+        length_tensors,
+        length_non_tensors,
+        tensor_keys,
+        non_tensor_keys,
+        *args,
+    ):
+        ctx.end_tensors = length_tensors
+        ctx.end_non_tensors = length_tensors + length_non_tensors
+        ctx.gpu_autocast_kwargs = {
+            "enabled": torch.is_autocast_enabled(),
+            "dtype": torch.get_autocast_gpu_dtype(),
+            "cache_enabled": torch.is_autocast_cache_enabled(),
+        }
+        assert (
+            len(tensor_keys) == length_tensors
+            and len(non_tensor_keys) == length_non_tensors
+        )
+
+        ctx.input_tensors = {
+            key: val for (key, val) in zip(tensor_keys, list(args[: ctx.end_tensors]))
+        }
+        ctx.input_non_tensors = {
+            key: val
+            for (key, val) in zip(
+                non_tensor_keys, list(args[ctx.end_tensors : ctx.end_non_tensors])
+            )
+        }
+        ctx.run_function = run_function
+        ctx.input_params = list(args[ctx.end_non_tensors :])
+
+        with torch.no_grad():
+            output_tensors = ctx.run_function(
+                **ctx.input_tensors, **ctx.input_non_tensors
+            )
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        # additional_args = {key: ctx.input_tensors[key] for key in ctx.input_tensors if not isinstance(ctx.input_tensors[key],torch.Tensor)}
+        ctx.input_tensors = {
+            key: ctx.input_tensors[key].detach().requires_grad_(True)
+            for key in ctx.input_tensors
+        }
+
+        with torch.enable_grad(), torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = {
+                key: ctx.input_tensors[key].view_as(ctx.input_tensors[key])
+                for key in ctx.input_tensors
+            }
+            # shallow_copies.update(additional_args)
+            output_tensors = ctx.run_function(**shallow_copies, **ctx.input_non_tensors)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            list(ctx.input_tensors.values()) + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (
+            (None, None, None, None, None)
+            + input_grads[: ctx.end_tensors]
+            + (None,) * (ctx.end_non_tensors - ctx.end_tensors)
+            + input_grads[ctx.end_tensors :]
+        )
+
+
+def checkpoint(func, inputs, params, flag):
+    """
+    Evaluate a function without caching intermediate activations, allowing for
+    reduced memory at the expense of extra compute in the backward pass.
+    :param func: the function to evaluate.
+    :param inputs: the argument sequence to pass to `func`.
+    :param params: a sequence of parameters `func` depends on but does not
+                   explicitly take as arguments.
+    :param flag: if False, disable gradient checkpointing.
+    """
+    if flag:
+        args = tuple(inputs) + tuple(params)
+        return CheckpointFunction.apply(func, len(inputs), *args)
+    else:
+        return func(*inputs)
+
+
+class CheckpointFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, run_function, length, *args):
+        ctx.run_function = run_function
+        ctx.input_tensors = list(args[:length])
+        ctx.input_params = list(args[length:])
+        ctx.gpu_autocast_kwargs = {
+            "enabled": torch.is_autocast_enabled(),
+            "dtype": torch.get_autocast_gpu_dtype(),
+            "cache_enabled": torch.is_autocast_cache_enabled(),
+        }
+        with torch.no_grad():
+            output_tensors = ctx.run_function(*ctx.input_tensors)
+        return output_tensors
+
+    @staticmethod
+    def backward(ctx, *output_grads):
+        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
+        with torch.enable_grad(), torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs):
+            # Fixes a bug where the first op in run_function modifies the
+            # Tensor storage in place, which is not allowed for detach()'d
+            # Tensors.
+            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
+            output_tensors = ctx.run_function(*shallow_copies)
+        input_grads = torch.autograd.grad(
+            output_tensors,
+            ctx.input_tensors + ctx.input_params,
+            output_grads,
+            allow_unused=True,
+        )
+        del ctx.input_tensors
+        del ctx.input_params
+        del output_tensors
+        return (None, None) + input_grads
+
+
+def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False, dtype=torch.float32):
+    """
+    Create sinusoidal timestep embeddings.
+    :param timesteps: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an [N x dim] Tensor of positional embeddings.
+    """
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=timesteps.device)
+        args = timesteps[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+    else:
+        embedding = repeat(timesteps, "b -> b d", d=dim)
+    return embedding.to(dtype)
+
+
+def zero_module(module):
+    """
+    Zero out the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().zero_()
+    return module
+
+
+def scale_module(module, scale):
+    """
+    Scale the parameters of a module and return it.
+    """
+    for p in module.parameters():
+        p.detach().mul_(scale)
+    return module
+
+
+def mean_flat(tensor):
+    """
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def normalization(channels):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(32, channels)
+
+
+# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
+class SiLU(nn.Module):
+    def forward(self, x):
+        return x * torch.sigmoid(x)
+
+
+class GroupNorm32(nn.GroupNorm):
+    def forward(self, x):
+        return super().forward(x).type(x.dtype)
+
+
+def conv_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D convolution module.
+    """
+    if dims == 1:
+        return nn.Conv1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.Conv2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.Conv3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+def linear(*args, **kwargs):
+    """
+    Create a linear module.
+    """
+    return nn.Linear(*args, **kwargs)
+
+
+def avg_pool_nd(dims, *args, **kwargs):
+    """
+    Create a 1D, 2D, or 3D average pooling module.
+    """
+    if dims == 1:
+        return nn.AvgPool1d(*args, **kwargs)
+    elif dims == 2:
+        return nn.AvgPool2d(*args, **kwargs)
+    elif dims == 3:
+        return nn.AvgPool3d(*args, **kwargs)
+    raise ValueError(f"unsupported dimensions: {dims}")
+
+
+class AlphaBlender(nn.Module):
+    strategies = ["learned", "fixed", "learned_with_images"]
+
+    def __init__(
+        self,
+        alpha: float,
+        merge_strategy: str = "learned_with_images",
+        rearrange_pattern: str = "b t -> (b t) 1 1",
+    ):
+        super().__init__()
+        self.merge_strategy = merge_strategy
+        self.rearrange_pattern = rearrange_pattern
+
+        assert (
+            merge_strategy in self.strategies
+        ), f"merge_strategy needs to be in {self.strategies}"
+
+        if self.merge_strategy == "fixed":
+            self.register_buffer("mix_factor", torch.Tensor([alpha]))
+        elif (
+            self.merge_strategy == "learned"
+            or self.merge_strategy == "learned_with_images"
+        ):
+            self.register_parameter(
+                "mix_factor", torch.nn.Parameter(torch.Tensor([alpha]))
+            )
+        else:
+            raise ValueError(f"unknown merge strategy {self.merge_strategy}")
+
+    def get_alpha(self, image_only_indicator: torch.Tensor) -> torch.Tensor:
+        if self.merge_strategy == "fixed":
+            alpha = self.mix_factor
+        elif self.merge_strategy == "learned":
+            alpha = torch.sigmoid(self.mix_factor)
+        elif self.merge_strategy == "learned_with_images":
+            assert image_only_indicator is not None, "need image_only_indicator ..."
+            alpha = torch.where(
+                image_only_indicator.bool(),
+                torch.ones(1, 1, device=image_only_indicator.device),
+                rearrange(torch.sigmoid(self.mix_factor), "... -> ... 1"),
+            )
+            alpha = rearrange(alpha, self.rearrange_pattern)
+        else:
+            raise NotImplementedError
+        return alpha
+
+    def forward(
+        self,
+        x_spatial: torch.Tensor,
+        x_temporal: torch.Tensor,
+        image_only_indicator: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        alpha = self.get_alpha(image_only_indicator)
+        x = (
+            alpha.to(x_spatial.dtype) * x_spatial
+            + (1.0 - alpha).to(x_spatial.dtype) * x_temporal
+        )
+        return x
--- a/sat/sgm/modules/diffusionmodules/wrappers.py
+++ b/sat/sgm/modules/diffusionmodules/wrappers.py
+import torch
+import torch.nn as nn
+from packaging import version
+
+OPENAIUNETWRAPPER = "sgm.modules.diffusionmodules.wrappers.OpenAIWrapper"
+
+
+class IdentityWrapper(nn.Module):
+    def __init__(self, diffusion_model, compile_model: bool = False, dtype: torch.dtype = torch.float32):
+        super().__init__()
+        compile = (
+            torch.compile
+            if (version.parse(torch.__version__) >= version.parse("2.0.0"))
+            and compile_model
+            else lambda x: x
+        )
+        self.diffusion_model = compile(diffusion_model)
+        self.dtype = dtype
+
+    def forward(self, *args, **kwargs):
+        return self.diffusion_model(*args, **kwargs)
+
+
+class OpenAIWrapper(IdentityWrapper):
+    def forward(
+        self, x: torch.Tensor, t: torch.Tensor, c: dict, **kwargs
+    ) -> torch.Tensor:
+        for key in c:
+            c[key] = c[key].to(self.dtype)
+
+        if len(x.shape) == 3:
+            x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=-1)
+        else:
+            x = torch.cat((x, c.get("concat", torch.Tensor([]).type_as(x))), dim=1)
+
+        return self.diffusion_model(
+            x,
+            timesteps=t,
+            context=c.get("crossattn", None),
+            y=c.get("vector", None),
+            **kwargs,
+        )
--- a/sat/sgm/modules/distributions/__init__.py
+++ b/sat/sgm/modules/distributions/__init__.py
--- a/sat/sgm/modules/distributions/distributions.py
+++ b/sat/sgm/modules/distributions/distributions.py
+import numpy as np
+import torch
+
+
+class AbstractDistribution:
+    def sample(self):
+        raise NotImplementedError()
+
+    def mode(self):
+        raise NotImplementedError()
+
+
+class DiracDistribution(AbstractDistribution):
+    def __init__(self, value):
+        self.value = value
+
+    def sample(self):
+        return self.value
+
+    def mode(self):
+        return self.value
+
+
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(
+                device=self.parameters.device
+            )
+
+    def sample(self):
+        # x = self.mean + self.std * torch.randn(self.mean.shape).to(
+        #     device=self.parameters.device
+        # )
+        x = self.mean + self.std * torch.randn_like(self.mean).to(
+            device=self.parameters.device
+        )
+        return x
+
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean, 2) + self.var - 1.0 - self.logvar,
+                    dim=[1, 2, 3],
+                )
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var
+                    - 1.0
+                    - self.logvar
+                    + other.logvar,
+                    dim=[1, 2, 3],
+                )
+
+    def nll(self, sample, dims=[1, 2, 3]):
+        if self.deterministic:
+            return torch.Tensor([0.0])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims,
+        )
+
+    def mode(self):
+        return self.mean
+
+
+def normal_kl(mean1, logvar1, mean2, logvar2):
+    """
+    source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12
+    Compute the KL divergence between two gaussians.
+    Shapes are automatically broadcasted, so batches can be compared to
+    scalars, among other use cases.
+    """
+    tensor = None
+    for obj in (mean1, logvar1, mean2, logvar2):
+        if isinstance(obj, torch.Tensor):
+            tensor = obj
+            break
+    assert tensor is not None, "at least one argument must be a Tensor"
+
+    # Force variances to be Tensors. Broadcasting helps convert scalars to
+    # Tensors, but it does not work for torch.exp().
+    logvar1, logvar2 = [
+        x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor)
+        for x in (logvar1, logvar2)
+    ]
+
+    return 0.5 * (
+        -1.0
+        + logvar2
+        - logvar1
+        + torch.exp(logvar1 - logvar2)
+        + ((mean1 - mean2) ** 2) * torch.exp(-logvar2)
+    )
--- a/sat/sgm/modules/ema.py
+++ b/sat/sgm/modules/ema.py
+import torch
+from torch import nn
+
+
+class LitEma(nn.Module):
+    def __init__(self, model, decay=0.9999, use_num_upates=True):
+        super().__init__()
+        if decay < 0.0 or decay > 1.0:
+            raise ValueError("Decay must be between 0 and 1")
+
+        self.m_name2s_name = {}
+        self.register_buffer("decay", torch.tensor(decay, dtype=torch.float32))
+        self.register_buffer(
+            "num_updates",
+            torch.tensor(0, dtype=torch.int)
+            if use_num_upates
+            else torch.tensor(-1, dtype=torch.int),
+        )
+
+        for name, p in model.named_parameters():
+            if p.requires_grad:
+                # remove as '.'-character is not allowed in buffers
+                s_name = name.replace(".", "")
+                self.m_name2s_name.update({name: s_name})
+                self.register_buffer(s_name, p.clone().detach().data)
+
+        self.collected_params = []
+
+    def reset_num_updates(self):
+        del self.num_updates
+        self.register_buffer("num_updates", torch.tensor(0, dtype=torch.int))
+
+    def forward(self, model):
+        decay = self.decay
+
+        if self.num_updates >= 0:
+            self.num_updates += 1
+            decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates))
+
+        one_minus_decay = 1.0 - decay
+
+        with torch.no_grad():
+            m_param = dict(model.named_parameters())
+            shadow_params = dict(self.named_buffers())
+
+            for key in m_param:
+                if m_param[key].requires_grad:
+                    sname = self.m_name2s_name[key]
+                    shadow_params[sname] = shadow_params[sname].type_as(m_param[key])
+                    shadow_params[sname].sub_(
+                        one_minus_decay * (shadow_params[sname] - m_param[key])
+                    )
+                else:
+                    assert not key in self.m_name2s_name
+
+    def copy_to(self, model):
+        m_param = dict(model.named_parameters())
+        shadow_params = dict(self.named_buffers())
+        for key in m_param:
+            if m_param[key].requires_grad:
+                m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data)
+            else:
+                assert not key in self.m_name2s_name
+
+    def store(self, parameters):
+        """
+        Save the current parameters for restoring later.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            temporarily stored.
+        """
+        self.collected_params = [param.clone() for param in parameters]
+
+    def restore(self, parameters):
+        """
+        Restore the parameters stored with the `store` method.
+        Useful to validate the model with EMA parameters without affecting the
+        original optimization process. Store the parameters before the
+        `copy_to` method. After validation (or model saving), use this to
+        restore the former parameters.
+        Args:
+          parameters: Iterable of `torch.nn.Parameter`; the parameters to be
+            updated with the stored parameters.
+        """
+        for c_param, param in zip(self.collected_params, parameters):
+            param.data.copy_(c_param.data)
--- a/sat/sgm/modules/encoders/__init__.py
+++ b/sat/sgm/modules/encoders/__init__.py
--- a/sat/sgm/modules/encoders/modules.py
+++ b/sat/sgm/modules/encoders/modules.py
+import math
+from contextlib import nullcontext
+from functools import partial
+from typing import Dict, List, Optional, Tuple, Union
+
+import kornia
+import numpy as np
+import open_clip
+import torch
+import torch.distributed
+import torch.nn as nn
+from einops import rearrange, repeat
+from omegaconf import ListConfig
+from torch.utils.checkpoint import checkpoint
+from transformers import (
+    ByT5Tokenizer,
+    CLIPTextModel,
+    CLIPTokenizer,
+    T5EncoderModel,
+    T5Tokenizer,
+    AutoModel,
+    AutoTokenizer
+)
+
+from ...modules.autoencoding.regularizers import DiagonalGaussianRegularizer
+from ...modules.diffusionmodules.model import Encoder
+from ...modules.diffusionmodules.openaimodel import Timestep
+from ...modules.diffusionmodules.util import extract_into_tensor, make_beta_schedule
+from ...modules.distributions.distributions import DiagonalGaussianDistribution
+from ...util import (
+    append_dims,
+    autocast,
+    count_params,
+    default,
+    disabled_train,
+    expand_dims_like,
+    instantiate_from_config,
+)
+
+
+class AbstractEmbModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self._is_trainable = None
+        self._ucg_rate = None
+        self._input_key = None
+
+    @property
+    def is_trainable(self) -> bool:
+        return self._is_trainable
+
+    @property
+    def ucg_rate(self) -> Union[float, torch.Tensor]:
+        return self._ucg_rate
+
+    @property
+    def input_key(self) -> str:
+        return self._input_key
+
+    @is_trainable.setter
+    def is_trainable(self, value: bool):
+        self._is_trainable = value
+
+    @ucg_rate.setter
+    def ucg_rate(self, value: Union[float, torch.Tensor]):
+        self._ucg_rate = value
+
+    @input_key.setter
+    def input_key(self, value: str):
+        self._input_key = value
+
+    @is_trainable.deleter
+    def is_trainable(self):
+        del self._is_trainable
+
+    @ucg_rate.deleter
+    def ucg_rate(self):
+        del self._ucg_rate
+
+    @input_key.deleter
+    def input_key(self):
+        del self._input_key
+
+
+class GeneralConditioner(nn.Module):
+    OUTPUT_DIM2KEYS = {2: "vector", 3: "crossattn", 4: "concat", 5: "concat"}
+    KEY2CATDIM = {"vector": 1, "crossattn": 2, "concat": 1}
+
+    def __init__(self, emb_models: Union[List, ListConfig], cor_embs=[], cor_p=[]):
+        super().__init__()
+        embedders = []
+        for n, embconfig in enumerate(emb_models):
+            embedder = instantiate_from_config(embconfig)
+            assert isinstance(
+                embedder, AbstractEmbModel
+            ), f"embedder model {embedder.__class__.__name__} has to inherit from AbstractEmbModel"
+            embedder.is_trainable = embconfig.get("is_trainable", False)
+            embedder.ucg_rate = embconfig.get("ucg_rate", 0.0)
+            if not embedder.is_trainable:
+                embedder.train = disabled_train
+                for param in embedder.parameters():
+                    param.requires_grad = False
+                embedder.eval()
+            # print(
+            #     f"Initialized embedder #{n}: {embedder.__class__.__name__} "
+            #     f"with {count_params(embedder, False)} params. Trainable: {embedder.is_trainable}"
+            # )
+
+            if "input_key" in embconfig:
+                embedder.input_key = embconfig["input_key"]
+            elif "input_keys" in embconfig:
+                embedder.input_keys = embconfig["input_keys"]
+            else:
+                raise KeyError(
+                    f"need either 'input_key' or 'input_keys' for embedder {embedder.__class__.__name__}"
+                )
+
+            embedder.legacy_ucg_val = embconfig.get("legacy_ucg_value", None)
+            if embedder.legacy_ucg_val is not None:
+                embedder.ucg_prng = np.random.RandomState()
+
+            embedders.append(embedder)
+        self.embedders = nn.ModuleList(embedders)
+
+        if len(cor_embs) > 0:
+            assert len(cor_p) == 2**len(cor_embs)
+        self.cor_embs = cor_embs
+        self.cor_p = cor_p
+
+    def possibly_get_ucg_val(self, embedder: AbstractEmbModel, batch: Dict) -> Dict:
+        assert embedder.legacy_ucg_val is not None
+        p = embedder.ucg_rate
+        val = embedder.legacy_ucg_val
+        for i in range(len(batch[embedder.input_key])):
+            if embedder.ucg_prng.choice(2, p=[1 - p, p]):
+                batch[embedder.input_key][i] = val
+        return batch
+    
+    def surely_get_ucg_val(self, embedder: AbstractEmbModel, batch: Dict, cond_or_not) -> Dict:
+        assert embedder.legacy_ucg_val is not None
+        val = embedder.legacy_ucg_val
+        for i in range(len(batch[embedder.input_key])):
+            if cond_or_not[i]:
+                batch[embedder.input_key][i] = val
+        return batch
+
+    def get_single_embedding(self, embedder, batch, output, cond_or_not: Optional[np.ndarray] = None, force_zero_embeddings: Optional[List] = None):
+        embedding_context = nullcontext if embedder.is_trainable else torch.no_grad
+        with embedding_context():
+            if hasattr(embedder, "input_key") and (embedder.input_key is not None):
+                if embedder.legacy_ucg_val is not None:
+                    if cond_or_not is None:
+                        batch = self.possibly_get_ucg_val(embedder, batch)
+                    else:
+                        batch = self.surely_get_ucg_val(embedder, batch, cond_or_not)
+                emb_out = embedder(batch[embedder.input_key])
+            elif hasattr(embedder, "input_keys"):
+                emb_out = embedder(*[batch[k] for k in embedder.input_keys])
+        assert isinstance(
+            emb_out, (torch.Tensor, list, tuple)
+        ), f"encoder outputs must be tensors or a sequence, but got {type(emb_out)}"
+        if not isinstance(emb_out, (list, tuple)):
+            emb_out = [emb_out]    
+        for emb in emb_out:
+            out_key = self.OUTPUT_DIM2KEYS[emb.dim()]
+            if embedder.ucg_rate > 0.0 and embedder.legacy_ucg_val is None:
+                if cond_or_not is None:
+                    emb = (
+                        expand_dims_like(
+                            torch.bernoulli(
+                                (1.0 - embedder.ucg_rate)
+                                * torch.ones(emb.shape[0], device=emb.device)
+                            ),
+                            emb,
+                        )
+                        * emb
+                    )
+                else:
+                    emb = (
+                        expand_dims_like(
+                            torch.tensor(1-cond_or_not, dtype=emb.dtype, device=emb.device),
+                            emb,
+                        )
+                        * emb
+                    )
+            if (
+                hasattr(embedder, "input_key")
+                and embedder.input_key in force_zero_embeddings
+            ):
+                emb = torch.zeros_like(emb)
+            if out_key in output:
+                output[out_key] = torch.cat(
+                    (output[out_key], emb), self.KEY2CATDIM[out_key]
+                )
+            else:
+                output[out_key] = emb
+        return output
+
+    def forward(
+        self, batch: Dict, force_zero_embeddings: Optional[List] = None
+    ) -> Dict:
+        output = dict()
+        if force_zero_embeddings is None:
+            force_zero_embeddings = []
+
+        if len(self.cor_embs) > 0:
+            batch_size = len(batch[list(batch.keys())[0]])
+            rand_idx = np.random.choice(len(self.cor_p), size=(batch_size,), p=self.cor_p)
+            for emb_idx in self.cor_embs:
+                cond_or_not = rand_idx % 2
+                rand_idx //= 2
+                embedder = self.embedders[emb_idx]
+                output = self.get_single_embedding(self.embedders[emb_idx], batch, output=output, cond_or_not=cond_or_not, force_zero_embeddings=force_zero_embeddings)
+
+        for i, embedder in enumerate(self.embedders):
+            if i in self.cor_embs:
+                continue
+            output = self.get_single_embedding(embedder, batch, output=output, force_zero_embeddings=force_zero_embeddings)
+        return output
+
+    def get_unconditional_conditioning(
+        self, batch_c, batch_uc=None, force_uc_zero_embeddings=None
+    ):
+        if force_uc_zero_embeddings is None:
+            force_uc_zero_embeddings = []
+        ucg_rates = list()
+        for embedder in self.embedders:
+            ucg_rates.append(embedder.ucg_rate)
+            embedder.ucg_rate = 0.0
+        cor_embs = self.cor_embs
+        cor_p = self.cor_p
+        self.cor_embs = []
+        self.cor_p = []
+
+        c = self(batch_c)
+        uc = self(batch_c if batch_uc is None else batch_uc, force_uc_zero_embeddings)
+
+        for embedder, rate in zip(self.embedders, ucg_rates):
+            embedder.ucg_rate = rate
+        self.cor_embs = cor_embs
+        self.cor_p = cor_p
+
+        return c, uc
+
+        
+class InceptionV3(nn.Module):
+    """Wrapper around the https://github.com/mseitzer/pytorch-fid inception
+    port with an additional squeeze at the end"""
+
+    def __init__(self, normalize_input=False, **kwargs):
+        super().__init__()
+        from pytorch_fid import inception
+
+        kwargs["resize_input"] = True
+        self.model = inception.InceptionV3(normalize_input=normalize_input, **kwargs)
+
+    def forward(self, inp):
+        # inp = kornia.geometry.resize(inp, (299, 299),
+        #                              interpolation='bicubic',
+        #                              align_corners=False,
+        #                              antialias=True)
+        # inp = inp.clamp(min=-1, max=1)
+
+        outp = self.model(inp)
+
+        if len(outp) == 1:
+            return outp[0].squeeze()
+
+        return outp
+
+
+class IdentityEncoder(AbstractEmbModel):
+    def encode(self, x):
+        return x
+
+    def forward(self, x):
+        return x
+
+
+class ClassEmbedder(AbstractEmbModel):
+    def __init__(self, embed_dim, n_classes=1000, add_sequence_dim=False):
+        super().__init__()
+        self.embedding = nn.Embedding(n_classes, embed_dim)
+        self.n_classes = n_classes
+        self.add_sequence_dim = add_sequence_dim
+
+    def forward(self, c):
+        c = self.embedding(c)
+        if self.add_sequence_dim:
+            c = c[:, None, :]
+        return c
+
+    def get_unconditional_conditioning(self, bs, device="cuda"):
+        uc_class = (
+            self.n_classes - 1
+        )  # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000)
+        uc = torch.ones((bs,), device=device) * uc_class
+        uc = {self.key: uc.long()}
+        return uc
+
+
+class ClassEmbedderForMultiCond(ClassEmbedder):
+    def forward(self, batch, key=None, disable_dropout=False):
+        out = batch
+        key = default(key, self.key)
+        islist = isinstance(batch[key], list)
+        if islist:
+            batch[key] = batch[key][0]
+        c_out = super().forward(batch, key, disable_dropout)
+        out[key] = [c_out] if islist else c_out
+        return out
+
+
+class FrozenT5Embedder(AbstractEmbModel):
+    """Uses the T5 transformer encoder for text"""
+
+    def __init__(
+        self,
+        model_dir="google/t5-v1_1-xxl",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        cache_dir=None,
+    ):
+        super().__init__()
+        if model_dir is not "google/t5-v1_1-xxl":
+            self.tokenizer = T5Tokenizer.from_pretrained(model_dir)
+            self.transformer = T5EncoderModel.from_pretrained(model_dir)
+        else:
+            self.tokenizer = T5Tokenizer.from_pretrained(model_dir, cache_dir=cache_dir)
+            self.transformer = T5EncoderModel.from_pretrained(model_dir, cache_dir=cache_dir)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+
+        for param in self.parameters():
+            param.requires_grad = False
+
+    # @autocast
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        tokens = batch_encoding["input_ids"].to(self.device)
+        with torch.autocast("cuda", enabled=False):
+            outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenByT5Embedder(AbstractEmbModel):
+    """
+    Uses the ByT5 transformer encoder for text. Is character-aware.
+    """
+
+    def __init__(
+        self, version="google/byt5-base", device="cuda", max_length=77, freeze=True
+    ):  # others are google/t5-v1_1-xl and google/t5-v1_1-xxl
+        super().__init__()
+        self.tokenizer = ByT5Tokenizer.from_pretrained(version)
+        self.transformer = T5EncoderModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        tokens = batch_encoding["input_ids"].to(self.device)
+        with torch.autocast("cuda", enabled=False):
+            outputs = self.transformer(input_ids=tokens)
+        z = outputs.last_hidden_state
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenCLIPEmbedder(AbstractEmbModel):
+    """Uses the CLIP transformer encoder for text (from huggingface)"""
+
+    LAYERS = ["last", "pooled", "hidden"]
+
+    def __init__(
+        self,
+        version="openai/clip-vit-large-patch14",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        layer="last",
+        layer_idx=None,
+        always_return_pooled=False,
+    ):  # clip-vit-base-patch32
+        super().__init__()
+        assert layer in self.LAYERS
+        self.tokenizer = CLIPTokenizer.from_pretrained(version)
+        self.transformer = CLIPTextModel.from_pretrained(version)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        self.layer_idx = layer_idx
+        self.return_pooled = always_return_pooled
+        if layer == "hidden":
+            assert layer_idx is not None
+            assert 0 <= abs(layer_idx) <= 12
+
+    def freeze(self):
+        self.transformer = self.transformer.eval()
+
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @autocast
+    def forward(self, text):
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=True,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        tokens = batch_encoding["input_ids"].to(self.device)
+        outputs = self.transformer(
+            input_ids=tokens, output_hidden_states=self.layer == "hidden"
+        )
+        if self.layer == "last":
+            z = outputs.last_hidden_state
+        elif self.layer == "pooled":
+            z = outputs.pooler_output[:, None, :]
+        else:
+            z = outputs.hidden_states[self.layer_idx]
+        if self.return_pooled:
+            return z, outputs.pooler_output
+        return z
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenOpenCLIPEmbedder2(AbstractEmbModel):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+
+    LAYERS = ["pooled", "last", "penultimate"]
+
+    def __init__(
+        self,
+        arch="ViT-H-14",
+        version="laion2b_s32b_b79k",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        layer="last",
+        always_return_pooled=False,
+        legacy=True,
+    ):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(
+            arch,
+            device=torch.device("cpu"),
+        )
+        del model.visual
+        self.model = model
+
+        self.device = device
+        self.max_length = max_length
+        self.return_pooled = always_return_pooled
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+        self.legacy = legacy
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @autocast
+    def forward(self, text):
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
+        if not self.return_pooled and self.legacy:
+            return z
+        if self.return_pooled:
+            assert not self.legacy
+            return z[self.layer], z["pooled"]
+        return z[self.layer]
+
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        if self.legacy:
+            x = x[self.layer]
+            x = self.model.ln_final(x)
+            return x
+        else:
+            # x is a dict and will stay a dict
+            o = x["last"]
+            o = self.model.ln_final(o)
+            pooled = self.pool(o, text)
+            x["pooled"] = pooled
+            return x
+
+    def pool(self, x, text):
+        # take features from the eot embedding (eot_token is the highest number in each sequence)
+        x = (
+            x[torch.arange(x.shape[0]), text.argmax(dim=-1)]
+            @ self.model.text_projection
+        )
+        return x
+
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
+        outputs = {}
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - 1:
+                outputs["penultimate"] = x.permute(1, 0, 2)  # LND -> NLD
+            if (
+                self.model.transformer.grad_checkpointing
+                and not torch.jit.is_scripting()
+            ):
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        outputs["last"] = x.permute(1, 0, 2)  # LND -> NLD
+        return outputs
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenOpenCLIPEmbedder(AbstractEmbModel):
+    LAYERS = [
+        # "pooled",
+        "last",
+        "penultimate",
+    ]
+
+    def __init__(
+        self,
+        arch="ViT-H-14",
+        version="laion2b_s32b_b79k",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        layer="last",
+    ):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(
+            arch, device=torch.device("cpu"), pretrained=version
+        )
+        del model.visual
+        self.model = model
+
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def forward(self, text):
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
+        return z
+
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        return x
+
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask=None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if (
+                self.model.transformer.grad_checkpointing
+                and not torch.jit.is_scripting()
+            ):
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenOpenCLIPImageEmbedder(AbstractEmbModel):
+    """
+    Uses the OpenCLIP vision transformer encoder for images
+    """
+
+    def __init__(
+        self,
+        arch="ViT-H-14",
+        version="laion2b_s32b_b79k",
+        device="cuda",
+        max_length=77,
+        freeze=True,
+        antialias=True,
+        ucg_rate=0.0,
+        unsqueeze_dim=False,
+        repeat_to_max_len=False,
+        num_image_crops=0,
+        output_tokens=False,
+    ):
+        super().__init__()
+        model, _, _ = open_clip.create_model_and_transforms(
+            arch,
+            device=torch.device("cpu"),
+            pretrained=version,
+        )
+        del model.transformer
+        self.model = model
+        self.max_crops = num_image_crops
+        self.pad_to_max_len = self.max_crops > 0
+        self.repeat_to_max_len = repeat_to_max_len and (not self.pad_to_max_len)
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+
+        self.antialias = antialias
+
+        self.register_buffer(
+            "mean", torch.Tensor([0.48145466, 0.4578275, 0.40821073]), persistent=False
+        )
+        self.register_buffer(
+            "std", torch.Tensor([0.26862954, 0.26130258, 0.27577711]), persistent=False
+        )
+        self.ucg_rate = ucg_rate
+        self.unsqueeze_dim = unsqueeze_dim
+        self.stored_batch = None
+        self.model.visual.output_tokens = output_tokens
+        self.output_tokens = output_tokens
+
+    def preprocess(self, x):
+        # normalize to [0,1]
+        x = kornia.geometry.resize(
+            x,
+            (224, 224),
+            interpolation="bicubic",
+            align_corners=True,
+            antialias=self.antialias,
+        )
+        x = (x + 1.0) / 2.0
+        # renormalize according to clip
+        x = kornia.enhance.normalize(x, self.mean, self.std)
+        return x
+
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+
+    @autocast
+    def forward(self, image, no_dropout=False):
+        z = self.encode_with_vision_transformer(image)
+        tokens = None
+        if self.output_tokens:
+            z, tokens = z[0], z[1]
+        z = z.to(image.dtype)
+        if self.ucg_rate > 0.0 and not no_dropout and not (self.max_crops > 0):
+            z = (
+                torch.bernoulli(
+                    (1.0 - self.ucg_rate) * torch.ones(z.shape[0], device=z.device)
+                )[:, None]
+                * z
+            )
+            if tokens is not None:
+                tokens = (
+                    expand_dims_like(
+                        torch.bernoulli(
+                            (1.0 - self.ucg_rate)
+                            * torch.ones(tokens.shape[0], device=tokens.device)
+                        ),
+                        tokens,
+                    )
+                    * tokens
+                )
+        if self.unsqueeze_dim:
+            z = z[:, None, :]
+        if self.output_tokens:
+            assert not self.repeat_to_max_len
+            assert not self.pad_to_max_len
+            return tokens, z
+        if self.repeat_to_max_len:
+            if z.dim() == 2:
+                z_ = z[:, None, :]
+            else:
+                z_ = z
+            return repeat(z_, "b 1 d -> b n d", n=self.max_length), z
+        elif self.pad_to_max_len:
+            assert z.dim() == 3
+            z_pad = torch.cat(
+                (
+                    z,
+                    torch.zeros(
+                        z.shape[0],
+                        self.max_length - z.shape[1],
+                        z.shape[2],
+                        device=z.device,
+                    ),
+                ),
+                1,
+            )
+            return z_pad, z_pad[:, 0, ...]
+        return z
+
+    def encode_with_vision_transformer(self, img):
+        # if self.max_crops > 0:
+        #    img = self.preprocess_by_cropping(img)
+        if img.dim() == 5:
+            assert self.max_crops == img.shape[1]
+            img = rearrange(img, "b n c h w -> (b n) c h w")
+        img = self.preprocess(img)
+        if not self.output_tokens:
+            assert not self.model.visual.output_tokens
+            x = self.model.visual(img)
+            tokens = None
+        else:
+            assert self.model.visual.output_tokens
+            x, tokens = self.model.visual(img)
+        if self.max_crops > 0:
+            x = rearrange(x, "(b n) d -> b n d", n=self.max_crops)
+            # drop out between 0 and all along the sequence axis
+            x = (
+                torch.bernoulli(
+                    (1.0 - self.ucg_rate)
+                    * torch.ones(x.shape[0], x.shape[1], 1, device=x.device)
+                )
+                * x
+            )
+            if tokens is not None:
+                tokens = rearrange(tokens, "(b n) t d -> b t (n d)", n=self.max_crops)
+                print(
+                    f"You are running very experimental token-concat in {self.__class__.__name__}. "
+                    f"Check what you are doing, and then remove this message."
+                )
+        if self.output_tokens:
+            return x, tokens
+        return x
+
+    def encode(self, text):
+        return self(text)
+
+
+class FrozenCLIPT5Encoder(AbstractEmbModel):
+    def __init__(
+        self,
+        clip_version="openai/clip-vit-large-patch14",
+        t5_version="google/t5-v1_1-xl",
+        device="cuda",
+        clip_max_length=77,
+        t5_max_length=77,
+    ):
+        super().__init__()
+        self.clip_encoder = FrozenCLIPEmbedder(
+            clip_version, device, max_length=clip_max_length
+        )
+        self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length)
+        print(
+            f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder) * 1.e-6:.2f} M parameters, "
+            f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder) * 1.e-6:.2f} M params."
+        )
+
+    def encode(self, text):
+        return self(text)
+
+    def forward(self, text):
+        clip_z = self.clip_encoder.encode(text)
+        t5_z = self.t5_encoder.encode(text)
+        return [clip_z, t5_z]
+
+
+class SpatialRescaler(nn.Module):
+    def __init__(
+        self,
+        n_stages=1,
+        method="bilinear",
+        multiplier=0.5,
+        in_channels=3,
+        out_channels=None,
+        bias=False,
+        wrap_video=False,
+        kernel_size=1,
+        remap_output=False,
+    ):
+        super().__init__()
+        self.n_stages = n_stages
+        assert self.n_stages >= 0
+        assert method in [
+            "nearest",
+            "linear",
+            "bilinear",
+            "trilinear",
+            "bicubic",
+            "area",
+        ]
+        self.multiplier = multiplier
+        self.interpolator = partial(torch.nn.functional.interpolate, mode=method)
+        self.remap_output = out_channels is not None or remap_output
+        if self.remap_output:
+            print(
+                f"Spatial Rescaler mapping from {in_channels} to {out_channels} channels after resizing."
+            )
+            self.channel_mapper = nn.Conv2d(
+                in_channels,
+                out_channels,
+                kernel_size=kernel_size,
+                bias=bias,
+                padding=kernel_size // 2,
+            )
+        self.wrap_video = wrap_video
+
+    def forward(self, x):
+        if self.wrap_video and x.ndim == 5:
+            B, C, T, H, W = x.shape
+            x = rearrange(x, "b c t h w -> b t c h w")
+            x = rearrange(x, "b t c h w -> (b t) c h w")
+
+        for stage in range(self.n_stages):
+            x = self.interpolator(x, scale_factor=self.multiplier)
+
+        if self.wrap_video:
+            x = rearrange(x, "(b t) c h w -> b t c h w", b=B, t=T, c=C)
+            x = rearrange(x, "b t c h w -> b c t h w")
+        if self.remap_output:
+            x = self.channel_mapper(x)
+        return x
+
+    def encode(self, x):
+        return self(x)
+
+
+class LowScaleEncoder(nn.Module):
+    def __init__(
+        self,
+        model_config,
+        linear_start,
+        linear_end,
+        timesteps=1000,
+        max_noise_level=250,
+        output_size=64,
+        scale_factor=1.0,
+    ):
+        super().__init__()
+        self.max_noise_level = max_noise_level
+        self.model = instantiate_from_config(model_config)
+        self.augmentation_schedule = self.register_schedule(
+            timesteps=timesteps, linear_start=linear_start, linear_end=linear_end
+        )
+        self.out_size = output_size
+        self.scale_factor = scale_factor
+
+    def register_schedule(
+        self,
+        beta_schedule="linear",
+        timesteps=1000,
+        linear_start=1e-4,
+        linear_end=2e-2,
+        cosine_s=8e-3,
+    ):
+        betas = make_beta_schedule(
+            beta_schedule,
+            timesteps,
+            linear_start=linear_start,
+            linear_end=linear_end,
+            cosine_s=cosine_s,
+        )
+        alphas = 1.0 - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+
+        (timesteps,) = betas.shape
+        self.num_timesteps = int(timesteps)
+        self.linear_start = linear_start
+        self.linear_end = linear_end
+        assert (
+            alphas_cumprod.shape[0] == self.num_timesteps
+        ), "alphas have to be defined for each timestep"
+
+        to_torch = partial(torch.tensor, dtype=torch.float32)
+
+        self.register_buffer("betas", to_torch(betas))
+        self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod))
+        self.register_buffer("alphas_cumprod_prev", to_torch(alphas_cumprod_prev))
+
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer("sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod)))
+        self.register_buffer(
+            "sqrt_one_minus_alphas_cumprod", to_torch(np.sqrt(1.0 - alphas_cumprod))
+        )
+        self.register_buffer(
+            "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod))
+        )
+        self.register_buffer(
+            "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod))
+        )
+        self.register_buffer(
+            "sqrt_recipm1_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod - 1))
+        )
+
+    def q_sample(self, x_start, t, noise=None):
+        noise = default(noise, lambda: torch.randn_like(x_start))
+        return (
+            extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start
+            + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape)
+            * noise
+        )
+
+    def forward(self, x):
+        z = self.model.encode(x)
+        if isinstance(z, DiagonalGaussianDistribution):
+            z = z.sample()
+        z = z * self.scale_factor
+        noise_level = torch.randint(
+            0, self.max_noise_level, (x.shape[0],), device=x.device
+        ).long()
+        z = self.q_sample(z, noise_level)
+        if self.out_size is not None:
+            z = torch.nn.functional.interpolate(z, size=self.out_size, mode="nearest")
+        # z = z.repeat_interleave(2, -2).repeat_interleave(2, -1)
+        return z, noise_level
+
+    def decode(self, z):
+        z = z / self.scale_factor
+        return self.model.decode(z)
+
+
+class ConcatTimestepEmbedderND(AbstractEmbModel):
+    """embeds each dimension independently and concatenates them"""
+
+    def __init__(self, outdim):
+        super().__init__()
+        self.timestep = Timestep(outdim)
+        self.outdim = outdim
+
+    def forward(self, x):
+        if x.ndim == 1:
+            x = x[:, None]
+        assert len(x.shape) == 2
+        b, dims = x.shape[0], x.shape[1]
+        x = rearrange(x, "b d -> (b d)")
+        emb = self.timestep(x)
+        emb = rearrange(emb, "(b d) d2 -> b (d d2)", b=b, d=dims, d2=self.outdim)
+        return emb
+
+
+class GaussianEncoder(Encoder, AbstractEmbModel):
+    def __init__(
+        self, weight: float = 1.0, flatten_output: bool = True, *args, **kwargs
+    ):
+        super().__init__(*args, **kwargs)
+        self.posterior = DiagonalGaussianRegularizer()
+        self.weight = weight
+        self.flatten_output = flatten_output
+
+    def forward(self, x) -> Tuple[Dict, torch.Tensor]:
+        z = super().forward(x)
+        z, log = self.posterior(z)
+        log["loss"] = log["kl_loss"]
+        log["weight"] = self.weight
+        if self.flatten_output:
+            z = rearrange(z, "b c h w -> b (h w ) c")
+        return log, z
+
+
+class FrozenOpenCLIPImagePredictionEmbedder(AbstractEmbModel):
+    def __init__(
+        self,
+        open_clip_embedding_config: Dict,
+        n_cond_frames: int,
+        n_copies: int,
+    ):
+        super().__init__()
+
+        self.n_cond_frames = n_cond_frames
+        self.n_copies = n_copies
+        self.open_clip = instantiate_from_config(open_clip_embedding_config)
+
+    def forward(self, vid):
+        vid = self.open_clip(vid)
+        vid = rearrange(vid, "(b t) d -> b t d", t=self.n_cond_frames)
+        vid = repeat(vid, "b t d -> (b s) t d", s=self.n_copies)
+
+        return vid
+
+
+class RawImageEmbedder(AbstractEmbModel):
+    """
+    original image as condition in Instructpix2pix
+    """
+    
+    def forward(self, image):
+        return image
\ No newline at end of file
--- a/sat/sgm/util.py
+++ b/sat/sgm/util.py
+import functools
+import importlib
+import os
+from functools import partial
+from inspect import isfunction
+
+import fsspec
+import numpy as np
+import torch
+from PIL import Image, ImageDraw, ImageFont
+from safetensors.torch import load_file as load_safetensors
+
+
+def disabled_train(self, mode=True):
+    """Overwrite model.train with this function to make sure train/eval mode
+    does not change anymore."""
+    return self
+
+
+def get_string_from_tuple(s):
+    try:
+        # Check if the string starts and ends with parentheses
+        if s[0] == "(" and s[-1] == ")":
+            # Convert the string to a tuple
+            t = eval(s)
+            # Check if the type of t is tuple
+            if type(t) == tuple:
+                return t[0]
+            else:
+                pass
+    except:
+        pass
+    return s
+
+
+def is_power_of_two(n):
+    """
+    chat.openai.com/chat
+    Return True if n is a power of 2, otherwise return False.
+
+    The function is_power_of_two takes an integer n as input and returns True if n is a power of 2, otherwise it returns False.
+    The function works by first checking if n is less than or equal to 0. If n is less than or equal to 0, it can't be a power of 2, so the function returns False.
+    If n is greater than 0, the function checks whether n is a power of 2 by using a bitwise AND operation between n and n-1. If n is a power of 2, then it will have only one bit set to 1 in its binary representation. When we subtract 1 from a power of 2, all the bits to the right of that bit become 1, and the bit itself becomes 0. So, when we perform a bitwise AND between n and n-1, we get 0 if n is a power of 2, and a non-zero value otherwise.
+    Thus, if the result of the bitwise AND operation is 0, then n is a power of 2 and the function returns True. Otherwise, the function returns False.
+
+    """
+    if n <= 0:
+        return False
+    return (n & (n - 1)) == 0
+
+
+def autocast(f, enabled=True):
+    def do_autocast(*args, **kwargs):
+        with torch.cuda.amp.autocast(
+            enabled=enabled,
+            dtype=torch.get_autocast_gpu_dtype(),
+            cache_enabled=torch.is_autocast_cache_enabled(),
+        ):
+            return f(*args, **kwargs)
+
+    return do_autocast
+
+
+def load_partial_from_config(config):
+    return partial(get_obj_from_str(config["target"]), **config.get("params", dict()))
+
+
+def log_txt_as_img(wh, xc, size=10):
+    # wh a tuple of (width, height)
+    # xc a list of captions to plot
+    b = len(xc)
+    txts = list()
+    for bi in range(b):
+        txt = Image.new("RGB", wh, color="white")
+        draw = ImageDraw.Draw(txt)
+        font = ImageFont.truetype("data/DejaVuSans.ttf", size=size)
+        nc = int(40 * (wh[0] / 256))
+        if isinstance(xc[bi], list):
+            text_seq = xc[bi][0]
+        else:
+            text_seq = xc[bi]
+        lines = "\n".join(
+            text_seq[start : start + nc] for start in range(0, len(text_seq), nc)
+        )
+
+        try:
+            draw.text((0, 0), lines, fill="black", font=font)
+        except UnicodeEncodeError:
+            print("Cant encode string for logging. Skipping.")
+
+        txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0
+        txts.append(txt)
+    txts = np.stack(txts)
+    txts = torch.tensor(txts)
+    return txts
+
+
+def partialclass(cls, *args, **kwargs):
+    class NewCls(cls):
+        __init__ = functools.partialmethod(cls.__init__, *args, **kwargs)
+
+    return NewCls
+
+
+def make_path_absolute(path):
+    fs, p = fsspec.core.url_to_fs(path)
+    if fs.protocol == "file":
+        return os.path.abspath(p)
+    return path
+
+
+def ismap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] > 3)
+
+
+def isimage(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1)
+
+
+def isheatmap(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+
+    return x.ndim == 2
+
+
+def isneighbors(x):
+    if not isinstance(x, torch.Tensor):
+        return False
+    return x.ndim == 5 and (x.shape[2] == 3 or x.shape[2] == 1)
+
+
+def exists(x):
+    return x is not None
+
+
+def expand_dims_like(x, y):
+    while x.dim() != y.dim():
+        x = x.unsqueeze(-1)
+    return x
+
+
+def default(val, d):
+    if exists(val):
+        return val
+    return d() if isfunction(d) else d
+
+
+def mean_flat(tensor):
+    """
+    https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86
+    Take the mean over all non-batch dimensions.
+    """
+    return tensor.mean(dim=list(range(1, len(tensor.shape))))
+
+
+def count_params(model, verbose=False):
+    total_params = sum(p.numel() for p in model.parameters())
+    if verbose:
+        print(f"{model.__class__.__name__} has {total_params * 1.e-6:.2f} M params.")
+    return total_params
+
+
+def instantiate_from_config(config, **extra_kwargs):
+    if not "target" in config:
+        if config == "__is_first_stage__":
+            return None
+        elif config == "__is_unconditional__":
+            return None
+        raise KeyError("Expected key `target` to instantiate.")
+    return get_obj_from_str(config["target"])(**config.get("params", dict()), **extra_kwargs)
+
+
+def get_obj_from_str(string, reload=False, invalidate_cache=True):
+    module, cls = string.rsplit(".", 1)
+    if invalidate_cache:
+        importlib.invalidate_caches()
+    if reload:
+        module_imp = importlib.import_module(module)
+        importlib.reload(module_imp)
+    return getattr(importlib.import_module(module, package=None), cls)
+
+
+def append_zero(x):
+    return torch.cat([x, x.new_zeros([1])])
+
+
+def append_dims(x, target_dims):
+    """Appends dimensions to the end of a tensor until it has target_dims dimensions."""
+    dims_to_append = target_dims - x.ndim
+    if dims_to_append < 0:
+        raise ValueError(
+            f"input has {x.ndim} dims but target_dims is {target_dims}, which is less"
+        )
+    return x[(...,) + (None,) * dims_to_append]
+
+
+def load_model_from_config(config, ckpt, verbose=True, freeze=True):
+    print(f"Loading model from {ckpt}")
+    if ckpt.endswith("ckpt"):
+        pl_sd = torch.load(ckpt, map_location="cpu")
+        if "global_step" in pl_sd:
+            print(f"Global Step: {pl_sd['global_step']}")
+        sd = pl_sd["state_dict"]
+    elif ckpt.endswith("safetensors"):
+        sd = load_safetensors(ckpt)
+    else:
+        raise NotImplementedError
+
+    model = instantiate_from_config(config.model)
+
+    m, u = model.load_state_dict(sd, strict=False)
+
+    if len(m) > 0 and verbose:
+        print("missing keys:")
+        print(m)
+    if len(u) > 0 and verbose:
+        print("unexpected keys:")
+        print(u)
+
+    if freeze:
+        for param in model.parameters():
+            param.requires_grad = False
+
+    model.eval()
+    return model
+
+
+def get_configs_path() -> str:
+    """
+    Get the `configs` directory.
+    For a working copy, this is the one in the root of the repository,
+    but for an installed copy, it's in the `sgm` package (see pyproject.toml).
+    """
+    this_dir = os.path.dirname(__file__)
+    candidates = (
+        os.path.join(this_dir, "configs"),
+        os.path.join(this_dir, "..", "configs"),
+    )
+    for candidate in candidates:
+        candidate = os.path.abspath(candidate)
+        if os.path.isdir(candidate):
+            return candidate
+    raise FileNotFoundError(f"Could not find SGM configs in {candidates}")