i2vgen-xl

1ad55bb4 · mashun1 · 1ad55bb4 · 1ad55bb4 · 1ad55bb4 · 1ad55bb4
Commit 1ad55bb4 authored Mar 15, 2024 by mashun1
20 changed files
--- a/tools/inferences/inference_higen_entrance.py
+++ b/tools/inferences/inference_higen_entrance.py
+'''
+/* 
+*Copyright (c) 2021, Alibaba Group;
+*Licensed under the Apache License, Version 2.0 (the "License");
+*you may not use this file except in compliance with the License.
+*You may obtain a copy of the License at
+*   http://www.apache.org/licenses/LICENSE-2.0
+*Unless required by applicable law or agreed to in writing, software
+*distributed under the License is distributed on an "AS IS" BASIS,
+*WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*See the License for the specific language governing permissions and
+*limitations under the License.
+*/
+'''
+import os
+import re
+import os.path as osp
+import sys
+sys.path.insert(0, '/'.join(osp.realpath(__file__).split('/')[:-4]))
+import json
+import math
+import torch
+import pynvml
+import logging
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import torch.cuda.amp as amp
+from importlib import reload
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from einops import rearrange
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from torch.nn.parallel import DistributedDataParallel
+import utils.transforms as data
+from ..modules.config import cfg
+from utils.seed import setup_seed
+from utils.multi_port import find_free_port
+from utils.assign_cfg import assign_signle_cfg
+from utils.distributed import generalized_all_gather, all_reduce
+from utils.video_op import save_i2vgen_video, save_t2vhigen_video_safe
+from utils.registry_class import INFER_ENGINE, MODEL, EMBEDDER, AUTO_ENCODER, DIFFUSION
+@INFER_ENGINE.register_function()
+def inference_higen_entrance(cfg_update,  **kwargs):
+    for k, v in cfg_update.items():
+        if isinstance(v, dict) and k in cfg:
+            cfg[k].update(v)
+        else:
+            cfg[k] = v
+    if not 'MASTER_ADDR' in os.environ:
+        os.environ['MASTER_ADDR']='localhost'
+        os.environ['MASTER_PORT']= find_free_port()
+    cfg.pmi_rank = int(os.getenv('RANK', 0)) 
+    cfg.pmi_world_size = int(os.getenv('WORLD_SIZE', 1))
+    if cfg.debug:
+        cfg.gpus_per_machine = 1
+        cfg.world_size = 1
+    else:
+        cfg.gpus_per_machine = torch.cuda.device_count()
+        cfg.world_size = cfg.pmi_world_size * cfg.gpus_per_machine
+    if cfg.world_size == 1:
+        worker(0, cfg, cfg_update)
+    else:
+        mp.spawn(worker, nprocs=cfg.gpus_per_machine, args=(cfg, cfg_update))
+    return cfg
+def worker(gpu, cfg, cfg_update):
+    '''
+    Inference worker for each gpu
+    '''
+    cfg = assign_signle_cfg(cfg, cfg_update, 'vldm_cfg')
+    for k, v in cfg_update.items():
+        if isinstance(v, dict) and k in cfg:
+            cfg[k].update(v)
+        else:
+            cfg[k] = v
+    cfg.gpu = gpu
+    cfg.seed = int(cfg.seed)
+    cfg.rank = cfg.pmi_rank * cfg.gpus_per_machine + gpu
+    setup_seed(cfg.seed + cfg.rank)
+    if not cfg.debug:
+        torch.cuda.set_device(gpu)
+        torch.backends.cudnn.benchmark = True
+        dist.init_process_group(backend='nccl', world_size=cfg.world_size, rank=cfg.rank)
+    # [Log] Save logging and make log dir
+    log_dir = generalized_all_gather(cfg.log_dir)[0]
+    exp_name = osp.basename(cfg.test_list_path).split('.')[0]
+    inf_name = osp.basename(cfg.cfg_file).split('.')[0]
+    test_model = osp.basename(cfg.test_model).split('.')[0].split('_')[-1]
+    cfg.log_dir = osp.join(cfg.log_dir, '%s' % (exp_name))
+    os.makedirs(cfg.log_dir, exist_ok=True)
+    log_file = osp.join(cfg.log_dir, 'log_%02d.txt' % (cfg.rank))
+    cfg.log_file = log_file
+    reload(logging)
+    logging.basicConfig(
+        level=logging.INFO,
+        format='[%(asctime)s] %(levelname)s: %(message)s',
+        handlers=[
+            logging.FileHandler(filename=log_file),
+            logging.StreamHandler(stream=sys.stdout)])
+    logging.info(cfg)
+    logging.info(f"Going into inference_text2video_entrance inference on {gpu} gpu")
+    # [Diffusion]
+    diffusion = DIFFUSION.build(cfg.Diffusion)
+    # [Data] Data Transform    
+    train_trans = data.Compose([
+        data.CenterCropWide(size=cfg.resolution),
+        data.ToTensor(),
+        data.Normalize(mean=cfg.mean, std=cfg.std)])
+    # [Model] embedder
+    clip_encoder = EMBEDDER.build(cfg.embedder)
+    clip_encoder.model.to(gpu)
+    _, _, zero_y = clip_encoder(text="")
+    zero_y = zero_y.detach()
+    # [Model] auotoencoder 
+    autoencoder = AUTO_ENCODER.build(cfg.auto_encoder)
+    autoencoder.eval() # freeze
+    for param in autoencoder.parameters():
+        param.requires_grad = False
+    autoencoder.cuda()
+    # [Model] UNet 
+    model = MODEL.build(cfg.UNet)
+    state_dict = torch.load(cfg.test_model, map_location='cpu')
+    if 'state_dict' in state_dict:
+        state_dict = state_dict['state_dict']
+    if 'step' in state_dict:
+        resume_step = state_dict['step']
+    else:
+        resume_step = 0
+    status = model.load_state_dict(state_dict, strict=True)
+    logging.info('Load model from {} with status {}'.format(cfg.test_model, status))
+    model = model.to(gpu)
+    model.eval()
+    model = DistributedDataParallel(model, device_ids=[gpu]) if not cfg.debug else model
+    torch.cuda.empty_cache()
+    # [Test List]
+    test_list = open(cfg.test_list_path).readlines()
+    test_list = [item.strip() for item in test_list]
+    num_videos = len(test_list)
+    logging.info(f'There are {num_videos} videos. with {cfg.round} times')
+    test_list = [item for item in test_list for _ in range(cfg.round)]
+    for idx, caption in enumerate(test_list):
+        if caption.startswith('#'):
+            logging.info(f'Skip {caption}')
+            continue
+        if '|' in caption:
+            caption, manual_seed = caption.split('|')
+            manual_seed = int(manual_seed)
+        else:
+            manual_seed = 0
+        logging.info(f"[{idx}]/[{num_videos}] Begin to sample {caption} ...")
+        if caption == "": 
+            logging.info(f'Caption is null of {caption}, skip..')
+            continue
+        captions = [caption]
+        with torch.no_grad():
+            _, y_text, y_words = clip_encoder(text=captions) # bs * 1 *1024 [B, 1, 1024]
+        with torch.no_grad():
+            pynvml.nvmlInit()
+            handle=pynvml.nvmlDeviceGetHandleByIndex(0)
+            meminfo=pynvml.nvmlDeviceGetMemoryInfo(handle)
+            logging.info(f'GPU Memory used {meminfo.used / (1024 ** 3):.2f} GB')
+            # sample images (DDIM)
+            with amp.autocast(enabled=cfg.use_fp16):
+                setup_seed(cfg.seed + cfg.rank + idx % cfg.round + manual_seed)
+                logging.info(f"Setup seed to {cfg.seed + cfg.rank + idx % cfg.round + manual_seed} ...")
+                # setup_seed(cfg.seed + cfg.rank + idx + manual_seed)
+                # logging.info(f"Setup seed to {cfg.seed + cfg.rank + idx} ...")
+                cur_seed = torch.initial_seed()
+                logging.info(f"Current seed {cur_seed} ...")
+                spat_noise = torch.randn([1, 4, 1, int(cfg.resolution[1]/cfg.scale), int(cfg.resolution[0]/cfg.scale)]).to(gpu)
+                spat_prior = torch.zeros_like(spat_noise).squeeze(2)
+                motion_cond = torch.tensor([0],dtype=torch.long, device=gpu)
+                appearance_cond = torch.Tensor([[[1.0]]]).repeat(1, 1, max(cfg.frame_lens)).to(gpu)
+                model_kwargs=[
+                    {'y': y_words, 'spat_prior': spat_prior, 'motion_cond': motion_cond, "appearance_cond": appearance_cond},
+                    {'y': zero_y, 'spat_prior': spat_prior, 'motion_cond': motion_cond, "appearance_cond": appearance_cond}]
+                spat_data = diffusion.ddim_sample_loop(
+                    noise=spat_noise,
+                    model=model.eval(),
+                    model_kwargs=model_kwargs,
+                    guide_scale=cfg.guide_scale,
+                    ddim_timesteps=cfg.ddim_timesteps,
+                    eta=0.0)
+                spat_key_frames = autoencoder.decode(1. / cfg.scale_factor * spat_data.squeeze(2))
+                spat_data = spat_data.squeeze(2)
+                temp_noise = torch.randn([1, 4, cfg.max_frames, int(cfg.resolution[1]/cfg.scale), int(cfg.resolution[0]/cfg.scale)]).to(gpu)
+                b, c, f, h, w= temp_noise.shape
+                offset_noise = torch.randn(b, c, f, 1, 1, device=gpu)
+                temp_noise = temp_noise + cfg.noise_strength * offset_noise
+                motion_cond = torch.tensor([[cfg.motion_factor] * (cfg.max_frames-1)], dtype=torch.long, device=gpu)
+                sim_list = torch.cat([torch.linspace(1.0-cfg.appearance_factor, 1.0, cfg.max_frames)[:-1], torch.linspace(1.0, 1.0-cfg.appearance_factor, cfg.max_frames)])
+                # sim_list = (torch.cos(sim_list * 3.1415926 + 3.1415926) + 1) / 2 # consine
+                appearance_cond = torch.stack([sim_list[i:i+cfg.max_frames] for i in range(len(sim_list)-cfg.max_frames, -1, -1)]).to(gpu)
+                # appearance_cond = CLIPSim().load_vid_sim('/mnt/data-nas-workspace/qingzhiwu/code/video_generation/workspace/temp_dir/cvpr2024_1.vidldm_15_pub_midj_unclip_basemodel_img_text_e003_eval_725000_pikachu_turn_back_g12/sample_000001/cvpr2024_1.vidldm_15_pub_midj_unclip_basemodel_img_text_e003_eval_725000_pikachu_turn_back_g12_s01_diff_0.0_500_.mp4')
+                model_kwargs=[
+                    {'y': y_words, 'spat_prior': spat_data, 'motion_cond': motion_cond, 'appearance_cond': appearance_cond[None, :]},
+                    {'y': zero_y, 'spat_prior': spat_data, 'motion_cond': motion_cond, 'appearance_cond': appearance_cond[None, :]}]
+                video_data = diffusion.ddim_sample_loop(
+                    noise=temp_noise,
+                    model=model.eval(),
+                    model_kwargs=model_kwargs,
+                    guide_scale=cfg.guide_scale,
+                    ddim_timesteps=cfg.ddim_timesteps,
+                    eta=0.0)
+        video_data = 1. / cfg.scale_factor * video_data # [1, 4, 32, 46]
+        video_data = rearrange(video_data, 'b c f h w -> (b f) c h w')
+        chunk_size = min(cfg.decoder_bs, video_data.shape[0])
+        video_data_list = torch.chunk(video_data, video_data.shape[0]//chunk_size, dim=0)
+        decode_data = []
+        for vd_data in video_data_list:
+            gen_frames = autoencoder.decode(vd_data)
+            decode_data.append(gen_frames)
+        video_data = torch.cat(decode_data, dim=0)
+        video_data = rearrange(video_data, '(b f) c h w -> b c f h w', b = cfg.batch_size)
+        # video_data = torch.cat([spat_key_frames[:, :, None, :, :], video_data], dim=2)
+        text_size = cfg.resolution[-1]
+        cap_name = re.sub(r'[^\w\s]', '', caption).replace(' ', '_')
+        file_name = f'rank_{cfg.world_size:02d}_{cfg.rank:02d}_{idx:04d}_{cap_name}.mp4'
+        local_path = os.path.join(cfg.log_dir, f'{file_name}')
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        try:
+            save_t2vhigen_video_safe(local_path, video_data.cpu(), captions, cfg.mean, cfg.std, text_size)
+            logging.info('Save video to dir %s:' % (local_path))
+        except Exception as e:
+            logging.info(f'Step: save text or video error with {e}')
+    logging.info('Congratulations! The inference is completed!')
+    # synchronize to finish some processes
+    if not cfg.debug:
+        torch.cuda.synchronize()
+        dist.barrier()
--- a/tools/inferences/inference_i2vgen_entrance.py
+++ b/tools/inferences/inference_i2vgen_entrance.py
+'''
+/* 
+*Copyright (c) 2021, Alibaba Group;
+*Licensed under the Apache License, Version 2.0 (the "License");
+*you may not use this file except in compliance with the License.
+*You may obtain a copy of the License at
+*   http://www.apache.org/licenses/LICENSE-2.0
+*Unless required by applicable law or agreed to in writing, software
+*distributed under the License is distributed on an "AS IS" BASIS,
+*WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*See the License for the specific language governing permissions and
+*limitations under the License.
+*/
+'''
+import os
+import re
+import os.path as osp
+import sys
+sys.path.insert(0, '/'.join(osp.realpath(__file__).split('/')[:-4]))
+import json
+import math
+import torch
+import random
+# import pynvml
+import logging
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import torch.cuda.amp as amp
+from importlib import reload
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from einops import rearrange
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from torch.nn.parallel import DistributedDataParallel
+import utils.transforms as data
+from ..modules.config import cfg
+from utils.seed import setup_seed
+from utils.multi_port import find_free_port
+from utils.assign_cfg import assign_signle_cfg
+from utils.distributed import generalized_all_gather, all_reduce
+from utils.video_op import save_i2vgen_video, save_i2vgen_video_safe
+from utils.registry_class import INFER_ENGINE, MODEL, EMBEDDER, AUTO_ENCODER, DIFFUSION
+@INFER_ENGINE.register_function()
+def inference_i2vgen_entrance(cfg_update,  **kwargs):
+    for k, v in cfg_update.items():
+        if isinstance(v, dict) and k in cfg:
+            cfg[k].update(v)
+        else:
+            cfg[k] = v
+    if not 'MASTER_ADDR' in os.environ:
+        os.environ['MASTER_ADDR']='localhost'
+        os.environ['MASTER_PORT']= find_free_port()
+    cfg.pmi_rank = int(os.getenv('RANK', 0)) 
+    cfg.pmi_world_size = int(os.getenv('WORLD_SIZE', 1))
+    if cfg.debug:
+        cfg.gpus_per_machine = 1
+        cfg.world_size = 1
+    else:
+        cfg.gpus_per_machine = torch.cuda.device_count()
+        cfg.world_size = cfg.pmi_world_size * cfg.gpus_per_machine
+    if cfg.world_size == 1:
+        worker(0, cfg, cfg_update)
+    else:
+        mp.spawn(worker, nprocs=cfg.gpus_per_machine, args=(cfg, cfg_update))
+    return cfg
+def worker(gpu, cfg, cfg_update):
+    '''
+    Inference worker for each gpu
+    '''
+    cfg = assign_signle_cfg(cfg, cfg_update, 'vldm_cfg')
+    for k, v in cfg_update.items():
+        if isinstance(v, dict) and k in cfg:
+            cfg[k].update(v)
+        else:
+            cfg[k] = v
+    cfg.gpu = gpu
+    cfg.seed = int(cfg.seed)
+    cfg.rank = cfg.pmi_rank * cfg.gpus_per_machine + gpu
+    setup_seed(cfg.seed + cfg.rank)
+    if not cfg.debug:
+        torch.cuda.set_device(gpu)
+        torch.backends.cudnn.benchmark = True
+        dist.init_process_group(backend='nccl', world_size=cfg.world_size, rank=cfg.rank)
+    # [Log] Save logging and make log dir
+    log_dir = generalized_all_gather(cfg.log_dir)[0]
+    exp_name = osp.basename(cfg.test_list_path).split('.')[0]
+    inf_name = osp.basename(cfg.cfg_file).split('.')[0]
+    test_model = osp.basename(cfg.test_model).split('.')[0].split('_')[-1]
+    cfg.log_dir = osp.join(cfg.log_dir, '%s' % (exp_name))
+    os.makedirs(cfg.log_dir, exist_ok=True)
+    log_file = osp.join(cfg.log_dir, 'log_%02d.txt' % (cfg.rank))
+    cfg.log_file = log_file
+    reload(logging)
+    logging.basicConfig(
+        level=logging.INFO,
+        format='[%(asctime)s] %(levelname)s: %(message)s',
+        handlers=[
+            logging.FileHandler(filename=log_file),
+            logging.StreamHandler(stream=sys.stdout)])
+    logging.info(cfg)
+    logging.info(f"Going into it2v_fullid_img_text inference on {gpu} gpu")
+    # [Diffusion]
+    diffusion = DIFFUSION.build(cfg.Diffusion)
+    # [Data] Data Transform    
+    train_trans = data.Compose([
+        data.CenterCropWide(size=cfg.resolution),
+        data.ToTensor(),
+        data.Normalize(mean=cfg.mean, std=cfg.std)])
+    vit_trans = data.Compose([
+        data.CenterCropWide(size=(cfg.resolution[0], cfg.resolution[0])),
+        data.Resize(cfg.vit_resolution),
+        data.ToTensor(),
+        data.Normalize(mean=cfg.vit_mean, std=cfg.vit_std)])
+    # [Model] embedder
+    clip_encoder = EMBEDDER.build(cfg.embedder)
+    clip_encoder.model.to(gpu)
+    _, _, zero_y = clip_encoder(text="")
+    _, _, zero_y_negative = clip_encoder(text=cfg.negative_prompt)
+    zero_y, zero_y_negative = zero_y.detach(), zero_y_negative.detach()
+    black_image_feature = torch.zeros([1, 1, cfg.UNet.y_dim]).cuda()
+    # [Model] auotoencoder 
+    autoencoder = AUTO_ENCODER.build(cfg.auto_encoder)
+    autoencoder.eval() # freeze
+    for param in autoencoder.parameters():
+        param.requires_grad = False
+    autoencoder.cuda()
+    # [Model] UNet 
+    model = MODEL.build(cfg.UNet)
+    checkpoint_dict = torch.load(cfg.test_model, map_location='cpu')
+    state_dict = checkpoint_dict['state_dict']
+    resume_step = checkpoint_dict['step']
+    status = model.load_state_dict(state_dict, strict=True)
+    logging.info('Load model from {} with status {}'.format(cfg.test_model, status))
+    model = model.to(gpu)
+    model.eval()
+    model = DistributedDataParallel(model, device_ids=[gpu]) if not cfg.debug else model
+    torch.cuda.empty_cache()
+    # [Test List]
+    test_list = open(cfg.test_list_path).readlines()
+    test_list = [item.strip() for item in test_list]
+    num_videos = len(test_list)
+    logging.info(f'There are {num_videos} videos. with {cfg.round} times')
+    test_list = [item for item in test_list for _ in range(cfg.round)]
+    for idx, line in enumerate(test_list):
+        if line.startswith('#'):
+            logging.info(f'Skip {line}')
+            continue
+        logging.info(f"[{idx}]/[{num_videos}] Begin to sample {line} ...")
+        img_key, caption = line.split('|||')
+        img_name = os.path.basename(img_key).split('.')[0]
+        if caption == "":
+            logging.info(f'Caption is null of {img_key}, skip..')
+            continue
+        captions = [caption]
+        image = Image.open(img_key)
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        with torch.no_grad():
+            image_tensor = vit_trans(image)
+            image_tensor = image_tensor.unsqueeze(0)
+            y_visual, y_text, y_words = clip_encoder(image=image_tensor, text=captions)
+            y_visual = y_visual.unsqueeze(1)
+        fps_tensor =  torch.tensor([cfg.target_fps], dtype=torch.long, device=gpu)
+        image_id_tensor = train_trans([image]).to(gpu)
+        local_image = autoencoder.encode_firsr_stage(image_id_tensor, cfg.scale_factor).detach()
+        local_image = local_image.unsqueeze(2).repeat_interleave(repeats=cfg.max_frames, dim=2)
+        with torch.no_grad():
+            # pynvml.nvmlInit()
+            # handle=pynvml.nvmlDeviceGetHandleByIndex(0)
+            # meminfo=pynvml.nvmlDeviceGetMemoryInfo(handle)
+            # logging.info(f'GPU Memory used {meminfo.used / (1024 ** 3):.2f} GB')
+            # Sample images
+            with amp.autocast(enabled=cfg.use_fp16):
+                # NOTE: For reproducibility, we have alread recorde the seed ``cur_seed''
+                # torch.manual_seed(cur_seed) 
+                # cur_seed = torch.get_rng_state()[0]
+                # logging.info(f"Current seed {cur_seed}...")
+                noise = torch.randn([1, 4, cfg.max_frames, int(cfg.resolution[1]/cfg.scale), int(cfg.resolution[0]/cfg.scale)])
+                noise = noise.to(gpu)
+                infer_img = black_image_feature if cfg.use_zero_infer else None
+                model_kwargs=[
+                    {'y': y_words, 'image':y_visual, 'local_image':local_image, 'fps': fps_tensor}, 
+                    {'y': zero_y_negative, 'image':infer_img, 'local_image':local_image, 'fps': fps_tensor}]
+                video_data = diffusion.ddim_sample_loop(
+                    noise=noise,
+                    model=model.eval(),
+                    model_kwargs=model_kwargs,
+                    guide_scale=cfg.guide_scale,
+                    ddim_timesteps=cfg.ddim_timesteps,
+                    eta=0.0)
+        video_data = 1. / cfg.scale_factor * video_data # [1, 4, 32, 46]
+        video_data = rearrange(video_data, 'b c f h w -> (b f) c h w')
+        chunk_size = min(cfg.decoder_bs, video_data.shape[0])
+        video_data_list = torch.chunk(video_data, video_data.shape[0]//chunk_size, dim=0)
+        decode_data = []
+        for vd_data in video_data_list:
+            gen_frames = autoencoder.decode(vd_data)
+            decode_data.append(gen_frames)
+        video_data = torch.cat(decode_data, dim=0)
+        video_data = rearrange(video_data, '(b f) c h w -> b c f h w', b = cfg.batch_size)
+        text_size = cfg.resolution[-1]
+        cap_name = re.sub(r'[^\w\s]', '', caption).replace(' ', '_')
+        file_name = f'{img_name}_{cfg.world_size:02d}_{cfg.rank:02d}_{cap_name}_{idx:02d}.mp4'
+        local_path = os.path.join(cfg.log_dir, f'{file_name}')
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        try:
+            save_i2vgen_video_safe(local_path, video_data.cpu(), captions, cfg.mean, cfg.std, text_size)
+            # NOTE: If you want to visualize the comparison between input and output, you can use the following function.
+            # save_i2vgen_video(local_path, image_id_tensor.cpu(), video_data.cpu(), captions, cfg.mean, cfg.std, text_size)
+            logging.info('Save video to dir %s:' % (local_path))
+        except Exception as e:
+            logging.info(f'Step: save text or video error with {e}')
+    logging.info('Congratulations! The inference is completed!')
+    # synchronize to finish some processes
+    if not cfg.debug:
+        torch.cuda.synchronize()
+        dist.barrier()
--- a/tools/inferences/inference_sr600_entrance.py
+++ b/tools/inferences/inference_sr600_entrance.py
+'''
+/* 
+*Copyright (c) 2021, Alibaba Group;
+*Licensed under the Apache License, Version 2.0 (the "License");
+*you may not use this file except in compliance with the License.
+*You may obtain a copy of the License at
+*   http://www.apache.org/licenses/LICENSE-2.0
+*Unless required by applicable law or agreed to in writing, software
+*distributed under the License is distributed on an "AS IS" BASIS,
+*WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*See the License for the specific language governing permissions and
+*limitations under the License.
+*/
+'''
+import os
+import re
+import os.path as osp
+import sys
+sys.path.insert(0, '/'.join(osp.realpath(__file__).split('/')[:-4]))
+import json
+import math
+import torch
+import pynvml
+import logging
+import cv2
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import torch.cuda.amp as amp
+from importlib import reload
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from einops import rearrange
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from torch.nn.parallel import DistributedDataParallel
+import utils.transforms as data
+from ..modules.config import cfg
+from utils.seed import setup_seed
+from utils.multi_port import find_free_port
+from utils.assign_cfg import assign_signle_cfg
+from utils.distributed import generalized_all_gather, all_reduce
+from utils.video_op import save_i2vgen_video, save_t2vhigen_video_safe
+from utils.registry_class import INFER_ENGINE, MODEL, EMBEDDER, AUTO_ENCODER, DIFFUSION
+@INFER_ENGINE.register_function()
+def inference_sr600_entrance(cfg_update,  **kwargs):
+    for k, v in cfg_update.items():
+        if isinstance(v, dict) and k in cfg:
+            cfg[k].update(v)
+        else:
+            cfg[k] = v
+    if not 'MASTER_ADDR' in os.environ:
+        os.environ['MASTER_ADDR']='localhost'
+        os.environ['MASTER_PORT']= find_free_port()
+    cfg.pmi_rank = int(os.getenv('RANK', 0)) 
+    cfg.pmi_world_size = int(os.getenv('WORLD_SIZE', 1))
+    if cfg.debug:
+        cfg.gpus_per_machine = 1
+        cfg.world_size = 1
+    else:
+        cfg.gpus_per_machine = torch.cuda.device_count()
+        cfg.world_size = cfg.pmi_world_size * cfg.gpus_per_machine
+    if cfg.world_size == 1:
+        worker(0, cfg, cfg_update)
+    else:
+        mp.spawn(worker, nprocs=cfg.gpus_per_machine, args=(cfg, cfg_update))
+    return cfg
+def load_video_frames(autoencoder, vid_path, train_trans, max_frames=32):
+    capture = cv2.VideoCapture(vid_path)
+    _fps = capture.get(cv2.CAP_PROP_FPS)
+    sample_fps = _fps
+    _total_frame_num = capture.get(cv2.CAP_PROP_FRAME_COUNT)
+    stride = round(_fps / sample_fps)
+    cover_frame_num = (stride * max_frames)
+    if _total_frame_num < cover_frame_num + 5:
+        start_frame = 0
+        end_frame = _total_frame_num
+    else:
+        # start_frame = random.randint(0, _total_frame_num-cover_frame_num-5)
+        start_frame = 0
+        end_frame = _total_frame_num
+    pointer = 0
+    frame_list = []
+    # while(True):
+    while len(frame_list) < max_frames:
+        ret, frame = capture.read()
+        pointer += 1 
+        if (not ret) or (frame is None): break
+        if pointer < start_frame: continue
+        # if pointer >= end_frame - 1: break
+        if pointer >= _total_frame_num + 1: break
+        if (pointer - start_frame) % stride == 0:
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            frame = Image.fromarray(frame)
+            frame_list.append(frame)
+    capture.release()
+    # video_data = torch.zeros(len(frame_list), 3,  resolution[1], resolution[0])
+    video_data = train_trans(frame_list)
+    video_data = torch.nn.functional.interpolate(video_data, size=(720, 1280), mode='bilinear')
+    video_data = video_data.unsqueeze(0)
+    video_data = video_data.cuda()
+    batch_size, frames_num, _, _, _ = video_data.shape
+    video_data = rearrange(video_data, 'b f c h w -> (b f) c h w')
+    video_data_list = torch.chunk(video_data, video_data.shape[0]//2, dim=0)
+    # setup_seed(0)
+    with torch.no_grad():
+        decode_data = []
+        for vd_data in video_data_list:
+            tmp = autoencoder.encode_firsr_stage(vd_data, cfg.scale_factor).detach()
+            # encoder_posterior = autoencoder.encode(vd_data)
+            # tmp = get_first_stage_encoding(encoder_posterior).detach()
+            decode_data.append(tmp)
+        video_data_feature = torch.cat(decode_data, dim=0)
+        video_data_feature = rearrange(video_data_feature, '(b f) c h w -> b c f h w', b = batch_size)
+    return video_data_feature
+def worker(gpu, cfg, cfg_update):
+    '''
+    Inference worker for each gpu
+    '''
+    cfg = assign_signle_cfg(cfg, cfg_update, 'vldm_cfg')
+    for k, v in cfg_update.items():
+        if isinstance(v, dict) and k in cfg:
+            cfg[k].update(v)
+        else:
+            cfg[k] = v
+    cfg.gpu = gpu
+    cfg.seed = int(cfg.seed)
+    cfg.rank = cfg.pmi_rank * cfg.gpus_per_machine + gpu
+    setup_seed(cfg.seed + cfg.rank)
+    if not cfg.debug:
+        torch.cuda.set_device(gpu)
+        torch.backends.cudnn.benchmark = True
+        dist.init_process_group(backend='nccl', world_size=cfg.world_size, rank=cfg.rank)
+    # [Log] Save logging and make log dir
+    log_dir = generalized_all_gather(cfg.log_dir)[0]
+    exp_name = osp.basename(cfg.test_list_path).split('.')[0]
+    inf_name = osp.basename(cfg.cfg_file).split('.')[0]
+    test_model = osp.basename(cfg.test_model).split('.')[0].split('_')[-1]
+    cfg.log_dir = osp.join(cfg.log_dir, '%s' % (exp_name))
+    os.makedirs(cfg.log_dir, exist_ok=True)
+    log_file = osp.join(cfg.log_dir, 'log_%02d.txt' % (cfg.rank))
+    cfg.log_file = log_file
+    reload(logging)
+    logging.basicConfig(
+        level=logging.INFO,
+        format='[%(asctime)s] %(levelname)s: %(message)s',
+        handlers=[
+            logging.FileHandler(filename=log_file),
+            logging.StreamHandler(stream=sys.stdout)])
+    logging.info(cfg)
+    logging.info(f"Going into inference_sr600_entrance inference on {gpu} gpu")
+    # [Diffusion]
+    diffusion = DIFFUSION.build(cfg.Diffusion)
+    # [Data] Data Transform    
+    train_trans = data.Compose([
+        data.ToTensor(),
+        data.Normalize(mean=cfg.mean, std=cfg.std)])
+    # [Model] embedder
+    clip_encoder = EMBEDDER.build(cfg.embedder)
+    clip_encoder.model.to(gpu)
+    _, _, zero_y = clip_encoder(text=cfg.embedder.negative_prompt)
+    zero_y = zero_y.detach()
+    # [Model] auotoencoder 
+    autoencoder = AUTO_ENCODER.build(cfg.auto_encoder)
+    autoencoder.eval() # freeze
+    for param in autoencoder.parameters():
+        param.requires_grad = False
+    autoencoder.cuda()
+    # [Model] UNet 
+    model = MODEL.build(cfg.UNet)
+    state_dict = torch.load(cfg.test_model, map_location='cpu')
+    if 'state_dict' in state_dict:
+        state_dict = state_dict['state_dict']
+    if 'step' in state_dict:
+        resume_step = state_dict['step']
+    else:
+        resume_step = 0
+    status = model.load_state_dict(state_dict, strict=True)
+    logging.info('Load model from {} with status {}'.format(cfg.test_model, status))
+    model = model.to(gpu)
+    model.eval()
+    model = DistributedDataParallel(model, device_ids=[gpu]) if not cfg.debug else model
+    torch.cuda.empty_cache()
+    # [Test List]
+    test_list = open(cfg.test_list_path).readlines()
+    test_list = [item.strip() for item in test_list]
+    num_videos = len(test_list)
+    logging.info(f'There are {num_videos} videos. with {cfg.round} times')
+    test_list = [item for item in test_list for _ in range(cfg.round)]
+    for idx, caption in enumerate(test_list):
+        if caption.startswith('#'):
+            logging.info(f'Skip {caption}')
+            continue
+        if '|' in caption:
+            caption, manual_seed = caption.split('|')
+            manual_seed = int(manual_seed)
+        else:
+            manual_seed = 0
+        logging.info(f"[{idx}]/[{num_videos}] Begin to sample {caption} ...")
+        if caption == "": 
+            logging.info(f'Caption is null of {caption}, skip..')
+            continue
+        captions = [caption + cfg.embedder.positive_prompt]
+        with torch.no_grad():
+            _, y_text, y_words = clip_encoder(text=captions) # bs * 1 *1024 [B, 1, 1024]
+        cap_name = re.sub(r'[^\w\s]', '', caption).replace(' ', '_')
+        file_name = f'rank_{cfg.world_size:02d}_{cfg.rank:02d}_{idx:04d}_{cap_name}.mp4'
+        low_video_local_path = os.path.join(cfg.log_dir, f'{file_name}')
+        video_data_feature = load_video_frames(autoencoder, low_video_local_path, train_trans)
+        with amp.autocast(enabled=True):
+            pynvml.nvmlInit()
+            handle=pynvml.nvmlDeviceGetHandleByIndex(0)
+            meminfo=pynvml.nvmlDeviceGetMemoryInfo(handle)
+            total_noise_levels = cfg.total_noise_levels
+            setup_seed(0)
+            t = torch.randint(total_noise_levels-1, total_noise_levels, (1, ), dtype=torch.long).cuda()
+            noised_vid_feat = diffusion.reverse_diffusion.ddim_reverse_sample_loop(
+                            x0=video_data_feature,
+                            model=model.eval(),
+                            model_kwargs={'y': zero_y},
+                            clamp=None,
+                            percentile=None,
+                            guide_scale=None,
+                            guide_rescale=None,
+                            ddim_timesteps=30,
+                            reverse_steps=total_noise_levels
+                            )
+            model_kwargs=[ {'y': y_words}, {'y': zero_y}]
+            video_data = diffusion.forward_diffusion.sample(
+                            noise=noised_vid_feat,
+                            model=model.eval(),#.requires_grad_(False),
+                            model_kwargs=model_kwargs,
+                            guide_scale=9.0,
+                            guide_rescale=0.3,
+                            solver='dpmpp_2m_sde',
+                            steps=30,
+                            t_max=total_noise_levels-1,
+                            t_min=0,
+                            discretization='trailing'
+                        )
+        video_data = 1. / cfg.scale_factor * video_data # [1, 4, 32, 46]
+        video_data = rearrange(video_data, 'b c f h w -> (b f) c h w')
+        chunk_size = min(cfg.decoder_bs, video_data.shape[0])
+        video_data_list = torch.chunk(video_data, video_data.shape[0]//chunk_size, dim=0)
+        decode_data = []
+        for vd_data in video_data_list:
+            gen_frames = autoencoder.decode(vd_data)
+            decode_data.append(gen_frames)
+        video_data = torch.cat(decode_data, dim=0)
+        video_data = rearrange(video_data, '(b f) c h w -> b c f h w', b = cfg.batch_size)
+        # video_data = torch.cat([spat_key_frames[:, :, None, :, :], video_data], dim=2)
+        text_size = cfg.resolution[-1]
+        cap_name = re.sub(r'[^\w\s]', '', caption).replace(' ', '_')
+        file_name = f'rank_{cfg.world_size:02d}_{cfg.rank:02d}_{idx:04d}_{cap_name}_sr.mp4'
+        local_path = os.path.join(cfg.log_dir, f'{file_name}')
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        try:
+            save_t2vhigen_video_safe(local_path, video_data.cpu(), captions, cfg.mean, cfg.std, text_size)
+            logging.info('Save video to dir %s:' % (local_path))
+        except Exception as e:
+            logging.info(f'Step: save text or video error with {e}')
+    logging.info('Congratulations! The inference is completed!')
+    # synchronize to finish some processes
+    if not cfg.debug:
+        torch.cuda.synchronize()
+        dist.barrier()
--- a/tools/inferences/inference_text2video_entrance.py
+++ b/tools/inferences/inference_text2video_entrance.py
+'''
+/* 
+*Copyright (c) 2021, Alibaba Group;
+*Licensed under the Apache License, Version 2.0 (the "License");
+*you may not use this file except in compliance with the License.
+*You may obtain a copy of the License at
+*   http://www.apache.org/licenses/LICENSE-2.0
+*Unless required by applicable law or agreed to in writing, software
+*distributed under the License is distributed on an "AS IS" BASIS,
+*WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+*See the License for the specific language governing permissions and
+*limitations under the License.
+*/
+'''
+import os
+import re
+import os.path as osp
+import sys
+sys.path.insert(0, '/'.join(osp.realpath(__file__).split('/')[:-4]))
+import json
+import math
+import torch
+import pynvml
+import logging
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+import torch.cuda.amp as amp
+from importlib import reload
+import torch.distributed as dist
+import torch.multiprocessing as mp
+from einops import rearrange
+import torchvision.transforms as T
+import torchvision.transforms.functional as TF
+from torch.nn.parallel import DistributedDataParallel
+import utils.transforms as data
+from ..modules.config import cfg
+from utils.seed import setup_seed
+from utils.multi_port import find_free_port
+from utils.assign_cfg import assign_signle_cfg
+from utils.distributed import generalized_all_gather, all_reduce
+from utils.video_op import save_i2vgen_video, save_i2vgen_video_safe
+from utils.registry_class import INFER_ENGINE, MODEL, EMBEDDER, AUTO_ENCODER, DIFFUSION
+@INFER_ENGINE.register_function()
+def inference_text2video_entrance(cfg_update,  **kwargs):
+    for k, v in cfg_update.items():
+        if isinstance(v, dict) and k in cfg:
+            cfg[k].update(v)
+        else:
+            cfg[k] = v
+    if not 'MASTER_ADDR' in os.environ:
+        os.environ['MASTER_ADDR']='localhost'
+        os.environ['MASTER_PORT']= find_free_port()
+    cfg.pmi_rank = int(os.getenv('RANK', 0)) 
+    cfg.pmi_world_size = int(os.getenv('WORLD_SIZE', 1))
+    if cfg.debug:
+        cfg.gpus_per_machine = 1
+        cfg.world_size = 1
+    else:
+        cfg.gpus_per_machine = torch.cuda.device_count()
+        cfg.world_size = cfg.pmi_world_size * cfg.gpus_per_machine
+    if cfg.world_size == 1:
+        worker(0, cfg, cfg_update)
+    else:
+        mp.spawn(worker, nprocs=cfg.gpus_per_machine, args=(cfg, cfg_update))
+    return cfg
+def worker(gpu, cfg, cfg_update):
+    '''
+    Inference worker for each gpu
+    '''
+    cfg = assign_signle_cfg(cfg, cfg_update, 'vldm_cfg')
+    for k, v in cfg_update.items():
+        if isinstance(v, dict) and k in cfg:
+            cfg[k].update(v)
+        else:
+            cfg[k] = v
+    cfg.gpu = gpu
+    cfg.seed = int(cfg.seed)
+    cfg.rank = cfg.pmi_rank * cfg.gpus_per_machine + gpu
+    setup_seed(cfg.seed + cfg.rank)
+    if not cfg.debug:
+        torch.cuda.set_device(gpu)
+        torch.backends.cudnn.benchmark = True
+        dist.init_process_group(backend='nccl', world_size=cfg.world_size, rank=cfg.rank)
+    # [Log] Save logging and make log dir
+    log_dir = generalized_all_gather(cfg.log_dir)[0]
+    exp_name = osp.basename(cfg.test_list_path).split('.')[0]
+    inf_name = osp.basename(cfg.cfg_file).split('.')[0]
+    test_model = osp.basename(cfg.test_model).split('.')[0].split('_')[-1]
+    cfg.log_dir = osp.join(cfg.log_dir, '%s' % (exp_name))
+    os.makedirs(cfg.log_dir, exist_ok=True)
+    log_file = osp.join(cfg.log_dir, 'log_%02d.txt' % (cfg.rank))
+    cfg.log_file = log_file
+    reload(logging)
+    logging.basicConfig(
+        level=logging.INFO,
+        format='[%(asctime)s] %(levelname)s: %(message)s',
+        handlers=[
+            logging.FileHandler(filename=log_file),
+            logging.StreamHandler(stream=sys.stdout)])
+    logging.info(cfg)
+    logging.info(f"Going into inference_text2video_entrance inference on {gpu} gpu")
+    # [Diffusion]
+    diffusion = DIFFUSION.build(cfg.Diffusion)
+    # [Data] Data Transform    
+    train_trans = data.Compose([
+        data.CenterCropWide(size=cfg.resolution),
+        data.ToTensor(),
+        data.Normalize(mean=cfg.mean, std=cfg.std)])
+    vit_trans = data.Compose([
+        data.CenterCropWide(size=(cfg.resolution[0], cfg.resolution[0])),
+        data.Resize(cfg.vit_resolution),
+        data.ToTensor(),
+        data.Normalize(mean=cfg.vit_mean, std=cfg.vit_std)])
+    # [Model] embedder
+    clip_encoder = EMBEDDER.build(cfg.embedder)
+    clip_encoder.model.to(gpu)
+    _, _, zero_y = clip_encoder(text="")
+    _, _, zero_y_negative = clip_encoder(text=cfg.negative_prompt)
+    zero_y, zero_y_negative = zero_y.detach(), zero_y_negative.detach()
+    # [Model] auotoencoder 
+    autoencoder = AUTO_ENCODER.build(cfg.auto_encoder)
+    autoencoder.eval() # freeze
+    for param in autoencoder.parameters():
+        param.requires_grad = False
+    autoencoder.cuda()
+    # [Model] UNet 
+    model = MODEL.build(cfg.UNet)
+    state_dict = torch.load(cfg.test_model, map_location='cpu')
+    if 'state_dict' in state_dict:
+        resume_step = state_dict['step']
+        state_dict = state_dict['state_dict']
+    else:
+        resume_step = 0
+    status = model.load_state_dict(state_dict, strict=True)
+    logging.info('Load model from {} with status {}'.format(cfg.test_model, status))
+    model = model.to(gpu)
+    model.eval()
+    model = DistributedDataParallel(model, device_ids=[gpu]) if not cfg.debug else model
+    torch.cuda.empty_cache()
+    # [Test List]
+    test_list = open(cfg.test_list_path).readlines()
+    test_list = [item.strip() for item in test_list]
+    num_videos = len(test_list)
+    logging.info(f'There are {num_videos} videos. with {cfg.round} times')
+    test_list = [item for item in test_list for _ in range(cfg.round)]
+    for idx, caption in enumerate(test_list):
+        if caption.startswith('#'):
+            logging.info(f'Skip {caption}')
+            continue
+        logging.info(f"[{idx}]/[{num_videos}] Begin to sample {caption} ...")
+        if caption == "": 
+            logging.info(f'Caption is null of {caption}, skip..')
+            continue
+        captions = [caption]
+        with torch.no_grad():
+            _, y_text, y_words = clip_encoder(text=captions) # bs * 1 *1024 [B, 1, 1024]
+        fps_tensor =  torch.tensor([cfg.target_fps], dtype=torch.long, device=gpu)
+        with torch.no_grad():
+            pynvml.nvmlInit()
+            handle=pynvml.nvmlDeviceGetHandleByIndex(0)
+            meminfo=pynvml.nvmlDeviceGetMemoryInfo(handle)
+            logging.info(f'GPU Memory used {meminfo.used / (1024 ** 3):.2f} GB')
+            # sample images (DDIM)
+            with amp.autocast(enabled=cfg.use_fp16):
+                cur_seed = torch.initial_seed()
+                logging.info(f"Current seed {cur_seed} ...")
+                noise = torch.randn([1, 4, cfg.max_frames, int(cfg.resolution[1]/cfg.scale), int(cfg.resolution[0]/cfg.scale)])
+                noise = noise.to(gpu)
+                model_kwargs=[
+                    {'y': y_words, 'fps': fps_tensor},
+                    {'y': zero_y_negative, 'fps': fps_tensor}]
+                video_data = diffusion.ddim_sample_loop(
+                    noise=noise,
+                    model=model.eval(),
+                    model_kwargs=model_kwargs,
+                    guide_scale=cfg.guide_scale,
+                    ddim_timesteps=cfg.ddim_timesteps,
+                    eta=0.0)
+        video_data = 1. / cfg.scale_factor * video_data # [1, 4, 32, 46]
+        video_data = rearrange(video_data, 'b c f h w -> (b f) c h w')
+        chunk_size = min(cfg.decoder_bs, video_data.shape[0])
+        video_data_list = torch.chunk(video_data, video_data.shape[0]//chunk_size, dim=0)
+        decode_data = []
+        for vd_data in video_data_list:
+            gen_frames = autoencoder.decode(vd_data)
+            decode_data.append(gen_frames)
+        video_data = torch.cat(decode_data, dim=0)
+        video_data = rearrange(video_data, '(b f) c h w -> b c f h w', b = cfg.batch_size)
+        text_size = cfg.resolution[-1]
+        cap_name = re.sub(r'[^\w\s]', '', caption).replace(' ', '_')
+        file_name = f'rank_{cfg.world_size:02d}_{cfg.rank:02d}_{idx:04d}_{cap_name}.mp4'
+        local_path = os.path.join(cfg.log_dir, f'{file_name}')
+        os.makedirs(os.path.dirname(local_path), exist_ok=True)
+        try:
+            save_i2vgen_video_safe(local_path, video_data.cpu(), captions, cfg.mean, cfg.std, text_size)
+            logging.info('Save video to dir %s:' % (local_path))
+        except Exception as e:
+            logging.info(f'Step: save text or video error with {e}')
+    logging.info('Congratulations! The inference is completed!')
+    # synchronize to finish some processes
+    if not cfg.debug:
+        torch.cuda.synchronize()
+        dist.barrier()
--- a/tools/modules/__init__.py
+++ b/tools/modules/__init__.py
+from .clip_embedder import FrozenOpenCLIPEmbedder
+from .autoencoder import DiagonalGaussianDistribution, AutoencoderKL
+from .clip_embedder import *
+from .autoencoder import *
+from .unet import *
+from .diffusions import *
\ No newline at end of file
--- a/tools/modules/__pycache__/__init__.cpython-38.pyc
+++ b/tools/modules/__pycache__/__init__.cpython-38.pyc
--- a/tools/modules/__pycache__/autoencoder.cpython-38.pyc
+++ b/tools/modules/__pycache__/autoencoder.cpython-38.pyc
--- a/tools/modules/__pycache__/clip_embedder.cpython-38.pyc
+++ b/tools/modules/__pycache__/clip_embedder.cpython-38.pyc
--- a/tools/modules/__pycache__/config.cpython-38.pyc
+++ b/tools/modules/__pycache__/config.cpython-38.pyc
--- a/tools/modules/autoencoder.py
+++ b/tools/modules/autoencoder.py
+import torch
+import logging
+import collections
+import numpy as np
+import torch.nn as nn
+import torch.nn.functional as F
+from utils.registry_class import AUTO_ENCODER,DISTRIBUTION
+def nonlinearity(x):
+    # swish
+    return x*torch.sigmoid(x)
+def Normalize(in_channels, num_groups=32):
+    return torch.nn.GroupNorm(num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+@torch.no_grad()
+def get_first_stage_encoding(encoder_posterior, scale_factor=1.0):                                                                
+    if isinstance(encoder_posterior, DiagonalGaussianDistribution):
+        z = encoder_posterior.sample()
+    elif isinstance(encoder_posterior, torch.Tensor):
+        z = encoder_posterior
+    else:
+        raise NotImplementedError(f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented")
+    return scale_factor * z
+@AUTO_ENCODER.register_class()
+class AutoencoderKL(nn.Module):
+    def __init__(self,
+                 ddconfig,
+                 embed_dim,
+                 pretrained=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ema_decay=None,
+                 learn_logvar=False,
+                 use_vid_decoder=False,
+                 **kwargs):
+        super().__init__()
+        self.learn_logvar = learn_logvar
+        self.image_key = image_key
+        self.encoder = Encoder(**ddconfig)
+        self.decoder = Decoder(**ddconfig)
+        assert ddconfig["double_z"]
+        self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1)
+        self.embed_dim = embed_dim
+        if colorize_nlabels is not None:
+            assert type(colorize_nlabels)==int
+            self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1))
+        if monitor is not None:
+            self.monitor = monitor
+        self.use_ema = ema_decay is not None
+        if pretrained is not None:
+            self.init_from_ckpt(pretrained, ignore_keys=ignore_keys)
+    def init_from_ckpt(self, path, ignore_keys=list()):
+        sd = torch.load(path, map_location="cpu")["state_dict"]
+        keys = list(sd.keys())
+        sd_new = collections.OrderedDict()
+        for k in keys:
+            if k.find('first_stage_model') >= 0:
+                k_new = k.split('first_stage_model.')[-1]
+                sd_new[k_new] = sd[k]
+        self.load_state_dict(sd_new, strict=True)
+        logging.info(f"Restored from {path}")
+    def on_train_batch_end(self, *args, **kwargs):
+        if self.use_ema:
+            self.model_ema(self)
+    def encode(self, x):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+    def encode_firsr_stage(self, x, scale_factor=1.0):
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        z = get_first_stage_encoding(posterior, scale_factor)
+        return z
+    def encode_ms(self, x):
+        hs = self.encoder(x, True)
+        h = hs[-1]
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+        hs[-1] = h
+        return hs
+    def decode(self, z, **kwargs):
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z, **kwargs)
+        return dec
+    def forward(self, input, sample_posterior=True):
+        posterior = self.encode(input)
+        if sample_posterior:
+            z = posterior.sample()
+        else:
+            z = posterior.mode()
+        dec = self.decode(z)
+        return dec, posterior
+    def get_input(self, batch, k):
+        x = batch[k]
+        if len(x.shape) == 3:
+            x = x[..., None]
+        x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float()
+        return x
+    def get_last_layer(self):
+        return self.decoder.conv_out.weight
+    @torch.no_grad()
+    def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs):
+        log = dict()
+        x = self.get_input(batch, self.image_key)
+        x = x.to(self.device)
+        if not only_inputs:
+            xrec, posterior = self(x)
+            if x.shape[1] > 3:
+                # colorize with random projection
+                assert xrec.shape[1] > 3
+                x = self.to_rgb(x)
+                xrec = self.to_rgb(xrec)
+            log["samples"] = self.decode(torch.randn_like(posterior.sample()))
+            log["reconstructions"] = xrec
+            if log_ema or self.use_ema:
+                with self.ema_scope():
+                    xrec_ema, posterior_ema = self(x)
+                    if x.shape[1] > 3:
+                        # colorize with random projection
+                        assert xrec_ema.shape[1] > 3
+                        xrec_ema = self.to_rgb(xrec_ema)
+                    log["samples_ema"] = self.decode(torch.randn_like(posterior_ema.sample()))
+                    log["reconstructions_ema"] = xrec_ema
+        log["inputs"] = x
+        return log
+    def to_rgb(self, x):
+        assert self.image_key == "segmentation"
+        if not hasattr(self, "colorize"):
+            self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x))
+        x = F.conv2d(x, weight=self.colorize)
+        x = 2.*(x-x.min())/(x.max()-x.min()) - 1.
+        return x
+@AUTO_ENCODER.register_class()
+class AutoencoderVideo(AutoencoderKL):
+    def __init__(self,
+                 ddconfig,
+                 embed_dim,
+                 pretrained=None,
+                 ignore_keys=[],
+                 image_key="image",
+                 colorize_nlabels=None,
+                 monitor=None,
+                 ema_decay=None,
+                 use_vid_decoder=True,
+                 learn_logvar=False,
+                 **kwargs):
+        use_vid_decoder = True
+        super().__init__(ddconfig, embed_dim, pretrained, ignore_keys, image_key, colorize_nlabels, monitor, ema_decay, learn_logvar, use_vid_decoder, **kwargs)
+    def decode(self, z, **kwargs):
+        # z = self.post_quant_conv(z)
+        dec = self.decoder(z, **kwargs)
+        return dec
+    def encode(self, x):
+        h = self.encoder(x)
+        # moments = self.quant_conv(h)
+        moments = h
+        posterior = DiagonalGaussianDistribution(moments)
+        return posterior
+class IdentityFirstStage(torch.nn.Module):
+    def __init__(self, *args, vq_interface=False, **kwargs):
+        self.vq_interface = vq_interface
+        super().__init__()
+    def encode(self, x, *args, **kwargs):
+        return x
+    def decode(self, x, *args, **kwargs):
+        return x
+    def quantize(self, x, *args, **kwargs):
+        if self.vq_interface:
+            return x, None, [None, None, None]
+        return x
+    def forward(self, x, *args, **kwargs):
+        return x
+@DISTRIBUTION.register_class()
+class DiagonalGaussianDistribution(object):
+    def __init__(self, parameters, deterministic=False):
+        self.parameters = parameters
+        self.mean, self.logvar = torch.chunk(parameters, 2, dim=1)
+        self.logvar = torch.clamp(self.logvar, -30.0, 20.0)
+        self.deterministic = deterministic
+        self.std = torch.exp(0.5 * self.logvar)
+        self.var = torch.exp(self.logvar)
+        if self.deterministic:
+            self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device)
+    def sample(self):
+        x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device)
+        return x
+    def kl(self, other=None):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        else:
+            if other is None:
+                return 0.5 * torch.sum(torch.pow(self.mean, 2)
+                                       + self.var - 1.0 - self.logvar,
+                                       dim=[1, 2, 3])
+            else:
+                return 0.5 * torch.sum(
+                    torch.pow(self.mean - other.mean, 2) / other.var
+                    + self.var / other.var - 1.0 - self.logvar + other.logvar,
+                    dim=[1, 2, 3])
+    def nll(self, sample, dims=[1,2,3]):
+        if self.deterministic:
+            return torch.Tensor([0.])
+        logtwopi = np.log(2.0 * np.pi)
+        return 0.5 * torch.sum(
+            logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var,
+            dim=dims)
+    def mode(self):
+        return self.mean
+# -------------------------------modules--------------------------------
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class ResnetBlock(nn.Module):
+    def __init__(self, *, in_channels, out_channels=None, conv_shortcut=False,
+                 dropout, temb_channels=512):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.norm1 = Normalize(in_channels)
+        self.conv1 = torch.nn.Conv2d(in_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if temb_channels > 0:
+            self.temb_proj = torch.nn.Linear(temb_channels,
+                                             out_channels)
+        self.norm2 = Normalize(out_channels)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels,
+                                     out_channels,
+                                     kernel_size=3,
+                                     stride=1,
+                                     padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels,
+                                                     out_channels,
+                                                     kernel_size=3,
+                                                     stride=1,
+                                                     padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels,
+                                                    out_channels,
+                                                    kernel_size=1,
+                                                    stride=1,
+                                                    padding=0)
+    def forward(self, x, temb):
+        h = x
+        h = self.norm1(h)
+        h = nonlinearity(h)
+        h = self.conv1(h)
+        if temb is not None:
+            h = h + self.temb_proj(nonlinearity(temb))[:,:,None,None]
+        h = self.norm2(h)
+        h = nonlinearity(h)
+        h = self.dropout(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                x = self.conv_shortcut(x)
+            else:
+                x = self.nin_shortcut(x)
+        return x+h
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = Normalize(in_channels)
+        self.q = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.k = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.v = torch.nn.Conv2d(in_channels,
+                                 in_channels,
+                                 kernel_size=1,
+                                 stride=1,
+                                 padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=1,
+                                        stride=1,
+                                        padding=0)
+    def forward(self, x):
+        h_ = x
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        # compute attention
+        b,c,h,w = q.shape
+        q = q.reshape(b,c,h*w)
+        q = q.permute(0,2,1)   # b,hw,c
+        k = k.reshape(b,c,h*w) # b,c,hw
+        w_ = torch.bmm(q,k)     # b,hw,hw    w[b,i,j]=sum_c q[b,i,c]k[b,c,j]
+        w_ = w_ * (int(c)**(-0.5))
+        w_ = torch.nn.functional.softmax(w_, dim=2)
+        # attend to values
+        v = v.reshape(b,c,h*w)
+        w_ = w_.permute(0,2,1)   # b,hw,hw (first hw of k, second of q)
+        h_ = torch.bmm(v,w_)     # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j]
+        h_ = h_.reshape(b,c,h,w)
+        h_ = self.proj_out(h_)
+        return x+h_
+class Upsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x):
+        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        if self.with_conv:
+            x = self.conv(x)
+        return x
+class Downsample(nn.Module):
+    def __init__(self, in_channels, with_conv):
+        super().__init__()
+        self.with_conv = with_conv
+        if self.with_conv:
+            # no asymmetric padding in torch conv, must do it ourselves
+            self.conv = torch.nn.Conv2d(in_channels,
+                                        in_channels,
+                                        kernel_size=3,
+                                        stride=2,
+                                        padding=0)
+    def forward(self, x):
+        if self.with_conv:
+            pad = (0,1,0,1)
+            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
+            x = self.conv(x)
+        else:
+            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
+        return x
+class Encoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, double_z=True, use_linear_attn=False, attn_type="vanilla",
+                 **ignore_kwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = torch.nn.Conv2d(in_channels,
+                                       self.ch,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,)+tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch*in_ch_mult[i_level]
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions-1:
+                down.downsample = Downsample(block_in, resamp_with_conv)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        2*z_channels if double_z else z_channels,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, x, return_feat=False):
+        # timestep embedding
+        temb = None
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1], temb)
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions-1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # end
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if return_feat:
+            hs[-1] = h
+            return hs
+        else:
+            return h
+class Decoder(nn.Module):
+    def __init__(self, *, ch, out_ch, ch_mult=(1,2,4,8), num_res_blocks,
+                 attn_resolutions, dropout=0.0, resamp_with_conv=True, in_channels,
+                 resolution, z_channels, give_pre_end=False, tanh_out=False, use_linear_attn=False,
+                 attn_type="vanilla", **ignorekwargs):
+        super().__init__()
+        if use_linear_attn: attn_type = "linear"
+        self.ch = ch
+        self.temb_ch = 0
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.give_pre_end = give_pre_end
+        self.tanh_out = tanh_out
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        in_ch_mult = (1,)+tuple(ch_mult)
+        block_in = ch*ch_mult[self.num_resolutions-1]
+        curr_res = resolution // 2**(self.num_resolutions-1)
+        self.z_shape = (1,z_channels, curr_res, curr_res)
+        # logging.info("Working with z of shape {} = {} dimensions.".format(self.z_shape, np.prod(self.z_shape)))
+        # z to block_in
+        self.conv_in = torch.nn.Conv2d(z_channels,
+                                       block_in,
+                                       kernel_size=3,
+                                       stride=1,
+                                       padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in,
+                                       out_channels=block_in,
+                                       temb_channels=self.temb_ch,
+                                       dropout=dropout)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch*ch_mult[i_level]
+            for i_block in range(self.num_res_blocks+1):
+                block.append(ResnetBlock(in_channels=block_in,
+                                         out_channels=block_out,
+                                         temb_channels=self.temb_ch,
+                                         dropout=dropout))
+                block_in = block_out
+                if curr_res in attn_resolutions:
+                    attn.append(AttnBlock(block_in))
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in, resamp_with_conv)
+                curr_res = curr_res * 2
+            self.up.insert(0, up) # prepend to get consistent order
+        # end
+        self.norm_out = Normalize(block_in)
+        self.conv_out = torch.nn.Conv2d(block_in,
+                                        out_ch,
+                                        kernel_size=3,
+                                        stride=1,
+                                        padding=1)
+    def forward(self, z, **kwargs):
+        #assert z.shape[1:] == self.z_shape[1:]
+        self.last_z_shape = z.shape
+        # timestep embedding
+        temb = None
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h, temb)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h, temb)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks+1):
+                h = self.up[i_level].block[i_block](h, temb)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        if self.give_pre_end:
+            return h
+        h = self.norm_out(h)
+        h = nonlinearity(h)
+        h = self.conv_out(h)
+        if self.tanh_out:
+            h = torch.tanh(h)
+        return h
--- a/tools/modules/clip_embedder.py
+++ b/tools/modules/clip_embedder.py
+import os
+import torch
+import logging
+import open_clip
+import numpy as np
+import torch.nn as nn
+import torchvision.transforms as T
+from utils.registry_class import EMBEDDER
+@EMBEDDER.register_class()
+class FrozenOpenCLIPEmbedder(nn.Module):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    LAYERS = [
+        #"pooled",
+        "last",
+        "penultimate"
+    ]
+    def __init__(self, pretrained, arch="ViT-H-14", device="cuda", max_length=77,
+                 freeze=True, layer="last"):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=pretrained)
+        del model.visual
+        self.model = model
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, text):
+        tokens = open_clip.tokenize(text)
+        z = self.encode_with_transformer(tokens.to(self.device))
+        return z
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        return x
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask = None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+    def encode(self, text):
+        return self(text)
+@EMBEDDER.register_class()
+class FrozenOpenCLIPVisualEmbedder(nn.Module):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    LAYERS = [
+        #"pooled",
+        "last",
+        "penultimate"
+    ]
+    def __init__(self, pretrained, vit_resolution=(224, 224), arch="ViT-H-14", device="cuda", max_length=77,
+                 freeze=True, layer="last"):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, preprocess = open_clip.create_model_and_transforms(
+                arch, device=torch.device('cpu'), pretrained=pretrained)
+        # Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)
+        del model.transformer 
+        self.model = model
+        data_white = np.ones((vit_resolution[0], vit_resolution[1], 3), dtype=np.uint8)*255
+        self.white_image = preprocess(T.ToPILImage()(data_white)).unsqueeze(0)
+        self.device = device
+        self.max_length = max_length # 77
+        if freeze:
+            self.freeze()
+        self.layer = layer # 'penultimate'
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+    def freeze(self): # model.encode_image(torch.randn(2,3,224,224))
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    def forward(self, image):
+        # tokens = open_clip.tokenize(text)
+        z = self.model.encode_image(image.to(self.device))
+        return z
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        return x
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask = None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+    def encode(self, text):
+        return self(text)
+@EMBEDDER.register_class()
+class FrozenOpenCLIPTextVisualEmbedder(nn.Module):
+    """
+    Uses the OpenCLIP transformer encoder for text
+    """
+    LAYERS = [
+        #"pooled",
+        "last",
+        "penultimate"
+    ]
+    def __init__(self, pretrained, arch="ViT-H-14", device="cuda", max_length=77,
+                 freeze=True, layer="last", **kwargs):
+        super().__init__()
+        assert layer in self.LAYERS
+        model, _, _ = open_clip.create_model_and_transforms(arch, device=torch.device('cpu'), pretrained=pretrained)
+        self.model = model
+        self.device = device
+        self.max_length = max_length
+        if freeze:
+            self.freeze()
+        self.layer = layer
+        if self.layer == "last":
+            self.layer_idx = 0
+        elif self.layer == "penultimate":
+            self.layer_idx = 1
+        else:
+            raise NotImplementedError()
+    def freeze(self):
+        self.model = self.model.eval()
+        for param in self.parameters():
+            param.requires_grad = False
+    # def forward(self, text):
+    #     tokens = open_clip.tokenize(text)
+    #     z = self.encode_with_transformer(tokens.to(self.device))
+    #     return z
+    def forward(self, image=None, text=None):
+        # xi = self.encode_image(image) if image is not None else None
+        xi = self.model.encode_image(image.to(self.device)) if image is not None else None
+        # tokens = open_clip.tokenize(text, truncate=True)
+        tokens = open_clip.tokenize(text)
+        xt, x = self.encode_with_transformer(tokens.to(self.device))
+        return xi, xt, x
+    def encode_with_transformer(self, text):
+        x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+        x = x + self.model.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.text_transformer_forward(x, attn_mask=self.model.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.model.ln_final(x)
+        xt = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.model.text_projection
+        return xt, x
+    # def encode_with_transformer(self, text):
+    #     x = self.model.token_embedding(text)  # [batch_size, n_ctx, d_model]
+    #     x = x + self.model.positional_embedding
+    #     x = x.permute(1, 0, 2)  # NLD -> LND
+    #     x = self.model.transformer(x)
+    #     x = x.permute(1, 0, 2)  # LND -> NLD
+    #     x = self.model.ln_final(x)
+    #     xt = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.model.text_projection
+    #     # text embedding, token embedding
+    #     return xt, x
+    def encode_image(self, image):
+        return self.model.visual(image)
+    def text_transformer_forward(self, x: torch.Tensor, attn_mask = None):
+        for i, r in enumerate(self.model.transformer.resblocks):
+            if i == len(self.model.transformer.resblocks) - self.layer_idx:
+                break
+            if self.model.transformer.grad_checkpointing and not torch.jit.is_scripting():
+                x = checkpoint(r, x, attn_mask)
+            else:
+                x = r(x, attn_mask=attn_mask)
+        return x
+    def encode(self, text):
+        return self(text)
--- a/tools/modules/config.py
+++ b/tools/modules/config.py
+import torch
+import logging
+import os.path as osp
+from datetime import datetime
+from easydict import EasyDict
+import os
+cfg = EasyDict(__name__='Config: VideoLDM Decoder')
+# -------------------------------distributed training--------------------------
+pmi_world_size = int(os.getenv('WORLD_SIZE', 1))
+gpus_per_machine = torch.cuda.device_count()
+world_size = pmi_world_size * gpus_per_machine
+# -----------------------------------------------------------------------------
+# ---------------------------Dataset Parameter---------------------------------
+cfg.mean = [0.5, 0.5, 0.5]
+cfg.std = [0.5, 0.5, 0.5]
+cfg.max_words = 1000
+cfg.num_workers = 8
+cfg.prefetch_factor = 2
+# PlaceHolder
+cfg.resolution = [448, 256]
+cfg.vit_out_dim = 1024
+cfg.vit_resolution = 336
+cfg.depth_clamp = 10.0
+cfg.misc_size = 384
+cfg.depth_std = 20.0
+cfg.frame_lens = [32, 32, 32, 1]
+cfg.sample_fps = [4, ]
+cfg.vid_dataset = {
+    'type': 'VideoBaseDataset',
+    'data_list': [],
+    'max_words': cfg.max_words,
+    'resolution': cfg.resolution}
+cfg.img_dataset = {
+    'type': 'ImageBaseDataset',
+    'data_list': ['laion_400m',],
+    'max_words': cfg.max_words,
+    'resolution': cfg.resolution}
+cfg.batch_sizes = {
+    str(1):256,
+    str(4):4,
+    str(8):4,
+    str(16):4}
+# -----------------------------------------------------------------------------
+# ---------------------------Mode Parameters-----------------------------------
+# Diffusion
+cfg.Diffusion = {
+    'type': 'DiffusionDDIM',
+    'schedule': 'cosine', # cosine
+    'schedule_param': {
+        'num_timesteps': 1000,
+        'cosine_s': 0.008,
+        'zero_terminal_snr': True,
+    },
+    'mean_type': 'v',           # [v, eps]
+    'loss_type': 'mse',
+    'var_type': 'fixed_small',
+    'rescale_timesteps': False,
+    'noise_strength': 0.1,
+    'ddim_timesteps': 50
+}
+cfg.ddim_timesteps = 50  # official: 250
+cfg.use_div_loss = False
+# classifier-free guidance
+cfg.p_zero = 0.9
+cfg.guide_scale = 3.0
+# clip vision encoder
+cfg.vit_mean = [0.48145466, 0.4578275, 0.40821073]
+cfg.vit_std = [0.26862954, 0.26130258, 0.27577711]
+# Model
+cfg.scale_factor = 0.18215  
+cfg.use_checkpoint = True
+cfg.use_sharded_ddp = False
+cfg.use_fsdp = False 
+cfg.use_fp16 = True
+cfg.temporal_attention = True
+cfg.UNet = {
+    'type': 'UNetSD',
+    'in_dim': 4,
+    'dim': 320,
+    'y_dim': cfg.vit_out_dim,
+    'context_dim': 1024,
+    'out_dim': 8,
+    'dim_mult': [1, 2, 4, 4],
+    'num_heads': 8,
+    'head_dim': 64,
+    'num_res_blocks': 2,
+    'attn_scales': [1 / 1, 1 / 2, 1 / 4],
+    'dropout': 0.1,
+    'temporal_attention': cfg.temporal_attention,
+    'temporal_attn_times': 1,
+    'use_checkpoint': cfg.use_checkpoint,
+    'use_fps_condition': False,
+    'use_sim_mask': False
+}
+# auotoencoder from stabel diffusion
+cfg.guidances = []
+cfg.auto_encoder = {
+    'type': 'AutoencoderKL',
+    'ddconfig': {
+        'double_z': True, 
+        'z_channels': 4,
+        'resolution': 256, 
+        'in_channels': 3,
+        'out_ch': 3, 
+        'ch': 128, 
+        'ch_mult': [1, 2, 4, 4],
+        'num_res_blocks': 2, 
+        'attn_resolutions': [], 
+        'dropout': 0.0,
+        'video_kernel_size': [3, 1, 1]
+    },
+    'embed_dim': 4,
+    'pretrained': 'i2vgen-xl/v2-1_512-ema-pruned.ckpt'
+}
+# clip embedder
+cfg.embedder = {
+    'type': 'FrozenOpenCLIPEmbedder',
+    'layer': 'penultimate',
+    'pretrained': 'i2vgen-xl/open_clip_pytorch_model.bin'
+}
+# -----------------------------------------------------------------------------
+# ---------------------------Training Settings---------------------------------
+# training and optimizer
+cfg.ema_decay = 0.9999
+cfg.num_steps = 600000
+cfg.lr = 5e-5
+cfg.weight_decay = 0.0
+cfg.betas = (0.9, 0.999)
+cfg.eps = 1.0e-8
+cfg.chunk_size = 16
+cfg.decoder_bs = 8
+cfg.alpha = 0.7
+cfg.save_ckp_interval = 1000
+# scheduler
+cfg.warmup_steps = 10
+cfg.decay_mode = 'cosine'
+# acceleration
+cfg.use_ema = True  
+if world_size<2:
+    cfg.use_ema = False
+cfg.load_from = None
+# -----------------------------------------------------------------------------
+# ----------------------------Pretrain Settings---------------------------------
+cfg.Pretrain = {
+    'type': 'pretrain_specific_strategies',
+    'fix_weight': False,
+    'grad_scale': 0.2,
+    'resume_checkpoint': 'models/jiuniu_0267000.pth',
+    'sd_keys_path': 'i2vgen-xl/stable_diffusion_image_key_temporal_attention_x1.json',
+}
+# -----------------------------------------------------------------------------
+# -----------------------------Visual-------------------------------------------
+# Visual videos
+cfg.viz_interval = 1000
+cfg.visual_train = {
+    'type': 'VisualTrainTextImageToVideo',
+}
+cfg.visual_inference = {
+    'type': 'VisualGeneratedVideos',
+}
+cfg.inference_list_path = ''
+# logging
+cfg.log_interval = 100
+### Default log_dir
+cfg.log_dir = 'workspace/temp_dir'
+# -----------------------------------------------------------------------------
+# ---------------------------Others--------------------------------------------
+# seed 
+cfg.seed = 8888
+cfg.negative_prompt = 'Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms'
+# -----------------------------------------------------------------------------
--- a/tools/modules/diffusions/__init__.py
+++ b/tools/modules/diffusions/__init__.py
+from .diffusion_ddim import *
--- a/tools/modules/diffusions/__pycache__/__init__.cpython-38.pyc
+++ b/tools/modules/diffusions/__pycache__/__init__.cpython-38.pyc
--- a/tools/modules/diffusions/__pycache__/diffusion_ddim.cpython-38.pyc
+++ b/tools/modules/diffusions/__pycache__/diffusion_ddim.cpython-38.pyc
--- a/tools/modules/diffusions/__pycache__/losses.cpython-38.pyc
+++ b/tools/modules/diffusions/__pycache__/losses.cpython-38.pyc
--- a/tools/modules/diffusions/__pycache__/schedules.cpython-38.pyc
+++ b/tools/modules/diffusions/__pycache__/schedules.cpython-38.pyc
--- a/tools/modules/diffusions/diffusion_ddim.py
+++ b/tools/modules/diffusions/diffusion_ddim.py
+import torch
+import math
+from utils.registry_class import DIFFUSION
+from .schedules import beta_schedule, sigma_schedule
+from .losses import kl_divergence, discretized_gaussian_log_likelihood
+# from .dpm_solver import NoiseScheduleVP, model_wrapper_guided_diffusion, model_wrapper, DPM_Solver
+def _i(tensor, t, x):
+    r"""Index tensor using t and format the output according to x.
+    """
+    if tensor.device != x.device:
+        tensor = tensor.to(x.device)
+    shape = (x.size(0), ) + (1, ) * (x.ndim - 1)
+    return tensor[t].view(shape).to(x)
+@DIFFUSION.register_class()
+class DiffusionDDIMSR(object):
+    def __init__(self, reverse_diffusion, forward_diffusion, **kwargs):
+        from .diffusion_gauss import GaussianDiffusion
+        self.reverse_diffusion = GaussianDiffusion(sigmas=sigma_schedule(reverse_diffusion.schedule, **reverse_diffusion.schedule_param), 
+                                                   prediction_type=reverse_diffusion.mean_type)
+        self.forward_diffusion = GaussianDiffusion(sigmas=sigma_schedule(forward_diffusion.schedule, **forward_diffusion.schedule_param), 
+                                                   prediction_type=forward_diffusion.mean_type)
+@DIFFUSION.register_class()
+class DiffusionDDIM(object):
+    def __init__(self,
+                 schedule='linear_sd',
+                 schedule_param={},
+                 mean_type='eps',
+                 var_type='learned_range',
+                 loss_type='mse',
+                 epsilon = 1e-12,
+                 rescale_timesteps=False,
+                 noise_strength=0.0, 
+                 **kwargs):
+        # check input
+        # check input
+        assert mean_type in ['x0', 'x_{t-1}', 'eps', 'v']
+        assert var_type in ['learned', 'learned_range', 'fixed_large', 'fixed_small']
+        assert loss_type in ['mse', 'rescaled_mse', 'kl', 'rescaled_kl', 'l1', 'rescaled_l1','charbonnier']
+        betas = beta_schedule(schedule, **schedule_param)
+        assert min(betas) > 0 and max(betas) <= 1
+        if not isinstance(betas, torch.DoubleTensor):
+            betas = torch.tensor(betas, dtype=torch.float64)
+        self.betas = betas
+        self.num_timesteps = len(betas)
+        self.mean_type = mean_type # eps
+        self.var_type = var_type # 'fixed_small'
+        self.loss_type = loss_type # mse
+        self.epsilon = epsilon # 1e-12
+        self.rescale_timesteps = rescale_timesteps # False
+        self.noise_strength = noise_strength # 0.0
+        # alphas
+        alphas = 1 - self.betas
+        self.alphas_cumprod = torch.cumprod(alphas, dim=0)
+        self.alphas_cumprod_prev = torch.cat([alphas.new_ones([1]), self.alphas_cumprod[:-1]])
+        self.alphas_cumprod_next = torch.cat([self.alphas_cumprod[1:], alphas.new_zeros([1])])
+        # q(x_t | x_{t-1})
+        self.sqrt_alphas_cumprod = torch.sqrt(self.alphas_cumprod)
+        self.sqrt_one_minus_alphas_cumprod = torch.sqrt(1.0 - self.alphas_cumprod)
+        self.log_one_minus_alphas_cumprod = torch.log(1.0 - self.alphas_cumprod)
+        self.sqrt_recip_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod)
+        self.sqrt_recipm1_alphas_cumprod = torch.sqrt(1.0 / self.alphas_cumprod - 1)
+        # q(x_{t-1} | x_t, x_0)
+        self.posterior_variance = betas * (1.0 - self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_log_variance_clipped = torch.log(self.posterior_variance.clamp(1e-20))
+        self.posterior_mean_coef1 = betas * torch.sqrt(self.alphas_cumprod_prev) / (1.0 - self.alphas_cumprod)
+        self.posterior_mean_coef2 = (1.0 - self.alphas_cumprod_prev) * torch.sqrt(alphas) / (1.0 - self.alphas_cumprod)
+    def sample_loss(self, x0, noise=None):
+        if noise is None:
+            noise = torch.randn_like(x0)
+            if self.noise_strength > 0:
+                b, c, f, _, _= x0.shape
+                offset_noise = torch.randn(b, c, f, 1, 1, device=x0.device)
+                noise = noise + self.noise_strength * offset_noise
+        return noise
+    def q_sample(self, x0, t, noise=None):
+        r"""Sample from q(x_t | x_0).
+        """
+        # noise = torch.randn_like(x0) if noise is None else noise
+        noise = self.sample_loss(x0, noise)
+        return _i(self.sqrt_alphas_cumprod, t, x0) * x0 + \
+               _i(self.sqrt_one_minus_alphas_cumprod, t, x0) * noise
+    def q_mean_variance(self, x0, t):
+        r"""Distribution of q(x_t | x_0).
+        """
+        mu = _i(self.sqrt_alphas_cumprod, t, x0) * x0
+        var = _i(1.0 - self.alphas_cumprod, t, x0)
+        log_var = _i(self.log_one_minus_alphas_cumprod, t, x0)
+        return mu, var, log_var
+    def q_posterior_mean_variance(self, x0, xt, t):
+        r"""Distribution of q(x_{t-1} | x_t, x_0).
+        """
+        mu = _i(self.posterior_mean_coef1, t, xt) * x0 + _i(self.posterior_mean_coef2, t, xt) * xt
+        var = _i(self.posterior_variance, t, xt)
+        log_var = _i(self.posterior_log_variance_clipped, t, xt)
+        return mu, var, log_var
+    @torch.no_grad()
+    def p_sample(self, xt, t, model, model_kwargs={}, clamp=None, percentile=None, condition_fn=None, guide_scale=None):
+        r"""Sample from p(x_{t-1} | x_t).
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        # predict distribution of p(x_{t-1} | x_t)
+        mu, var, log_var, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp, percentile, guide_scale)
+        # random sample (with optional conditional function)
+        noise = torch.randn_like(xt)
+        mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))  # no noise when t == 0
+        if condition_fn is not None:
+            grad = condition_fn(xt, self._scale_timesteps(t), **model_kwargs)
+            mu = mu.float() + var * grad.float()
+        xt_1 = mu + mask * torch.exp(0.5 * log_var) * noise
+        return xt_1, x0
+    @torch.no_grad()
+    def p_sample_loop(self, noise, model, model_kwargs={}, clamp=None, percentile=None, condition_fn=None, guide_scale=None):
+        r"""Sample from p(x_{t-1} | x_t) p(x_{t-2} | x_{t-1}) ... p(x_0 | x_1).
+        """
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+        # diffusion process
+        for step in torch.arange(self.num_timesteps).flip(0):
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.p_sample(xt, t, model, model_kwargs, clamp, percentile, condition_fn, guide_scale)
+        return xt
+    def p_mean_variance(self, xt, t, model, model_kwargs={}, clamp=None, percentile=None, guide_scale=None):
+        r"""Distribution of p(x_{t-1} | x_t).
+        """
+        # predict distribution
+        if guide_scale is None:
+            out = model(xt, self._scale_timesteps(t), **model_kwargs)
+        else:
+            # classifier-free guidance
+            # (model_kwargs[0]: conditional kwargs; model_kwargs[1]: non-conditional kwargs)
+            assert isinstance(model_kwargs, list) and len(model_kwargs) == 2
+            y_out = model(xt, self._scale_timesteps(t), **model_kwargs[0])
+            u_out = model(xt, self._scale_timesteps(t), **model_kwargs[1])
+            dim = y_out.size(1) if self.var_type.startswith('fixed') else y_out.size(1) // 2
+            out = torch.cat([
+                u_out[:, :dim] + guide_scale * (y_out[:, :dim] - u_out[:, :dim]),
+                y_out[:, dim:]], dim=1) # guide_scale=9.0
+        # compute variance
+        if self.var_type == 'learned':
+            out, log_var = out.chunk(2, dim=1)
+            var = torch.exp(log_var)
+        elif self.var_type == 'learned_range':
+            out, fraction = out.chunk(2, dim=1)
+            min_log_var = _i(self.posterior_log_variance_clipped, t, xt)
+            max_log_var = _i(torch.log(self.betas), t, xt)
+            fraction = (fraction + 1) / 2.0
+            log_var = fraction * max_log_var + (1 - fraction) * min_log_var
+            var = torch.exp(log_var)
+        elif self.var_type == 'fixed_large':
+            var = _i(torch.cat([self.posterior_variance[1:2], self.betas[1:]]), t, xt)
+            log_var = torch.log(var)
+        elif self.var_type == 'fixed_small':
+            var = _i(self.posterior_variance, t, xt)
+            log_var = _i(self.posterior_log_variance_clipped, t, xt)
+        # compute mean and x0
+        if self.mean_type == 'x_{t-1}':
+            mu = out  # x_{t-1}
+            x0 = _i(1.0 / self.posterior_mean_coef1, t, xt) * mu - \
+                 _i(self.posterior_mean_coef2 / self.posterior_mean_coef1, t, xt) * xt
+        elif self.mean_type == 'x0':
+            x0 = out
+            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
+        elif self.mean_type == 'eps':
+            x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - \
+                 _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * out
+            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
+        elif self.mean_type == 'v':
+            x0 = _i(self.sqrt_alphas_cumprod, t, xt) * xt - \
+                 _i(self.sqrt_one_minus_alphas_cumprod, t, xt) * out
+            mu, _, _ = self.q_posterior_mean_variance(x0, xt, t)
+        # restrict the range of x0
+        if percentile is not None:
+            assert percentile > 0 and percentile <= 1  # e.g., 0.995
+            s = torch.quantile(x0.flatten(1).abs(), percentile, dim=1).clamp_(1.0).view(-1, 1, 1, 1)
+            x0 = torch.min(s, torch.max(-s, x0)) / s
+        elif clamp is not None:
+            x0 = x0.clamp(-clamp, clamp)
+        return mu, var, log_var, x0
+    @torch.no_grad()
+    def ddim_sample(self, xt, t, model, model_kwargs={}, clamp=None, percentile=None, condition_fn=None, guide_scale=None, ddim_timesteps=20, eta=0.0):
+        r"""Sample from p(x_{t-1} | x_t) using DDIM.
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        stride = self.num_timesteps // ddim_timesteps
+        # predict distribution of p(x_{t-1} | x_t)
+        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp, percentile, guide_scale)
+        if condition_fn is not None:
+            # x0 -> eps
+            alpha = _i(self.alphas_cumprod, t, xt)
+            eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / \
+                  _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            eps = eps - (1 - alpha).sqrt() * condition_fn(xt, self._scale_timesteps(t), **model_kwargs)
+            # eps -> x0
+            x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - \
+                 _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+        # derive variables
+        eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / \
+              _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+        alphas = _i(self.alphas_cumprod, t, xt)
+        alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
+        sigmas = eta * torch.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
+        # random sample
+        noise = torch.randn_like(xt)
+        direction = torch.sqrt(1 - alphas_prev - sigmas ** 2) * eps
+        mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
+        xt_1 = torch.sqrt(alphas_prev) * x0 + direction + mask * sigmas * noise
+        return xt_1, x0
+    @torch.no_grad()
+    def ddim_sample_loop(self, noise, model, model_kwargs={}, clamp=None, percentile=None, condition_fn=None, guide_scale=None, ddim_timesteps=20, eta=0.0):
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+        # diffusion process (TODO: clamp is inaccurate! Consider replacing the stride by explicit prev/next steps)
+        steps = (1 + torch.arange(0, self.num_timesteps, self.num_timesteps // ddim_timesteps)).clamp(0, self.num_timesteps - 1).flip(0)
+        for step in steps:
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.ddim_sample(xt, t, model, model_kwargs, clamp, percentile, condition_fn, guide_scale, ddim_timesteps, eta)
+        return xt
+    @torch.no_grad()
+    def ddim_reverse_sample(self, xt, t, model, model_kwargs={}, clamp=None, percentile=None, guide_scale=None, ddim_timesteps=20):
+        r"""Sample from p(x_{t+1} | x_t) using DDIM reverse ODE (deterministic).
+        """
+        stride = self.num_timesteps // ddim_timesteps
+        # predict distribution of p(x_{t-1} | x_t)
+        _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp, percentile, guide_scale)
+        # derive variables
+        eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / \
+              _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+        alphas_next = _i(
+            torch.cat([self.alphas_cumprod, self.alphas_cumprod.new_zeros([1])]),
+            (t + stride).clamp(0, self.num_timesteps), xt)
+        # reverse sample
+        mu = torch.sqrt(alphas_next) * x0 + torch.sqrt(1 - alphas_next) * eps
+        return mu, x0
+    @torch.no_grad()
+    def ddim_reverse_sample_loop(self, x0, model, model_kwargs={}, clamp=None, percentile=None, guide_scale=None, ddim_timesteps=20):
+        # prepare input
+        b = x0.size(0)
+        xt = x0
+        # reconstruction steps
+        steps = torch.arange(0, self.num_timesteps, self.num_timesteps // ddim_timesteps)
+        for step in steps:
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.ddim_reverse_sample(xt, t, model, model_kwargs, clamp, percentile, guide_scale, ddim_timesteps)
+        return xt
+    @torch.no_grad()
+    def plms_sample(self, xt, t, model, model_kwargs={}, clamp=None, percentile=None, condition_fn=None, guide_scale=None, plms_timesteps=20):
+        r"""Sample from p(x_{t-1} | x_t) using PLMS.
+            - condition_fn: for classifier-based guidance (guided-diffusion).
+            - guide_scale: for classifier-free guidance (glide/dalle-2).
+        """
+        stride = self.num_timesteps // plms_timesteps
+        # function for compute eps
+        def compute_eps(xt, t):
+            # predict distribution of p(x_{t-1} | x_t)
+            _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp, percentile, guide_scale)
+            # condition
+            if condition_fn is not None:
+                # x0 -> eps
+                alpha = _i(self.alphas_cumprod, t, xt)
+                eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / \
+                      _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+                eps = eps - (1 - alpha).sqrt() * condition_fn(xt, self._scale_timesteps(t), **model_kwargs)
+                # eps -> x0
+                x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - \
+                     _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+            # derive eps
+            eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / \
+                  _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            return eps
+        # function for compute x_0 and x_{t-1}
+        def compute_x0(eps, t):
+            # eps -> x0
+            x0 = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - \
+                 _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * eps
+            # deterministic sample
+            alphas_prev = _i(self.alphas_cumprod, (t - stride).clamp(0), xt)
+            direction = torch.sqrt(1 - alphas_prev) * eps
+            mask = t.ne(0).float().view(-1, *((1, ) * (xt.ndim - 1)))
+            xt_1 = torch.sqrt(alphas_prev) * x0 + direction
+            return xt_1, x0
+        # PLMS sample
+        eps = compute_eps(xt, t)
+        if len(eps_cache) == 0:
+            # 2nd order pseudo improved Euler
+            xt_1, x0 = compute_x0(eps, t)
+            eps_next = compute_eps(xt_1, (t - stride).clamp(0))
+            eps_prime = (eps + eps_next) / 2.0
+        elif len(eps_cache) == 1:
+            # 2nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (3 * eps - eps_cache[-1]) / 2.0
+        elif len(eps_cache) == 2:
+            # 3nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (23 * eps - 16 * eps_cache[-1] + 5 * eps_cache[-2]) / 12.0
+        elif len(eps_cache) >= 3:
+            # 4nd order pseudo linear multistep (Adams-Bashforth)
+            eps_prime = (55 * eps - 59 * eps_cache[-1] + 37 * eps_cache[-2] - 9 * eps_cache[-3]) / 24.0
+        xt_1, x0 = compute_x0(eps_prime, t)
+        return xt_1, x0, eps
+    @torch.no_grad()
+    def plms_sample_loop(self, noise, model, model_kwargs={}, clamp=None, percentile=None, condition_fn=None, guide_scale=None, plms_timesteps=20):
+        # prepare input
+        b = noise.size(0)
+        xt = noise
+        # diffusion process
+        steps = (1 + torch.arange(0, self.num_timesteps, self.num_timesteps // plms_timesteps)).clamp(0, self.num_timesteps - 1).flip(0)
+        eps_cache = []
+        for step in steps:
+            # PLMS sampling step
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _, eps = self.plms_sample(xt, t, model, model_kwargs, clamp, percentile, condition_fn, guide_scale, plms_timesteps, eps_cache)
+            # update eps cache
+            eps_cache.append(eps)
+            if len(eps_cache) >= 4:
+                eps_cache.pop(0)
+        return xt
+    def loss(self, x0, t, model, model_kwargs={}, noise=None, weight = None, use_div_loss= False):
+        # noise = torch.randn_like(x0) if noise is None else noise # [80, 4, 8, 32, 32]
+        noise = self.sample_loss(x0, noise)
+        xt = self.q_sample(x0, t, noise=noise)
+        # compute loss
+        if self.loss_type in ['kl', 'rescaled_kl']:
+            loss, _ = self.variational_lower_bound(x0, xt, t, model, model_kwargs)
+            if self.loss_type == 'rescaled_kl':
+                loss = loss * self.num_timesteps
+        elif self.loss_type in ['mse', 'rescaled_mse', 'l1', 'rescaled_l1']: # self.loss_type: mse
+            out = model(xt, self._scale_timesteps(t), **model_kwargs)
+            # VLB for variation
+            loss_vlb = 0.0
+            if self.var_type in ['learned', 'learned_range']: # self.var_type: 'fixed_small'
+                out, var = out.chunk(2, dim=1)
+                frozen = torch.cat([out.detach(), var], dim=1)  # learn var without affecting the prediction of mean
+                loss_vlb, _ = self.variational_lower_bound(x0, xt, t, model=lambda *args, **kwargs: frozen)
+                if self.loss_type.startswith('rescaled_'):
+                    loss_vlb = loss_vlb * self.num_timesteps / 1000.0
+            # MSE/L1 for x0/eps
+            # target = {'eps': noise, 'x0': x0, 'x_{t-1}': self.q_posterior_mean_variance(x0, xt, t)[0]}[self.mean_type]
+            target = {
+                'eps': noise, 
+                'x0': x0, 
+                'x_{t-1}': self.q_posterior_mean_variance(x0, xt, t)[0], 
+                'v':_i(self.sqrt_alphas_cumprod, t, xt) * noise - _i(self.sqrt_one_minus_alphas_cumprod, t, xt) * x0}[self.mean_type]
+            loss = (out - target).pow(1 if self.loss_type.endswith('l1') else 2).abs().flatten(1).mean(dim=1)
+            if weight is not None:
+                loss = loss*weight   
+            # div loss
+            if use_div_loss and self.mean_type == 'eps' and x0.shape[2]>1:
+                # derive  x0
+                x0_ = _i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - \
+                    _i(self.sqrt_recipm1_alphas_cumprod, t, xt) * out
+                # # derive xt_1, set eta=0 as ddim
+                # alphas_prev = _i(self.alphas_cumprod, (t - 1).clamp(0), xt)
+                # direction = torch.sqrt(1 - alphas_prev) * out
+                # xt_1 = torch.sqrt(alphas_prev) * x0_ + direction
+                # ncfhw, std on f
+                div_loss = 0.001/(x0_.std(dim=2).flatten(1).mean(dim=1)+1e-4)
+                # print(div_loss,loss)
+                loss = loss+div_loss
+            # total loss
+            loss = loss + loss_vlb
+        elif self.loss_type in ['charbonnier']:
+            out = model(xt, self._scale_timesteps(t), **model_kwargs)
+            # VLB for variation
+            loss_vlb = 0.0
+            if self.var_type in ['learned', 'learned_range']:
+                out, var = out.chunk(2, dim=1)
+                frozen = torch.cat([out.detach(), var], dim=1)  # learn var without affecting the prediction of mean
+                loss_vlb, _ = self.variational_lower_bound(x0, xt, t, model=lambda *args, **kwargs: frozen)
+                if self.loss_type.startswith('rescaled_'):
+                    loss_vlb = loss_vlb * self.num_timesteps / 1000.0
+            # MSE/L1 for x0/eps
+            target = {'eps': noise, 'x0': x0, 'x_{t-1}': self.q_posterior_mean_variance(x0, xt, t)[0]}[self.mean_type]
+            loss = torch.sqrt((out - target)**2 + self.epsilon)
+            if weight is not None:
+                loss = loss*weight
+            loss = loss.flatten(1).mean(dim=1)
+            # total loss
+            loss = loss + loss_vlb
+        return loss
+    def variational_lower_bound(self, x0, xt, t, model, model_kwargs={}, clamp=None, percentile=None):
+        # compute groundtruth and predicted distributions
+        mu1, _, log_var1 = self.q_posterior_mean_variance(x0, xt, t)
+        mu2, _, log_var2, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp, percentile)
+        # compute KL loss
+        kl = kl_divergence(mu1, log_var1, mu2, log_var2)
+        kl = kl.flatten(1).mean(dim=1) / math.log(2.0)
+        # compute discretized NLL loss (for p(x0 | x1) only)
+        nll = -discretized_gaussian_log_likelihood(x0, mean=mu2, log_scale=0.5 * log_var2)
+        nll = nll.flatten(1).mean(dim=1) / math.log(2.0)
+        # NLL for p(x0 | x1) and KL otherwise
+        vlb = torch.where(t == 0, nll, kl)
+        return vlb, x0
+    @torch.no_grad()
+    def variational_lower_bound_loop(self, x0, model, model_kwargs={}, clamp=None, percentile=None):
+        r"""Compute the entire variational lower bound, measured in bits-per-dim.
+        """
+        # prepare input and output
+        b = x0.size(0)
+        metrics = {'vlb': [], 'mse': [], 'x0_mse': []}
+        # loop
+        for step in torch.arange(self.num_timesteps).flip(0):
+            # compute VLB
+            t = torch.full((b, ), step, dtype=torch.long, device=x0.device)
+            # noise = torch.randn_like(x0)
+            noise = self.sample_loss(x0)
+            xt = self.q_sample(x0, t, noise)
+            vlb, pred_x0 = self.variational_lower_bound(x0, xt, t, model, model_kwargs, clamp, percentile)
+            # predict eps from x0
+            eps = (_i(self.sqrt_recip_alphas_cumprod, t, xt) * xt - x0) / \
+                  _i(self.sqrt_recipm1_alphas_cumprod, t, xt)
+            # collect metrics
+            metrics['vlb'].append(vlb)
+            metrics['x0_mse'].append((pred_x0 - x0).square().flatten(1).mean(dim=1))
+            metrics['mse'].append((eps - noise).square().flatten(1).mean(dim=1))
+        metrics = {k: torch.stack(v, dim=1) for k, v in metrics.items()}
+        # compute the prior KL term for VLB, measured in bits-per-dim
+        mu, _, log_var = self.q_mean_variance(x0, t)
+        kl_prior = kl_divergence(mu, log_var, torch.zeros_like(mu), torch.zeros_like(log_var))
+        kl_prior = kl_prior.flatten(1).mean(dim=1) / math.log(2.0)
+        # update metrics
+        metrics['prior_bits_per_dim'] = kl_prior
+        metrics['total_bits_per_dim'] = metrics['vlb'].sum(dim=1) + kl_prior
+        return metrics
+    def _scale_timesteps(self, t):
+        if self.rescale_timesteps:
+            return t.float() * 1000.0 / self.num_timesteps
+        return t
+        #return t.float()
--- a/tools/modules/diffusions/diffusion_gauss.py
+++ b/tools/modules/diffusions/diffusion_gauss.py
+"""
+GaussianDiffusion wraps operators for denoising diffusion models, including the
+diffusion and denoising processes, as well as the loss evaluation.
+"""
+import torch
+import torchsde
+import random
+from tqdm.auto import trange
+__all__ = ['GaussianDiffusion']
+def _i(tensor, t, x):
+    """
+    Index tensor using t and format the output according to x.
+    """
+    shape = (x.size(0), ) + (1, ) * (x.ndim - 1)
+    return tensor[t.to(tensor.device)].view(shape).to(x.device)
+class BatchedBrownianTree:
+    """
+    A wrapper around torchsde.BrownianTree that enables batches of entropy.
+    """
+    def __init__(self, x, t0, t1, seed=None, **kwargs):
+        t0, t1, self.sign = self.sort(t0, t1)
+        w0 = kwargs.get('w0', torch.zeros_like(x))
+        if seed is None:
+            seed = torch.randint(0, 2 ** 63 - 1, []).item()
+        self.batched = True
+        try:
+            assert len(seed) == x.shape[0]
+            w0 = w0[0]
+        except TypeError:
+            seed = [seed]
+            self.batched = False
+        self.trees = [torchsde.BrownianTree(
+            t0, w0, t1, entropy=s, **kwargs
+        ) for s in seed]
+    @staticmethod
+    def sort(a, b):
+        return (a, b, 1) if a < b else (b, a, -1)
+    def __call__(self, t0, t1):
+        t0, t1, sign = self.sort(t0, t1)
+        w = torch.stack([tree(t0, t1) for tree in self.trees]) * (self.sign * sign)
+        return w if self.batched else w[0]
+class BrownianTreeNoiseSampler:
+    """
+    A noise sampler backed by a torchsde.BrownianTree.
+    Args:
+        x (Tensor): The tensor whose shape, device and dtype to use to generate
+            random samples.
+        sigma_min (float): The low end of the valid interval.
+        sigma_max (float): The high end of the valid interval.
+        seed (int or List[int]): The random seed. If a list of seeds is
+            supplied instead of a single integer, then the noise sampler will
+            use one BrownianTree per batch item, each with its own seed.
+        transform (callable): A function that maps sigma to the sampler's
+            internal timestep.
+    """
+    def __init__(self, x, sigma_min, sigma_max, seed=None, transform=lambda x: x):
+        self.transform = transform
+        t0 = self.transform(torch.as_tensor(sigma_min))
+        t1 = self.transform(torch.as_tensor(sigma_max))
+        self.tree = BatchedBrownianTree(x, t0, t1, seed)
+    def __call__(self, sigma, sigma_next):
+        t0 = self.transform(torch.as_tensor(sigma))
+        t1 = self.transform(torch.as_tensor(sigma_next))
+        return self.tree(t0, t1) / (t1 - t0).abs().sqrt()
+def get_scalings(sigma):
+    c_out = -sigma
+    c_in = 1 / (sigma ** 2 + 1. ** 2) ** 0.5
+    return c_out, c_in
+@torch.no_grad()
+def sample_dpmpp_2m_sde(
+    noise,
+    model,
+    sigmas,
+    eta=1.,
+    s_noise=1.,
+    solver_type='midpoint',
+    show_progress=True
+):
+    """
+    DPM-Solver++ (2M) SDE.
+    """
+    assert solver_type in {'heun', 'midpoint'}
+    x = noise * sigmas[0]
+    sigma_min, sigma_max = sigmas[sigmas > 0].min(), sigmas[sigmas < float('inf')].max()
+    noise_sampler = BrownianTreeNoiseSampler(x, sigma_min, sigma_max)
+    old_denoised = None
+    h_last = None
+    for i in trange(len(sigmas) - 1, disable=not show_progress):
+        if sigmas[i] == float('inf'):
+            # Euler method
+            denoised = model(noise, sigmas[i])
+            x = denoised + sigmas[i + 1] * noise
+        else:
+            _, c_in = get_scalings(sigmas[i])
+            denoised = model(x * c_in, sigmas[i])
+            if sigmas[i + 1] == 0:
+                # Denoising step
+                x = denoised
+            else:
+                # DPM-Solver++(2M) SDE
+                t, s = -sigmas[i].log(), -sigmas[i + 1].log()
+                h = s - t
+                eta_h = eta * h
+                x = sigmas[i + 1] / sigmas[i] * (-eta_h).exp() * x + \
+                    (-h - eta_h).expm1().neg() * denoised
+                if old_denoised is not None:
+                    r = h_last / h
+                    if solver_type == 'heun':
+                        x = x + ((-h - eta_h).expm1().neg() / (-h - eta_h) + 1) * \
+                            (1 / r) * (denoised - old_denoised)
+                    elif solver_type == 'midpoint':
+                        x = x + 0.5 * (-h - eta_h).expm1().neg() * \
+                            (1 / r) * (denoised - old_denoised)
+                x = x + noise_sampler(
+                    sigmas[i],
+                    sigmas[i + 1]
+                ) * sigmas[i + 1] * (-2 * eta_h).expm1().neg().sqrt() * s_noise
+            old_denoised = denoised
+            h_last = h
+    return x
+class GaussianDiffusion(object):
+    def __init__(self, sigmas, prediction_type='eps'):
+        assert prediction_type in {'x0', 'eps', 'v'}
+        self.sigmas = sigmas.float()                        # noise coefficients
+        self.alphas = torch.sqrt(1 - sigmas ** 2).float()   # signal coefficients
+        self.num_timesteps = len(sigmas)
+        self.prediction_type = prediction_type
+    def diffuse(self, x0, t, noise=None):
+        """
+        Add Gaussian noise to signal x0 according to:
+        q(x_t | x_0) = N(x_t | alpha_t x_0, sigma_t^2 I).
+        """
+        noise = torch.randn_like(x0) if noise is None else noise
+        xt = _i(self.alphas, t, x0) * x0 + _i(self.sigmas, t, x0) * noise
+        return xt
+    def denoise(
+        self,
+        xt,
+        t,
+        s,
+        model,
+        model_kwargs={},
+        guide_scale=None,
+        guide_rescale=None,
+        clamp=None,
+        percentile=None
+    ):
+        """
+        Apply one step of denoising from the posterior distribution q(x_s | x_t, x0).
+        Since x0 is not available, estimate the denoising results using the learned
+        distribution p(x_s | x_t, \hat{x}_0 == f(x_t)).
+        """
+        s = t - 1 if s is None else s
+        # hyperparams
+        sigmas = _i(self.sigmas, t, xt)
+        alphas = _i(self.alphas, t, xt)
+        alphas_s = _i(self.alphas, s.clamp(0), xt)
+        alphas_s[s < 0] = 1.
+        sigmas_s = torch.sqrt(1 - alphas_s ** 2)
+        # precompute variables
+        betas = 1 - (alphas / alphas_s) ** 2
+        coef1 = betas * alphas_s / sigmas ** 2
+        coef2 = (alphas * sigmas_s ** 2) / (alphas_s * sigmas ** 2)
+        var = betas * (sigmas_s / sigmas) ** 2
+        log_var = torch.log(var).clamp_(-20, 20)
+        # prediction
+        if guide_scale is None:
+            assert isinstance(model_kwargs, dict)
+            out = model(xt, t=t, **model_kwargs)
+        else:
+            # classifier-free guidance (arXiv:2207.12598)
+            # model_kwargs[0]: conditional kwargs
+            # model_kwargs[1]: non-conditional kwargs
+            assert isinstance(model_kwargs, list) and len(model_kwargs) == 2
+            y_out = model(xt, t=t, **model_kwargs[0])
+            if guide_scale == 1.:
+                out = y_out
+            else:
+                u_out = model(xt, t=t, **model_kwargs[1])
+                out = u_out + guide_scale * (y_out - u_out)
+                # rescale the output according to arXiv:2305.08891
+                if guide_rescale is not None:
+                    assert guide_rescale >= 0 and guide_rescale <= 1
+                    ratio = (y_out.flatten(1).std(dim=1) / (
+                        out.flatten(1).std(dim=1) + 1e-12
+                    )).view((-1, ) + (1, ) * (y_out.ndim - 1))
+                    out *= guide_rescale * ratio + (1 - guide_rescale) * 1.0
+        # compute x0
+        if self.prediction_type == 'x0':
+            x0 = out
+        elif self.prediction_type == 'eps':
+            x0 = (xt - sigmas * out) / alphas
+        elif self.prediction_type == 'v':
+            x0 = alphas * xt - sigmas * out
+        else:
+            raise NotImplementedError(
+                f'prediction_type {self.prediction_type} not implemented'
+            )
+        # restrict the range of x0
+        if percentile is not None:
+            # NOTE: percentile should only be used when data is within range [-1, 1]
+            assert percentile > 0 and percentile <= 1
+            s = torch.quantile(x0.flatten(1).abs(), percentile, dim=1)
+            s = s.clamp_(1.0).view((-1, ) + (1, ) * (xt.ndim - 1))
+            x0 = torch.min(s, torch.max(-s, x0)) / s
+        elif clamp is not None:
+            x0 = x0.clamp(-clamp, clamp)
+        # recompute eps using the restricted x0
+        eps = (xt - alphas * x0) / sigmas
+        # compute mu (mean of posterior distribution) using the restricted x0
+        mu = coef1 * x0 + coef2 * xt
+        return mu, var, log_var, x0, eps
+    @torch.no_grad()
+    def sample(
+        self,
+        noise,
+        model,
+        model_kwargs={},
+        condition_fn=None,
+        guide_scale=None,
+        guide_rescale=None,
+        clamp=None,
+        percentile=None,
+        solver='euler_a',
+        steps=20,
+        t_max=None,
+        t_min=None,
+        discretization=None,
+        discard_penultimate_step=None,
+        return_intermediate=None,
+        show_progress=False,
+        seed=-1,
+        **kwargs
+    ):
+        # sanity check
+        assert isinstance(steps, (int, torch.LongTensor))
+        assert t_max is None or (t_max > 0 and t_max <= self.num_timesteps - 1)
+        assert t_min is None or (t_min >= 0 and t_min < self.num_timesteps - 1)
+        assert discretization in (None, 'leading', 'linspace', 'trailing')
+        assert discard_penultimate_step in (None, True, False)
+        assert return_intermediate in (None, 'x0', 'xt')
+        # function of diffusion solver
+        solver_fn = {
+            # 'heun': sample_heun,
+            'dpmpp_2m_sde': sample_dpmpp_2m_sde
+        }[solver]
+        # options
+        schedule = 'karras' if 'karras' in solver else None
+        discretization = discretization or 'linspace'
+        seed = seed if seed >= 0 else random.randint(0, 2 ** 31)
+        if isinstance(steps, torch.LongTensor):
+            discard_penultimate_step = False
+        if discard_penultimate_step is None:
+            discard_penultimate_step = True if solver in (
+                'dpm2',
+                'dpm2_ancestral',
+                'dpmpp_2m_sde',
+                'dpm2_karras',
+                'dpm2_ancestral_karras',
+                'dpmpp_2m_sde_karras'
+            ) else False
+        # function for denoising xt to get x0
+        intermediates = []
+        def model_fn(xt, sigma):
+            # denoising
+            t = self._sigma_to_t(sigma).repeat(len(xt)).round().long()
+            x0 = self.denoise(
+                xt, t, None, model, model_kwargs, guide_scale, guide_rescale, clamp,
+                percentile
+            )[-2]
+            # collect intermediate outputs
+            if return_intermediate == 'xt':
+                intermediates.append(xt)
+            elif return_intermediate == 'x0':
+                intermediates.append(x0)
+            return x0
+        # get timesteps
+        if isinstance(steps, int):
+            steps += 1 if discard_penultimate_step else 0
+            t_max = self.num_timesteps - 1 if t_max is None else t_max
+            t_min = 0 if t_min is None else t_min
+            # discretize timesteps
+            if discretization == 'leading':
+                steps = torch.arange(
+                    t_min, t_max + 1, (t_max - t_min + 1) / steps
+                ).flip(0)
+            elif discretization == 'linspace':
+                steps = torch.linspace(t_max, t_min, steps)
+            elif discretization == 'trailing':
+                steps = torch.arange(t_max, t_min - 1, -((t_max - t_min + 1) / steps))
+            else:
+                raise NotImplementedError(
+                    f'{discretization} discretization not implemented'
+                )
+            steps = steps.clamp_(t_min, t_max)
+        steps = torch.as_tensor(steps, dtype=torch.float32, device=noise.device)
+        # get sigmas
+        sigmas = self._t_to_sigma(steps)
+        sigmas = torch.cat([sigmas, sigmas.new_zeros([1])])
+        if schedule == 'karras':
+            if sigmas[0] == float('inf'):
+                sigmas = karras_schedule(
+                    n=len(steps) - 1,
+                    sigma_min=sigmas[sigmas > 0].min().item(),
+                    sigma_max=sigmas[sigmas < float('inf')].max().item(),
+                    rho=7.
+                ).to(sigmas)
+                sigmas = torch.cat([
+                    sigmas.new_tensor([float('inf')]), sigmas, sigmas.new_zeros([1])
+                ])
+            else:
+                sigmas = karras_schedule(
+                    n=len(steps),
+                    sigma_min=sigmas[sigmas > 0].min().item(),
+                    sigma_max=sigmas.max().item(),
+                    rho=7.
+                ).to(sigmas)
+                sigmas = torch.cat([sigmas, sigmas.new_zeros([1])])
+        if discard_penultimate_step:
+            sigmas = torch.cat([sigmas[:-2], sigmas[-1:]])
+        # sampling
+        x0 = solver_fn(
+            noise,
+            model_fn,
+            sigmas,
+            show_progress=show_progress,
+            **kwargs
+        )
+        return (x0, intermediates) if return_intermediate is not None else x0
+    @torch.no_grad()
+    def ddim_reverse_sample(
+        self,
+        xt,
+        t,
+        model,
+        model_kwargs={},
+        clamp=None,
+        percentile=None,
+        guide_scale=None,
+        guide_rescale=None,
+        ddim_timesteps=20,
+        reverse_steps=600
+        ):
+        r"""Sample from p(x_{t+1} | x_t) using DDIM reverse ODE (deterministic).
+        """
+        stride = reverse_steps // ddim_timesteps
+        # predict distribution of p(x_{t-1} | x_t)
+        # _, _, _, x0 = self.p_mean_variance(xt, t, model, model_kwargs, clamp, percentile, guide_scale)
+        _, _, _, x0, eps = self.denoise(
+                xt, t, None, model, model_kwargs, guide_scale, guide_rescale, clamp,
+                percentile
+            )
+        # derive variables
+        s = (t + stride).clamp(0, reverse_steps-1)
+        # hyperparams
+        sigmas = _i(self.sigmas, t, xt)
+        alphas = _i(self.alphas, t, xt)
+        alphas_s = _i(self.alphas, s.clamp(0), xt)
+        alphas_s[s < 0] = 1.
+        sigmas_s = torch.sqrt(1 - alphas_s ** 2)
+        # reverse sample
+        mu = alphas_s * x0 + sigmas_s * eps
+        return mu, x0
+    @torch.no_grad()
+    def ddim_reverse_sample_loop(
+        self,
+        x0,
+        model,
+        model_kwargs={},
+        clamp=None,
+        percentile=None,
+        guide_scale=None,
+        guide_rescale=None,
+        ddim_timesteps=20,
+        reverse_steps=600
+        ):
+        # prepare input
+        b = x0.size(0)
+        xt = x0
+        # reconstruction steps
+        steps = torch.arange(0, reverse_steps, reverse_steps // ddim_timesteps)
+        for step in steps:
+            t = torch.full((b, ), step, dtype=torch.long, device=xt.device)
+            xt, _ = self.ddim_reverse_sample(xt, t, model, model_kwargs, clamp, percentile, guide_scale, guide_rescale, ddim_timesteps, reverse_steps)
+        return xt
+    def _sigma_to_t(self, sigma):
+        if sigma == float('inf'):
+            t = torch.full_like(sigma, len(self.sigmas) - 1)
+        else:
+            log_sigmas = torch.sqrt(
+                self.sigmas ** 2 / (1 - self.sigmas ** 2)
+            ).log().to(sigma)
+            log_sigma = sigma.log()
+            dists = log_sigma - log_sigmas[:, None]
+            low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(
+                max=log_sigmas.shape[0] - 2
+            )
+            high_idx = low_idx + 1
+            low, high = log_sigmas[low_idx], log_sigmas[high_idx]
+            w = (low - log_sigma) / (low - high)
+            w = w.clamp(0, 1)
+            t = (1 - w) * low_idx + w * high_idx
+            t = t.view(sigma.shape)
+        if t.ndim == 0:
+            t = t.unsqueeze(0)
+        return t
+    def _t_to_sigma(self, t):
+        t = t.float()
+        low_idx, high_idx, w = t.floor().long(), t.ceil().long(), t.frac()
+        log_sigmas = torch.sqrt(self.sigmas ** 2 / (1 - self.sigmas ** 2)).log().to(t)
+        log_sigma = (1 - w) * log_sigmas[low_idx] + w * log_sigmas[high_idx]
+        log_sigma[torch.isnan(log_sigma) | torch.isinf(log_sigma)] = float('inf')
+        return log_sigma.exp()
+    def prev_step(self, model_out, t, xt, inference_steps=50):
+        prev_t = t - self.num_timesteps // inference_steps
+        sigmas = _i(self.sigmas, t, xt)
+        alphas = _i(self.alphas, t, xt)
+        alphas_prev = _i(self.alphas, prev_t.clamp(0), xt)
+        alphas_prev[prev_t < 0] = 1.
+        sigmas_prev = torch.sqrt(1 - alphas_prev ** 2)
+        x0 = alphas * xt - sigmas * model_out
+        eps = (xt - alphas * x0) / sigmas
+        prev_sample = alphas_prev * x0 + sigmas_prev * eps
+        return prev_sample
+    def next_step(self, model_out, t, xt, inference_steps=50):
+        t, next_t = min(t - self.num_timesteps // inference_steps, 999), t
+        sigmas = _i(self.sigmas, t, xt)
+        alphas = _i(self.alphas, t, xt)
+        alphas_next = _i(self.alphas, next_t.clamp(0), xt)
+        alphas_next[next_t < 0] = 1.
+        sigmas_next = torch.sqrt(1 - alphas_next ** 2)
+        x0 = alphas * xt - sigmas * model_out
+        eps = (xt - alphas * x0) / sigmas
+        next_sample = alphas_next * x0 + sigmas_next * eps
+        return next_sample
+    def get_noise_pred_single(self, xt, t, model, model_kwargs):
+        assert isinstance(model_kwargs, dict)
+        out = model(xt, t=t, **model_kwargs)
+        return out
--- a/tools/modules/diffusions/losses.py
+++ b/tools/modules/diffusions/losses.py
+import torch
+import math
+__all__ = ['kl_divergence', 'discretized_gaussian_log_likelihood']
+def kl_divergence(mu1, logvar1, mu2, logvar2):
+    return 0.5 * (-1.0 + logvar2 - logvar1 + torch.exp(logvar1 - logvar2) + ((mu1 - mu2) ** 2) * torch.exp(-logvar2))
+def standard_normal_cdf(x):
+    r"""A fast approximation of the cumulative distribution function of the standard normal.
+    """
+    return 0.5 * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+def discretized_gaussian_log_likelihood(x0, mean, log_scale):
+    assert x0.shape == mean.shape == log_scale.shape
+    cx = x0 - mean
+    inv_stdv = torch.exp(-log_scale)
+    cdf_plus = standard_normal_cdf(inv_stdv * (cx + 1.0 / 255.0))
+    cdf_min = standard_normal_cdf(inv_stdv * (cx - 1.0 / 255.0))
+    log_cdf_plus = torch.log(cdf_plus.clamp(min=1e-12))
+    log_one_minus_cdf_min = torch.log((1.0 - cdf_min).clamp(min=1e-12))
+    cdf_delta = cdf_plus - cdf_min
+    log_probs = torch.where(
+        x0 < -0.999,
+        log_cdf_plus,
+        torch.where(x0 > 0.999, log_one_minus_cdf_min, torch.log(cdf_delta.clamp(min=1e-12))))
+    assert log_probs.shape == x0.shape
+    return log_probs