Commit ce34ef01 authored by mashun1's avatar mashun1
Browse files

dynamicrafter

parents
Pipeline #812 canceled with stages
import os, sys, glob
import numpy as np
from collections import OrderedDict
from decord import VideoReader, cpu
import cv2
import torch
import torchvision
sys.path.insert(1, os.path.join(sys.path[0], '..', '..'))
from lvdm.models.samplers.ddim import DDIMSampler
from einops import rearrange
def batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1.0,\
cfg_scale=1.0, temporal_cfg_scale=None, **kwargs):
ddim_sampler = DDIMSampler(model)
uncond_type = model.uncond_type
batch_size = noise_shape[0]
fs = cond["fs"]
del cond["fs"]
if noise_shape[-1] == 32:
timestep_spacing = "uniform"
guidance_rescale = 0.0
else:
timestep_spacing = "uniform_trailing"
guidance_rescale = 0.7
## construct unconditional guidance
if cfg_scale != 1.0:
if uncond_type == "empty_seq":
prompts = batch_size * [""]
#prompts = N * T * [""] ## if is_imgbatch=True
uc_emb = model.get_learned_conditioning(prompts)
elif uncond_type == "zero_embed":
c_emb = cond["c_crossattn"][0] if isinstance(cond, dict) else cond
uc_emb = torch.zeros_like(c_emb)
## process image embedding token
if hasattr(model, 'embedder'):
uc_img = torch.zeros(noise_shape[0],3,224,224).to(model.device)
## img: b c h w >> b l c
uc_img = model.embedder(uc_img)
uc_img = model.image_proj_model(uc_img)
uc_emb = torch.cat([uc_emb, uc_img], dim=1)
if isinstance(cond, dict):
uc = {key:cond[key] for key in cond.keys()}
uc.update({'c_crossattn': [uc_emb]})
else:
uc = uc_emb
else:
uc = None
x_T = None
batch_variants = []
for _ in range(n_samples):
if ddim_sampler is not None:
kwargs.update({"clean_cond": True})
samples, _ = ddim_sampler.sample(S=ddim_steps,
conditioning=cond,
batch_size=noise_shape[0],
shape=noise_shape[1:],
verbose=False,
unconditional_guidance_scale=cfg_scale,
unconditional_conditioning=uc,
eta=ddim_eta,
temporal_length=noise_shape[2],
conditional_guidance_scale_temporal=temporal_cfg_scale,
x_T=x_T,
fs=fs,
timestep_spacing=timestep_spacing,
guidance_rescale=guidance_rescale,
precision=16,
**kwargs
)
## reconstruct from latent to pixel space
batch_images = model.decode_first_stage(samples)
batch_variants.append(batch_images)
## batch, <samples>, c, t, h, w
batch_variants = torch.stack(batch_variants, dim=1)
return batch_variants
def get_filelist(data_dir, ext='*'):
file_list = glob.glob(os.path.join(data_dir, '*.%s'%ext))
file_list.sort()
return file_list
def get_dirlist(path):
list = []
if (os.path.exists(path)):
files = os.listdir(path)
for file in files:
m = os.path.join(path,file)
if (os.path.isdir(m)):
list.append(m)
list.sort()
return list
def load_model_checkpoint(model, ckpt):
def load_checkpoint(model, ckpt, full_strict):
state_dict = torch.load(ckpt, map_location="cpu")
if "state_dict" in list(state_dict.keys()):
state_dict = state_dict["state_dict"]
try:
model.load_state_dict(state_dict, strict=full_strict)
except:
## rename the keys for 256x256 model
new_pl_sd = OrderedDict()
for k,v in state_dict.items():
new_pl_sd[k] = v
for k in list(new_pl_sd.keys()):
if "framestride_embed" in k:
new_key = k.replace("framestride_embed", "fps_embedding")
new_pl_sd[new_key] = new_pl_sd[k]
del new_pl_sd[k]
model.load_state_dict(new_pl_sd, strict=full_strict)
else:
## deepspeed
new_pl_sd = OrderedDict()
for key in state_dict['module'].keys():
new_pl_sd[key[16:]]=state_dict['module'][key]
model.load_state_dict(new_pl_sd, strict=full_strict)
return model
load_checkpoint(model, ckpt, full_strict=True)
print('>>> model checkpoint loaded.')
return model
def load_prompts(prompt_file):
f = open(prompt_file, 'r')
prompt_list = []
for idx, line in enumerate(f.readlines()):
l = line.strip()
if len(l) != 0:
prompt_list.append(l)
f.close()
return prompt_list
def load_video_batch(filepath_list, frame_stride, video_size=(256,256), video_frames=16):
'''
Notice about some special cases:
1. video_frames=-1 means to take all the frames (with fs=1)
2. when the total video frames is less than required, padding strategy will be used (repreated last frame)
'''
fps_list = []
batch_tensor = []
assert frame_stride > 0, "valid frame stride should be a positive interge!"
for filepath in filepath_list:
padding_num = 0
vidreader = VideoReader(filepath, ctx=cpu(0), width=video_size[1], height=video_size[0])
fps = vidreader.get_avg_fps()
total_frames = len(vidreader)
max_valid_frames = (total_frames-1) // frame_stride + 1
if video_frames < 0:
## all frames are collected: fs=1 is a must
required_frames = total_frames
frame_stride = 1
else:
required_frames = video_frames
query_frames = min(required_frames, max_valid_frames)
frame_indices = [frame_stride*i for i in range(query_frames)]
## [t,h,w,c] -> [c,t,h,w]
frames = vidreader.get_batch(frame_indices)
frame_tensor = torch.tensor(frames.asnumpy()).permute(3, 0, 1, 2).float()
frame_tensor = (frame_tensor / 255. - 0.5) * 2
if max_valid_frames < required_frames:
padding_num = required_frames - max_valid_frames
frame_tensor = torch.cat([frame_tensor, *([frame_tensor[:,-1:,:,:]]*padding_num)], dim=1)
print(f'{os.path.split(filepath)[1]} is not long enough: {padding_num} frames padded.')
batch_tensor.append(frame_tensor)
sample_fps = int(fps/frame_stride)
fps_list.append(sample_fps)
return torch.stack(batch_tensor, dim=0)
from PIL import Image
def load_image_batch(filepath_list, image_size=(256,256)):
batch_tensor = []
for filepath in filepath_list:
_, filename = os.path.split(filepath)
_, ext = os.path.splitext(filename)
if ext == '.mp4':
vidreader = VideoReader(filepath, ctx=cpu(0), width=image_size[1], height=image_size[0])
frame = vidreader.get_batch([0])
img_tensor = torch.tensor(frame.asnumpy()).squeeze(0).permute(2, 0, 1).float()
elif ext == '.png' or ext == '.jpg':
img = Image.open(filepath).convert("RGB")
rgb_img = np.array(img, np.float32)
#bgr_img = cv2.imread(filepath, cv2.IMREAD_COLOR)
#bgr_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2RGB)
rgb_img = cv2.resize(rgb_img, (image_size[1],image_size[0]), interpolation=cv2.INTER_LINEAR)
img_tensor = torch.from_numpy(rgb_img).permute(2, 0, 1).float()
else:
print(f'ERROR: <{ext}> image loading only support format: [mp4], [png], [jpg]')
raise NotImplementedError
img_tensor = (img_tensor / 255. - 0.5) * 2
batch_tensor.append(img_tensor)
return torch.stack(batch_tensor, dim=0)
def save_videos(batch_tensors, savedir, filenames, fps=10):
# b,samples,c,t,h,w
n_samples = batch_tensors.shape[1]
for idx, vid_tensor in enumerate(batch_tensors):
video = vid_tensor.detach().cpu()
video = torch.clamp(video.float(), -1., 1.)
video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w
frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(n_samples)) for framesheet in video] #[3, 1*h, n*w]
grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, n*h, w]
grid = (grid + 1.0) / 2.0
grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
savepath = os.path.join(savedir, f"{filenames[idx]}.mp4")
torchvision.io.write_video(savepath, grid, fps=fps, video_codec='h264', options={'crf': '10'})
def get_latent_z(model, videos):
b, c, t, h, w = videos.shape
x = rearrange(videos, 'b c t h w -> (b t) c h w')
z = model.encode_first_stage(x)
z = rearrange(z, '(b t) c h w -> b c t h w', b=b, t=t)
return z
\ No newline at end of file
import argparse, os, sys, glob
import datetime, time
from omegaconf import OmegaConf
from tqdm import tqdm
from einops import rearrange, repeat
from collections import OrderedDict
import torch
import torchvision
import torchvision.transforms as transforms
from pytorch_lightning import seed_everything
from PIL import Image
sys.path.insert(1, os.path.join(sys.path[0], '..', '..'))
from lvdm.models.samplers.ddim import DDIMSampler
from lvdm.models.samplers.ddim_multiplecond import DDIMSampler as DDIMSampler_multicond
from utils.utils import instantiate_from_config
def get_filelist(data_dir, postfixes):
patterns = [os.path.join(data_dir, f"*.{postfix}") for postfix in postfixes]
file_list = []
for pattern in patterns:
file_list.extend(glob.glob(pattern))
file_list.sort()
return file_list
def load_model_checkpoint(model, ckpt):
state_dict = torch.load(ckpt, map_location="cpu")
if "state_dict" in list(state_dict.keys()):
state_dict = state_dict["state_dict"]
try:
model.load_state_dict(state_dict, strict=True)
except:
## rename the keys for 256x256 model
new_pl_sd = OrderedDict()
for k,v in state_dict.items():
new_pl_sd[k] = v
for k in list(new_pl_sd.keys()):
if "framestride_embed" in k:
new_key = k.replace("framestride_embed", "fps_embedding")
new_pl_sd[new_key] = new_pl_sd[k]
del new_pl_sd[k]
model.load_state_dict(new_pl_sd, strict=True)
else:
# deepspeed
new_pl_sd = OrderedDict()
for key in state_dict['module'].keys():
new_pl_sd[key[16:]]=state_dict['module'][key]
model.load_state_dict(new_pl_sd)
print('>>> model checkpoint loaded.')
return model
def load_prompts(prompt_file):
f = open(prompt_file, 'r')
prompt_list = []
for idx, line in enumerate(f.readlines()):
l = line.strip()
if len(l) != 0:
prompt_list.append(l)
f.close()
return prompt_list
def load_data_prompts(data_dir, video_size=(256,256), video_frames=16, gfi=False):
transform = transforms.Compose([
transforms.Resize(min(video_size)),
transforms.CenterCrop(video_size),
transforms.ToTensor(),
transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))])
## load prompts
prompt_file = get_filelist(data_dir, ['txt'])
assert len(prompt_file) > 0, "Error: found NO prompt file!"
###### default prompt
default_idx = 0
default_idx = min(default_idx, len(prompt_file)-1)
if len(prompt_file) > 1:
print(f"Warning: multiple prompt files exist. The one {os.path.split(prompt_file[default_idx])[1]} is used.")
## only use the first one (sorted by name) if multiple exist
## load video
file_list = get_filelist(data_dir, ['jpg', 'png', 'jpeg', 'JPEG', 'PNG'])
# assert len(file_list) == n_samples, "Error: data and prompts are NOT paired!"
data_list = []
filename_list = []
prompt_list = load_prompts(prompt_file[default_idx])
n_samples = len(prompt_list)
for idx in range(n_samples):
image = Image.open(file_list[idx]).convert('RGB')
image_tensor = transform(image).unsqueeze(1) # [c,1,h,w]
frame_tensor = repeat(image_tensor, 'c t h w -> c (repeat t) h w', repeat=video_frames)
data_list.append(frame_tensor)
_, filename = os.path.split(file_list[idx])
filename_list.append(filename)
return filename_list, data_list, prompt_list
def save_results(prompt, samples, filename, fakedir, fps=8, loop=False):
filename = filename.split('.')[0]+'.mp4'
prompt = prompt[0] if isinstance(prompt, list) else prompt
## save video
videos = [samples]
savedirs = [fakedir]
for idx, video in enumerate(videos):
if video is None:
continue
# b,c,t,h,w
video = video.detach().cpu()
video = torch.clamp(video.float(), -1., 1.)
n = video.shape[0]
video = video.permute(2, 0, 1, 3, 4) # t,n,c,h,w
if loop:
video = video[:-1,...]
frame_grids = [torchvision.utils.make_grid(framesheet, nrow=int(n), padding=0) for framesheet in video] #[3, 1*h, n*w]
grid = torch.stack(frame_grids, dim=0) # stack in temporal dim [t, 3, h, n*w]
grid = (grid + 1.0) / 2.0
grid = (grid * 255).to(torch.uint8).permute(0, 2, 3, 1)
path = os.path.join(savedirs[idx], filename)
torchvision.io.write_video(path, grid, fps=fps, video_codec='h264', options={'crf': '10'}) ## crf indicates the quality
def save_results_seperate(prompt, samples, filename, fakedir, fps=10, loop=False):
prompt = prompt[0] if isinstance(prompt, list) else prompt
## save video
videos = [samples]
savedirs = [fakedir]
for idx, video in enumerate(videos):
if video is None:
continue
# b,c,t,h,w
video = video.detach().cpu()
if loop: # remove the last frame
video = video[:,:,:-1,...]
video = torch.clamp(video.float(), -1., 1.)
n = video.shape[0]
for i in range(n):
grid = video[i,...]
grid = (grid + 1.0) / 2.0
grid = (grid * 255).to(torch.uint8).permute(1, 2, 3, 0) #thwc
path = os.path.join(savedirs[idx].replace('samples', 'samples_separate'), f'{filename.split(".")[0]}_sample{i}.mp4')
torchvision.io.write_video(path, grid, fps=fps, video_codec='h264', options={'crf': '10'})
def get_latent_z(model, videos):
b, c, t, h, w = videos.shape
x = rearrange(videos, 'b c t h w -> (b t) c h w')
z = model.encode_first_stage(x)
z = rearrange(z, '(b t) c h w -> b c t h w', b=b, t=t)
return z
def image_guided_synthesis(model, prompts, videos, noise_shape, n_samples=1, ddim_steps=50, ddim_eta=1., \
unconditional_guidance_scale=1.0, cfg_img=None, fs=None, text_input=False, multiple_cond_cfg=False, loop=False, gfi=False, timestep_spacing='uniform', guidance_rescale=0.0, **kwargs):
ddim_sampler = DDIMSampler(model) if not multiple_cond_cfg else DDIMSampler_multicond(model)
batch_size = noise_shape[0]
fs = torch.tensor([fs] * batch_size, dtype=torch.long, device=model.device)
if not text_input:
prompts = [""]*batch_size
img = videos[:,:,0] #bchw
img_emb = model.embedder(img) ## blc
img_emb = model.image_proj_model(img_emb)
cond_emb = model.get_learned_conditioning(prompts)
cond = {"c_crossattn": [torch.cat([cond_emb,img_emb], dim=1)]}
if model.model.conditioning_key == 'hybrid':
z = get_latent_z(model, videos) # b c t h w
if loop or gfi:
img_cat_cond = torch.zeros_like(z)
img_cat_cond[:,:,0,:,:] = z[:,:,0,:,:]
img_cat_cond[:,:,-1,:,:] = z[:,:,-1,:,:]
else:
img_cat_cond = z[:,:,:1,:,:]
img_cat_cond = repeat(img_cat_cond, 'b c t h w -> b c (repeat t) h w', repeat=z.shape[2])
cond["c_concat"] = [img_cat_cond] # b c 1 h w
if unconditional_guidance_scale != 1.0:
if model.uncond_type == "empty_seq":
prompts = batch_size * [""]
uc_emb = model.get_learned_conditioning(prompts)
elif model.uncond_type == "zero_embed":
uc_emb = torch.zeros_like(cond_emb)
uc_img_emb = model.embedder(torch.zeros_like(img)) ## b l c
uc_img_emb = model.image_proj_model(uc_img_emb)
uc = {"c_crossattn": [torch.cat([uc_emb,uc_img_emb],dim=1)]}
if model.model.conditioning_key == 'hybrid':
uc["c_concat"] = [img_cat_cond]
else:
uc = None
## we need one more unconditioning image=yes, text=""
if multiple_cond_cfg and cfg_img != 1.0:
uc_2 = {"c_crossattn": [torch.cat([uc_emb,img_emb],dim=1)]}
if model.model.conditioning_key == 'hybrid':
uc_2["c_concat"] = [img_cat_cond]
kwargs.update({"unconditional_conditioning_img_nonetext": uc_2})
else:
kwargs.update({"unconditional_conditioning_img_nonetext": None})
z0 = None
cond_mask = None
batch_variants = []
for _ in range(n_samples):
if z0 is not None:
cond_z0 = z0.clone()
kwargs.update({"clean_cond": True})
else:
cond_z0 = None
if ddim_sampler is not None:
samples, _ = ddim_sampler.sample(S=ddim_steps,
conditioning=cond,
batch_size=batch_size,
shape=noise_shape[1:],
verbose=False,
unconditional_guidance_scale=unconditional_guidance_scale,
unconditional_conditioning=uc,
eta=ddim_eta,
cfg_img=cfg_img,
mask=cond_mask,
x0=cond_z0,
fs=fs,
timestep_spacing=timestep_spacing,
guidance_rescale=guidance_rescale,
**kwargs
)
## reconstruct from latent to pixel space
batch_images = model.decode_first_stage(samples)
batch_variants.append(batch_images)
## variants, batch, c, t, h, w
batch_variants = torch.stack(batch_variants)
return batch_variants.permute(1, 0, 2, 3, 4, 5)
def run_inference(args, gpu_num, gpu_no):
## model config
config = OmegaConf.load(args.config)
model_config = config.pop("model", OmegaConf.create())
## set use_checkpoint as False as when using deepspeed, it encounters an error "deepspeed backend not set"
model_config['params']['unet_config']['params']['use_checkpoint'] = False
model = instantiate_from_config(model_config)
model = model.cuda(gpu_no)
model.perframe_ae = args.perframe_ae
assert os.path.exists(args.ckpt_path), "Error: checkpoint Not Found!"
model = load_model_checkpoint(model, args.ckpt_path)
model.eval()
## run over data
assert (args.height % 16 == 0) and (args.width % 16 == 0), "Error: image size [h,w] should be multiples of 16!"
assert args.bs == 1, "Current implementation only support [batch size = 1]!"
## latent noise shape
h, w = args.height // 8, args.width // 8
channels = model.model.diffusion_model.out_channels
n_frames = args.video_length
print(f'Inference with {n_frames} frames')
noise_shape = [args.bs, channels, n_frames, h, w]
fakedir = os.path.join(args.savedir, "samples")
fakedir_separate = os.path.join(args.savedir, "samples_separate")
# os.makedirs(fakedir, exist_ok=True)
os.makedirs(fakedir_separate, exist_ok=True)
## prompt file setting
assert os.path.exists(args.prompt_dir), "Error: prompt file Not Found!"
filename_list, data_list, prompt_list = load_data_prompts(args.prompt_dir, video_size=(args.height, args.width), video_frames=n_frames, gfi=args.gfi)
num_samples = len(prompt_list)
samples_split = num_samples // gpu_num
print('Prompts testing [rank:%d] %d/%d samples loaded.'%(gpu_no, samples_split, num_samples))
#indices = random.choices(list(range(0, num_samples)), k=samples_per_device)
indices = list(range(samples_split*gpu_no, samples_split*(gpu_no+1)))
prompt_list_rank = [prompt_list[i] for i in indices]
data_list_rank = [data_list[i] for i in indices]
filename_list_rank = [filename_list[i] for i in indices]
start = time.time()
with torch.no_grad(), torch.cuda.amp.autocast():
for idx, indice in tqdm(enumerate(range(0, len(prompt_list_rank), args.bs)), desc='Sample Batch'):
prompts = prompt_list_rank[indice:indice+args.bs]
videos = data_list_rank[indice:indice+args.bs]
filenames = filename_list_rank[indice:indice+args.bs]
if isinstance(videos, list):
videos = torch.stack(videos, dim=0).to("cuda")
else:
videos = videos.unsqueeze(0).to("cuda")
batch_samples = image_guided_synthesis(model, prompts, videos, noise_shape, args.n_samples, args.ddim_steps, args.ddim_eta, \
args.unconditional_guidance_scale, args.cfg_img, args.frame_stride, args.text_input, args.multiple_cond_cfg, args.loop, args.gfi, args.timestep_spacing, args.guidance_rescale)
## save each example individually
for nn, samples in enumerate(batch_samples):
## samples : [n_samples,c,t,h,w]
prompt = prompts[nn]
filename = filenames[nn]
# save_results(prompt, samples, filename, fakedir, fps=8, loop=args.loop)
save_results_seperate(prompt, samples, filename, fakedir, fps=8, loop=args.loop)
print(f"Saved in {args.savedir}. Time used: {(time.time() - start):.2f} seconds")
def get_parser():
parser = argparse.ArgumentParser()
parser.add_argument("--savedir", type=str, default=None, help="results saving path")
parser.add_argument("--ckpt_path", type=str, default=None, help="checkpoint path")
parser.add_argument("--config", type=str, help="config (yaml) path")
parser.add_argument("--prompt_dir", type=str, default=None, help="a data dir containing videos and prompts")
parser.add_argument("--n_samples", type=int, default=1, help="num of samples per prompt",)
parser.add_argument("--ddim_steps", type=int, default=50, help="steps of ddim if positive, otherwise use DDPM",)
parser.add_argument("--ddim_eta", type=float, default=1.0, help="eta for ddim sampling (0.0 yields deterministic sampling)",)
parser.add_argument("--bs", type=int, default=1, help="batch size for inference, should be one")
parser.add_argument("--height", type=int, default=512, help="image height, in pixel space")
parser.add_argument("--width", type=int, default=512, help="image width, in pixel space")
parser.add_argument("--frame_stride", type=int, default=3, help="frame stride control for 256 model (larger->larger motion), FPS control for 512 or 1024 model (smaller->larger motion)")
parser.add_argument("--unconditional_guidance_scale", type=float, default=1.0, help="prompt classifier-free guidance")
parser.add_argument("--seed", type=int, default=123, help="seed for seed_everything")
parser.add_argument("--video_length", type=int, default=16, help="inference video length")
parser.add_argument("--negative_prompt", action='store_true', default=False, help="negative prompt")
parser.add_argument("--text_input", action='store_true', default=False, help="input text to I2V model or not")
parser.add_argument("--multiple_cond_cfg", action='store_true', default=False, help="use multi-condition cfg or not")
parser.add_argument("--cfg_img", type=float, default=None, help="guidance scale for image conditioning")
parser.add_argument("--timestep_spacing", type=str, default="uniform", help="The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.")
parser.add_argument("--guidance_rescale", type=float, default=0.0, help="guidance rescale in [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891)")
parser.add_argument("--perframe_ae", action='store_true', default=False, help="if we use per-frame AE decoding, set it to True to save GPU memory, especially for the model of 576x1024")
## currently not support looping video and generative frame interpolation
parser.add_argument("--loop", action='store_true', default=False, help="generate looping videos or not")
parser.add_argument("--gfi", action='store_true', default=False, help="generate generative frame interpolation (gfi) or not")
return parser
if __name__ == '__main__':
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
print("@DynamiCrafter cond-Inference: %s"%now)
parser = get_parser()
args = parser.parse_args()
seed_everything(args.seed)
rank, gpu_num = 0, 1
run_inference(args, gpu_num, rank)
\ No newline at end of file
import os
import time
from omegaconf import OmegaConf
import torch
from scripts.evaluation.funcs import load_model_checkpoint, save_videos, batch_ddim_sampling, get_latent_z
from utils.utils import instantiate_from_config
from huggingface_hub import hf_hub_download
from einops import repeat
import torchvision.transforms as transforms
from pytorch_lightning import seed_everything
class Image2Video():
def __init__(self,result_dir='./tmp/',gpu_num=1,resolution='256_256', **kwargs) -> None:
self.resolution = (int(resolution.split('_')[0]), int(resolution.split('_')[1])) #hw
self.download_model()
self.result_dir = result_dir
if not os.path.exists(self.result_dir):
os.mkdir(self.result_dir)
ckpt_path='checkpoints/dynamicrafter_'+resolution.split('_')[1]+'_v1/model.ckpt'
config_file='configs/inference_'+resolution.split('_')[1]+'_v1.0.yaml'
config = OmegaConf.load(config_file)
model_config = config.pop("model", OmegaConf.create())
model_config['params']['unet_config']['params']['use_checkpoint']=False
model_list = []
for gpu_id in range(gpu_num):
model = instantiate_from_config(model_config)
# model = model.cuda(gpu_id)
assert os.path.exists(ckpt_path), "Error: checkpoint Not Found!"
model = load_model_checkpoint(model, ckpt_path)
model.eval()
model_list.append(model)
self.model_list = model_list
self.save_fps = 8
self.kwargs = kwargs
def get_image(self, image, prompt, steps=50, cfg_scale=7.5, eta=1.0, fs=3, seed=123):
seed_everything(seed)
transform = transforms.Compose([
transforms.Resize(min(self.resolution)),
transforms.CenterCrop(self.resolution),
])
torch.cuda.empty_cache()
print('start:', prompt, time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())))
start = time.time()
gpu_id=0
if steps > 60:
steps = 60
model = self.model_list[gpu_id]
model = model.cuda()
batch_size=1
channels = model.model.diffusion_model.out_channels
frames = model.temporal_length
h, w = self.resolution[0] // 8, self.resolution[1] // 8
noise_shape = [batch_size, channels, frames, h, w]
# text cond
text_emb = model.get_learned_conditioning([prompt])
# img cond
img_tensor = torch.from_numpy(image).permute(2, 0, 1).float().to(model.device)
img_tensor = (img_tensor / 255. - 0.5) * 2
image_tensor_resized = transform(img_tensor) #3,h,w
videos = image_tensor_resized.unsqueeze(0) # bchw
z = get_latent_z(model, videos.unsqueeze(2)) #bc,1,hw
img_tensor_repeat = repeat(z, 'b c t h w -> b c (repeat t) h w', repeat=frames)
cond_images = model.embedder(img_tensor.unsqueeze(0)) ## blc
img_emb = model.image_proj_model(cond_images)
imtext_cond = torch.cat([text_emb, img_emb], dim=1)
fs = torch.tensor([fs], dtype=torch.long, device=model.device)
cond = {"c_crossattn": [imtext_cond], "fs": fs, "c_concat": [img_tensor_repeat]}
## inference
with torch.no_grad(), torch.cuda.amp.autocast():
batch_samples = batch_ddim_sampling(model, cond, noise_shape, n_samples=1, ddim_steps=steps, ddim_eta=eta, cfg_scale=cfg_scale)
## b,samples,c,t,h,w
prompt_str = prompt.replace("/", "_slash_") if "/" in prompt else prompt
prompt_str = prompt_str.replace(" ", "_") if " " in prompt else prompt_str
prompt_str=prompt_str[:40]
if len(prompt_str) == 0:
prompt_str = 'empty_prompt'
save_videos(batch_samples, self.result_dir, filenames=[prompt_str], fps=self.save_fps)
print(f"Saved in {prompt_str}. Time used: {(time.time() - start):.2f} seconds")
model = model.cpu()
return os.path.join(self.result_dir, f"{prompt_str}.mp4")
def download_model(self):
REPO_ID = 'Doubiiu/DynamiCrafter_'+str(self.resolution[1]) if self.resolution[1]!=256 else 'Doubiiu/DynamiCrafter'
filename_list = ['model.ckpt']
if not os.path.exists('./checkpoints/dynamicrafter_'+str(self.resolution[1])+'_v1/'):
os.makedirs('./checkpoints/dynamicrafter_'+str(self.resolution[1])+'_v1/')
for filename in filename_list:
local_file = os.path.join('./checkpoints/dynamicrafter_'+str(self.resolution[1])+'_v1/', filename)
if not os.path.exists(local_file):
hf_hub_download(repo_id=REPO_ID, filename=filename, local_dir='./checkpoints/dynamicrafter_'+str(self.resolution[1])+'_v1/', local_dir_use_symlinks=False)
if __name__ == '__main__':
i2v = Image2Video()
video_path = i2v.get_image('prompts/art.png','man fishing in a boat at sunset')
print('done', video_path)
\ No newline at end of file
version=$1 ##1024, 512, 256
seed=123
name=dynamicrafter_$1_seed${seed}
ckpt=checkpoints/dynamicrafter_$1_v1/model.ckpt
config=configs/inference_$1_v1.0.yaml
prompt_dir=prompts/$1/
res_dir="results"
if [ "$1" == "256" ]; then
H=256
FS=3 ## This model adopts frame stride=3, range recommended: 1-6 (larger value -> larger motion)
elif [ "$1" == "512" ]; then
H=320
FS=24 ## This model adopts FPS=24, range recommended: 15-30 (smaller value -> larger motion)
elif [ "$1" == "1024" ]; then
H=576
FS=10 ## This model adopts FPS=10, range recommended: 15-5 (smaller value -> larger motion)
else
echo "Invalid input. Please enter 256, 512, or 1024."
exit 1
fi
if [ "$1" == "256" ]; then
CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/inference.py \
--seed ${seed} \
--ckpt_path $ckpt \
--config $config \
--savedir $res_dir/$name \
--n_samples 1 \
--bs 1 --height ${H} --width $1 \
--unconditional_guidance_scale 7.5 \
--ddim_steps 50 \
--ddim_eta 1.0 \
--prompt_dir $prompt_dir \
--text_input \
--video_length 16 \
--frame_stride ${FS}
else
CUDA_VISIBLE_DEVICES=0 python3 scripts/evaluation/inference.py \
--seed ${seed} \
--ckpt_path $ckpt \
--config $config \
--savedir $res_dir/$name \
--n_samples 1 \
--bs 1 --height ${H} --width $1 \
--unconditional_guidance_scale 7.5 \
--ddim_steps 50 \
--ddim_eta 1.0 \
--prompt_dir $prompt_dir \
--text_input \
--video_length 16 \
--frame_stride ${FS} \
--timestep_spacing 'uniform_trailing' --guidance_rescale 0.7 --perframe_ae
fi
## multi-cond CFG: the <unconditional_guidance_scale> is s_txt, <cfg_img> is s_img
#--multiple_cond_cfg --cfg_img 7.5
#--loop
\ No newline at end of file
version=$1 ##1024, 512, 256
seed=123
name=dynamicrafter_$1_mp_seed${seed}
ckpt=checkpoints/dynamicrafter_$1_v1/model.ckpt
config=configs/inference_$1_v1.0.yaml
prompt_dir=prompts/$1/
res_dir="results"
if [ "$1" == "256" ]; then
H=256
FS=3 ## This model adopts frame stride=3
elif [ "$1" == "512" ]; then
H=320
FS=24 ## This model adopts FPS=24
elif [ "$1" == "1024" ]; then
H=576
FS=10 ## This model adopts FPS=10
else
echo "Invalid input. Please enter 256, 512, or 1024."
exit 1
fi
# if [ "$1" == "256" ]; then
# CUDA_VISIBLE_DEVICES=2 python3 scripts/evaluation/inference.py \
# --seed 123 \
# --ckpt_path $ckpt \
# --config $config \
# --savedir $res_dir/$name \
# --n_samples 1 \
# --bs 1 --height ${H} --width $1 \
# --unconditional_guidance_scale 7.5 \
# --ddim_steps 50 \
# --ddim_eta 1.0 \
# --prompt_dir $prompt_dir \
# --text_input \
# --video_length 16 \
# --frame_stride ${FS}
# else
# CUDA_VISIBLE_DEVICES=2 python3 scripts/evaluation/inference.py \
# --seed 123 \
# --ckpt_path $ckpt \
# --config $config \
# --savedir $res_dir/$name \
# --n_samples 1 \
# --bs 1 --height ${H} --width $1 \
# --unconditional_guidance_scale 7.5 \
# --ddim_steps 50 \
# --ddim_eta 1.0 \
# --prompt_dir $prompt_dir \
# --text_input \
# --video_length 16 \
# --frame_stride ${FS} \
# --timestep_spacing 'uniform_trailing' --guidance_rescale 0.7
# fi
## multi-cond CFG: the <unconditional_guidance_scale> is s_txt, <cfg_img> is s_img
#--multiple_cond_cfg --cfg_img 7.5
#--loop
## inference using single node with multi-GPUs:
if [ "$1" == "256" ]; then
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch \
--nproc_per_node=8 --nnodes=1 --master_addr=127.0.0.1 --master_port=23456 --node_rank=0 \
scripts/evaluation/ddp_wrapper.py \
--module 'inference' \
--seed ${seed} \
--ckpt_path $ckpt \
--config $config \
--savedir $res_dir/$name \
--n_samples 1 \
--bs 1 --height ${H} --width $1 \
--unconditional_guidance_scale 7.5 \
--ddim_steps 50 \
--ddim_eta 1.0 \
--prompt_dir $prompt_dir \
--text_input \
--video_length 16 \
--frame_stride ${FS}
else
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python3 -m torch.distributed.launch \
--nproc_per_node=8 --nnodes=1 --master_addr=127.0.0.1 --master_port=23456 --node_rank=0 \
scripts/evaluation/ddp_wrapper.py \
--module 'inference' \
--seed ${seed} \
--ckpt_path $ckpt \
--config $config \
--savedir $res_dir/$name \
--n_samples 1 \
--bs 1 --height ${H} --width $1 \
--unconditional_guidance_scale 7.5 \
--ddim_steps 50 \
--ddim_eta 1.0 \
--prompt_dir $prompt_dir \
--text_input \
--video_length 16 \
--frame_stride ${FS} \
--timestep_spacing 'uniform_trailing' --guidance_rescale 0.7 --perframe_ae
fi
\ No newline at end of file
import importlib
import numpy as np
import cv2
import torch
import torch.distributed as dist
def count_params(model, verbose=False):
total_params = sum(p.numel() for p in model.parameters())
if verbose:
print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.")
return total_params
def check_istarget(name, para_list):
"""
name: full name of source para
para_list: partial name of target para
"""
istarget=False
for para in para_list:
if para in name:
return True
return istarget
def instantiate_from_config(config):
if not "target" in config:
if config == '__is_first_stage__':
return None
elif config == "__is_unconditional__":
return None
raise KeyError("Expected key `target` to instantiate.")
return get_obj_from_str(config["target"])(**config.get("params", dict()))
def get_obj_from_str(string, reload=False):
module, cls = string.rsplit(".", 1)
if reload:
module_imp = importlib.import_module(module)
importlib.reload(module_imp)
return getattr(importlib.import_module(module, package=None), cls)
def load_npz_from_dir(data_dir):
data = [np.load(os.path.join(data_dir, data_name))['arr_0'] for data_name in os.listdir(data_dir)]
data = np.concatenate(data, axis=0)
return data
def load_npz_from_paths(data_paths):
data = [np.load(data_path)['arr_0'] for data_path in data_paths]
data = np.concatenate(data, axis=0)
return data
def resize_numpy_image(image, max_resolution=512 * 512, resize_short_edge=None):
h, w = image.shape[:2]
if resize_short_edge is not None:
k = resize_short_edge / min(h, w)
else:
k = max_resolution / (h * w)
k = k**0.5
h = int(np.round(h * k / 64)) * 64
w = int(np.round(w * k / 64)) * 64
image = cv2.resize(image, (w, h), interpolation=cv2.INTER_LANCZOS4)
return image
def setup_dist(args):
if dist.is_initialized():
return
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(
'nccl',
init_method='env://'
)
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment