# ************************************************************************* # Copyright (2023) Bytedance Inc. # # Copyright (2023) DragDiffusion Authors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ************************************************************************* # run results_0 of DragDiffusion import os import datetime import numpy as np import torch import torch.nn as nn import torch.nn.functional as F import pickle import PIL from PIL import Image from copy import deepcopy from einops import rearrange from types import SimpleNamespace from diffusers import DDIMScheduler, AutoencoderKL from torchvision.utils import save_image from pytorch_lightning import seed_everything import sys sys.path.insert(0, '../') from drag_pipeline import DragPipeline from utils.drag_utils import drag_diffusion_update from utils.attn_utils import register_attention_editor_diffusers, MutualSelfAttentionControl def preprocess_image(image, device): image = torch.from_numpy(image).float() / 127.5 - 1 # [-1, 1] image = rearrange(image, "h w c -> 1 c h w") image = image.to(device) return image # copy the run_drag function to here def run_drag(source_image, # image_with_clicks, mask, prompt, points, inversion_strength, end_step, lam, latent_lr, n_pix_step, model_path, vae_path, lora_path, start_step, start_layer, # save_dir="./results" ): # initialize model device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False, steps_offset=1) model = DragPipeline.from_pretrained(model_path, scheduler=scheduler).to(device) # call this function to override unet forward function, # so that intermediate features are returned after forward model.modify_unet_forward() # set vae if vae_path != "default": model.vae = AutoencoderKL.from_pretrained( vae_path ).to(model.vae.device, model.vae.dtype) # initialize parameters seed = 42 # random seed used by a lot of people for unknown reason seed_everything(seed) args = SimpleNamespace() args.prompt = prompt args.points = points args.n_inference_step = 50 args.n_actual_inference_step = round(inversion_strength * args.n_inference_step) args.guidance_scale = 1.0 args.unet_feature_idx = [3,4] args.r_m = 1 args.r_p = 3 args.end_step = end_step args.lam = lam args.lr = latent_lr args.n_pix_step = n_pix_step full_h, full_w = source_image.shape[:2] args.sup_res_h = int(0.5*full_h) args.sup_res_w = int(0.5*full_w) print(args) source_image = preprocess_image(source_image, device) # image_with_clicks = preprocess_image(image_with_clicks, device) # set lora if lora_path == "": print("nolora applying default parameters") model.unet.set_default_attn_processor() else: print("applying lora: " + lora_path) model.unet.load_attn_procs(lora_path) # invert the source image # the latent code resolution is too small, only 64*64 invert_code = model.invert(source_image, prompt, guidance_scale=args.guidance_scale, num_inference_steps=args.n_inference_step, num_actual_inference_steps=args.n_actual_inference_step) mask = torch.from_numpy(mask).float() / 255. mask[mask > 0.0] = 1.0 mask = rearrange(mask, "h w -> 1 1 h w").cuda() mask = F.interpolate(mask, (args.sup_res_h, args.sup_res_w), mode="nearest") handle_points = [] target_points = [] # grads_means = [] # here, the point is in x,y coordinate for idx, point in enumerate(points): cur_point = torch.tensor([point[1]/full_h*args.sup_res_h, point[0]/full_w*args.sup_res_w]) cur_point = torch.round(cur_point) if idx % 2 == 0: handle_points.append(cur_point) else: target_points.append(cur_point) print('handle points:', handle_points) print('target points:', target_points) init_code = invert_code init_code_orig = deepcopy(init_code) model.scheduler.set_timesteps(args.n_inference_step) t = model.scheduler.timesteps[args.n_inference_step - args.n_actual_inference_step] # update according to the given supervision updated_init_code, h_feature, h_features = drag_diffusion_update(model, init_code, t, handle_points, target_points, mask, args) # inference the synthesized image gen_image = model( prompt=args.prompt, h_feature=h_feature, end_step=args.end_step, batch_size=2, latents=torch.cat([init_code_orig, updated_init_code], dim=0), guidance_scale=args.guidance_scale, num_inference_steps=args.n_inference_step, num_actual_inference_steps=args.n_actual_inference_step )[1].unsqueeze(dim=0) # resize gen_image into the size of source_image # we do this because shape of gen_image will be rounded to multipliers of 8 gen_image = F.interpolate(gen_image, (full_h, full_w), mode='bilinear') # save the original image, user editing instructions, synthesized image # save_result = torch.cat([ # source_image * 0.5 + 0.5, # torch.ones((1,3,full_h,25)).cuda(), # image_with_clicks * 0.5 + 0.5, # torch.ones((1,3,full_h,25)).cuda(), # gen_image[0:1] # ], dim=-1) # if not os.path.isdir(save_dir): # os.mkdir(save_dir) # save_prefix = datetime.datetime.now().strftime("%Y-%m-%d-%H%M-%S") # save_image(save_result, os.path.join(save_dir, save_prefix + '.png')) out_image = gen_image.cpu().permute(0, 2, 3, 1).numpy()[0] out_image = (out_image * 255).astype(np.uint8) return out_image if __name__ == '__main__': all_category = [ 'art_work', 'land_scape', 'building_city_view', 'building_countryside_view', 'animals', 'human_head', 'human_upper_body', 'human_full_body', 'interior_design', 'other_objects', ] # assume root_dir and lora_dir are valid directory root_dir = '/home/bailuo/code/DragNoise/../DragDiffusion/drag_bench_evaluation/drag_bench_data/DragBench' lora_dir = '/home/bailuo/code/DragNoise/drag_bench_evaluation/drag_bench_lora_lora_rank_list*2' result_dir = '/home/bailuo/code/DragNoise/drag_bench_evaluation/drag_diffusion_res_lora_rank_list*2_[3,4]' # mkdir if necessary if not os.path.isdir(result_dir): os.mkdir(result_dir) for cat in all_category: os.mkdir(os.path.join(result_dir,cat)) grads_means = [] for cat in all_category: file_dir = os.path.join(root_dir, cat) for sample_name in os.listdir(file_dir): if sample_name == '.DS_Store': continue sample_path = os.path.join(file_dir, sample_name) # read image file source_image = Image.open(os.path.join(sample_path, 'original_image.png')) source_image = np.array(source_image) # load meta data with open(os.path.join(sample_path, 'meta_data.pkl'), 'rb') as f: meta_data = pickle.load(f) prompt = meta_data['prompt'] mask = meta_data['mask'] print(mask.shape) points = meta_data['points'] # load lora # using LoRA @ 200 steps lora_path = os.path.join(lora_dir, cat, sample_name, str(200)) print("applying lora: " + lora_path) out_image = run_drag( source_image, mask, prompt, points, inversion_strength=0.7, end_step=0, lam=0.2, latent_lr=0.02, n_pix_step=80, model_path="botp/stable-diffusion-v1-5", # model_path="/home/bailuo/models/models--botp--stable-diffusion-v1-5", vae_path="default", lora_path=lora_path, start_step=0, start_layer=10, ) save_dir = os.path.join(result_dir, cat, sample_name) if not os.path.isdir(save_dir): os.mkdir(save_dir) Image.fromarray(out_image).save(os.path.join(save_dir, 'dragged_image.png'))