#!/usr/bin/env python from __future__ import annotations import argparse import os import random import sys import uuid from datetime import datetime from pathlib import Path from typing import List, Tuple, Union current_file_path = Path(__file__).resolve() sys.path.insert(0, str(current_file_path.parent.parent)) import gradio as gr import numpy as np import torch from PIL import Image as PILImage import torchvision.transforms as T import torchvision.transforms.functional as TF from torchvision.utils import _log_api_usage_once, make_grid, save_image from diffusers import PixArtAlphaPipeline from diffusion import DPMS, SASolverSampler from diffusion.data.datasets import * from diffusion.model.hed import HEDdetector from diffusion.model.nets import PixArt_XL_2, PixArtMS_XL_2, ControlPixArtHalf, ControlPixArtMSHalf from diffusion.model.utils import resize_and_crop_tensor from diffusion.utils.misc import read_config from tools.download import find_model DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png) # PixArt-Delta (ControlNet) #### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. #### This demo uses the [PixArt-alpha/PixArt-XL-2-1024-ControlNet](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) checkpoint. #### This demo uses the [PixArt-alpha/PixArt-XL-2-512-ControlNet](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) checkpoint. #### English prompts ONLY; 提示词仅限英文 ### Please use the image size corresponding to the model as input to get the best performance. (eg. 1024px for PixArt-XL-2-1024-ControlNet.pth) """ if not torch.cuda.is_available(): DESCRIPTION += "\n

Running on CPU �� This demo does not work on CPU.

" MAX_SEED = np.iinfo(np.int32).max CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1" MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048")) USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1" ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1" PORT = int(os.getenv("DEMO_PORT", "15432")) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") @torch.no_grad() def ndarr_image(tensor: Union[torch.Tensor, List[torch.Tensor]], **kwargs, ) -> None: if not torch.jit.is_scripting() and not torch.jit.is_tracing(): _log_api_usage_once(save_image) grid = make_grid(tensor, **kwargs) ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy() return ndarr style_list = [ { "name": "(No style)", "prompt": "{prompt}", "negative_prompt": "", }, { "name": "Cinematic", "prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy", "negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured", }, { "name": "Photographic", "prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed", "negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly", }, { "name": "Anime", "prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed", "negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast", }, { "name": "Manga", "prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style", "negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style", }, { "name": "Digital Art", "prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed", "negative_prompt": "photo, photorealistic, realism, ugly", }, { "name": "Pixel art", "prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics", "negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic", }, { "name": "Fantasy art", "prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy", "negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white", }, { "name": "Neonpunk", "prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional", "negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured", }, { "name": "3D Model", "prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting", "negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting", }, ] styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list} STYLE_NAMES = list(styles.keys()) DEFAULT_STYLE_NAME = "(No style)" SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"] DEFAULT_SCHEDULE_NAME = "DPM-Solver" def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]: p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME]) if not negative: negative = "" return p.replace("{prompt}", positive), n + negative def save_image(img): unique_name = str(uuid.uuid4()) + '.png' save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}') os.makedirs(save_path, exist_ok=True) unique_name = os.path.join(save_path, unique_name) img.save(unique_name) return unique_name def randomize_seed_fn(seed: int, randomize_seed: bool) -> int: if randomize_seed: seed = random.randint(0, MAX_SEED) return seed @torch.inference_mode() def generate( prompt: str, given_image = None, negative_prompt: str = "", style: str = DEFAULT_STYLE_NAME, use_negative_prompt: bool = False, seed: int = 0, width: int = 1024, height: int = 1024, schedule: str = 'DPM-Solver', dpms_guidance_scale: float = 4.5, sas_guidance_scale: float = 3, dpms_inference_steps: int = 14, sas_inference_steps: int = 25, randomize_seed: bool = False, ): seed = int(randomize_seed_fn(seed, randomize_seed)) torch.manual_seed(seed) torch.cuda.empty_cache() strength = 1.0 c_vis = given_image if not use_negative_prompt: negative_prompt = None # type: ignore prompt, negative_prompt = apply_style(style, prompt, negative_prompt) prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask\ = pipe.encode_prompt(prompt=prompt, negative_prompt=negative_prompt) prompt_embeds, negative_prompt_embeds = prompt_embeds[:, None], negative_prompt_embeds[:, None] torch.cuda.empty_cache() # condition process if given_image is not None: ar = torch.tensor([given_image.size[1] / given_image.size[0]], device=device)[None] custom_hw = torch.tensor([given_image.size[1], given_image.size[0]], device=device)[None] closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))] hw = torch.tensor(closest_hw, device=device)[None] condition_transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB')), T.Resize(int(min(closest_hw))), T.CenterCrop([int(closest_hw[0]), int(closest_hw[1])]), T.ToTensor(), ]) given_image = condition_transform(given_image).unsqueeze(0).to(device) hed_edge = hed(given_image) * strength hed_edge = TF.normalize(hed_edge, [.5], [.5]) hed_edge = hed_edge.repeat(1, 3, 1, 1).to(weight_dtype) posterior = vae.encode(hed_edge).latent_dist condition = posterior.sample() c = condition * config.scale_factor c_vis = vae.decode(condition)['sample'] c_vis = torch.clamp(127.5 * c_vis + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()[0] else: c = None ar = torch.tensor([int(height) / int(width)], device=device)[None] custom_hw = torch.tensor([int(height), int(width)], device=device)[None] closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))] hw = torch.tensor(closest_hw, device=device)[None] latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8) # Sample images: if schedule == 'DPM-Solver': # Create sampling noise: n = prompt_embeds.shape[0] z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device) model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=prompt_attention_mask, c=c) dpm_solver = DPMS(model.forward_with_dpmsolver, condition=prompt_embeds, uncondition=negative_prompt_embeds, cfg_scale=dpms_guidance_scale, model_kwargs=model_kwargs) samples = dpm_solver.sample( z, steps=dpms_inference_steps, order=2, skip_type="time_uniform", method="multistep", ).to(weight_dtype) elif schedule == "SA-Solver": # Create sampling noise: n = prompt_embeds.shape[0] model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=prompt_attention_mask, c=c) sas_solver = SASolverSampler(model.forward_with_dpmsolver, device=device) samples = sas_solver.sample( S=sas_inference_steps, batch_size=n, shape=(4, latent_size_h, latent_size_w), eta=1, conditioning=prompt_embeds, unconditional_conditioning=negative_prompt_embeds, unconditional_guidance_scale=sas_guidance_scale, model_kwargs=model_kwargs, )[0].to(weight_dtype) samples = vae.decode(samples / config.scale_factor).sample torch.cuda.empty_cache() samples = resize_and_crop_tensor(samples, custom_hw[0, 1], custom_hw[0, 0]) samples = PILImage.fromarray(ndarr_image(samples, normalize=True, value_range=(-1, 1))) image_paths = [save_image(samples)] c_vis = PILImage.fromarray(c_vis) if c_vis is not None else samples c_paths = [save_image(c_vis)] print(image_paths) return image_paths, c_paths, seed def get_args(): parser = argparse.ArgumentParser() parser.add_argument("config", type=str, help="config") parser.add_argument('--image_size', default=1024, type=int) parser.add_argument('--model_path', type=str) return parser.parse_args() args = get_args() config = read_config(args.config) device = "cuda" if torch.cuda.is_available() else "cpu" assert args.image_size in [512, 1024], "We only provide pre-trained models for 512x512 and 1024x1024 resolutions." lewei_scale = {512: 1, 1024: 2} latent_size = args.image_size // 8 weight_dtype = torch.float16 print(f"Inference with {weight_dtype}") if torch.cuda.is_available(): hed = HEDdetector(False).to(device) pipe = PixArtAlphaPipeline.from_pretrained( "PixArt-alpha/PixArt-XL-2-1024-MS", transformer=None, torch_dtype=weight_dtype, use_safetensors=True, ) pipe.to(device) print("Loaded on Device!") vae = pipe.vae text_encoder = pipe.text_encoder tokenizer = pipe.tokenizer assert args.image_size == config.image_size if config.image_size == 512: model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[config.image_size]) print('model architecture ControlPixArtHalf and image size is 512') model = ControlPixArtHalf(model).to(device) elif config.image_size == 1024: model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[config.image_size]) print('model architecture ControlPixArtMSHalf and image size is 1024') model = ControlPixArtMSHalf(model).to(device) state_dict = find_model(args.model_path)['state_dict'] if 'pos_embed' in state_dict: del state_dict['pos_embed'] elif 'base_model.pos_embed' in state_dict: del state_dict['base_model.pos_embed'] missing, unexpected = model.load_state_dict(state_dict, strict=False) print('Missing keys (missing pos_embed is normal): ', missing) print('Unexpected keys', unexpected) model.eval() model.to(weight_dtype) base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST') with gr.Blocks(css="app/style_controlnet.css") as demo: gr.Markdown(DESCRIPTION) gr.DuplicateButton( value="Duplicate Space for private use", elem_id="duplicate-button", visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1", ) image_input = gr.Image( label="Image", height=360, width=360, show_label=False, sources="upload", type="pil", ) with gr.Group(): with gr.Row(): prompt = gr.Text( label="Prompt", show_label=False, max_lines=1, placeholder="Enter your prompt", container=False, ) run_button = gr.Button("Run", scale=0) with gr.Group(): with gr.Row(): hed_result = gr.Gallery(label="Hed Result", show_label=False) result = gr.Gallery(label="Result", show_label=False) with gr.Accordion("Advanced options", open=False): with gr.Row(): use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True) schedule = gr.Radio( show_label=True, container=True, interactive=True, choices=SCHEDULE_NAME, value=DEFAULT_SCHEDULE_NAME, label="Sampler Schedule", visible=True, ) style_selection = gr.Radio( show_label=True, container=True, interactive=True, choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME, label="Image Style", ) negative_prompt = gr.Text( label="Negative prompt", max_lines=1, placeholder="Enter a negative prompt", visible=True, ) seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Row(visible=True): width = gr.Slider( label="Width", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=config.image_size, ) height = gr.Slider( label="Height", minimum=256, maximum=MAX_IMAGE_SIZE, step=32, value=config.image_size, ) with gr.Row(): dpms_guidance_scale = gr.Slider( label="DPM-Solver Guidance scale", minimum=1, maximum=10, step=0.1, value=4.5, ) dpms_inference_steps = gr.Slider( label="DPM-Solver inference steps", minimum=5, maximum=40, step=1, value=14, ) with gr.Row(): sas_guidance_scale = gr.Slider( label="SA-Solver Guidance scale", minimum=1, maximum=10, step=0.1, value=3, ) sas_inference_steps = gr.Slider( label="SA-Solver inference steps", minimum=10, maximum=40, step=1, value=25, ) gr.Examples( examples=[ [ "anime superman in action", "asset/images/controlnet/0_0.png", ], [ "illustration of A loving couple standing in the open kitchen of the living room, cooking ,Couples have a full body, with characters accounting for a quarter of the screen, and the composition of the living room has a large perspective, resulting in a larger space.", "asset/images/controlnet/0_3.png", ], [ "A Electric 4 seats mini VAN,simple design stylel,led headlight,front 45 angle view,sunlight,clear sky.", "asset/images/controlnet/0_2.png", ], ], inputs=[prompt, image_input], outputs=[result, hed_result, seed], fn=generate, cache_examples=CACHE_EXAMPLES, ) use_negative_prompt.change( fn=lambda x: gr.update(visible=x), inputs=use_negative_prompt, outputs=negative_prompt, api_name=False, ) gr.on( triggers=[ prompt.submit, negative_prompt.submit, run_button.click, ], fn=generate, inputs=[ prompt, image_input, negative_prompt, style_selection, use_negative_prompt, seed, width, height, schedule, dpms_guidance_scale, sas_guidance_scale, dpms_inference_steps, sas_inference_steps, randomize_seed, ], outputs=[result, hed_result, seed], api_name="run", ) if __name__ == "__main__": demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True)