"vscode:/vscode.git/clone" did not exist on "32f3d7befb906ddc2563e43dda1ad04703272f49"
Commit e2364931 authored by mashun1's avatar mashun1
Browse files

pixart-alpha

parents
Pipeline #861 canceled with stages
docker
\ No newline at end of file
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug
.idea/
cloud_tools/
output/
output_cv/
# added by ylw
*.pt
*.pth
*mj*
s3helper/
TODO.md
pretrained_models
work_dir
#demo.py
develop/
tmp.py
data/
output_cv/
output_all/
output_demo/
output_debug/
data/
output/
#cache for docker
docker/cache/gradio
docker/cache/huggingface
\ No newline at end of file
FROM image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10.1-py38
\ No newline at end of file
This diff is collapsed.
# PixArt-alpha
## 论文
**PixArt-α: Fast Training of Diffusion Transformer for Photorealistic Text-to-Image Synthesis**
* https://arxiv.org/abs/2310.00426
## 模型结构
该模型基于`DiT(Diffusion Transformer)`模型,添加了`Multi-Head Cross-Attention`用于对其文本与图像。
![alt text](readme_imgs/image-1.png)
## 算法原理
模型中主要涉及`Multi-Head Self-Attention``Multi-Head Cross-Attention`,其中`Multi-Head Self-Attention`主要用于对图像建模,`Multi-Head Cross-Attention`用于对齐图像与文本。
![alt text](readme_imgs/image-2.png)
## 环境配置
### Docker(方法一)
docker pull image.sourcefind.cn:5000/dcu/admin/base/pytorch:2.1.0-centos7.6-dtk23.10.1-py38
docker run --shm-size 10g --network=host --name=opensora --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
pip install flash_attn-2.0.4_torch2.1_dtk2310-cp38-cp38-linux_x86_64.whl (whl.zip文件中)
pip install triton-2.1.0%2Bgit34f8189.abi0.dtk2310-cp38-cp38-manylinux2014_x86_64.whl (开发者社区下载)
cd xformers && pip install xformers==0.0.23 --no-deps && bash patch_xformers.rocm.sh (whl.zip文件中)
pip install -r requirements.txt
pip install timm --no-deps
pip uninstall apex
# 安装diffusers
# 手动安装
git clone https://github.com/huggingface/diffusers.git
cd diffusers && python setup.py install
# 自动安装
pip install git+https://github.com/huggingface/diffusers
### Dockerfile(方法二)
# 需要在对应的目录下
docker build -t <IMAGE_NAME>:<TAG> .
docker run --shm-size 10g --network=host --name=opensora --privileged --device=/dev/kfd --device=/dev/dri --group-add video --cap-add=SYS_PTRACE --security-opt seccomp=unconfined -v 项目地址(绝对路径):/home/ -v /opt/hyhal:/opt/hyhal:ro -it <your IMAGE ID> bash
pip install flash_attn-2.0.4_torch2.1_dtk2310-cp38-cp38-linux_x86_64.whl (whl.zip文件中)
pip install triton-2.1.0%2Bgit34f8189.abi0.dtk2310-cp38-cp38-manylinux2014_x86_64.whl (开发者社区下载)
cd xformers && pip install xformers==0.0.23 --no-deps && bash patch_xformers.rocm.sh (whl.zip文件中)
pip install -r requirements.txt
pip install timm --no-deps
cd diffusion && python setup.py install
pip uninstall apex
# 安装diffusers
# 手动安装
git clone https://github.com/huggingface/diffusers.git
cd diffusers && python setup.py install
# 自动安装
pip install git+https://github.com/huggingface/diffusers
### Anaconda (方法三)
1、关于本项目DCU显卡所需的特殊深度学习库可从光合开发者社区下载安装:
https://developer.hpccube.com/tool/
DTK驱动:dtk23.10.1
python:python3.8
torch:2.1.0
torchvision:0.16.0
triton:2.1.0
Tips:以上dtk驱动、python、torch等DCU相关工具版本需要严格一一对应
2、其它非特殊库参照requirements.txt安装
pip install flash_attn-2.0.4_torch2.1_dtk2310-cp38-cp38-linux_x86_64.whl (whl.zip文件中)
cd xformers && pip install xformers==0.0.23 --no-deps && bash patch_xformers.rocm.sh (whl.zip文件中)
pip install -r requirements.txt
pip install timm --no-deps
# 安装diffusers
# 手动安装
git clone https://github.com/huggingface/diffusers.git
cd diffusers && python setup.py install
# 自动安装
pip install git+https://github.com/huggingface/diffusers
## 数据集
完整数据:https://ai.meta.com/datasets/segment-anything/
测试数据:https://huggingface.co/datasets/PixArt-alpha/data_toy
数据下载完成后需要进行处理,可运行以下脚本:
# 使用LLava获取更加详细的图像描述
python tools/VLM_caption_lightning.py --output output/dir/ --data-root data/root/path --index path/to/data.json
# 提前生成训练需要的特征
python tools/extract_features.py --img_size=256 \
--json_path "data/data_toy/data_info.json" \
--t5_save_root "data/data_toy/caption_feature_wmask" \
--vae_save_root "data/data_toy/img_vae_features" \
--pretrained_models_dir "pretrained_models/hub/pixart_alpha" \
--dataset_root "data/data_toy/images/"
处理后获得下述数据结构
data/
└── data_toy
├── caption_feature_wmask
│   ├── 0_1.npz
│   └── 0_3.npz
├── captions
│   ├── 0_1.txt
│   └── 0_3.txt
├── data_info.json
├── images
│   ├── 0_1.png
│   └── 0_3.png
├── img_vae_features
│   └── 256resolution
│   └── noflip
│   ├── 0_1.npy
│   └── 0_3.npy
└── partition
└── part0.txt
## 训练
敬请期待!
## 推理
### 模型下载
|Model+url|存放位置|
|:---:|:---:|
|[T5](https://hf-mirror.com/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl)|/path/to/save/models/pixart_alpha/t5_ckpts|
|[sd-vae-ft-ema](https://hf-mirror.com/PixArt-alpha/PixArt-alpha/tree/main/sd-vae-ft-ema)|/path/to/save/models/pixart_alpha/sd-vae-ft-ema|
pixart_alpha/
├── sd-vae-ft-ema
│ ├── config.json
│ └── diffusion_pytorch_model.bin
└── t5_ckpts
└── t5-v1_1-xxl
├── config.json
├── pytorch_model-00001-of-00002.bin
├── pytorch_model-00002-of-00002.bin
├── pytorch_model.bin.index.json
├── special_tokens_map.json
├── spiece.model
└── tokenizer_config.json
注意:上述模型需手动下载,其余模型将在运行时自动下载。
export HF_ENDPOINT=https://hf-mirror.com
export HUB_HOME=/path/to/save/models
### 命令
# 快速测试
HIP_VISIBLE_DEVICES=0 python quick_inference_with_code.py <prompt>
### WebUI
# diffusers version
DEMO_PORT=12345 python app/app.py
## result
||prompt|output|
|:---|:---:|:---:|
||a dog is playing a basketball|![alt text](readme_imgs/image-3.png)|
### 精度
## 应用场景
### 算法类别
`AIGC`
### 热点应用行业
`零售,广媒,教育`
## 源码仓库及问题反馈
* https://developer.hpccube.com/codes/modelzoo/pixart-alpha_pytorch
## 参考资料
* https://github.com/PixArt-alpha/PixArt-alpha
\ No newline at end of file
This diff is collapsed.
#!/usr/bin/env python
from __future__ import annotations
import os
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import random
import gradio as gr
import numpy as np
import uuid
from diffusers import ConsistencyDecoderVAE, PixArtAlphaPipeline, DPMSolverMultistepScheduler
import torch
from typing import Tuple
from datetime import datetime
from diffusion.sa_solver_diffusers import SASolverScheduler
DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png)
# PixArt-Alpha 1024px
#### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-XL-2-1024-MS](https://huggingface.co/PixArt-alpha/PixArt-XL-2-1024-MS) checkpoint.
#### English prompts ONLY; 提示词仅限英文
Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing).
### <span style='color: red;'>You may change the DPM-Solver inference steps from 14 to 20, if you didn't get satisfied results.
"""
if not torch.cuda.is_available():
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
MAX_SEED = np.iinfo(np.int32).max
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
PORT = int(os.getenv("DEMO_PORT", "15432"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
style_list = [
{
"name": "(No style)",
"prompt": "{prompt}",
"negative_prompt": "",
},
{
"name": "Cinematic",
"prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
"negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
},
{
"name": "Photographic",
"prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
"negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
},
{
"name": "Anime",
"prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed",
"negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
},
{
"name": "Manga",
"prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
"negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
},
{
"name": "Digital Art",
"prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
"negative_prompt": "photo, photorealistic, realism, ugly",
},
{
"name": "Pixel art",
"prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
"negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
},
{
"name": "Fantasy art",
"prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
"negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
},
{
"name": "Neonpunk",
"prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
"negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
},
{
"name": "3D Model",
"prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
"negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
},
]
styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "(No style)"
SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"]
DEFAULT_SCHEDULE_NAME = "DPM-Solver"
NUM_IMAGES_PER_PROMPT = 1
def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
if not negative:
negative = ""
return p.replace("{prompt}", positive), n + negative
if torch.cuda.is_available():
pipe = PixArtAlphaPipeline.from_pretrained(
"PixArt-alpha/PixArt-XL-2-1024-MS",
torch_dtype=torch.float16,
use_safetensors=True,
)
if os.getenv('CONSISTENCY_DECODER', False):
print("Using DALL-E 3 Consistency Decoder")
pipe.vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
if ENABLE_CPU_OFFLOAD:
pipe.enable_model_cpu_offload()
else:
pipe.to(device)
print("Loaded on Device!")
# speed-up T5
pipe.text_encoder.to_bettertransformer()
if USE_TORCH_COMPILE:
pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
print("Model Compiled!")
def save_image(img):
unique_name = f'{str(uuid.uuid4())}.png'
save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}')
os.makedirs(save_path, exist_ok=True)
unique_name = os.path.join(save_path, unique_name)
img.save(unique_name)
return unique_name
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
if randomize_seed:
seed = random.randint(0, MAX_SEED)
return seed
def generate(
prompt: str,
negative_prompt: str = "",
style: str = DEFAULT_STYLE_NAME,
use_negative_prompt: bool = False,
seed: int = 0,
width: int = 1024,
height: int = 1024,
schedule: str = 'DPM-Solver',
dpms_guidance_scale: float = 4.5,
sas_guidance_scale: float = 3,
dpms_inference_steps: int = 20,
sas_inference_steps: int = 25,
randomize_seed: bool = False,
use_resolution_binning: bool = True,
progress=gr.Progress(track_tqdm=True),
):
seed = int(randomize_seed_fn(seed, randomize_seed))
generator = torch.Generator().manual_seed(seed)
if schedule == 'DPM-Solver':
if not isinstance(pipe.scheduler, DPMSolverMultistepScheduler):
pipe.scheduler = DPMSolverMultistepScheduler()
num_inference_steps = dpms_inference_steps
guidance_scale = dpms_guidance_scale
elif schedule == "SA-Solver":
if not isinstance(pipe.scheduler, SASolverScheduler):
pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction', tau_func=lambda t: 1 if 200 <= t <= 800 else 0, predictor_order=2, corrector_order=2)
num_inference_steps = sas_inference_steps
guidance_scale = sas_guidance_scale
else:
raise ValueError(f"Unknown schedule: {schedule}")
if not use_negative_prompt:
negative_prompt = None # type: ignore
prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
images = pipe(
prompt=prompt,
width=width,
height=height,
guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps,
generator=generator,
num_images_per_prompt=NUM_IMAGES_PER_PROMPT,
use_resolution_binning=use_resolution_binning,
output_type="pil",
).images
image_paths = [save_image(img) for img in images]
print(image_paths)
return image_paths, seed
examples = [
"A small cactus with a happy face in the Sahara desert.",
"an astronaut sitting in a diner, eating fries, cinematic, analog film",
"Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
"stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.",
"professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
"beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background",
"Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism",
"anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur",
"The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8",
]
with gr.Blocks(css="app/style.css") as demo:
gr.Markdown(DESCRIPTION)
gr.DuplicateButton(
value="Duplicate Space for private use",
elem_id="duplicate-button",
visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
)
with gr.Group():
with gr.Row():
prompt = gr.Text(
label="Prompt",
show_label=False,
max_lines=1,
placeholder="Enter your prompt",
container=False,
)
run_button = gr.Button("Run", scale=0)
result = gr.Gallery(label="Result", columns=NUM_IMAGES_PER_PROMPT, show_label=False)
with gr.Accordion("Advanced options", open=False):
with gr.Row():
use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
schedule = gr.Radio(
show_label=True,
container=True,
interactive=True,
choices=SCHEDULE_NAME,
value=DEFAULT_SCHEDULE_NAME,
label="Sampler Schedule",
visible=True,
)
style_selection = gr.Radio(
show_label=True,
container=True,
interactive=True,
choices=STYLE_NAMES,
value=DEFAULT_STYLE_NAME,
label="Image Style",
)
negative_prompt = gr.Text(
label="Negative prompt",
max_lines=1,
placeholder="Enter a negative prompt",
visible=True,
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
with gr.Row(visible=True):
width = gr.Slider(
label="Width",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=1024,
)
height = gr.Slider(
label="Height",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=1024,
)
with gr.Row():
dpms_guidance_scale = gr.Slider(
label="DPM-Solver Guidance scale",
minimum=1,
maximum=10,
step=0.1,
value=4.5,
)
dpms_inference_steps = gr.Slider(
label="DPM-Solver inference steps",
minimum=5,
maximum=40,
step=1,
value=14,
)
with gr.Row():
sas_guidance_scale = gr.Slider(
label="SA-Solver Guidance scale",
minimum=1,
maximum=10,
step=0.1,
value=3,
)
sas_inference_steps = gr.Slider(
label="SA-Solver inference steps",
minimum=10,
maximum=40,
step=1,
value=25,
)
gr.Examples(
examples=examples,
inputs=prompt,
outputs=[result, seed],
fn=generate,
cache_examples=CACHE_EXAMPLES,
)
use_negative_prompt.change(
fn=lambda x: gr.update(visible=x),
inputs=use_negative_prompt,
outputs=negative_prompt,
api_name=False,
)
gr.on(
triggers=[
prompt.submit,
negative_prompt.submit,
run_button.click,
],
fn=generate,
inputs=[
prompt,
negative_prompt,
style_selection,
use_negative_prompt,
seed,
width,
height,
schedule,
dpms_guidance_scale,
sas_guidance_scale,
dpms_inference_steps,
sas_inference_steps,
randomize_seed,
],
outputs=[result, seed],
api_name="run",
)
if __name__ == "__main__":
demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT)
#!/usr/bin/env python
from __future__ import annotations
import os
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import random
import gradio as gr
import numpy as np
import uuid
from diffusers import PixArtAlphaPipeline, ConsistencyDecoderVAE, DPMSolverMultistepScheduler
import torch
from typing import Tuple
from datetime import datetime
from diffusion.data.datasets import ASPECT_RATIO_512_TEST
from diffusion.model.utils import resize_and_crop_img
from diffusion.sa_solver_diffusers import SASolverScheduler
DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png)
# PixArt-Alpha 512px
#### [PixArt-Alpha 512px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-XL-2-512x512](https://huggingface.co/PixArt-alpha/PixArt-XL-2-512x512) checkpoint.
#### English prompts ONLY; 提示词仅限英文
Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing).
"""
if not torch.cuda.is_available():
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
MAX_SEED = np.iinfo(np.int32).max
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "1024"))
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
PORT = int(os.getenv("DEMO_PORT", "15432"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
style_list = [
{
"name": "(No style)",
"prompt": "{prompt}",
"negative_prompt": "",
},
{
"name": "Cinematic",
"prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
"negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
},
{
"name": "Photographic",
"prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
"negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
},
{
"name": "Anime",
"prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed",
"negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
},
{
"name": "Manga",
"prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
"negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
},
{
"name": "Digital Art",
"prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
"negative_prompt": "photo, photorealistic, realism, ugly",
},
{
"name": "Pixel art",
"prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
"negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
},
{
"name": "Fantasy art",
"prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
"negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
},
{
"name": "Neonpunk",
"prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
"negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
},
{
"name": "3D Model",
"prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
"negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
},
]
styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "(No style)"
SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"]
DEFAULT_SCHEDULE_NAME = "DPM-Solver"
NUM_IMAGES_PER_PROMPT = 2
def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
if not negative:
negative = ""
return p.replace("{prompt}", positive), n + negative
if torch.cuda.is_available():
pipe = PixArtAlphaPipeline.from_pretrained(
"PixArt-alpha/PixArt-XL-2-512x512",
torch_dtype=torch.float16,
variant="fp16",
use_safetensors=True,
)
if os.getenv('CONSISTENCY_DECODER', False):
print("Using DALL-E 3 Consistency Decoder")
pipe.vae = ConsistencyDecoderVAE.from_pretrained("openai/consistency-decoder", torch_dtype=torch.float16)
if ENABLE_CPU_OFFLOAD:
pipe.enable_model_cpu_offload()
else:
pipe.to(device)
print("Loaded on Device!")
# speed-up T5
pipe.text_encoder.to_bettertransformer()
if USE_TORCH_COMPILE:
pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
print("Model Compiled!")
def prepare_prompt_hw(height, width, ratios):
ar = float(height/width)
closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
default_hw = ratios[closest_ratio]
return int(default_hw[0]), int(default_hw[1])
def save_image(img):
unique_name = f'{str(uuid.uuid4())}.png'
save_path = os.path.join(f'output/online_demo_img512/{datetime.now().date()}')
os.makedirs(save_path, exist_ok=True)
unique_name = os.path.join(save_path, unique_name)
img.save(unique_name)
return unique_name
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
if randomize_seed:
seed = random.randint(0, MAX_SEED)
return seed
def classify_height_width_bin(height: int, width: int, ratios: dict):
ar = float(height / width)
closest_ratio = min(ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))
default_hw = ratios[closest_ratio]
return int(default_hw[0]), int(default_hw[1])
def generate(
prompt: str,
negative_prompt: str = "",
style: str = DEFAULT_STYLE_NAME,
use_negative_prompt: bool = False,
seed: int = 0,
width: int = 512,
height: int = 512,
schedule: str = 'DPM-Solver',
dpms_guidance_scale: float = 4.5,
sas_guidance_scale: float = 3,
dpms_inference_steps: int = 20,
sas_inference_steps: int = 25,
randomize_seed: bool = False,
use_resolution_binning: bool = True,
progress=gr.Progress(track_tqdm=True),
):
seed = int(randomize_seed_fn(seed, randomize_seed))
generator = torch.Generator().manual_seed(seed)
if schedule == 'DPM-Solver':
if not isinstance(pipe.scheduler, DPMSolverMultistepScheduler):
pipe.scheduler = DPMSolverMultistepScheduler()
num_inference_steps = dpms_inference_steps
guidance_scale = dpms_guidance_scale
elif schedule == "SA-Solver":
if not isinstance(pipe.scheduler, SASolverScheduler):
pipe.scheduler = SASolverScheduler.from_config(pipe.scheduler.config, algorithm_type='data_prediction', tau_func=lambda t: 1 if 200 <= t <= 800 else 0, predictor_order=2, corrector_order=2)
num_inference_steps = sas_inference_steps
guidance_scale = sas_guidance_scale
else:
raise ValueError(f"Unknown schedule: {schedule}")
if not use_negative_prompt:
negative_prompt = None # type: ignore
prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
if use_resolution_binning:
orig_height, orig_width = height, width
height, width = classify_height_width_bin(height, width, ratios=ASPECT_RATIO_512_TEST)
images = pipe(
prompt=prompt,
width=width,
height=height,
guidance_scale=guidance_scale,
num_inference_steps=num_inference_steps,
generator=generator,
use_resolution_binning=False,
num_images_per_prompt=NUM_IMAGES_PER_PROMPT,
output_type="pil",
).images
if use_resolution_binning:
images = [resize_and_crop_img(img, orig_width, orig_height) for img in images]
image_paths = [save_image(img) for img in images]
print(image_paths)
return image_paths, seed
examples = [
"A small cactus with a happy face in the Sahara desert.",
"an astronaut sitting in a diner, eating fries, cinematic, analog film",
"Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
"stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.",
"professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
"beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background",
"Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism",
"anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur",
"The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8",
]
with gr.Blocks(css="scripts/style.css") as demo:
gr.Markdown(DESCRIPTION)
gr.DuplicateButton(
value="Duplicate Space for private use",
elem_id="duplicate-button",
visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
)
with gr.Group():
with gr.Row():
prompt = gr.Text(
label="Prompt",
show_label=False,
max_lines=1,
placeholder="Enter your prompt",
container=False,
)
run_button = gr.Button("Run", scale=0)
result = gr.Gallery(label="Result", columns=NUM_IMAGES_PER_PROMPT, show_label=False)
with gr.Accordion("Advanced options", open=False):
with gr.Row():
use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=False)
schedule = gr.Radio(
show_label=True,
container=True,
interactive=True,
choices=SCHEDULE_NAME,
value=DEFAULT_SCHEDULE_NAME,
label="Sampler Schedule",
visible=True,
)
style_selection = gr.Radio(
show_label=True,
container=True,
interactive=True,
choices=STYLE_NAMES,
value=DEFAULT_STYLE_NAME,
label="Image Style",
)
negative_prompt = gr.Text(
label="Negative prompt (no use now)",
max_lines=1,
placeholder="Enter a negative prompt",
visible=False,
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
with gr.Row(visible=True):
width = gr.Slider(
label="Width",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=512,
)
height = gr.Slider(
label="Height",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=512,
)
with gr.Row():
dpms_guidance_scale = gr.Slider(
label="DPM-Solver Guidance scale",
minimum=1,
maximum=10,
step=0.1,
value=4.5,
)
dpms_inference_steps = gr.Slider(
label="DPM-Solver inference steps",
minimum=5,
maximum=40,
step=1,
value=20,
)
with gr.Row():
sas_guidance_scale = gr.Slider(
label="SA-Solver Guidance scale",
minimum=1,
maximum=10,
step=0.1,
value=3,
)
sas_inference_steps = gr.Slider(
label="SA-Solver inference steps",
minimum=10,
maximum=40,
step=1,
value=25,
)
gr.Examples(
examples=examples,
inputs=prompt,
outputs=[result, seed],
fn=generate,
cache_examples=CACHE_EXAMPLES,
)
use_negative_prompt.change(
fn=lambda x: gr.update(visible=x),
inputs=use_negative_prompt,
outputs=negative_prompt,
api_name=False,
)
gr.on(
triggers=[
prompt.submit,
negative_prompt.submit,
run_button.click,
],
fn=generate,
inputs=[
prompt,
negative_prompt,
style_selection,
use_negative_prompt,
seed,
width,
height,
schedule,
dpms_guidance_scale,
sas_guidance_scale,
dpms_inference_steps,
sas_inference_steps,
randomize_seed,
],
outputs=[result, seed],
api_name="run",
)
if __name__ == "__main__":
demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True)
#!/usr/bin/env python
from __future__ import annotations
import argparse
import os
import random
import sys
import uuid
from datetime import datetime
from pathlib import Path
from typing import List, Tuple, Union
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import gradio as gr
import numpy as np
import torch
from PIL import Image as PILImage
import torchvision.transforms as T
import torchvision.transforms.functional as TF
from torchvision.utils import _log_api_usage_once, make_grid, save_image
from diffusers import PixArtAlphaPipeline
from diffusion import DPMS, SASolverSampler
from diffusion.data.datasets import *
from diffusion.model.hed import HEDdetector
from diffusion.model.nets import PixArt_XL_2, PixArtMS_XL_2, ControlPixArtHalf, ControlPixArtMSHalf
from diffusion.model.utils import resize_and_crop_tensor
from diffusion.utils.misc import read_config
from tools.download import find_model
DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/logo.png)
# PixArt-Delta (ControlNet)
#### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5.
#### This demo uses the [PixArt-alpha/PixArt-XL-2-1024-ControlNet](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) checkpoint.
#### This demo uses the [PixArt-alpha/PixArt-XL-2-512-ControlNet](https://huggingface.co/PixArt-alpha/PixArt-ControlNet/tree/main) checkpoint.
#### English prompts ONLY; 提示词仅限英文
### <span style='color: red;'>Please use the image size corresponding to the model as input to get the best performance. (eg. 1024px for PixArt-XL-2-1024-ControlNet.pth)
"""
if not torch.cuda.is_available():
DESCRIPTION += "\n<p>Running on CPU �� This demo does not work on CPU.</p>"
MAX_SEED = np.iinfo(np.int32).max
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
PORT = int(os.getenv("DEMO_PORT", "15432"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@torch.no_grad()
def ndarr_image(tensor: Union[torch.Tensor, List[torch.Tensor]], **kwargs, ) -> None:
if not torch.jit.is_scripting() and not torch.jit.is_tracing():
_log_api_usage_once(save_image)
grid = make_grid(tensor, **kwargs)
ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
return ndarr
style_list = [
{
"name": "(No style)",
"prompt": "{prompt}",
"negative_prompt": "",
},
{
"name": "Cinematic",
"prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
"negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
},
{
"name": "Photographic",
"prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
"negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
},
{
"name": "Anime",
"prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed",
"negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
},
{
"name": "Manga",
"prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
"negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
},
{
"name": "Digital Art",
"prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
"negative_prompt": "photo, photorealistic, realism, ugly",
},
{
"name": "Pixel art",
"prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
"negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
},
{
"name": "Fantasy art",
"prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
"negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
},
{
"name": "Neonpunk",
"prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
"negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
},
{
"name": "3D Model",
"prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
"negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
},
]
styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "(No style)"
SCHEDULE_NAME = ["DPM-Solver", "SA-Solver"]
DEFAULT_SCHEDULE_NAME = "DPM-Solver"
def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
if not negative:
negative = ""
return p.replace("{prompt}", positive), n + negative
def save_image(img):
unique_name = str(uuid.uuid4()) + '.png'
save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}')
os.makedirs(save_path, exist_ok=True)
unique_name = os.path.join(save_path, unique_name)
img.save(unique_name)
return unique_name
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
if randomize_seed:
seed = random.randint(0, MAX_SEED)
return seed
@torch.inference_mode()
def generate(
prompt: str,
given_image = None,
negative_prompt: str = "",
style: str = DEFAULT_STYLE_NAME,
use_negative_prompt: bool = False,
seed: int = 0,
width: int = 1024,
height: int = 1024,
schedule: str = 'DPM-Solver',
dpms_guidance_scale: float = 4.5,
sas_guidance_scale: float = 3,
dpms_inference_steps: int = 14,
sas_inference_steps: int = 25,
randomize_seed: bool = False,
):
seed = int(randomize_seed_fn(seed, randomize_seed))
torch.manual_seed(seed)
torch.cuda.empty_cache()
strength = 1.0
c_vis = given_image
if not use_negative_prompt:
negative_prompt = None # type: ignore
prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask\
= pipe.encode_prompt(prompt=prompt, negative_prompt=negative_prompt)
prompt_embeds, negative_prompt_embeds = prompt_embeds[:, None], negative_prompt_embeds[:, None]
torch.cuda.empty_cache()
# condition process
if given_image is not None:
ar = torch.tensor([given_image.size[1] / given_image.size[0]], device=device)[None]
custom_hw = torch.tensor([given_image.size[1], given_image.size[0]], device=device)[None]
closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))]
hw = torch.tensor(closest_hw, device=device)[None]
condition_transform = T.Compose([
T.Lambda(lambda img: img.convert('RGB')),
T.Resize(int(min(closest_hw))),
T.CenterCrop([int(closest_hw[0]), int(closest_hw[1])]),
T.ToTensor(),
])
given_image = condition_transform(given_image).unsqueeze(0).to(device)
hed_edge = hed(given_image) * strength
hed_edge = TF.normalize(hed_edge, [.5], [.5])
hed_edge = hed_edge.repeat(1, 3, 1, 1).to(weight_dtype)
posterior = vae.encode(hed_edge).latent_dist
condition = posterior.sample()
c = condition * config.scale_factor
c_vis = vae.decode(condition)['sample']
c_vis = torch.clamp(127.5 * c_vis + 128.0, 0, 255).permute(0, 2, 3, 1).to("cpu", dtype=torch.uint8).numpy()[0]
else:
c = None
ar = torch.tensor([int(height) / int(width)], device=device)[None]
custom_hw = torch.tensor([int(height), int(width)], device=device)[None]
closest_hw = base_ratios[min(base_ratios.keys(), key=lambda ratio: abs(float(ratio) - ar))]
hw = torch.tensor(closest_hw, device=device)[None]
latent_size_h, latent_size_w = int(hw[0, 0] // 8), int(hw[0, 1] // 8)
# Sample images:
if schedule == 'DPM-Solver':
# Create sampling noise:
n = prompt_embeds.shape[0]
z = torch.randn(n, 4, latent_size_h, latent_size_w, device=device)
model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=prompt_attention_mask, c=c)
dpm_solver = DPMS(model.forward_with_dpmsolver,
condition=prompt_embeds,
uncondition=negative_prompt_embeds,
cfg_scale=dpms_guidance_scale,
model_kwargs=model_kwargs)
samples = dpm_solver.sample(
z,
steps=dpms_inference_steps,
order=2,
skip_type="time_uniform",
method="multistep",
).to(weight_dtype)
elif schedule == "SA-Solver":
# Create sampling noise:
n = prompt_embeds.shape[0]
model_kwargs = dict(data_info={'img_hw': hw, 'aspect_ratio': ar}, mask=prompt_attention_mask, c=c)
sas_solver = SASolverSampler(model.forward_with_dpmsolver, device=device)
samples = sas_solver.sample(
S=sas_inference_steps,
batch_size=n,
shape=(4, latent_size_h, latent_size_w),
eta=1,
conditioning=prompt_embeds,
unconditional_conditioning=negative_prompt_embeds,
unconditional_guidance_scale=sas_guidance_scale,
model_kwargs=model_kwargs,
)[0].to(weight_dtype)
samples = vae.decode(samples / config.scale_factor).sample
torch.cuda.empty_cache()
samples = resize_and_crop_tensor(samples, custom_hw[0, 1], custom_hw[0, 0])
samples = PILImage.fromarray(ndarr_image(samples, normalize=True, value_range=(-1, 1)))
image_paths = [save_image(samples)]
c_vis = PILImage.fromarray(c_vis) if c_vis is not None else samples
c_paths = [save_image(c_vis)]
print(image_paths)
return image_paths, c_paths, seed
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("config", type=str, help="config")
parser.add_argument('--image_size', default=1024, type=int)
parser.add_argument('--model_path', type=str)
return parser.parse_args()
args = get_args()
config = read_config(args.config)
device = "cuda" if torch.cuda.is_available() else "cpu"
assert args.image_size in [512, 1024], "We only provide pre-trained models for 512x512 and 1024x1024 resolutions."
lewei_scale = {512: 1, 1024: 2}
latent_size = args.image_size // 8
weight_dtype = torch.float16
print(f"Inference with {weight_dtype}")
if torch.cuda.is_available():
hed = HEDdetector(False).to(device)
pipe = PixArtAlphaPipeline.from_pretrained(
"PixArt-alpha/PixArt-XL-2-1024-MS",
transformer=None,
torch_dtype=weight_dtype,
use_safetensors=True,
)
pipe.to(device)
print("Loaded on Device!")
vae = pipe.vae
text_encoder = pipe.text_encoder
tokenizer = pipe.tokenizer
assert args.image_size == config.image_size
if config.image_size == 512:
model = PixArt_XL_2(input_size=latent_size, lewei_scale=lewei_scale[config.image_size])
print('model architecture ControlPixArtHalf and image size is 512')
model = ControlPixArtHalf(model).to(device)
elif config.image_size == 1024:
model = PixArtMS_XL_2(input_size=latent_size, lewei_scale=lewei_scale[config.image_size])
print('model architecture ControlPixArtMSHalf and image size is 1024')
model = ControlPixArtMSHalf(model).to(device)
state_dict = find_model(args.model_path)['state_dict']
if 'pos_embed' in state_dict:
del state_dict['pos_embed']
elif 'base_model.pos_embed' in state_dict:
del state_dict['base_model.pos_embed']
missing, unexpected = model.load_state_dict(state_dict, strict=False)
print('Missing keys (missing pos_embed is normal): ', missing)
print('Unexpected keys', unexpected)
model.eval()
model.to(weight_dtype)
base_ratios = eval(f'ASPECT_RATIO_{args.image_size}_TEST')
with gr.Blocks(css="app/style_controlnet.css") as demo:
gr.Markdown(DESCRIPTION)
gr.DuplicateButton(
value="Duplicate Space for private use",
elem_id="duplicate-button",
visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
)
image_input = gr.Image(
label="Image",
height=360,
width=360,
show_label=False,
sources="upload",
type="pil",
)
with gr.Group():
with gr.Row():
prompt = gr.Text(
label="Prompt",
show_label=False,
max_lines=1,
placeholder="Enter your prompt",
container=False,
)
run_button = gr.Button("Run", scale=0)
with gr.Group():
with gr.Row():
hed_result = gr.Gallery(label="Hed Result", show_label=False)
result = gr.Gallery(label="Result", show_label=False)
with gr.Accordion("Advanced options", open=False):
with gr.Row():
use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
schedule = gr.Radio(
show_label=True,
container=True,
interactive=True,
choices=SCHEDULE_NAME,
value=DEFAULT_SCHEDULE_NAME,
label="Sampler Schedule",
visible=True,
)
style_selection = gr.Radio(
show_label=True,
container=True,
interactive=True,
choices=STYLE_NAMES,
value=DEFAULT_STYLE_NAME,
label="Image Style",
)
negative_prompt = gr.Text(
label="Negative prompt",
max_lines=1,
placeholder="Enter a negative prompt",
visible=True,
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
with gr.Row(visible=True):
width = gr.Slider(
label="Width",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=config.image_size,
)
height = gr.Slider(
label="Height",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=config.image_size,
)
with gr.Row():
dpms_guidance_scale = gr.Slider(
label="DPM-Solver Guidance scale",
minimum=1,
maximum=10,
step=0.1,
value=4.5,
)
dpms_inference_steps = gr.Slider(
label="DPM-Solver inference steps",
minimum=5,
maximum=40,
step=1,
value=14,
)
with gr.Row():
sas_guidance_scale = gr.Slider(
label="SA-Solver Guidance scale",
minimum=1,
maximum=10,
step=0.1,
value=3,
)
sas_inference_steps = gr.Slider(
label="SA-Solver inference steps",
minimum=10,
maximum=40,
step=1,
value=25,
)
gr.Examples(
examples=[
[
"anime superman in action",
"asset/images/controlnet/0_0.png",
],
[
"illustration of A loving couple standing in the open kitchen of the living room, cooking ,Couples have a full body, with characters accounting for a quarter of the screen, and the composition of the living room has a large perspective, resulting in a larger space.",
"asset/images/controlnet/0_3.png",
],
[
"A Electric 4 seats mini VAN,simple design stylel,led headlight,front 45 angle view,sunlight,clear sky.",
"asset/images/controlnet/0_2.png",
],
],
inputs=[prompt, image_input],
outputs=[result, hed_result, seed],
fn=generate,
cache_examples=CACHE_EXAMPLES,
)
use_negative_prompt.change(
fn=lambda x: gr.update(visible=x),
inputs=use_negative_prompt,
outputs=negative_prompt,
api_name=False,
)
gr.on(
triggers=[
prompt.submit,
negative_prompt.submit,
run_button.click,
],
fn=generate,
inputs=[
prompt,
image_input,
negative_prompt,
style_selection,
use_negative_prompt,
seed,
width,
height,
schedule,
dpms_guidance_scale,
sas_guidance_scale,
dpms_inference_steps,
sas_inference_steps,
randomize_seed,
],
outputs=[result, hed_result, seed],
api_name="run",
)
if __name__ == "__main__":
demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True)
#!/usr/bin/env python
from __future__ import annotations
import os
import sys
from pathlib import Path
current_file_path = Path(__file__).resolve()
sys.path.insert(0, str(current_file_path.parent.parent))
import random
import gradio as gr
import numpy as np
import uuid
from diffusers import PixArtAlphaPipeline, Transformer2DModel
from peft import PeftModel
import torch
from typing import Tuple
from datetime import datetime
import argparse
DESCRIPTION = """![Logo](https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/pixart-lcm.png)
# PixArt-LCM 1024px
#### [PixArt-Alpha 1024px](https://github.com/PixArt-alpha/PixArt-alpha) is a transformer-based text-to-image diffusion system trained on text embeddings from T5. This demo uses the [PixArt-alpha/PixArt-LCM-XL-2-1024-MS](https://huggingface.co/PixArt-alpha/PixArt-LCM-XL-2-1024-MS) checkpoint.
#### [LCMs](https://github.com/luosiallen/latent-consistency-model) is a diffusion distillation method which predict PF-ODE's solution directly in latent space, achieving super fast inference with few steps.
#### English prompts ONLY; 提示词仅限英文
Don't want to queue? Try [OpenXLab](https://openxlab.org.cn/apps/detail/PixArt-alpha/PixArt-alpha) or [Google Colab Demo](https://colab.research.google.com/drive/1jZ5UZXk7tcpTfVwnX33dDuefNMcnW9ME?usp=sharing).
"""
if not torch.cuda.is_available():
DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"
MAX_SEED = np.iinfo(np.int32).max
CACHE_EXAMPLES = torch.cuda.is_available() and os.getenv("CACHE_EXAMPLES", "1") == "1"
MAX_IMAGE_SIZE = int(os.getenv("MAX_IMAGE_SIZE", "2048"))
USE_TORCH_COMPILE = os.getenv("USE_TORCH_COMPILE", "0") == "1"
ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD", "0") == "1"
PORT = int(os.getenv("DEMO_PORT", "15432"))
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
style_list = [
{
"name": "(No style)",
"prompt": "{prompt}",
"negative_prompt": "",
},
{
"name": "Cinematic",
"prompt": "cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy",
"negative_prompt": "anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured",
},
{
"name": "Photographic",
"prompt": "cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed",
"negative_prompt": "drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
},
{
"name": "Anime",
"prompt": "anime artwork {prompt} . anime style, key visual, vibrant, studio anime, highly detailed",
"negative_prompt": "photo, deformed, black and white, realism, disfigured, low contrast",
},
{
"name": "Manga",
"prompt": "manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style",
"negative_prompt": "ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style",
},
{
"name": "Digital Art",
"prompt": "concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed",
"negative_prompt": "photo, photorealistic, realism, ugly",
},
{
"name": "Pixel art",
"prompt": "pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics",
"negative_prompt": "sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic",
},
{
"name": "Fantasy art",
"prompt": "ethereal fantasy concept art of {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy",
"negative_prompt": "photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
},
{
"name": "Neonpunk",
"prompt": "neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional",
"negative_prompt": "painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
},
{
"name": "3D Model",
"prompt": "professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting",
"negative_prompt": "ugly, deformed, noisy, low poly, blurry, painting",
},
]
styles = {k["name"]: (k["prompt"], k["negative_prompt"]) for k in style_list}
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "(No style)"
NUM_IMAGES_PER_PROMPT = 1
def apply_style(style_name: str, positive: str, negative: str = "") -> Tuple[str, str]:
p, n = styles.get(style_name, styles[DEFAULT_STYLE_NAME])
if not negative:
negative = ""
return p.replace("{prompt}", positive), n + negative
def get_args():
parser = argparse.ArgumentParser()
parser.add_argument('--is_lora', action='store_true', help='enable lora ckpt loading')
parser.add_argument('--repo_id', default="PixArt-alpha/PixArt-LCM-XL-2-1024-MS", type=str)
parser.add_argument('--lora_repo_id', default="PixArt-alpha/PixArt-LCM-LoRA-XL-2-1024-MS", type=str)
return parser.parse_args()
args = get_args()
if torch.cuda.is_available():
if not args.is_lora:
pipe = PixArtAlphaPipeline.from_pretrained(
args.repo_id,
torch_dtype=torch.float16,
use_safetensors=True,
)
else:
assert args.lora_repo_id is not None
transformer = Transformer2DModel.from_pretrained(args.repo_id, subfolder="transformer", torch_dtype=torch.float16)
transformer = PeftModel.from_pretrained(transformer, args.lora_repo_id)
pipe = PixArtAlphaPipeline.from_pretrained(
args.repo_id,
transformer=transformer,
torch_dtype=torch.float16,
use_safetensors=True,
)
del transformer
if ENABLE_CPU_OFFLOAD:
pipe.enable_model_cpu_offload()
else:
pipe.to(device)
print("Loaded on Device!")
# speed-up T5
pipe.text_encoder.to_bettertransformer()
if USE_TORCH_COMPILE:
pipe.transformer = torch.compile(pipe.transformer, mode="reduce-overhead", fullgraph=True)
print("Model Compiled!")
def save_image(img):
unique_name = f'{str(uuid.uuid4())}.png'
save_path = os.path.join(f'output/online_demo_img/{datetime.now().date()}')
os.makedirs(save_path, exist_ok=True)
unique_name = os.path.join(save_path, unique_name)
img.save(unique_name)
return unique_name
def randomize_seed_fn(seed: int, randomize_seed: bool) -> int:
if randomize_seed:
seed = random.randint(0, MAX_SEED)
return seed
def generate(
prompt: str,
negative_prompt: str = "",
style: str = DEFAULT_STYLE_NAME,
use_negative_prompt: bool = False,
seed: int = 0,
width: int = 1024,
height: int = 1024,
inference_steps: int = 4,
randomize_seed: bool = False,
use_resolution_binning: bool = True,
progress=gr.Progress(track_tqdm=True),
):
seed = int(randomize_seed_fn(seed, randomize_seed))
generator = torch.Generator().manual_seed(seed)
if not use_negative_prompt:
negative_prompt = None # type: ignore
prompt, negative_prompt = apply_style(style, prompt, negative_prompt)
images = pipe(
prompt=prompt,
width=width,
height=height,
guidance_scale=0.,
num_inference_steps=inference_steps,
generator=generator,
num_images_per_prompt=NUM_IMAGES_PER_PROMPT,
use_resolution_binning=use_resolution_binning,
output_type="pil",
).images
image_paths = [save_image(img) for img in images]
print(image_paths)
return image_paths, seed
examples = [
"A small cactus with a happy face in the Sahara desert.",
"an astronaut sitting in a diner, eating fries, cinematic, analog film",
"Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, intricate detail.",
"stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.",
"professional portrait photo of an anthropomorphic cat wearing fancy gentleman hat and jacket walking in autumn forest.",
"beautiful lady, freckles, big smile, blue eyes, short ginger hair, dark makeup, wearing a floral blue vest top, soft light, dark grey background",
"Spectacular Tiny World in the Transparent Jar On the Table, interior of the Great Hall, Elaborate, Carved Architecture, Anatomy, Symetrical, Geometric and Parameteric Details, Precision Flat line Details, Pattern, Dark fantasy, Dark errie mood and ineffably mysterious mood, Technical design, Intricate Ultra Detail, Ornate Detail, Stylized and Futuristic and Biomorphic Details, Architectural Concept, Low contrast Details, Cinematic Lighting, 8k, by moebius, Fullshot, Epic, Fullshot, Octane render, Unreal ,Photorealistic, Hyperrealism",
"anthropomorphic profile of the white snow owl Crystal priestess , art deco painting, pretty and expressive eyes, ornate costume, mythical, ethereal, intricate, elaborate, hyperrealism, hyper detailed, 3D, 8K, Ultra Realistic, high octane, ultra resolution, amazing detail, perfection, In frame, photorealistic, cinematic lighting, visual clarity, shading , Lumen Reflections, Super-Resolution, gigapixel, color grading, retouch, enhanced, PBR, Blender, V-ray, Procreate, zBrush, Unreal Engine 5, cinematic, volumetric, dramatic, neon lighting, wide angle lens ,no digital painting blur",
"The parametric hotel lobby is a sleek and modern space with plenty of natural light. The lobby is spacious and open with a variety of seating options. The front desk is a sleek white counter with a parametric design. The walls are a light blue color with parametric patterns. The floor is a light wood color with a parametric design. There are plenty of plants and flowers throughout the space. The overall effect is a calm and relaxing space. occlusion, moody, sunset, concept art, octane rendering, 8k, highly detailed, concept art, highly detailed, beautiful scenery, cinematic, beautiful light, hyperreal, octane render, hdr, long exposure, 8K, realistic, fog, moody, fire and explosions, smoke, 50mm f2.8",
]
with gr.Blocks(css="scripts/style.css") as demo:
gr.Markdown(DESCRIPTION)
gr.DuplicateButton(
value="Duplicate Space for private use",
elem_id="duplicate-button",
visible=os.getenv("SHOW_DUPLICATE_BUTTON") == "1",
)
with gr.Group():
with gr.Row():
prompt = gr.Text(
label="Prompt",
show_label=False,
max_lines=1,
placeholder="Enter your prompt",
container=False,
)
run_button = gr.Button("Run", scale=0)
result = gr.Gallery(label="Result", columns=NUM_IMAGES_PER_PROMPT, show_label=False)
with gr.Accordion("Advanced options", open=False):
with gr.Row():
use_negative_prompt = gr.Checkbox(label="Use negative prompt", value=False, visible=True)
negative_prompt = gr.Text(
label="Negative prompt",
max_lines=1,
placeholder="Enter a negative prompt",
visible=True,
)
style_selection = gr.Radio(
show_label=True,
container=True,
interactive=True,
choices=STYLE_NAMES,
value=DEFAULT_STYLE_NAME,
label="Image Style",
)
seed = gr.Slider(
label="Seed",
minimum=0,
maximum=MAX_SEED,
step=1,
value=0,
)
randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
with gr.Row(visible=True):
width = gr.Slider(
label="Width",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=1024,
)
height = gr.Slider(
label="Height",
minimum=256,
maximum=MAX_IMAGE_SIZE,
step=32,
value=1024,
)
with gr.Row():
inference_steps = gr.Slider(
label="LCM inference steps",
minimum=1,
maximum=30,
step=1,
value=4,
)
gr.Examples(
examples=examples,
inputs=prompt,
outputs=[result, seed],
fn=generate,
cache_examples=CACHE_EXAMPLES,
)
use_negative_prompt.change(
fn=lambda x: gr.update(visible=x),
inputs=use_negative_prompt,
outputs=negative_prompt,
api_name=False,
)
gr.on(
triggers=[
prompt.submit,
negative_prompt.submit,
run_button.click,
],
fn=generate,
inputs=[
prompt,
negative_prompt,
style_selection,
use_negative_prompt,
seed,
width,
height,
inference_steps,
randomize_seed,
],
outputs=[result, seed],
api_name="run",
)
if __name__ == "__main__":
demo.queue(max_size=20).launch(server_name="0.0.0.0", server_port=PORT, debug=True)
.gradio-container{width:680px!important}
\ No newline at end of file
.gradio-container{width:768px!important}
\ No newline at end of file
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<!-- Created with Inkscape (http://www.inkscape.org/) -->
<svg
version="1.1"
id="svg2"
width="773.8667"
height="560.20001"
viewBox="0 0 773.8667 560.20001"
sodipodi:docname="PixArt.eps"
xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
xmlns="http://www.w3.org/2000/svg"
xmlns:svg="http://www.w3.org/2000/svg">
<defs
id="defs6" />
<sodipodi:namedview
id="namedview4"
pagecolor="#ffffff"
bordercolor="#000000"
borderopacity="0.25"
inkscape:showpageshadow="2"
inkscape:pageopacity="0.0"
inkscape:pagecheckerboard="0"
inkscape:deskcolor="#d1d1d1" />
<g
id="g8"
inkscape:groupmode="layer"
inkscape:label="ink_ext_XXXXXX"
transform="matrix(1.3333333,0,0,-1.3333333,0,755.2)">
<g
id="g10"
transform="scale(0.1)">
<path
d="m 5224.08,3900.61 94.67,-331.37 -12.08,-27.22 -663.91,359.06 13.25,23.2 307.71,94.68 94.67,189.35 260.36,-122.37 -94.67,-185.33"
style="fill:#434845;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path24" />
<path
d="m 4982.17,2802.84 -89.46,-204.04 -94.68,-165.68 -118.35,6.87 -639.06,-54.2 47.33,544.39 145.57,254.67 748.65,-382.01"
style="fill:#434845;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path26" />
<path
d="m 5306.67,3542.02 -324.5,-739.18 -748.65,382.01 409.24,716.23 663.91,-359.06"
style="fill:#7acb32;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path28" />
<path
d="m 4115.42,4101.8 v -426.05 h -110.31 l -662.73,6.63 h -115.03 v 419.42 h 888.07"
style="fill:#434845;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path30" />
<path
d="M 4265.47,3353.78 V 2368.27 H 3034.68 v 998.18 c 0,60.42 29.19,117.12 78.36,152.22 l 229.34,163.71 662.73,-6.63 200.1,-184.45 c 38.41,-35.41 60.26,-85.27 60.26,-137.52"
style="fill:#cd176b;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path32" />
<path
d="m 3271.37,4811.87 -142.02,-118.35 -189.35,23.67 -118.34,149.12 105.08,236.22 286.64,232.19 57.99,-522.85"
style="fill:#e7c590;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path34" />
<path
d="m 3153.02,4338.49 -331.36,-1869.86 -11.36,-82.84 h -130.66 v 130.17 l -24.85,76.46 v 0.23 l 166.87,1771.4 118.34,92.79 165.69,-47.34 47.33,-71.01"
style="fill:#8a4535;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path36" />
<path
d="M 3129.35,4693.52 3105.69,4409.5 2940,4456.84 v 260.35 l 189.35,-23.67"
style="fill:#434845;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path38" />
<path
d="m 3213.38,5334.72 -286.64,-232.19 249.95,561.43 z"
style="fill:#cd176b;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path40" />
<path
d="m 1472.51,3865.11 142.02,284.03 -47.34,213.02 133.97,31 173.73,40 51.84,-224.85 255.86,-153.85 -710.08,-189.35"
style="fill:#434845;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path42" />
<path
d="m 2452.42,3219.17 -831.02,-241.66 -148.89,887.6 710.08,189.35 269.83,-835.29"
style="fill:#f57626;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path44" />
<path
d="m 2679.64,2385.79 v -62.26 l -959.07,62.26 -99.17,591.72 831.02,241.66 202.37,-626.52 v -0.23 l -19.41,-206.63 h 44.26"
style="fill:#3f3f48;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path46" />
<path
d="m 1693.18,4312.71 c -83.7,128.92 -100.66,247.93 -50.4,353.77 38.33,76.7 146.8,105.91 251.69,134.13 16.44,4.44 32.73,8.81 48.4,13.32 64.08,18.35 103.5,82.79 87.87,143.61 -5.66,22.03 -17.29,37.65 -36.6,49.14 -21.71,12.89 -50.52,16 -78.99,8.44 l -680.91,-180.25 c -79.96,-21.17 -163.22,18.01 -197.88,93.15 -22,47.69 -28.58,96.64 -19.54,145.53 11.3,61.11 50.68,113.05 108.07,142.52 75.06,38.6 158.91,61.84 249.18,69.05 l 7.54,-94.4 c -77.74,-6.2 -149.55,-26.01 -213.45,-58.85 -31.1,-15.98 -52.33,-43.51 -58.24,-75.52 -5.49,-29.67 -1.43,-58.68 12.4,-88.66 15.37,-33.29 52.31,-50.69 87.69,-41.29 l 680.91,180.25 c 52.91,14 108.2,7.23 151.63,-18.61 41.06,-24.43 67.93,-60.42 79.89,-106.93 28.26,-110.02 -40.58,-225.84 -153.48,-258.2 -16.15,-4.63 -32.92,-9.16 -49.87,-13.71 -72.72,-19.58 -172.31,-46.39 -191.2,-84.19 -35.28,-74.31 -20.8,-159.88 44.68,-260.75 l -79.39,-51.55"
style="fill:#f57626;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path48" />
<path
d="m 459.43,1708.52 c 35.945,12.11 72.113,23.62 107.816,36.41 483.314,173.06 980.014,290.91 1487.914,363.69 422.09,60.47 846.28,84.56 1272.22,72.45 445.54,-12.66 887.01,-60.64 1318.44,-177.88 61.77,-16.8 97.38,-3.8 138.76,44.51 171.46,200.2 348.01,396.02 522.55,593.56 12.39,14.01 23.6,29.05 43.64,53.86 -659.91,147.84 -1314.12,165.56 -1972.11,85.46 C 2322.93,2652.06 1340.97,2318.31 453.254,1726.26 c 2.059,-5.9 4.113,-11.82 6.176,-17.74"
style="fill:#d97f2e;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path50" />
<path
d="m 1105.38,1733.16 c 110.53,16.84 220.87,35.09 331.63,50.28 335.25,45.97 671.81,51.34 1009.23,31.38 461.86,-27.32 916.38,-101.09 1363.97,-216.92 125.77,-32.54 249.57,-72.96 373.44,-112.38 39.88,-12.69 69.23,-9.27 98.56,23.16 71.9,79.42 146.64,156.26 219.87,234.48 8.06,8.6 13.72,19.47 25.47,36.51 -24.65,8.94 -44.27,18.19 -64.92,23.22 -353.69,86.16 -710.65,154.18 -1073.27,189.65 -387.13,37.88 -774.11,42.52 -1161.85,3.28 -328.28,-33.24 -649.07,-99.51 -963.77,-197 -54.38,-16.84 -107.3,-38.39 -160.9,-57.75 0.85,-2.64 1.7,-5.27 2.54,-7.91"
style="fill:#d97f2e;fill-opacity:1;fill-rule:nonzero;stroke:none"
id="path52" />
</g>
</g>
</svg>
# 🔥 How to Train PixArt + Dreambooth
- PixArt + [Dreambooth](https://dreambooth.github.io/)
<div id="dreambooth" style="display: flex; justify-content: center;">
<img src="../images/dreambooth/dreambooth_dog.svg" width="46%" style="margin: 5px;">
<img src="../images/dreambooth/dreambooth_m5.svg" width="46%" style="margin: 5px;">
</div>
You **ONLY** need to change the **config** file in [config](../../configs/pixart_app_config/PixArt_xl2_img1024_dreambooth.py) and **dataloader** in [dataset](../../diffusion/data/datasets/Dreambooth.py).
The directory structure for Dreambooth dataset is:
```
cd ./data/dreambooth
dataset
├──dog6/
│ ├──00.jpg
│ ├──01.jpg
│ ├──......
├──cat/
│ ├──00.jpg
│ ├──01.jpg
│ ├──......
```
To get started, first install the required dependencies, then run on your local machine:
```bash
cd data/
git clone https://github.com/google/dreambooth.git
python -m torch.distributed.launch --nproc_per_node=1 --master_port=26666 train_scripts/train_dreambooth.py configs/pixart_app_config/PixArt_xl2_img1024_dreambooth.py --work-dir output/path
```
<!--Copyright 2023 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
[//]: # (&#40;reference from [hugging Face]&#40;https://github.com/huggingface/diffusers/blob/docs/8bit-inference-pixart/docs/source/en/api/pipelines/pixart.md&#41;&#41;)
## Running the `PixArtAlphaPipeline` in under 8GB GPU VRAM
It is possible to run the [`PixArtAlphaPipeline`] under 8GB GPU VRAM by loading the text encoder in 8-bit numerical precision. Let's walk through a full-fledged example.
First, install the `bitsandbytes` library:
```bash
pip install -U bitsandbytes
```
Then load the text encoder in 8-bit:
```python
from transformers import T5EncoderModel
from diffusers import PixArtAlphaPipeline
text_encoder = T5EncoderModel.from_pretrained(
"PixArt-alpha/PixArt-XL-2-1024-MS",
subfolder="text_encoder",
load_in_8bit=True,
device_map="auto",
)
pipe = PixArtAlphaPipeline.from_pretrained(
"PixArt-alpha/PixArt-XL-2-1024-MS",
text_encoder=text_encoder,
transformer=None,
device_map="auto"
)
```
Now, use the `pipe` to encode a prompt:
```python
with torch.no_grad():
prompt = "cute cat"
prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt)
del text_encoder
del pipe
flush()
```
`flush()` is just a utility function to clear the GPU VRAM and is implemented like so:
```python
import gc
def flush():
gc.collect()
torch.cuda.empty_cache()
```
Then compute the latents providing the prompt embeddings as inputs:
```python
pipe = PixArtAlphaPipeline.from_pretrained(
"PixArt-alpha/PixArt-XL-2-1024-MS",
text_encoder=None,
torch_dtype=torch.float16,
).to("cuda")
latents = pipe(
negative_prompt=None,
prompt_embeds=prompt_embeds,
negative_prompt_embeds=negative_embeds,
prompt_attention_mask=prompt_attention_mask,
negative_prompt_attention_mask=negative_prompt_attention_mask,
num_images_per_prompt=1,
output_type="latent",
).images
del pipe.transformer
flush()
```
Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
Once the latents are computed, pass it off the VAE to decode into a real image:
```python
with torch.no_grad():
image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
image = pipe.image_processor.postprocess(image, output_type="pil")
image.save("cat.png")
```
All of this, put together, should allow you to run [`PixArtAlphaPipeline`] under 8GB GPU VRAM.
![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/8bits_cat.png)
Find the script [here](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e) that can be run end-to-end to report the memory being used.
<Tip warning={true}>
Text embeddings computed in 8-bit can have an impact on the quality of the generated images because of the information loss in the representation space induced by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
</Tip>
\ No newline at end of file
<!--Copyright 2023 The Huawei Noah’s Ark Lab Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
## 🔥 How to use PixArt in ComfyUI
### 1. Preparation for PixArt running envrironment
```bash
cd /workspace
conda create -n pixart python==3.9.0
conda activate pixart
pip install torch==2.0.0+cu117 torchvision==0.15.1+cu117 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu117
git clone https://github.com/PixArt-alpha/PixArt-alpha.git
cd PixArt-alpha
pip install -r requirements.txt
```
### 2. Install ComfyUI related dependencies
```bash
cd /workspace
git clone https://github.com/comfyanonymous/ComfyUI.git
cd ComfyUI
git clone https://github.com/city96/ComfyUI_ExtraModels custom_nodes/ComfyUI_ExtraModels
```
### 3. Download all the checkpoints: PixArt, VAE, T5 with script
```bash
cd /workspace/PixArt
python tools/download.py --model_names "PixArt-XL-2-1024-MS.pth"
```
or download with urls:[PixArt ckpt](https://huggingface.co/PixArt-alpha/PixArt-alpha/resolve/main/PixArt-XL-2-1024-MS.pth), [VAE ckpt](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/sd-vae-ft-ema),
[T5 ckpt](https://huggingface.co/PixArt-alpha/PixArt-alpha/tree/main/t5-v1_1-xxl).
### 4. Put Checkpoints into corresponding folders
```bash
cd /workspace/ComfyUI
mv /path/to/PixArt-XL-2-1024-MS.pth ./models/checkpoints/
mv /path/to/sd-vae-ft-ema ./models/VAE/
mv /path/to/t5-v1_1-xxl ./models/t5/
```
### 5. run the ComfyUI website
```bash
cd /workspace/ComfyUI
python main.py --port 11111 --listen 0.0.0.0
```
Open http://your-server-ip:11111 to play with PixArt.
### 6. Create your own custom nodes
Here we prepare two examples for better understanding:
1) [PixArt Text-to-Image workflow](https://huggingface.co/PixArt-alpha/PixArt-alpha/blob/main/PixArt-image-to-image-workflow.json)
2) [PixArt Image-to-Image workflow](https://huggingface.co/PixArt-alpha/PixArt-alpha/blob/main/PixArt-image-to-image-workflow.json)
Once you download these json files, you can open your server website which is `http://your-server-ip:11111` and drop the json file into the website window to begin the PixArt-ComfyUI playground.
\ No newline at end of file
<!--Copyright 2023 The Huawei Noah’s Ark Lab Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
## 🔥 ControlNet
We incorporate a ControlNet-like(https://github.com/lllyasviel/ControlNet) module enables fine-grained control over text-to-image diffusion models. We introduce a novel ControlNet-Transformer architecture, specifically tailored for Transformers, achieving explicit controllability alongside high-quality image generation.
For more details about PixArt-ControlNet, please check the technical report [PixArt-δ](https://arxiv.org/abs/2401.05252).
<p align="center">
<img src="../images/controlnet.PNG" height=480>
</p>
## Training the `PixArt + ControlNet` on your machine
```bash
# Train on 1024px
python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_controlnet.py configs/pixart_app_config/PixArt_xl2_img1024_controlHed.py --work-dir output/pixartcontrolnet-xl2-img1024
# Train on 512px
python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_controlnet.py configs/pixart_app_config/PixArt_xl2_img512_controlHed.py --work-dir output/pixartcontrolnet-xl2-img512
```
## Testing the `PixArt + ControlNet`
```bash
# Test on 1024px
DEMO_PORT= 12345 python app/app_controlnet.py configs/pixart_app_config/PixArt_xl2_img1024_controlHed.py --model_path path/to/1024px/PixArt-XL-2-1024-ControlNet.pth
# Test on 512px
DEMO_PORT= 12345 python app/app_controlnet.py configs/pixart_app_config/PixArt_xl2_img512_controlHed.py --model_path path/to/512px/pixart_controlnet_ckpt
```
Then have a look at a simple example using the http://your-server-ip:12345
<!--Copyright 2023 The Huawei Noah’s Ark Lab Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
<p align="center">
<img src="https://raw.githubusercontent.com/PixArt-alpha/PixArt-alpha.github.io/master/static/images/pixart-lcm2.png" height=120>
</p>
## 🔥 Why Need PixArt-LCM
Following [LCM LoRA](https://huggingface.co/blog/lcm_lora), we illustrative of the generation speed we achieve on various computers. Let us stress again how liberating it is to explore image generation so easily with PixArt-LCM.
| Hardware | PixArt-LCM (4 steps) | SDXL LoRA LCM (4 steps) | PixArt standard (14 steps) | SDXL standard (25 steps) |
|-----------------------------|----------------------|-------------------------|----------------------------|---------------------------|
| T4 (Google Colab Free Tier) | 3.3s | 8.4s | 16.0s | 26.5s |
| A100 (80 GB) | 0.51s | 1.2s | 2.2s | 3.8s |
| V100 (32 GB) | 0.8s | 1.2s | 5.5s | 7.7s |
These tests were run with a batch size of 1 in all cases.
For cards with a lot of capacity, such as A100, performance increases significantly when generating multiple images at once, which is usually the case for production workloads.
## Training the `PixArt + LCM` on your machine
```bash
python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_pixart_lcm.py configs/pixart_config/PixArt_xl2_img1024_lcm.py --work-dir output/pixartlcm-xl2-img1024_ft
```
## Trainig the `PixArt + LCM-LoRA`
```bash
python -m torch.distributed.launch --nproc_per_node=2 --master_port=12345 train_scripts/train_pixart_lcm_lora.py configs/pixart_config/PixArt_xl2_img1024_lcm.py --work-dir output/pixartlcm-lora-xl2-img1024_ft
```
## Testing the `PixArt + LCM` on your machine
```bash
DEMO_PORT=12345 python app/app_lcm.py
Then have a look at a simple example using the http://your-server-ip:12345
```
## Testing the `PixArt + LCM-LoRA`
```bash
DEMO_PORT=12345 python app/app_lcm.py --is_lora --lora_repo_id output/pixartlcm-lora-xl2-img1024_ft/checkpoint-xxx
Then have a look at a simple example using the http://your-server-ip:12345
```
## Integration in diffusers
### Using in 🧨 diffusers
Make sure you have the updated versions of the following libraries:
```bash
pip install -U transformers accelerate diffusers
```
And then:
```python
import torch
from diffusers import PixArtAlphaPipeline, AutoencoderKL
# for PixArt-LCM
pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", torch_dtype=torch.float16, use_safetensors=True)
# for PixArt-LCM-LoRA
# transformer = Transformer2DModel.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", subfolder="transformer", torch_dtype=torch.float16)
# transformer = PeftModel.from_pretrained(transformer, "PixArt-alpha/PixArt-LCM-LoRA-XL-2-1024-MS")
# pipe = PixArtAlphaPipeline.from_pretrained("PixArt-alpha/PixArt-LCM-XL-2-1024-MS", transformer=transformer, torch_dtype=torch.float16, use_safetensors=True)
# del transformer
# Enable memory optimizations.
pipe.enable_model_cpu_offload()
prompt = "A small cactus with a happy face in the Sahara desert."
image = pipe(prompt, guidance_scale=0., num_inference_steps=4).images[0]
```
This integration allows running the pipeline with a batch size of 4 under 11 GBs of GPU VRAM.
Check out the [documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/pixart) to learn more.
# Keeping updating
\ No newline at end of file
## SA-Solver: Stochastic Adams Solver for Fast Sampling of Diffusion Models (Neurips 2023)
<div align="center">
<a href="https://arxiv.org/pdf/2309.05019.pdf"><img src="https://img.shields.io/static/v1?label=Paper&message=Arxiv&color=red&logo=arxiv"></a> &ensp;
<a href="https://github.com/PixArt-alpha/PixArt-alpha/blob/master/diffusion/sa_solver_diffusers.py"><img src="https://img.shields.io/static/v1?label=Project%20Page&message=Github&color=blue&logo=github-pages"></a> &ensp;
</div>
> [**SA-Solver: Stochastic Adams Solver for Fast Sampling of Diffusion Models (Neurips 2023)**](https://arxiv.org/pdf/2309.05019.pdf)<br>
> [Shuchen Xue*](https://github.com/scxue), [Mingyang Yi]()&#8224;,
> [Weijian Luo](), [Shifeng Zhang](), [Jiacheng Sun](),
> [Zhenguo Li](https://scholar.google.com/citations?user=XboZC1AAAAAJ),
> [Zhi-Ming Ma]()
> <br>University of Chinese Academy of Sciences, Huawei Noah’s Ark Lab, Peking University<br>
---
## 🐱 Abstract
SA-Solver is a stochastic diffusion sampler based on Stochastic Adams Method. It is training-free and can be employed into pretrained diffusion models. It is a multistep SDE solver that can do fast stochastic sampling.
1. The parameter 'tau function' controls the stochasticity in the sampling process. Inspired by EDM, we choose the 'tau function' to be a piecewise constant function that is greater than 0 in the middle stage of sampling process and equals zero in the start and end stage. Specifically, we choose the default value of this parameter to be
```python
tau_func = lambda t: 1 if t >= 200 and t <= 800 else 0
```
in diffusers library and
```python
tau_t = lambda t: eta if 0.2 <= t <= 0.8 else 0
```
in ldm library. (The difference is because the time transformation * 1000).
The value '1' represents the magnitude of stochasticity. Higher value are recommended with more NFEs.
If you want to employ deterministic sampling (solving diffusion ODE) in SA-Solver, please set
```python
tau_func = lambda t: 0
```
If you want to employ original stochastic sampling (solving original diffusion SDE) in SA-Solver, please set
```python
tau_func = lambda t: 1
```
2. The parameter 'predictor_order' and 'corrector_order' controls the specific orders of 'SA-Predictor' and 'SA-Corrector'. For unconditional generation and conditional generation with small classifier-free guidance scale, the recommended orders are 'predictor_order = 3' and 'corrector_order = 4'; for conditional generation with large classifier-free guidance scale (e.g. t2i), the recommended orders are 'predictor_order = 2' and 'corrector_order = 2'.
examples = [
[
"A small cactus with a happy face in the Sahara desert.",
"dpm-solver", 20, 4.5,
"https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/carousel/carousel1.png",
"Prompt: A small cactus with a happy face in the Sahara desert. \nSize: --ar 1:1.",
"Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"],
[
"Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, "
"spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, "
"intricate detail. --ar 6144:4096.",
"dpm-solver", 20, 4.5,
"https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/samples/15.png",
"Prompt: Pirate ship trapped in a cosmic maelstrom nebula, rendered in cosmic beach whirlpool engine, volumetric lighting, "
"spectacular, ambient lights, light pollution, cinematic atmosphere, art nouveau style, illustration art artwork by SenseiJaye, "
"intricate detail.\nSize: --ar 6144:4096.",
"Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"],
[
"stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, "
"blue and pink, brilliantly illuminated in the background.",
"dpm-solver", 20, 4.5,
"https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/samples/13.png",
"stars, water, brilliantly, gorgeous large scale scene, a little girl, in the style of dreamy realism, light gold and amber, blue and pink, brilliantly illuminated in the background.",
"Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"],
[
"nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.",
"dpm-solver", 20, 4.5,
"https://github.com/PixArt-alpha/PixArt-alpha.github.io/blob/master/static/images/samples/14.png",
"nature vs human nature, surreal, UHD, 8k, hyper details, rich colors, photograph.",
"Model path: PixArt-XL-2-1024x1024.pt.\nBase image size: 1024, \nSampling Algo: dpm-solver"],
]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment