Unverified Commit ccd1f7ce authored by Markku Post-Uttula's avatar Markku Post-Uttula Committed by GitHub
Browse files

Allow user to select output image aspect ratio (#120)



* Fix typo in comment

* Enable choosing output aspect-ratio

* Explain the effect of aspect ratio in the README

---------
Co-authored-by: default avatarMarkku Post <markku.post@cleverest.eu>
parent 93044f89
......@@ -75,6 +75,16 @@ Note: only change the base model and add the LoRA modules for better stylization
<img src="https://cdn-uploads.huggingface.co/production/uploads/6285a9133ab6642179158944/-AC7Hr5YL4yW1zXGe_Izl.jpeg" height=450>
</p>
### Note about the output aspect ratio
The underlying generative model has billions of parameters. And, whether You believe it or not, **all** of them do affect the output. The width/height (i.e. the "Aspect Ratio") of the output are just a few of these parameters.
For example, using the exact same positive/negative prompts and seed value, only changing the aspect ratio (i.e. the width/height) of the output image, it is possible to generate e.g. following results:
<p align="center">
<img src="https://cdn.cleverest.eu/attachment/github.com/TencentARC/PhotoMaker/pull/120/aspect-ratio-matters.png" height=725>
</p>
# 🔧 Dependencies and Installation
- Python >= 3.8 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html))
......
......@@ -12,7 +12,9 @@ import spaces
import gradio as gr
from photomaker import PhotoMakerStableDiffusionXLPipeline
from style_template import styles
from aspect_ratio_template import aspect_ratios
# global variable
base_model_path = 'SG161222/RealVisXL_V3.0'
......@@ -29,6 +31,8 @@ except:
MAX_SEED = np.iinfo(np.int32).max
STYLE_NAMES = list(styles.keys())
DEFAULT_STYLE_NAME = "Photographic (Default)"
ASPECT_RATIO_LABELS = list(aspect_ratios)
DEFAULT_ASPECT_RATIO = ASPECT_RATIO_LABELS[0]
# download PhotoMaker checkpoint to cache
photomaker_ckpt = hf_hub_download(repo_id="TencentARC/PhotoMaker", filename="photomaker-v1.bin", repo_type="model")
......@@ -37,6 +41,7 @@ if device == "mps":
torch_dtype = torch.float16
else:
torch_dtype = torch.bfloat16
pipe = PhotoMakerStableDiffusionXLPipeline.from_pretrained(
base_model_path,
torch_dtype=torch_dtype,
......@@ -58,7 +63,7 @@ pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
pipe.fuse_lora()
@spaces.GPU(enable_queue=True)
def generate_image(upload_images, prompt, negative_prompt, style_name, num_steps, style_strength_ratio, num_outputs, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
def generate_image(upload_images, prompt, negative_prompt, aspect_ratio_name, style_name, num_steps, style_strength_ratio, num_outputs, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
# check the trigger word
image_token_id = pipe.tokenizer.convert_tokens_to_ids(pipe.trigger_word)
input_ids = pipe.tokenizer.encode(prompt)
......@@ -68,6 +73,10 @@ def generate_image(upload_images, prompt, negative_prompt, style_name, num_steps
if input_ids.count(image_token_id) > 1:
raise gr.Error(f"Cannot use multiple trigger words '{pipe.trigger_word}' in text prompt!")
# determine output dimensions by the aspect ratio
output_w, output_h = aspect_ratios[aspect_ratio_name]
print(f"[Debug] Generate image using aspect ratio [{aspect_ratio_name}] => {output_w} x {output_h}")
# apply the style template
prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)
......@@ -88,6 +97,8 @@ def generate_image(upload_images, prompt, negative_prompt, style_name, num_steps
print(start_merge_step)
images = pipe(
prompt=prompt,
width=output_w,
height=output_h,
input_id_images=input_id_images,
negative_prompt=negative_prompt,
num_images_per_prompt=num_outputs,
......@@ -222,6 +233,7 @@ with gr.Blocks(css=css) as demo:
info="Try something like 'a photo of a man/woman img', 'img' is the trigger word.",
placeholder="A photo of a [man/woman img]...")
style = gr.Dropdown(label="Style template", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
aspect_ratio = gr.Dropdown(label="Output aspect ratio", choices=ASPECT_RATIO_LABELS, value=DEFAULT_ASPECT_RATIO)
submit = gr.Button("Submit")
with gr.Accordion(open=False, label="Advanced Options"):
......@@ -284,7 +296,7 @@ with gr.Blocks(css=css) as demo:
api_name=False,
).then(
fn=generate_image,
inputs=[files, prompt, negative_prompt, style, num_steps, style_strength_ratio, num_outputs, guidance_scale, seed],
inputs=[files, prompt, negative_prompt, aspect_ratio, style, num_steps, style_strength_ratio, num_outputs, guidance_scale, seed],
outputs=[gallery, usage_tips]
)
......
# Note: Since output width & height need to be divisible by 8, the w & h -values do
# not exactly match the stated aspect ratios... but they are "close enough":)
aspect_ratio_list = [
{
"name": "Instagram (1:1)",
"w": 1024,
"h": 1024,
},
{
"name": "35mm film / Landscape (3:2)",
"w": 1024,
"h": 680,
},
{
"name": "35mm film / Portrait (2:3)",
"w": 680,
"h": 1024,
},
{
"name": "CRT Monitor / Landscape (4:3)",
"w": 1024,
"h": 768,
},
{
"name": "CRT Monitor / Portrait (3:4)",
"w": 768,
"h": 1024,
},
{
"name": "Widescreen TV / Landscape (16:9)",
"w": 1024,
"h": 576,
},
{
"name": "Widescreen TV / Portrait (9:16)",
"w": 576,
"h": 1024,
},
{
"name": "Widescreen Monitor / Landscape (16:10)",
"w": 1024,
"h": 640,
},
{
"name": "Widescreen Monitor / Portrait (10:16)",
"w": 640,
"h": 1024,
},
{
"name": "Cinemascope (2.39:1)",
"w": 1024,
"h": 424,
},
{
"name": "Widescreen Movie (1.85:1)",
"w": 1024,
"h": 552,
},
{
"name": "Academy Movie (1.37:1)",
"w": 1024,
"h": 744,
},
{
"name": "Sheet-print (A-series) / Landscape (297:210)",
"w": 1024,
"h": 720,
},
{
"name": "Sheet-print (A-series) / Portrait (210:297)",
"w": 720,
"h": 1024,
},
]
aspect_ratios = {k["name"]: (k["w"], k["h"]) for k in aspect_ratio_list}
......@@ -164,7 +164,7 @@ class PhotoMakerStableDiffusionXLPipeline(StableDiffusionXLPipeline):
clean_index = 0
clean_input_ids = []
class_token_index = []
# Find out the corrresponding class word token based on the newly added trigger word token
# Find out the corresponding class word token based on the newly added trigger word token
for i, token_id in enumerate(input_ids):
if token_id == image_token_id:
class_token_index.append(clean_index - 1)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment