Allow user to select output image aspect ratio (#120)

* Fix typo in comment * Enable choosing output aspect-ratio * Explain the effect of aspect ratio in the README --------- Co-authored-by: Markku Post <markku.post@cleverest.eu>

Allow user to select output image aspect ratio (#120)
* Fix typo in comment * Enable choosing output aspect-ratio * Explain the effect of aspect ratio in the README --------- Co-authored-by: Markku Post <markku.post@cleverest.eu>
ccd1f7ce · Markku Post-Uttula · GitHub · 93044f89 · ccd1f7ce · ccd1f7ce
Unverified Commit ccd1f7ce authored Feb 04, 2024 by Markku Post-Uttula Committed by GitHub Feb 04, 2024
Showing with 102 additions and 3 deletions

README.md README.md +10 -0

gradio_demo/app.py gradio_demo/app.py +14 -2

gradio_demo/aspect_ratio_template.py gradio_demo/aspect_ratio_template.py +77 -0

photomaker/pipeline.py photomaker/pipeline.py +1 -1

No files found.
--- a/README.md
+++ b/README.md
@@ -75,6 +75,16 @@ Note: only change the base model and add the LoRA modules for better stylization
  <img src="https://cdn-uploads.huggingface.co/production/uploads/6285a9133ab6642179158944/-AC7Hr5YL4yW1zXGe_Izl.jpeg" height=450>
 </p>

+### Note about the output aspect ratio
+
+The underlying generative model has billions of parameters. And, whether You believe it or not, **all** of them do affect the output. The width/height (i.e. the "Aspect Ratio") of the output are just a few of these parameters.
+
+For example, using the exact same positive/negative prompts and seed value, only changing the aspect ratio (i.e. the width/height) of the output image, it is possible to generate e.g. following results:
+
+<p align="center">
+  <img src="https://cdn.cleverest.eu/attachment/github.com/TencentARC/PhotoMaker/pull/120/aspect-ratio-matters.png" height=725>
+</p>
+
 # 🔧 Dependencies and Installation

 - Python >= 3.8 (Recommend to use [Anaconda](https://www.anaconda.com/download/#linux) or [Miniconda](https://docs.conda.io/en/latest/miniconda.html))

--- a/gradio_demo/app.py
+++ b/gradio_demo/app.py
@@ -12,7 +12,9 @@ import spaces
 import gradio as gr

 from photomaker import PhotoMakerStableDiffusionXLPipeline
+
 from style_template import styles
+from aspect_ratio_template import aspect_ratios

 # global variable
 base_model_path = 'SG161222/RealVisXL_V3.0'
@@ -29,6 +31,8 @@ except:
 MAX_SEED = np.iinfo(np.int32).max
 STYLE_NAMES = list(styles.keys())
 DEFAULT_STYLE_NAME = "Photographic (Default)"
+ASPECT_RATIO_LABELS = list(aspect_ratios)
+DEFAULT_ASPECT_RATIO = ASPECT_RATIO_LABELS[0]

 # download PhotoMaker checkpoint to cache
 photomaker_ckpt = hf_hub_download(repo_id="TencentARC/PhotoMaker", filename="photomaker-v1.bin", repo_type="model")
@@ -37,6 +41,7 @@ if device == "mps":
    torch_dtype = torch.float16
 else:
    torch_dtype = torch.bfloat16
+
 pipe = PhotoMakerStableDiffusionXLPipeline.from_pretrained(
    base_model_path, 
    torch_dtype=torch_dtype,
@@ -58,7 +63,7 @@ pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
 pipe.fuse_lora()

 @spaces.GPU(enable_queue=True)
-def generate_image(upload_images, prompt, negative_prompt, style_name, num_steps, style_strength_ratio, num_outputs, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
+def generate_image(upload_images, prompt, negative_prompt, aspect_ratio_name, style_name, num_steps, style_strength_ratio, num_outputs, guidance_scale, seed, progress=gr.Progress(track_tqdm=True)):
    # check the trigger word
    image_token_id = pipe.tokenizer.convert_tokens_to_ids(pipe.trigger_word)
    input_ids = pipe.tokenizer.encode(prompt)
@@ -68,6 +73,10 @@ def generate_image(upload_images, prompt, negative_prompt, style_name, num_steps
    if input_ids.count(image_token_id) > 1:
        raise gr.Error(f"Cannot use multiple trigger words '{pipe.trigger_word}' in text prompt!")

+    # determine output dimensions by the aspect ratio
+    output_w, output_h = aspect_ratios[aspect_ratio_name]
+    print(f"[Debug] Generate image using aspect ratio [{aspect_ratio_name}] => {output_w} x {output_h}")
+
    # apply the style template
    prompt, negative_prompt = apply_style(style_name, prompt, negative_prompt)

@@ -88,6 +97,8 @@ def generate_image(upload_images, prompt, negative_prompt, style_name, num_steps
    print(start_merge_step)
    images = pipe(
        prompt=prompt,
+        width=output_w,
+        height=output_h,
        input_id_images=input_id_images,
        negative_prompt=negative_prompt,
        num_images_per_prompt=num_outputs,
@@ -222,6 +233,7 @@ with gr.Blocks(css=css) as demo:
                       info="Try something like 'a photo of a man/woman img', 'img' is the trigger word.",
                       placeholder="A photo of a [man/woman img]...")
            style = gr.Dropdown(label="Style template", choices=STYLE_NAMES, value=DEFAULT_STYLE_NAME)
+            aspect_ratio = gr.Dropdown(label="Output aspect ratio", choices=ASPECT_RATIO_LABELS, value=DEFAULT_ASPECT_RATIO)
            submit = gr.Button("Submit")

            with gr.Accordion(open=False, label="Advanced Options"):
@@ -284,7 +296,7 @@ with gr.Blocks(css=css) as demo:
            api_name=False,
        ).then(
            fn=generate_image,
-            inputs=[files, prompt, negative_prompt, style, num_steps, style_strength_ratio, num_outputs, guidance_scale, seed],
+            inputs=[files, prompt, negative_prompt, aspect_ratio, style, num_steps, style_strength_ratio, num_outputs, guidance_scale, seed],
            outputs=[gallery, usage_tips]
        )


--- a/gradio_demo/aspect_ratio_template.py
+++ b/gradio_demo/aspect_ratio_template.py
+# Note: Since output width & height need to be divisible by 8, the w & h -values do
+#       not exactly match the stated aspect ratios... but they are "close enough":)
+
+aspect_ratio_list = [
+    {
+        "name": "Instagram (1:1)",
+        "w": 1024,
+        "h": 1024,
+    },
+    {
+        "name": "35mm film / Landscape (3:2)",
+        "w": 1024,
+        "h": 680,
+    },
+    {
+        "name": "35mm film / Portrait (2:3)",
+        "w": 680,
+        "h": 1024,
+    },
+    {
+        "name": "CRT Monitor / Landscape (4:3)",
+        "w": 1024,
+        "h": 768,
+    },
+    {
+        "name": "CRT Monitor / Portrait (3:4)",
+        "w": 768,
+        "h": 1024,
+    },
+    {
+        "name": "Widescreen TV / Landscape (16:9)",
+        "w": 1024,
+        "h": 576,
+    },
+    {
+        "name": "Widescreen TV / Portrait (9:16)",
+        "w": 576,
+        "h": 1024,
+    },
+    {
+        "name": "Widescreen Monitor / Landscape (16:10)",
+        "w": 1024,
+        "h": 640,
+    },
+    {
+        "name": "Widescreen Monitor / Portrait (10:16)",
+        "w": 640,
+        "h": 1024,
+    },
+    {
+        "name": "Cinemascope (2.39:1)",
+        "w": 1024,
+        "h": 424,
+    },
+    {
+        "name": "Widescreen Movie (1.85:1)",
+        "w": 1024,
+        "h": 552,
+    },
+    {
+        "name": "Academy Movie (1.37:1)",
+        "w": 1024,
+        "h": 744,
+    },
+    {
+        "name": "Sheet-print (A-series) / Landscape (297:210)",
+        "w": 1024,
+        "h": 720,
+    },
+    {
+        "name": "Sheet-print (A-series) / Portrait (210:297)",
+        "w": 720,
+        "h": 1024,
+    },
+]
+
+aspect_ratios = {k["name"]: (k["w"], k["h"]) for k in aspect_ratio_list}
--- a/photomaker/pipeline.py
+++ b/photomaker/pipeline.py
@@ -164,7 +164,7 @@ class PhotoMakerStableDiffusionXLPipeline(StableDiffusionXLPipeline):
                clean_index = 0
                clean_input_ids = []
                class_token_index = []
-                # Find out the corrresponding class word token based on the newly added trigger word token
+                # Find out the corresponding class word token based on the newly added trigger word token
                for i, token_id in enumerate(input_ids):
                    if token_id == image_token_id:
                        class_token_index.append(clean_index - 1)