unCLIP image variation (#1781)

* unCLIP image variation * remove prior comment re: @pcuenca * stable diffusion -> unCLIP re: @pcuenca * add copy froms re: @patil-suraj

unCLIP image variation (#1781)
* unCLIP image variation * remove prior comment re: @pcuenca * stable diffusion -> unCLIP re: @pcuenca * add copy froms re: @patil-suraj
53c8147a · Will Berman · GitHub · cf5265ad · 53c8147a · 53c8147a
Unverified Commit 53c8147a authored Dec 28, 2022 by Will Berman Committed by GitHub Dec 28, 2022
10 changed files
--- a/docs/source/api/pipelines/unclip.mdx
+++ b/docs/source/api/pipelines/unclip.mdx
@@ -28,4 +28,6 @@ The unCLIP model in diffusers comes from kakaobrain's karlo and the original cod
 ## UnCLIPPipeline
 [[autodoc]] pipelines.unclip.pipeline_unclip.UnCLIPPipeline
+    - __call__
+[[autodoc]] pipelines.unclip.pipeline_unclip_image_variation.UnCLIPImageVariationPipeline
    - __call__
\ No newline at end of file
--- a/scripts/convert_unclip_txt2img_to_image_variation.py
+++ b/scripts/convert_unclip_txt2img_to_image_variation.py
+import argparse
+from diffusers import UnCLIPImageVariationPipeline, UnCLIPPipeline
+from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    parser.add_argument(
+        "--txt2img_unclip",
+        default="kakaobrain/karlo-v1-alpha",
+        type=str,
+        required=False,
+        help="The pretrained txt2img unclip.",
+    )
+    args = parser.parse_args()
+    txt2img = UnCLIPPipeline.from_pretrained(args.txt2img_unclip)
+    feature_extractor = CLIPImageProcessor()
+    image_encoder = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")
+    img2img = UnCLIPImageVariationPipeline(
+        decoder=txt2img.decoder,
+        text_encoder=txt2img.text_encoder,
+        tokenizer=txt2img.tokenizer,
+        text_proj=txt2img.text_proj,
+        feature_extractor=feature_extractor,
+        image_encoder=image_encoder,
+        super_res_first=txt2img.super_res_first,
+        super_res_last=txt2img.super_res_last,
+        decoder_scheduler=txt2img.decoder_scheduler,
+        super_res_scheduler=txt2img.super_res_scheduler,
+    )
+    img2img.save_pretrained(args.dump_path)
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -105,6 +105,7 @@ else:
        StableDiffusionPipeline,
        StableDiffusionPipelineSafe,
        StableDiffusionUpscalePipeline,
+        UnCLIPImageVariationPipeline,
        UnCLIPPipeline,
        VersatileDiffusionDualGuidedPipeline,
        VersatileDiffusionImageVariationPipeline,

--- a/src/diffusers/pipelines/__init__.py
+++ b/src/diffusers/pipelines/__init__.py
@@ -53,7 +53,7 @@ else:
        StableDiffusionUpscalePipeline,
    )
    from .stable_diffusion_safe import StableDiffusionPipelineSafe
-    from .unclip import UnCLIPPipeline
+    from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
    from .versatile_diffusion import (
        VersatileDiffusionDualGuidedPipeline,
        VersatileDiffusionImageVariationPipeline,

--- a/src/diffusers/pipelines/unclip/__init__.py
+++ b/src/diffusers/pipelines/unclip/__init__.py
@@ -13,4 +13,5 @@ except OptionalDependencyNotAvailable:
    from ...utils.dummy_torch_and_transformers_objects import UnCLIPPipeline
 else:
    from .pipeline_unclip import UnCLIPPipeline
+    from .pipeline_unclip_image_variation import UnCLIPImageVariationPipeline
    from .text_proj import UnCLIPTextProjModel
--- a/src/diffusers/pipelines/unclip/pipeline_unclip.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip.py
@@ -45,6 +45,8 @@ class UnCLIPPipeline(DiffusionPipeline):
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior ([`PriorTransformer`]):
            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
+        text_proj ([`UnCLIPTextProjModel`]):
+            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
            The decoder to invert the image embedding into an image.
        super_res_first ([`UNet2DModel`]):

--- a/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
+++ b/src/diffusers/pipelines/unclip/pipeline_unclip_image_variation.py
--- a/src/diffusers/utils/dummy_torch_and_transformers_objects.py
+++ b/src/diffusers/utils/dummy_torch_and_transformers_objects.py
@@ -199,6 +199,21 @@ class StableDiffusionUpscalePipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])
+class UnCLIPImageVariationPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
 class UnCLIPPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

--- a/tests/pipelines/unclip/test_unclip.py
+++ b/tests/pipelines/unclip/test_unclip.py
@@ -281,7 +281,7 @@ class UnCLIPPipelineIntegrationTests(unittest.TestCase):
        assert image.shape == (256, 256, 3)
        assert np.abs(expected_image - image).max() < 1e-2
-    def test_stable_diffusion_pipeline_with_sequential_cpu_offloading(self):
+    def test_unclip_pipeline_with_sequential_cpu_offloading(self):
        torch.cuda.empty_cache()
        torch.cuda.reset_max_memory_allocated()
        torch.cuda.reset_peak_memory_stats()

--- a/tests/pipelines/unclip/test_unclip_image_variation.py
+++ b/tests/pipelines/unclip/test_unclip_image_variation.py
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gc
+import random
+import unittest
+import numpy as np
+import torch
+from diffusers import UnCLIPImageVariationPipeline, UnCLIPScheduler, UNet2DConditionModel, UNet2DModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.unclip.text_proj import UnCLIPTextProjModel
+from diffusers.utils import floats_tensor, load_numpy, slow, torch_device
+from diffusers.utils.testing_utils import load_image, require_torch_gpu
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)
+torch.backends.cuda.matmul.allow_tf32 = False
+class UnCLIPImageVariationPipelineFastTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+    @property
+    def text_embedder_hidden_size(self):
+        return 32
+    @property
+    def time_input_dim(self):
+        return 32
+    @property
+    def block_out_channels_0(self):
+        return self.time_input_dim
+    @property
+    def time_embed_dim(self):
+        return self.time_input_dim * 4
+    @property
+    def cross_attention_dim(self):
+        return 100
+    @property
+    def dummy_tokenizer(self):
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+        return tokenizer
+    @property
+    def dummy_text_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        return CLIPTextModelWithProjection(config)
+    @property
+    def dummy_image_encoder(self):
+        torch.manual_seed(0)
+        config = CLIPVisionConfig(
+            hidden_size=self.text_embedder_hidden_size,
+            projection_dim=self.text_embedder_hidden_size,
+            num_hidden_layers=5,
+            num_attention_heads=4,
+            image_size=32,
+            intermediate_size=37,
+            patch_size=1,
+        )
+        return CLIPVisionModelWithProjection(config)
+    @property
+    def dummy_text_proj(self):
+        torch.manual_seed(0)
+        model_kwargs = {
+            "clip_embeddings_dim": self.text_embedder_hidden_size,
+            "time_embed_dim": self.time_embed_dim,
+            "cross_attention_dim": self.cross_attention_dim,
+        }
+        model = UnCLIPTextProjModel(**model_kwargs)
+        return model
+    @property
+    def dummy_decoder(self):
+        torch.manual_seed(0)
+        model_kwargs = {
+            "sample_size": 64,
+            # RGB in channels
+            "in_channels": 3,
+            # Out channels is double in channels because predicts mean and variance
+            "out_channels": 6,
+            "down_block_types": ("ResnetDownsampleBlock2D", "SimpleCrossAttnDownBlock2D"),
+            "up_block_types": ("SimpleCrossAttnUpBlock2D", "ResnetUpsampleBlock2D"),
+            "mid_block_type": "UNetMidBlock2DSimpleCrossAttn",
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "layers_per_block": 1,
+            "cross_attention_dim": self.cross_attention_dim,
+            "attention_head_dim": 4,
+            "resnet_time_scale_shift": "scale_shift",
+            "class_embed_type": "identity",
+        }
+        model = UNet2DConditionModel(**model_kwargs)
+        return model
+    @property
+    def dummy_super_res_kwargs(self):
+        return {
+            "sample_size": 128,
+            "layers_per_block": 1,
+            "down_block_types": ("ResnetDownsampleBlock2D", "ResnetDownsampleBlock2D"),
+            "up_block_types": ("ResnetUpsampleBlock2D", "ResnetUpsampleBlock2D"),
+            "block_out_channels": (self.block_out_channels_0, self.block_out_channels_0 * 2),
+            "in_channels": 6,
+            "out_channels": 3,
+        }
+    @property
+    def dummy_super_res_first(self):
+        torch.manual_seed(0)
+        model = UNet2DModel(**self.dummy_super_res_kwargs)
+        return model
+    @property
+    def dummy_super_res_last(self):
+        # seeded differently to get different unet than `self.dummy_super_res_first`
+        torch.manual_seed(1)
+        model = UNet2DModel(**self.dummy_super_res_kwargs)
+        return model
+    def get_pipeline(self, device):
+        decoder = self.dummy_decoder
+        text_proj = self.dummy_text_proj
+        text_encoder = self.dummy_text_encoder
+        tokenizer = self.dummy_tokenizer
+        super_res_first = self.dummy_super_res_first
+        super_res_last = self.dummy_super_res_last
+        decoder_scheduler = UnCLIPScheduler(
+            variance_type="learned_range",
+            prediction_type="epsilon",
+            num_train_timesteps=1000,
+        )
+        super_res_scheduler = UnCLIPScheduler(
+            variance_type="fixed_small_log",
+            prediction_type="epsilon",
+            num_train_timesteps=1000,
+        )
+        feature_extractor = CLIPImageProcessor(crop_size=32, size=32)
+        image_encoder = self.dummy_image_encoder
+        pipe = UnCLIPImageVariationPipeline(
+            decoder=decoder,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            text_proj=text_proj,
+            feature_extractor=feature_extractor,
+            image_encoder=image_encoder,
+            super_res_first=super_res_first,
+            super_res_last=super_res_last,
+            decoder_scheduler=decoder_scheduler,
+            super_res_scheduler=super_res_scheduler,
+        )
+        pipe = pipe.to(device)
+        pipe.set_progress_bar_config(disable=None)
+        return pipe
+    def get_pipeline_inputs(self, device, seed, pil_image=False):
+        input_image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        generator = torch.Generator(device=device).manual_seed(seed)
+        if pil_image:
+            input_image = input_image * 0.5 + 0.5
+            input_image = input_image.clamp(0, 1)
+            input_image = input_image.cpu().permute(0, 2, 3, 1).float().numpy()
+            input_image = DiffusionPipeline.numpy_to_pil(input_image)[0]
+        return {
+            "image": input_image,
+            "generator": generator,
+            "decoder_num_inference_steps": 2,
+            "super_res_num_inference_steps": 2,
+            "output_type": "np",
+        }
+    def test_unclip_image_variation_input_tensor(self):
+        device = "cpu"
+        seed = 0
+        pipe = self.get_pipeline(device)
+        pipeline_inputs = self.get_pipeline_inputs(device, seed)
+        output = pipe(**pipeline_inputs)
+        image = output.images
+        tuple_pipeline_inputs = self.get_pipeline_inputs(device, seed)
+        image_from_tuple = pipe(
+            **tuple_pipeline_inputs,
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array(
+            [
+                0.9988,
+                0.9997,
+                0.9944,
+                0.0003,
+                0.0003,
+                0.9974,
+                0.0003,
+                0.0004,
+                0.9931,
+            ]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+    def test_unclip_image_variation_input_image(self):
+        device = "cpu"
+        seed = 0
+        pipe = self.get_pipeline(device)
+        pipeline_inputs = self.get_pipeline_inputs(device, seed, pil_image=True)
+        output = pipe(**pipeline_inputs)
+        image = output.images
+        tuple_pipeline_inputs = self.get_pipeline_inputs(device, seed, pil_image=True)
+        image_from_tuple = pipe(
+            **tuple_pipeline_inputs,
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (1, 128, 128, 3)
+        expected_slice = np.array(
+            [
+                0.9988,
+                0.9997,
+                0.9944,
+                0.0003,
+                0.0003,
+                0.9974,
+                0.0003,
+                0.0004,
+                0.9931,
+            ]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+    def test_unclip_image_variation_input_list_images(self):
+        device = "cpu"
+        seed = 0
+        pipe = self.get_pipeline(device)
+        pipeline_inputs = self.get_pipeline_inputs(device, seed, pil_image=True)
+        pipeline_inputs["image"] = [
+            pipeline_inputs["image"],
+            pipeline_inputs["image"],
+        ]
+        output = pipe(**pipeline_inputs)
+        image = output.images
+        tuple_pipeline_inputs = self.get_pipeline_inputs(device, seed, pil_image=True)
+        tuple_pipeline_inputs["image"] = [
+            tuple_pipeline_inputs["image"],
+            tuple_pipeline_inputs["image"],
+        ]
+        image_from_tuple = pipe(
+            **tuple_pipeline_inputs,
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (2, 128, 128, 3)
+        expected_slice = np.array(
+            [
+                0.9997,
+                0.9997,
+                0.0003,
+                0.0003,
+                0.9950,
+                0.0003,
+                0.9993,
+                0.9957,
+                0.0004,
+            ]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+    def test_unclip_image_variation_input_num_images_per_prompt(self):
+        device = "cpu"
+        seed = 0
+        pipe = self.get_pipeline(device)
+        pipeline_inputs = self.get_pipeline_inputs(device, seed, pil_image=True)
+        pipeline_inputs["image"] = [
+            pipeline_inputs["image"],
+            pipeline_inputs["image"],
+        ]
+        output = pipe(**pipeline_inputs, num_images_per_prompt=2)
+        image = output.images
+        tuple_pipeline_inputs = self.get_pipeline_inputs(device, seed, pil_image=True)
+        tuple_pipeline_inputs["image"] = [
+            tuple_pipeline_inputs["image"],
+            tuple_pipeline_inputs["image"],
+        ]
+        image_from_tuple = pipe(
+            **tuple_pipeline_inputs,
+            num_images_per_prompt=2,
+            return_dict=False,
+        )[0]
+        image_slice = image[0, -3:, -3:, -1]
+        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]
+        assert image.shape == (4, 128, 128, 3)
+        expected_slice = np.array(
+            [
+                0.9997,
+                0.9997,
+                0.0008,
+                0.9952,
+                0.9980,
+                0.9997,
+                0.9961,
+                0.9997,
+                0.9995,
+            ]
+        )
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+        assert np.abs(image_from_tuple_slice.flatten() - expected_slice).max() < 1e-2
+@slow
+@require_torch_gpu
+class UnCLIPImageVariationPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+    def test_unclip_image_variation_karlo(self):
+        input_image = load_image(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/unclip/cat.png"
+        )
+        expected_image = load_numpy(
+            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
+            "/unclip/karlo_v1_alpha_cat_variation_fp16.npy"
+        )
+        pipeline = UnCLIPImageVariationPipeline.from_pretrained(
+            "fusing/karlo-image-variations-diffusers", torch_dtype=torch.float16
+        )
+        pipeline = pipeline.to(torch_device)
+        pipeline.set_progress_bar_config(disable=None)
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        output = pipeline(
+            input_image,
+            num_images_per_prompt=1,
+            generator=generator,
+            output_type="np",
+        )
+        image = output.images[0]
+        np.save("./karlo_v1_alpha_cat_variation_fp16.npy", image)
+        assert image.shape == (256, 256, 3)
+        assert np.abs(expected_image - image).max() < 1e-2