[Versatile Diffusion] Add versatile diffusion model (#1283)

* up * convert dual unet * revert dual attn * adapt for vd-official * test the full pipeline * mixed inference * mixed inference for text2img * add image prompting * fix clip norm * split text2img and img2img * fix format * refactor text2img * mega pipeline * add optimus * refactor image var * wip text_unet * text unet end to end * update tests * reshape * fix image to text * add some first docs * dual guided pipeline * fix token ratio * propose change * dual transformer as a native module * DualTransformer(nn.Module) * DualTransformer(nn.Module) * correct unconditional image * save-load with mega pipeline * remove image to text * up * uP * fix * up * final fix * remove_unused_weights * test updates * save progress * uP * fix dual prompts * some fixes * finish * style * finish renaming * up * fix * fix * fix * finish Co-authored-by: anton-l <anton@huggingface.co>

[Versatile Diffusion] Add versatile diffusion model (#1283)
* up * convert dual unet * revert dual attn * adapt for vd-official * test the full pipeline * mixed inference * mixed inference for text2img * add image prompting * fix clip norm * split text2img and img2img * fix format * refactor text2img * mega pipeline * add optimus * refactor image var * wip text_unet * text unet end to end * update tests * reshape * fix image to text * add some first docs * dual guided pipeline * fix token ratio * propose change * dual transformer as a native module * DualTransformer(nn.Module) * DualTransformer(nn.Module) * correct unconditional image * save-load with mega pipeline * remove image to text * up * uP * fix * up * final fix * remove_unused_weights * test updates * save progress * uP * fix dual prompts * some fixes * finish * style * finish renaming * up * fix * fix * fix * finish Co-authored-by: anton-l <anton@huggingface.co>
2625fb59 · Patrick von Platen · GitHub · 0eb507f2 · 2625fb59 · 2625fb59
Unverified Commit 2625fb59 authored Nov 23, 2022 by Patrick von Platen Committed by GitHub Nov 23, 2022
3 changed files
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_image_variation.py
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import VersatileDiffusionImageVariationPipeline
+from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class VersatileDiffusionImageVariationPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pass
+
+
+@slow
+@require_torch_gpu
+class VersatileDiffusionImageVariationPipelineIntegrationTests(unittest.TestCase):
+    def test_inference_image_variations(self):
+        pipe = VersatileDiffusionImageVariationPipeline.from_pretrained("shi-labs/versatile-diffusion")
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        image_prompt = load_image(
+            "https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/benz.jpg"
+        )
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe(
+            image=image_prompt,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=50,
+            output_type="numpy",
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0113, 0.2241, 0.4024, 0.0839, 0.0871, 0.2725, 0.2581, 0.0, 0.1096])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_mega.py
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import VersatileDiffusionPipeline
+from diffusers.utils.testing_utils import load_image, require_torch_gpu, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class VersatileDiffusionMegaPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pass
+
+
+@slow
+@require_torch_gpu
+class VersatileDiffusionMegaPipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_from_pretrained_save_pretrained(self):
+        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt_image = load_image(
+            "https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/benz.jpg"
+        )
+
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe.dual_guided(
+            prompt="first prompt",
+            image=prompt_image,
+            text_to_image_strength=0.75,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=2,
+            output_type="numpy",
+        ).images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = VersatileDiffusionPipeline.from_pretrained(tmpdirname, torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = generator.manual_seed(0)
+        new_image = pipe.dual_guided(
+            prompt="first prompt",
+            image=prompt_image,
+            text_to_image_strength=0.75,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=2,
+            output_type="numpy",
+        ).images
+
+        assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass"
+
+    def test_inference_dual_guided_then_text_to_image(self):
+        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "cyberpunk 2077"
+        init_image = load_image(
+            "https://raw.githubusercontent.com/SHI-Labs/Versatile-Diffusion/master/assets/benz.jpg"
+        )
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe.dual_guided(
+            prompt=prompt,
+            image=init_image,
+            text_to_image_strength=0.75,
+            generator=generator,
+            guidance_scale=7.5,
+            num_inference_steps=50,
+            output_type="numpy",
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.014, 0.0112, 0.0136, 0.0145, 0.0107, 0.0113, 0.0272, 0.0215, 0.0216])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        prompt = "A painting of a squirrel eating a burger "
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe.text_to_image(
+            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0408, 0.0181, 0.0, 0.0388, 0.0046, 0.0461, 0.0411, 0.0, 0.0222])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+        pipe = VersatileDiffusionPipeline.from_pretrained("shi-labs/versatile-diffusion", torch_dtype=torch.float16)
+        image = pipe.image_variation(init_image, generator=generator, output_type="numpy").images[0]
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0657, 0.0529, 0.0455, 0.0802, 0.0570, 0.0179, 0.0267, 0.0483, 0.0769])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
--- a/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
+++ b/tests/pipelines/versatile_diffusion/test_versatile_diffusion_text_to_image.py
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+import tempfile
+import unittest
+
+import numpy as np
+import torch
+
+from diffusers import VersatileDiffusionTextToImagePipeline
+from diffusers.utils.testing_utils import require_torch_gpu, slow, torch_device
+
+from ...test_pipelines_common import PipelineTesterMixin
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class VersatileDiffusionTextToImagePipelineFastTests(PipelineTesterMixin, unittest.TestCase):
+    pass
+
+
+@slow
+@require_torch_gpu
+class VersatileDiffusionTextToImagePipelineIntegrationTests(unittest.TestCase):
+    def tearDown(self):
+        # clean up the VRAM after each test
+        super().tearDown()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    def test_remove_unused_weights_save_load(self):
+        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
+        # remove text_unet
+        pipe.remove_unused_weights()
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger "
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe(
+            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
+        ).images
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            pipe.save_pretrained(tmpdirname)
+            pipe = VersatileDiffusionTextToImagePipeline.from_pretrained(tmpdirname)
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        generator = generator.manual_seed(0)
+        new_image = pipe(
+            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=2, output_type="numpy"
+        ).images
+
+        assert np.abs(image - new_image).sum() < 1e-5, "Models don't have the same forward pass"
+
+    def test_inference_text2img(self):
+        pipe = VersatileDiffusionTextToImagePipeline.from_pretrained("shi-labs/versatile-diffusion")
+        pipe.to(torch_device)
+        pipe.set_progress_bar_config(disable=None)
+
+        prompt = "A painting of a squirrel eating a burger "
+        generator = torch.Generator(device=torch_device).manual_seed(0)
+        image = pipe(
+            prompt=prompt, generator=generator, guidance_scale=7.5, num_inference_steps=50, output_type="numpy"
+        ).images
+
+        image_slice = image[0, 253:256, 253:256, -1]
+
+        assert image.shape == (1, 512, 512, 3)
+        expected_slice = np.array([0.0408, 0.0181, 0.0, 0.0388, 0.0046, 0.0461, 0.0411, 0.0, 0.0222])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2