Unverified Commit 072e0089 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[LCM] Make sure img2img works (#5632)

* [LCM] Clean up implementations

* Add all

* correct more

* correct more

* finish

* up
parent b91d5ddd
......@@ -10,6 +10,8 @@ A demo for the [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/L
This pipeline was contributed by [luosiallen](https://luosiallen.github.io/) and [dg845](https://github.com/dg845).
## text-to-image
```python
import torch
from diffusers import DiffusionPipeline
......@@ -27,6 +29,27 @@ num_inference_steps = 4
images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images
```
## image-to-image
```python
import torch
from diffusers import AutoPipelineForImage2Image
import PIL
pipe = AutoPipelineForImage2Image.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", torch_dtype=torch.float32)
# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
pipe.to(torch_device="cuda", torch_dtype=torch.float32)
prompt = "High altitude snowy mountains"
image = PIL.Image.open("./snowy_mountains.png")
# Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
num_inference_steps = 4
images = pipe(prompt=prompt, image=image, num_inference_steps=num_inference_steps, guidance_scale=8.0).images
```
## LatentConsistencyModelPipeline
[[autodoc]] LatentConsistencyModelPipeline
......@@ -39,6 +62,16 @@ images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_s
- enable_vae_tiling
- disable_vae_tiling
[[autodoc]] LatentConsistencyModelImg2ImgPipeline
- all
- __call__
- enable_freeu
- disable_freeu
- enable_vae_slicing
- disable_vae_slicing
- enable_vae_tiling
- disable_vae_tiling
## StableDiffusionPipelineOutput
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
......@@ -230,6 +230,7 @@ else:
"KandinskyV22Pipeline",
"KandinskyV22PriorEmb2EmbPipeline",
"KandinskyV22PriorPipeline",
"LatentConsistencyModelImg2ImgPipeline",
"LatentConsistencyModelPipeline",
"LDMTextToImagePipeline",
"MusicLDMPipeline",
......@@ -573,6 +574,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
KandinskyV22Pipeline,
KandinskyV22PriorEmb2EmbPipeline,
KandinskyV22PriorPipeline,
LatentConsistencyModelImg2ImgPipeline,
LatentConsistencyModelPipeline,
LDMTextToImagePipeline,
MusicLDMPipeline,
......
......@@ -110,7 +110,10 @@ else:
"KandinskyV22PriorEmb2EmbPipeline",
"KandinskyV22PriorPipeline",
]
_import_structure["latent_consistency_models"] = ["LatentConsistencyModelPipeline"]
_import_structure["latent_consistency_models"] = [
"LatentConsistencyModelImg2ImgPipeline",
"LatentConsistencyModelPipeline",
]
_import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"])
_import_structure["musicldm"] = ["MusicLDMPipeline"]
_import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
......@@ -334,7 +337,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
KandinskyV22PriorEmb2EmbPipeline,
KandinskyV22PriorPipeline,
)
from .latent_consistency_models import LatentConsistencyModelPipeline
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
from .latent_diffusion import LDMTextToImagePipeline
from .musicldm import MusicLDMPipeline
from .paint_by_example import PaintByExamplePipeline
......
......@@ -42,6 +42,7 @@ from .kandinsky2_2 import (
KandinskyV22InpaintPipeline,
KandinskyV22Pipeline,
)
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
from .stable_diffusion import (
StableDiffusionImg2ImgPipeline,
StableDiffusionInpaintPipeline,
......@@ -65,6 +66,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
("wuerstchen", WuerstchenCombinedPipeline),
("lcm", LatentConsistencyModelPipeline),
]
)
......@@ -77,6 +79,7 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
("lcm", LatentConsistencyModelImg2ImgPipeline),
]
)
......
......@@ -5,11 +5,15 @@ from ...utils import (
)
_import_structure = {"pipeline_latent_consistency_models": ["LatentConsistencyModelPipeline"]}
_import_structure = {
"pipeline_latent_consistency_img2img": ["LatentConsistencyModelImg2ImgPipeline"],
"pipeline_latent_consistency_text2img": ["LatentConsistencyModelPipeline"],
}
if TYPE_CHECKING:
from .pipeline_latent_consistency_models import LatentConsistencyModelPipeline
from .pipeline_latent_consistency_img2img import LatentConsistencyModelImg2ImgPipeline
from .pipeline_latent_consistency_text2img import LatentConsistencyModelPipeline
else:
import sys
......
......@@ -324,6 +324,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
num_inference_steps: int,
device: Union[str, torch.device] = None,
original_inference_steps: Optional[int] = None,
strength: int = 1.0,
):
"""
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
......@@ -349,7 +350,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
self.num_inference_steps = num_inference_steps
original_steps = (
original_inference_steps if original_inference_steps is not None else self.original_inference_steps
original_inference_steps if original_inference_steps is not None else self.config.original_inference_steps
)
if original_steps > self.config.num_train_timesteps:
......@@ -370,7 +371,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
# Currently, only linear spacing is supported.
c = self.config.num_train_timesteps // original_steps
# LCM Training Steps Schedule
lcm_origin_timesteps = np.asarray(list(range(1, original_steps + 1))) * c - 1
lcm_origin_timesteps = np.asarray(list(range(1, int(original_steps * strength) + 1))) * c - 1
skipping_step = len(lcm_origin_timesteps) // num_inference_steps
# LCM Inference Steps Schedule
timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]
......
......@@ -497,6 +497,21 @@ class KandinskyV22PriorPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch", "transformers"])
class LatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class LatentConsistencyModelPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
......
import gc
import random
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
LatentConsistencyModelImg2ImgPipeline,
LCMScheduler,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import (
enable_full_determinism,
floats_tensor,
load_image,
require_torch_gpu,
slow,
torch_device,
)
from ..pipeline_params import (
IMAGE_TO_IMAGE_IMAGE_PARAMS,
TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
)
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
enable_full_determinism()
class LatentConsistencyModelImg2ImgPipelineFastTests(
PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
):
pipeline_class = LatentConsistencyModelImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "negative_prompt", "negative_prompt_embeds"}
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents", "negative_prompt"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(4, 8),
layers_per_block=1,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
norm_num_groups=2,
time_cond_proj_dim=32,
)
scheduler = LCMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[4, 8],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=64,
layer_norm_eps=1e-05,
num_attention_heads=8,
num_hidden_layers=3,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"requires_safety_checker": False,
}
return components
def get_dummy_inputs(self, device, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
image = image / 2 + 0.5
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"image": image,
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "np",
}
return inputs
def test_lcm_onestep(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
inputs["num_inference_steps"] = 1
output = pipe(**inputs)
image = output.images
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.5865, 0.2854, 0.2828, 0.7473, 0.6006, 0.4580, 0.4397, 0.6415, 0.6069])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_lcm_multistep(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = pipe(**inputs)
image = output.images
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.4903, 0.3304, 0.3503, 0.5241, 0.5153, 0.4585, 0.3222, 0.4764, 0.4891])
assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=5e-4)
@slow
@require_torch_gpu
class LatentConsistencyModelImg2ImgPipelineSlowTests(unittest.TestCase):
def setUp(self):
gc.collect()
torch.cuda.empty_cache()
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed)
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
init_image = load_image(
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
"/stable_diffusion_img2img/sketch-mountains-input.png"
)
init_image = init_image.resize((512, 512))
inputs = {
"prompt": "a photograph of an astronaut riding a horse",
"latents": latents,
"generator": generator,
"num_inference_steps": 3,
"guidance_scale": 7.5,
"output_type": "np",
"image": init_image,
}
return inputs
def test_lcm_onestep(self):
pipe = LatentConsistencyModelImg2ImgPipeline.from_pretrained(
"SimianLuo/LCM_Dreamshaper_v7", safety_checker=None
)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 1
image = pipe(**inputs).images
assert image.shape == (1, 512, 512, 3)
image_slice = image[0, -3:, -3:, -1].flatten()
expected_slice = np.array([0.1025, 0.0911, 0.0984, 0.0981, 0.0901, 0.0918, 0.1055, 0.0940, 0.0730])
assert np.abs(image_slice - expected_slice).max() < 1e-3
def test_lcm_multistep(self):
pipe = LatentConsistencyModelImg2ImgPipeline.from_pretrained(
"SimianLuo/LCM_Dreamshaper_v7", safety_checker=None
)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
image = pipe(**inputs).images
assert image.shape == (1, 512, 512, 3)
image_slice = image[0, -3:, -3:, -1].flatten()
expected_slice = np.array([0.01855, 0.01855, 0.01489, 0.01392, 0.01782, 0.01465, 0.01831, 0.02539, 0.0])
assert np.abs(image_slice - expected_slice).max() < 1e-3
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment