Unverified Commit 072e0089 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[LCM] Make sure img2img works (#5632)

* [LCM] Clean up implementations

* Add all

* correct more

* correct more

* finish

* up
parent b91d5ddd
...@@ -10,6 +10,8 @@ A demo for the [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/L ...@@ -10,6 +10,8 @@ A demo for the [SimianLuo/LCM_Dreamshaper_v7](https://huggingface.co/SimianLuo/L
This pipeline was contributed by [luosiallen](https://luosiallen.github.io/) and [dg845](https://github.com/dg845). This pipeline was contributed by [luosiallen](https://luosiallen.github.io/) and [dg845](https://github.com/dg845).
## text-to-image
```python ```python
import torch import torch
from diffusers import DiffusionPipeline from diffusers import DiffusionPipeline
...@@ -27,6 +29,27 @@ num_inference_steps = 4 ...@@ -27,6 +29,27 @@ num_inference_steps = 4
images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_scale=8.0).images
``` ```
## image-to-image
```python
import torch
from diffusers import AutoPipelineForImage2Image
import PIL
pipe = AutoPipelineForImage2Image.from_pretrained("SimianLuo/LCM_Dreamshaper_v7", torch_dtype=torch.float32)
# To save GPU memory, torch.float16 can be used, but it may compromise image quality.
pipe.to(torch_device="cuda", torch_dtype=torch.float32)
prompt = "High altitude snowy mountains"
image = PIL.Image.open("./snowy_mountains.png")
# Can be set to 1~50 steps. LCM support fast inference even <= 4 steps. Recommend: 1~8 steps.
num_inference_steps = 4
images = pipe(prompt=prompt, image=image, num_inference_steps=num_inference_steps, guidance_scale=8.0).images
```
## LatentConsistencyModelPipeline ## LatentConsistencyModelPipeline
[[autodoc]] LatentConsistencyModelPipeline [[autodoc]] LatentConsistencyModelPipeline
...@@ -39,6 +62,16 @@ images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_s ...@@ -39,6 +62,16 @@ images = pipe(prompt=prompt, num_inference_steps=num_inference_steps, guidance_s
- enable_vae_tiling - enable_vae_tiling
- disable_vae_tiling - disable_vae_tiling
[[autodoc]] LatentConsistencyModelImg2ImgPipeline
- all
- __call__
- enable_freeu
- disable_freeu
- enable_vae_slicing
- disable_vae_slicing
- enable_vae_tiling
- disable_vae_tiling
## StableDiffusionPipelineOutput ## StableDiffusionPipelineOutput
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput [[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
...@@ -230,6 +230,7 @@ else: ...@@ -230,6 +230,7 @@ else:
"KandinskyV22Pipeline", "KandinskyV22Pipeline",
"KandinskyV22PriorEmb2EmbPipeline", "KandinskyV22PriorEmb2EmbPipeline",
"KandinskyV22PriorPipeline", "KandinskyV22PriorPipeline",
"LatentConsistencyModelImg2ImgPipeline",
"LatentConsistencyModelPipeline", "LatentConsistencyModelPipeline",
"LDMTextToImagePipeline", "LDMTextToImagePipeline",
"MusicLDMPipeline", "MusicLDMPipeline",
...@@ -573,6 +574,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: ...@@ -573,6 +574,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
KandinskyV22Pipeline, KandinskyV22Pipeline,
KandinskyV22PriorEmb2EmbPipeline, KandinskyV22PriorEmb2EmbPipeline,
KandinskyV22PriorPipeline, KandinskyV22PriorPipeline,
LatentConsistencyModelImg2ImgPipeline,
LatentConsistencyModelPipeline, LatentConsistencyModelPipeline,
LDMTextToImagePipeline, LDMTextToImagePipeline,
MusicLDMPipeline, MusicLDMPipeline,
......
...@@ -110,7 +110,10 @@ else: ...@@ -110,7 +110,10 @@ else:
"KandinskyV22PriorEmb2EmbPipeline", "KandinskyV22PriorEmb2EmbPipeline",
"KandinskyV22PriorPipeline", "KandinskyV22PriorPipeline",
] ]
_import_structure["latent_consistency_models"] = ["LatentConsistencyModelPipeline"] _import_structure["latent_consistency_models"] = [
"LatentConsistencyModelImg2ImgPipeline",
"LatentConsistencyModelPipeline",
]
_import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"]) _import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"])
_import_structure["musicldm"] = ["MusicLDMPipeline"] _import_structure["musicldm"] = ["MusicLDMPipeline"]
_import_structure["paint_by_example"] = ["PaintByExamplePipeline"] _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
...@@ -334,7 +337,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT: ...@@ -334,7 +337,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
KandinskyV22PriorEmb2EmbPipeline, KandinskyV22PriorEmb2EmbPipeline,
KandinskyV22PriorPipeline, KandinskyV22PriorPipeline,
) )
from .latent_consistency_models import LatentConsistencyModelPipeline from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
from .latent_diffusion import LDMTextToImagePipeline from .latent_diffusion import LDMTextToImagePipeline
from .musicldm import MusicLDMPipeline from .musicldm import MusicLDMPipeline
from .paint_by_example import PaintByExamplePipeline from .paint_by_example import PaintByExamplePipeline
......
...@@ -42,6 +42,7 @@ from .kandinsky2_2 import ( ...@@ -42,6 +42,7 @@ from .kandinsky2_2 import (
KandinskyV22InpaintPipeline, KandinskyV22InpaintPipeline,
KandinskyV22Pipeline, KandinskyV22Pipeline,
) )
from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
from .stable_diffusion import ( from .stable_diffusion import (
StableDiffusionImg2ImgPipeline, StableDiffusionImg2ImgPipeline,
StableDiffusionInpaintPipeline, StableDiffusionInpaintPipeline,
...@@ -65,6 +66,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict( ...@@ -65,6 +66,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
("stable-diffusion-controlnet", StableDiffusionControlNetPipeline), ("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline), ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
("wuerstchen", WuerstchenCombinedPipeline), ("wuerstchen", WuerstchenCombinedPipeline),
("lcm", LatentConsistencyModelPipeline),
] ]
) )
...@@ -77,6 +79,7 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict( ...@@ -77,6 +79,7 @@ AUTO_IMAGE2IMAGE_PIPELINES_MAPPING = OrderedDict(
("kandinsky22", KandinskyV22Img2ImgCombinedPipeline), ("kandinsky22", KandinskyV22Img2ImgCombinedPipeline),
("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline), ("stable-diffusion-controlnet", StableDiffusionControlNetImg2ImgPipeline),
("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline), ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetImg2ImgPipeline),
("lcm", LatentConsistencyModelImg2ImgPipeline),
] ]
) )
......
...@@ -5,11 +5,15 @@ from ...utils import ( ...@@ -5,11 +5,15 @@ from ...utils import (
) )
_import_structure = {"pipeline_latent_consistency_models": ["LatentConsistencyModelPipeline"]} _import_structure = {
"pipeline_latent_consistency_img2img": ["LatentConsistencyModelImg2ImgPipeline"],
"pipeline_latent_consistency_text2img": ["LatentConsistencyModelPipeline"],
}
if TYPE_CHECKING: if TYPE_CHECKING:
from .pipeline_latent_consistency_models import LatentConsistencyModelPipeline from .pipeline_latent_consistency_img2img import LatentConsistencyModelImg2ImgPipeline
from .pipeline_latent_consistency_text2img import LatentConsistencyModelPipeline
else: else:
import sys import sys
......
...@@ -324,6 +324,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): ...@@ -324,6 +324,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
num_inference_steps: int, num_inference_steps: int,
device: Union[str, torch.device] = None, device: Union[str, torch.device] = None,
original_inference_steps: Optional[int] = None, original_inference_steps: Optional[int] = None,
strength: int = 1.0,
): ):
""" """
Sets the discrete timesteps used for the diffusion chain (to be run before inference). Sets the discrete timesteps used for the diffusion chain (to be run before inference).
...@@ -349,7 +350,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): ...@@ -349,7 +350,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
self.num_inference_steps = num_inference_steps self.num_inference_steps = num_inference_steps
original_steps = ( original_steps = (
original_inference_steps if original_inference_steps is not None else self.original_inference_steps original_inference_steps if original_inference_steps is not None else self.config.original_inference_steps
) )
if original_steps > self.config.num_train_timesteps: if original_steps > self.config.num_train_timesteps:
...@@ -370,7 +371,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin): ...@@ -370,7 +371,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
# Currently, only linear spacing is supported. # Currently, only linear spacing is supported.
c = self.config.num_train_timesteps // original_steps c = self.config.num_train_timesteps // original_steps
# LCM Training Steps Schedule # LCM Training Steps Schedule
lcm_origin_timesteps = np.asarray(list(range(1, original_steps + 1))) * c - 1 lcm_origin_timesteps = np.asarray(list(range(1, int(original_steps * strength) + 1))) * c - 1
skipping_step = len(lcm_origin_timesteps) // num_inference_steps skipping_step = len(lcm_origin_timesteps) // num_inference_steps
# LCM Inference Steps Schedule # LCM Inference Steps Schedule
timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps] timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]
......
...@@ -497,6 +497,21 @@ class KandinskyV22PriorPipeline(metaclass=DummyObject): ...@@ -497,6 +497,21 @@ class KandinskyV22PriorPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch", "transformers"]) requires_backends(cls, ["torch", "transformers"])
class LatentConsistencyModelImg2ImgPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class LatentConsistencyModelPipeline(metaclass=DummyObject): class LatentConsistencyModelPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"] _backends = ["torch", "transformers"]
......
import gc
import random
import unittest
import numpy as np
import torch
from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
from diffusers import (
AutoencoderKL,
LatentConsistencyModelImg2ImgPipeline,
LCMScheduler,
UNet2DConditionModel,
)
from diffusers.utils.testing_utils import (
enable_full_determinism,
floats_tensor,
load_image,
require_torch_gpu,
slow,
torch_device,
)
from ..pipeline_params import (
IMAGE_TO_IMAGE_IMAGE_PARAMS,
TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
)
from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin
enable_full_determinism()
class LatentConsistencyModelImg2ImgPipelineFastTests(
PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
):
pipeline_class = LatentConsistencyModelImg2ImgPipeline
params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "negative_prompt", "negative_prompt_embeds"}
required_optional_params = PipelineTesterMixin.required_optional_params - {"latents", "negative_prompt"}
batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
def get_dummy_components(self):
torch.manual_seed(0)
unet = UNet2DConditionModel(
block_out_channels=(4, 8),
layers_per_block=1,
sample_size=32,
in_channels=4,
out_channels=4,
down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
cross_attention_dim=32,
norm_num_groups=2,
time_cond_proj_dim=32,
)
scheduler = LCMScheduler(
beta_start=0.00085,
beta_end=0.012,
beta_schedule="scaled_linear",
clip_sample=False,
set_alpha_to_one=False,
)
torch.manual_seed(0)
vae = AutoencoderKL(
block_out_channels=[4, 8],
in_channels=3,
out_channels=3,
down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
latent_channels=4,
norm_num_groups=2,
)
torch.manual_seed(0)
text_encoder_config = CLIPTextConfig(
bos_token_id=0,
eos_token_id=2,
hidden_size=32,
intermediate_size=64,
layer_norm_eps=1e-05,
num_attention_heads=8,
num_hidden_layers=3,
pad_token_id=1,
vocab_size=1000,
)
text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
components = {
"unet": unet,
"scheduler": scheduler,
"vae": vae,
"text_encoder": text_encoder,
"tokenizer": tokenizer,
"safety_checker": None,
"feature_extractor": None,
"requires_safety_checker": False,
}
return components
def get_dummy_inputs(self, device, seed=0):
image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
image = image / 2 + 0.5
if str(device).startswith("mps"):
generator = torch.manual_seed(seed)
else:
generator = torch.Generator(device=device).manual_seed(seed)
inputs = {
"prompt": "A painting of a squirrel eating a burger",
"image": image,
"generator": generator,
"num_inference_steps": 2,
"guidance_scale": 6.0,
"output_type": "np",
}
return inputs
def test_lcm_onestep(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
inputs["num_inference_steps"] = 1
output = pipe(**inputs)
image = output.images
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.5865, 0.2854, 0.2828, 0.7473, 0.6006, 0.4580, 0.4397, 0.6415, 0.6069])
assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
def test_lcm_multistep(self):
device = "cpu" # ensure determinism for the device-dependent torch.Generator
components = self.get_dummy_components()
pipe = self.pipeline_class(**components)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_dummy_inputs(device)
output = pipe(**inputs)
image = output.images
assert image.shape == (1, 32, 32, 3)
image_slice = image[0, -3:, -3:, -1]
expected_slice = np.array([0.4903, 0.3304, 0.3503, 0.5241, 0.5153, 0.4585, 0.3222, 0.4764, 0.4891])
assert np.abs(image_slice.flatten() - expected_slice).max() < 2e-2
def test_inference_batch_single_identical(self):
super().test_inference_batch_single_identical(expected_max_diff=5e-4)
@slow
@require_torch_gpu
class LatentConsistencyModelImg2ImgPipelineSlowTests(unittest.TestCase):
def setUp(self):
gc.collect()
torch.cuda.empty_cache()
def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
generator = torch.Generator(device=generator_device).manual_seed(seed)
latents = np.random.RandomState(seed).standard_normal((1, 4, 64, 64))
latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
init_image = load_image(
"https://huggingface.co/datasets/diffusers/test-arrays/resolve/main"
"/stable_diffusion_img2img/sketch-mountains-input.png"
)
init_image = init_image.resize((512, 512))
inputs = {
"prompt": "a photograph of an astronaut riding a horse",
"latents": latents,
"generator": generator,
"num_inference_steps": 3,
"guidance_scale": 7.5,
"output_type": "np",
"image": init_image,
}
return inputs
def test_lcm_onestep(self):
pipe = LatentConsistencyModelImg2ImgPipeline.from_pretrained(
"SimianLuo/LCM_Dreamshaper_v7", safety_checker=None
)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
inputs["num_inference_steps"] = 1
image = pipe(**inputs).images
assert image.shape == (1, 512, 512, 3)
image_slice = image[0, -3:, -3:, -1].flatten()
expected_slice = np.array([0.1025, 0.0911, 0.0984, 0.0981, 0.0901, 0.0918, 0.1055, 0.0940, 0.0730])
assert np.abs(image_slice - expected_slice).max() < 1e-3
def test_lcm_multistep(self):
pipe = LatentConsistencyModelImg2ImgPipeline.from_pretrained(
"SimianLuo/LCM_Dreamshaper_v7", safety_checker=None
)
pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to(torch_device)
pipe.set_progress_bar_config(disable=None)
inputs = self.get_inputs(torch_device)
image = pipe(**inputs).images
assert image.shape == (1, 512, 512, 3)
image_slice = image[0, -3:, -3:, -1].flatten()
expected_slice = np.array([0.01855, 0.01855, 0.01489, 0.01392, 0.01782, 0.01465, 0.01831, 0.02539, 0.0])
assert np.abs(image_slice - expected_slice).max() < 1e-3
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment