Unverified Commit b134f6a8 authored by takuoko's avatar takuoko Committed by GitHub
Browse files

[Community] ControlNet Reference (#3508)



add controlnet reference and bugfix
Co-authored-by: default avatarPatrick von Platen <patrick.v.platen@gmail.com>
parent edc65051
...@@ -1324,7 +1324,7 @@ image.save('tensorrt_img2img_new_zealand_hills.png') ...@@ -1324,7 +1324,7 @@ image.save('tensorrt_img2img_new_zealand_hills.png')
### Stable Diffusion Reference ### Stable Diffusion Reference
This pipeline uses the Reference only Control. Refer to the [sd-webui-controlnet discussion](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236). This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
```py ```py
...@@ -1365,6 +1365,54 @@ Output Image of `reference_attn=True` and `reference_adain=True` ...@@ -1365,6 +1365,54 @@ Output Image of `reference_attn=True` and `reference_adain=True`
![output_image](https://github.com/huggingface/diffusers/assets/24734142/3c5255d6-867d-4d35-b202-8dfd30cc6827) ![output_image](https://github.com/huggingface/diffusers/assets/24734142/3c5255d6-867d-4d35-b202-8dfd30cc6827)
### Stable Diffusion ControlNet Reference
This pipeline uses the Reference Control with ControlNet. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
```py
import cv2
import torch
import numpy as np
from PIL import Image
from diffusers import UniPCMultistepScheduler
from diffusers.utils import load_image
input_image = load_image("https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png")
# get canny image
image = cv2.Canny(np.array(input_image), 100, 200)
image = image[:, :, None]
image = np.concatenate([image, image, image], axis=2)
canny_image = Image.fromarray(image)
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
pipe = StableDiffusionControlNetReferencePipeline.from_pretrained(
"runwayml/stable-diffusion-v1-5",
controlnet=controlnet,
safety_checker=None,
torch_dtype=torch.float16
).to('cuda:0')
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
result_img = pipe(ref_image=input_image,
prompt="1girl",
image=canny_image,
num_inference_steps=20,
reference_attn=True,
reference_adain=True).images[0]
```
Reference Image
![reference_image](https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png)
Output Image
![output_image](https://github.com/huggingface/diffusers/assets/24734142/7b9a5830-f173-4b92-b0cf-73d0e9c01d60)
### Stable Diffusion on IPEX ### Stable Diffusion on IPEX
This diffusion pipeline aims to accelarate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch). This diffusion pipeline aims to accelarate the inference of Stable-Diffusion on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).
...@@ -1462,4 +1510,3 @@ latency = elapsed_time(pipe4) ...@@ -1462,4 +1510,3 @@ latency = elapsed_time(pipe4)
print("Latency of StableDiffusionPipeline--fp32",latency) print("Latency of StableDiffusionPipeline--fp32",latency)
``` ```
This diff is collapsed.
# Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 # Inspired by: https://github.com/Mikubill/sd-webui-controlnet/discussions/1236 and https://github.com/Mikubill/sd-webui-controlnet/discussions/1280
from typing import Any, Callable, Dict, List, Optional, Union from typing import Any, Callable, Dict, List, Optional, Tuple, Union
import numpy as np import numpy as np
import PIL.Image import PIL.Image
...@@ -162,7 +162,7 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline): ...@@ -162,7 +162,7 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline):
def __call__( def __call__(
self, self,
prompt: Union[str, List[str]] = None, prompt: Union[str, List[str]] = None,
ref_image: Union[torch.FloatTensor, PIL.Image.Image, List[torch.FloatTensor], List[PIL.Image.Image]] = None, ref_image: Union[torch.FloatTensor, PIL.Image.Image] = None,
height: Optional[int] = None, height: Optional[int] = None,
width: Optional[int] = None, width: Optional[int] = None,
num_inference_steps: int = 50, num_inference_steps: int = 50,
...@@ -356,12 +356,13 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline): ...@@ -356,12 +356,13 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline):
def hacked_basic_transformer_inner_forward( def hacked_basic_transformer_inner_forward(
self, self,
hidden_states, hidden_states: torch.FloatTensor,
encoder_hidden_states=None, attention_mask: Optional[torch.FloatTensor] = None,
timestep=None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
attention_mask=None, encoder_attention_mask: Optional[torch.FloatTensor] = None,
cross_attention_kwargs=None, timestep: Optional[torch.LongTensor] = None,
class_labels=None, cross_attention_kwargs: Dict[str, Any] = None,
class_labels: Optional[torch.LongTensor] = None,
): ):
if self.use_ada_layer_norm: if self.use_ada_layer_norm:
norm_hidden_states = self.norm1(hidden_states, timestep) norm_hidden_states = self.norm1(hidden_states, timestep)
...@@ -427,7 +428,7 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline): ...@@ -427,7 +428,7 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline):
attn_output = self.attn2( attn_output = self.attn2(
norm_hidden_states, norm_hidden_states,
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
attention_mask=attention_mask, attention_mask=encoder_attention_mask,
**cross_attention_kwargs, **cross_attention_kwargs,
) )
hidden_states = attn_output + hidden_states hidden_states = attn_output + hidden_states
...@@ -473,11 +474,12 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline): ...@@ -473,11 +474,12 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline):
def hack_CrossAttnDownBlock2D_forward( def hack_CrossAttnDownBlock2D_forward(
self, self,
hidden_states, hidden_states: torch.FloatTensor,
temb=None, temb: Optional[torch.FloatTensor] = None,
encoder_hidden_states=None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
attention_mask=None, attention_mask: Optional[torch.FloatTensor] = None,
cross_attention_kwargs=None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
): ):
eps = 1e-6 eps = 1e-6
...@@ -490,6 +492,8 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline): ...@@ -490,6 +492,8 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline):
hidden_states, hidden_states,
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs, cross_attention_kwargs=cross_attention_kwargs,
attention_mask=attention_mask,
encoder_attention_mask=encoder_attention_mask,
return_dict=False, return_dict=False,
)[0] )[0]
if MODE == "write": if MODE == "write":
...@@ -566,13 +570,14 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline): ...@@ -566,13 +570,14 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline):
def hacked_CrossAttnUpBlock2D_forward( def hacked_CrossAttnUpBlock2D_forward(
self, self,
hidden_states, hidden_states: torch.FloatTensor,
res_hidden_states_tuple, res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
temb=None, temb: Optional[torch.FloatTensor] = None,
encoder_hidden_states=None, encoder_hidden_states: Optional[torch.FloatTensor] = None,
cross_attention_kwargs=None, cross_attention_kwargs: Optional[Dict[str, Any]] = None,
upsample_size=None, upsample_size: Optional[int] = None,
attention_mask=None, attention_mask: Optional[torch.FloatTensor] = None,
encoder_attention_mask: Optional[torch.FloatTensor] = None,
): ):
eps = 1e-6 eps = 1e-6
# TODO(Patrick, William) - attention mask is not used # TODO(Patrick, William) - attention mask is not used
...@@ -586,6 +591,8 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline): ...@@ -586,6 +591,8 @@ class StableDiffusionReferencePipeline(StableDiffusionPipeline):
hidden_states, hidden_states,
encoder_hidden_states=encoder_hidden_states, encoder_hidden_states=encoder_hidden_states,
cross_attention_kwargs=cross_attention_kwargs, cross_attention_kwargs=cross_attention_kwargs,
attention_mask=attention_mask,
encoder_attention_mask=encoder_attention_mask,
return_dict=False, return_dict=False,
)[0] )[0]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment