Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
renzhc
diffusers_dcu
Commits
b288684d
Unverified
Commit
b288684d
authored
Jul 24, 2023
by
Patrick von Platen
Committed by
GitHub
Jul 24, 2023
Browse files
[SDXL] Fix sd xl encode prompt (#4237)
* [SDXL] Fix sd xl encode prompt * add tests
parent
06eda5b2
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
53 additions
and
16 deletions
+53
-16
src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
...ffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
+2
-2
src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
...lines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+2
-2
src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
...able_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+2
-2
src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
...able_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+2
-2
tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
...s/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
+22
-4
tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
...s/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
+23
-4
No files found.
src/diffusers/pipelines/controlnet/pipeline_controlnet_sd_xl.py
View file @
b288684d
...
@@ -360,7 +360,7 @@ class StableDiffusionXLControlNetPipeline(DiffusionPipeline, TextualInversionLoa
...
@@ -360,7 +360,7 @@ class StableDiffusionXLControlNetPipeline(DiffusionPipeline, TextualInversionLoa
negative_prompt_embeds
=
torch
.
concat
(
negative_prompt_embeds_list
,
dim
=-
1
)
negative_prompt_embeds
=
torch
.
concat
(
negative_prompt_embeds_list
,
dim
=-
1
)
prompt_embeds
=
prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
.
dtype
,
device
=
device
)
prompt_embeds
=
prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
_2
.
dtype
,
device
=
device
)
bs_embed
,
seq_len
,
_
=
prompt_embeds
.
shape
bs_embed
,
seq_len
,
_
=
prompt_embeds
.
shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds
=
prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
prompt_embeds
=
prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
...
@@ -369,7 +369,7 @@ class StableDiffusionXLControlNetPipeline(DiffusionPipeline, TextualInversionLoa
...
@@ -369,7 +369,7 @@ class StableDiffusionXLControlNetPipeline(DiffusionPipeline, TextualInversionLoa
if
do_classifier_free_guidance
:
if
do_classifier_free_guidance
:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len
=
negative_prompt_embeds
.
shape
[
1
]
seq_len
=
negative_prompt_embeds
.
shape
[
1
]
negative_prompt_embeds
=
negative_prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
.
dtype
,
device
=
device
)
negative_prompt_embeds
=
negative_prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
_2
.
dtype
,
device
=
device
)
negative_prompt_embeds
=
negative_prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
view
(
batch_size
*
num_images_per_prompt
,
seq_len
,
-
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
view
(
batch_size
*
num_images_per_prompt
,
seq_len
,
-
1
)
...
...
src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
View file @
b288684d
...
@@ -375,7 +375,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
...
@@ -375,7 +375,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
negative_prompt_embeds
=
torch
.
concat
(
negative_prompt_embeds_list
,
dim
=-
1
)
negative_prompt_embeds
=
torch
.
concat
(
negative_prompt_embeds_list
,
dim
=-
1
)
prompt_embeds
=
prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
.
dtype
,
device
=
device
)
prompt_embeds
=
prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
_2
.
dtype
,
device
=
device
)
bs_embed
,
seq_len
,
_
=
prompt_embeds
.
shape
bs_embed
,
seq_len
,
_
=
prompt_embeds
.
shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds
=
prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
prompt_embeds
=
prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
...
@@ -384,7 +384,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
...
@@ -384,7 +384,7 @@ class StableDiffusionXLPipeline(DiffusionPipeline, FromSingleFileMixin, LoraLoad
if
do_classifier_free_guidance
:
if
do_classifier_free_guidance
:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len
=
negative_prompt_embeds
.
shape
[
1
]
seq_len
=
negative_prompt_embeds
.
shape
[
1
]
negative_prompt_embeds
=
negative_prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
.
dtype
,
device
=
device
)
negative_prompt_embeds
=
negative_prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
_2
.
dtype
,
device
=
device
)
negative_prompt_embeds
=
negative_prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
view
(
batch_size
*
num_images_per_prompt
,
seq_len
,
-
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
view
(
batch_size
*
num_images_per_prompt
,
seq_len
,
-
1
)
...
...
src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
View file @
b288684d
...
@@ -383,7 +383,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin, L
...
@@ -383,7 +383,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin, L
negative_prompt_embeds
=
torch
.
concat
(
negative_prompt_embeds_list
,
dim
=-
1
)
negative_prompt_embeds
=
torch
.
concat
(
negative_prompt_embeds_list
,
dim
=-
1
)
prompt_embeds
=
prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
.
dtype
,
device
=
device
)
prompt_embeds
=
prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
_2
.
dtype
,
device
=
device
)
bs_embed
,
seq_len
,
_
=
prompt_embeds
.
shape
bs_embed
,
seq_len
,
_
=
prompt_embeds
.
shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds
=
prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
prompt_embeds
=
prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
...
@@ -392,7 +392,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin, L
...
@@ -392,7 +392,7 @@ class StableDiffusionXLImg2ImgPipeline(DiffusionPipeline, FromSingleFileMixin, L
if
do_classifier_free_guidance
:
if
do_classifier_free_guidance
:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len
=
negative_prompt_embeds
.
shape
[
1
]
seq_len
=
negative_prompt_embeds
.
shape
[
1
]
negative_prompt_embeds
=
negative_prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
.
dtype
,
device
=
device
)
negative_prompt_embeds
=
negative_prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
_2
.
dtype
,
device
=
device
)
negative_prompt_embeds
=
negative_prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
view
(
batch_size
*
num_images_per_prompt
,
seq_len
,
-
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
view
(
batch_size
*
num_images_per_prompt
,
seq_len
,
-
1
)
...
...
src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
View file @
b288684d
...
@@ -489,7 +489,7 @@ class StableDiffusionXLInpaintPipeline(
...
@@ -489,7 +489,7 @@ class StableDiffusionXLInpaintPipeline(
negative_prompt_embeds
=
torch
.
concat
(
negative_prompt_embeds_list
,
dim
=-
1
)
negative_prompt_embeds
=
torch
.
concat
(
negative_prompt_embeds_list
,
dim
=-
1
)
prompt_embeds
=
prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
.
dtype
,
device
=
device
)
prompt_embeds
=
prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
_2
.
dtype
,
device
=
device
)
bs_embed
,
seq_len
,
_
=
prompt_embeds
.
shape
bs_embed
,
seq_len
,
_
=
prompt_embeds
.
shape
# duplicate text embeddings for each generation per prompt, using mps friendly method
# duplicate text embeddings for each generation per prompt, using mps friendly method
prompt_embeds
=
prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
prompt_embeds
=
prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
...
@@ -498,7 +498,7 @@ class StableDiffusionXLInpaintPipeline(
...
@@ -498,7 +498,7 @@ class StableDiffusionXLInpaintPipeline(
if
do_classifier_free_guidance
:
if
do_classifier_free_guidance
:
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
# duplicate unconditional embeddings for each generation per prompt, using mps friendly method
seq_len
=
negative_prompt_embeds
.
shape
[
1
]
seq_len
=
negative_prompt_embeds
.
shape
[
1
]
negative_prompt_embeds
=
negative_prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
.
dtype
,
device
=
device
)
negative_prompt_embeds
=
negative_prompt_embeds
.
to
(
dtype
=
self
.
text_encoder
_2
.
dtype
,
device
=
device
)
negative_prompt_embeds
=
negative_prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
repeat
(
1
,
num_images_per_prompt
,
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
view
(
batch_size
*
num_images_per_prompt
,
seq_len
,
-
1
)
negative_prompt_embeds
=
negative_prompt_embeds
.
view
(
batch_size
*
num_images_per_prompt
,
seq_len
,
-
1
)
...
...
tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_img2img.py
View file @
b288684d
...
@@ -48,7 +48,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
...
@@ -48,7 +48,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
image_params
=
IMAGE_TO_IMAGE_IMAGE_PARAMS
image_params
=
IMAGE_TO_IMAGE_IMAGE_PARAMS
image_latents_params
=
IMAGE_TO_IMAGE_IMAGE_PARAMS
image_latents_params
=
IMAGE_TO_IMAGE_IMAGE_PARAMS
def
get_dummy_components
(
self
):
def
get_dummy_components
(
self
,
skip_first_text_encoder
=
False
):
torch
.
manual_seed
(
0
)
torch
.
manual_seed
(
0
)
unet
=
UNet2DConditionModel
(
unet
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
block_out_channels
=
(
32
,
64
),
...
@@ -65,7 +65,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
...
@@ -65,7 +65,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
addition_time_embed_dim
=
8
,
addition_time_embed_dim
=
8
,
transformer_layers_per_block
=
(
1
,
2
),
transformer_layers_per_block
=
(
1
,
2
),
projection_class_embeddings_input_dim
=
80
,
# 6 * 8 + 32
projection_class_embeddings_input_dim
=
80
,
# 6 * 8 + 32
cross_attention_dim
=
64
,
cross_attention_dim
=
64
if
not
skip_first_text_encoder
else
32
,
)
)
scheduler
=
EulerDiscreteScheduler
(
scheduler
=
EulerDiscreteScheduler
(
beta_start
=
0.00085
,
beta_start
=
0.00085
,
...
@@ -109,8 +109,8 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
...
@@ -109,8 +109,8 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
"unet"
:
unet
,
"unet"
:
unet
,
"scheduler"
:
scheduler
,
"scheduler"
:
scheduler
,
"vae"
:
vae
,
"vae"
:
vae
,
"text_encoder"
:
text_encoder
,
"text_encoder"
:
text_encoder
if
not
skip_first_text_encoder
else
None
,
"tokenizer"
:
tokenizer
,
"tokenizer"
:
tokenizer
if
not
skip_first_text_encoder
else
None
,
"text_encoder_2"
:
text_encoder_2
,
"text_encoder_2"
:
text_encoder_2
,
"tokenizer_2"
:
tokenizer_2
,
"tokenizer_2"
:
tokenizer_2
,
}
}
...
@@ -151,6 +151,24 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
...
@@ -151,6 +151,24 @@ class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, Pipel
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_xl_refiner
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
components
=
self
.
get_dummy_components
(
skip_first_text_encoder
=
True
)
sd_pipe
=
StableDiffusionXLImg2ImgPipeline
(
**
components
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
inputs
=
self
.
get_dummy_inputs
(
device
)
image
=
sd_pipe
(
**
inputs
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4676
,
0.4865
,
0.4335
,
0.6715
,
0.5578
,
0.4497
,
0.5847
,
0.5967
,
0.5198
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_attention_slicing_forward_pass
(
self
):
def
test_attention_slicing_forward_pass
(
self
):
super
().
test_attention_slicing_forward_pass
(
expected_max_diff
=
3e-3
)
super
().
test_attention_slicing_forward_pass
(
expected_max_diff
=
3e-3
)
...
...
tests/pipelines/stable_diffusion_xl/test_stable_diffusion_xl_inpaint.py
View file @
b288684d
...
@@ -50,7 +50,7 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
...
@@ -50,7 +50,7 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
# TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
# TO-DO: update image_params once pipeline is refactored with VaeImageProcessor.preprocess
image_latents_params
=
frozenset
([])
image_latents_params
=
frozenset
([])
def
get_dummy_components
(
self
):
def
get_dummy_components
(
self
,
skip_first_text_encoder
=
False
):
torch
.
manual_seed
(
0
)
torch
.
manual_seed
(
0
)
unet
=
UNet2DConditionModel
(
unet
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
block_out_channels
=
(
32
,
64
),
...
@@ -67,7 +67,7 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
...
@@ -67,7 +67,7 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
addition_time_embed_dim
=
8
,
addition_time_embed_dim
=
8
,
transformer_layers_per_block
=
(
1
,
2
),
transformer_layers_per_block
=
(
1
,
2
),
projection_class_embeddings_input_dim
=
80
,
# 6 * 8 + 32
projection_class_embeddings_input_dim
=
80
,
# 6 * 8 + 32
cross_attention_dim
=
64
,
cross_attention_dim
=
64
if
not
skip_first_text_encoder
else
32
,
)
)
scheduler
=
EulerDiscreteScheduler
(
scheduler
=
EulerDiscreteScheduler
(
beta_start
=
0.00085
,
beta_start
=
0.00085
,
...
@@ -111,8 +111,8 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
...
@@ -111,8 +111,8 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
"unet"
:
unet
,
"unet"
:
unet
,
"scheduler"
:
scheduler
,
"scheduler"
:
scheduler
,
"vae"
:
vae
,
"vae"
:
vae
,
"text_encoder"
:
text_encoder
,
"text_encoder"
:
text_encoder
if
not
skip_first_text_encoder
else
None
,
"tokenizer"
:
tokenizer
,
"tokenizer"
:
tokenizer
if
not
skip_first_text_encoder
else
None
,
"text_encoder_2"
:
text_encoder_2
,
"text_encoder_2"
:
text_encoder_2
,
"tokenizer_2"
:
tokenizer_2
,
"tokenizer_2"
:
tokenizer_2
,
}
}
...
@@ -238,6 +238,25 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
...
@@ -238,6 +238,25 @@ class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, Pipel
assert
np
.
abs
(
image_slices
[
0
]
-
image_slices
[
1
]).
max
()
<
1e-3
assert
np
.
abs
(
image_slices
[
0
]
-
image_slices
[
1
]).
max
()
<
1e-3
assert
np
.
abs
(
image_slices
[
0
]
-
image_slices
[
2
]).
max
()
<
1e-3
assert
np
.
abs
(
image_slices
[
0
]
-
image_slices
[
2
]).
max
()
<
1e-3
def
test_stable_diffusion_xl_refiner
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
components
=
self
.
get_dummy_components
(
skip_first_text_encoder
=
True
)
sd_pipe
=
self
.
pipeline_class
(
**
components
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
inputs
=
self
.
get_dummy_inputs
(
device
)
image
=
sd_pipe
(
**
inputs
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
print
(
torch
.
from_numpy
(
image_slice
).
flatten
())
assert
image
.
shape
==
(
1
,
64
,
64
,
3
)
expected_slice
=
np
.
array
([
0.9106
,
0.6563
,
0.6766
,
0.6537
,
0.6709
,
0.7367
,
0.6537
,
0.5937
,
0.5418
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_two_xl_mixture_of_denoiser
(
self
):
def
test_stable_diffusion_two_xl_mixture_of_denoiser
(
self
):
components
=
self
.
get_dummy_components
()
components
=
self
.
get_dummy_components
()
pipe_1
=
StableDiffusionXLInpaintPipeline
(
**
components
).
to
(
torch_device
)
pipe_1
=
StableDiffusionXLInpaintPipeline
(
**
components
).
to
(
torch_device
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment