Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
renzhc
diffusers_dcu
Commits
25dfd0f8
Unverified
Commit
25dfd0f8
authored
Oct 21, 2022
by
Patrick von Platen
Committed by
GitHub
Oct 21, 2022
Browse files
[Tests] Move stable diffusion into their own files (#936)
* [Tests] Move stable diffusion into their own files * up
parent
32bf4fdc
Changes
6
Show whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
2283 additions
and
1680 deletions
+2283
-1680
src/diffusers/pipeline_utils.py
src/diffusers/pipeline_utils.py
+1
-1
tests/pipelines/stable_diffusion/test_stable_diffusion.py
tests/pipelines/stable_diffusion/test_stable_diffusion.py
+729
-0
tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
...pelines/stable_diffusion/test_stable_diffusion_img2img.py
+601
-0
tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
...pelines/stable_diffusion/test_stable_diffusion_inpaint.py
+384
-0
tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
.../stable_diffusion/test_stable_diffusion_inpaint_legacy.py
+491
-0
tests/test_pipelines.py
tests/test_pipelines.py
+77
-1679
No files found.
src/diffusers/pipeline_utils.py
View file @
25dfd0f8
...
...
@@ -122,7 +122,7 @@ class DiffusionPipeline(ConfigMixin):
library
=
module
.
__module__
.
split
(
"."
)[
0
]
# check if the module is a pipeline module
pipeline_dir
=
module
.
__module__
.
split
(
"."
)[
-
2
]
pipeline_dir
=
module
.
__module__
.
split
(
"."
)[
-
2
]
if
len
(
module
.
__module__
.
split
(
"."
))
>
2
else
None
path
=
module
.
__module__
.
split
(
"."
)
is_pipeline_module
=
pipeline_dir
in
path
and
hasattr
(
pipelines
,
pipeline_dir
)
...
...
tests/pipelines/stable_diffusion/test_stable_diffusion.py
0 → 100644
View file @
25dfd0f8
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
gc
import
random
import
unittest
import
numpy
as
np
import
torch
from
diffusers
import
(
AutoencoderKL
,
DDIMScheduler
,
LMSDiscreteScheduler
,
PNDMScheduler
,
StableDiffusionPipeline
,
UNet2DConditionModel
,
UNet2DModel
,
VQModel
,
)
from
diffusers.utils
import
floats_tensor
,
load_image
,
slow
,
torch_device
from
transformers
import
CLIPTextConfig
,
CLIPTextModel
,
CLIPTokenizer
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
class
PipelineFastTests
(
unittest
.
TestCase
):
def
tearDown
(
self
):
# clean up the VRAM after each test
super
().
tearDown
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
@
property
def
dummy_image
(
self
):
batch_size
=
1
num_channels
=
3
sizes
=
(
32
,
32
)
image
=
floats_tensor
((
batch_size
,
num_channels
)
+
sizes
,
rng
=
random
.
Random
(
0
)).
to
(
torch_device
)
return
image
@
property
def
dummy_uncond_unet
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
(
"DownBlock2D"
,
"AttnDownBlock2D"
),
up_block_types
=
(
"AttnUpBlock2D"
,
"UpBlock2D"
),
)
return
model
@
property
def
dummy_cond_unet
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
4
,
out_channels
=
4
,
down_block_types
=
(
"DownBlock2D"
,
"CrossAttnDownBlock2D"
),
up_block_types
=
(
"CrossAttnUpBlock2D"
,
"UpBlock2D"
),
cross_attention_dim
=
32
,
)
return
model
@
property
def
dummy_cond_unet_inpaint
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
9
,
out_channels
=
4
,
down_block_types
=
(
"DownBlock2D"
,
"CrossAttnDownBlock2D"
),
up_block_types
=
(
"CrossAttnUpBlock2D"
,
"UpBlock2D"
),
cross_attention_dim
=
32
,
)
return
model
@
property
def
dummy_vq_model
(
self
):
torch
.
manual_seed
(
0
)
model
=
VQModel
(
block_out_channels
=
[
32
,
64
],
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
[
"DownEncoderBlock2D"
,
"DownEncoderBlock2D"
],
up_block_types
=
[
"UpDecoderBlock2D"
,
"UpDecoderBlock2D"
],
latent_channels
=
3
,
)
return
model
@
property
def
dummy_vae
(
self
):
torch
.
manual_seed
(
0
)
model
=
AutoencoderKL
(
block_out_channels
=
[
32
,
64
],
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
[
"DownEncoderBlock2D"
,
"DownEncoderBlock2D"
],
up_block_types
=
[
"UpDecoderBlock2D"
,
"UpDecoderBlock2D"
],
latent_channels
=
4
,
)
return
model
@
property
def
dummy_text_encoder
(
self
):
torch
.
manual_seed
(
0
)
config
=
CLIPTextConfig
(
bos_token_id
=
0
,
eos_token_id
=
2
,
hidden_size
=
32
,
intermediate_size
=
37
,
layer_norm_eps
=
1e-05
,
num_attention_heads
=
4
,
num_hidden_layers
=
5
,
pad_token_id
=
1
,
vocab_size
=
1000
,
)
return
CLIPTextModel
(
config
)
@
property
def
dummy_extractor
(
self
):
def
extract
(
*
args
,
**
kwargs
):
class
Out
:
def
__init__
(
self
):
self
.
pixel_values
=
torch
.
ones
([
0
])
def
to
(
self
,
device
):
self
.
pixel_values
.
to
(
device
)
return
self
return
Out
()
return
extract
def
test_stable_diffusion_ddim
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
DDIMScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
,
clip_sample
=
False
,
set_alpha_to_one
=
False
,
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.5112
,
0.4692
,
0.4715
,
0.5206
,
0.4894
,
0.5114
,
0.5096
,
0.4932
,
0.4755
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_ddim_factor_8
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
DDIMScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
,
clip_sample
=
False
,
set_alpha_to_one
=
False
,
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
height
=
536
,
width
=
536
,
num_inference_steps
=
2
,
output_type
=
"np"
,
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
134
,
134
,
3
)
expected_slice
=
np
.
array
([
0.7834
,
0.5488
,
0.5781
,
0.46
,
0.3609
,
0.5369
,
0.542
,
0.4855
,
0.5557
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_pndm
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.4937
,
0.4649
,
0.4716
,
0.5145
,
0.4889
,
0.513
,
0.513
,
0.4905
,
0.4738
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_no_safety_checker
(
self
):
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
"hf-internal-testing/tiny-stable-diffusion-lms-pipe"
,
safety_checker
=
None
)
assert
isinstance
(
pipe
,
StableDiffusionPipeline
)
assert
isinstance
(
pipe
.
scheduler
,
LMSDiscreteScheduler
)
assert
pipe
.
safety_checker
is
None
image
=
pipe
(
"example prompt"
,
num_inference_steps
=
2
).
images
[
0
]
assert
image
is
not
None
def
test_stable_diffusion_k_lms
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.5067
,
0.4689
,
0.4614
,
0.5233
,
0.4903
,
0.5112
,
0.524
,
0.5069
,
0.4785
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_attention_chunk
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output_1
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
# make sure chunking the attention yields the same result
sd_pipe
.
enable_attention_slicing
(
slice_size
=
1
)
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output_2
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
assert
np
.
abs
(
output_2
.
images
.
flatten
()
-
output_1
.
images
.
flatten
()).
max
()
<
1e-4
def
test_stable_diffusion_negative_prompt
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
negative_prompt
=
"french fries"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
prompt
,
negative_prompt
=
negative_prompt
,
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.4851
,
0.4617
,
0.4765
,
0.5127
,
0.4845
,
0.5153
,
0.5141
,
0.4886
,
0.4719
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_num_images_per_prompt
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
# test num_images_per_prompt=1 (default)
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
).
images
assert
images
.
shape
==
(
1
,
128
,
128
,
3
)
# test num_images_per_prompt=1 (default) for batch of prompts
batch_size
=
2
images
=
sd_pipe
([
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
).
images
assert
images
.
shape
==
(
batch_size
,
128
,
128
,
3
)
# test num_images_per_prompt for single prompt
num_images_per_prompt
=
2
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
num_images_per_prompt
=
num_images_per_prompt
).
images
assert
images
.
shape
==
(
num_images_per_prompt
,
128
,
128
,
3
)
# test num_images_per_prompt for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
num_images_per_prompt
=
num_images_per_prompt
).
images
assert
images
.
shape
==
(
batch_size
*
num_images_per_prompt
,
128
,
128
,
3
)
@
unittest
.
skipIf
(
torch_device
!=
"cuda"
,
"This test requires a GPU"
)
def
test_stable_diffusion_fp16
(
self
):
"""Test that stable diffusion works with fp16"""
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# put models in fp16
unet
=
unet
.
half
()
vae
=
vae
.
half
()
bert
=
bert
.
half
()
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
image
=
sd_pipe
([
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"np"
).
images
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
class
PipelineIntegrationTests
(
unittest
.
TestCase
):
def
tearDown
(
self
):
# clean up the VRAM after each test
super
().
tearDown
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
def
test_stable_diffusion
(
self
):
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-1"
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
"cuda"
):
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
20
,
output_type
=
"np"
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
512
,
512
,
3
)
expected_slice
=
np
.
array
([
0.8887
,
0.915
,
0.91
,
0.894
,
0.909
,
0.912
,
0.919
,
0.925
,
0.883
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_fast_ddim
(
self
):
sd_pipe
=
StableDiffusionPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-1"
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
scheduler
=
DDIMScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
,
clip_sample
=
False
,
set_alpha_to_one
=
False
,
)
sd_pipe
.
scheduler
=
scheduler
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
"cuda"
):
output
=
sd_pipe
([
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"numpy"
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
512
,
512
,
3
)
expected_slice
=
np
.
array
([
0.9326
,
0.923
,
0.951
,
0.9365
,
0.9214
,
0.951
,
0.9365
,
0.9414
,
0.918
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_lms_stable_diffusion_pipeline
(
self
):
model_id
=
"CompVis/stable-diffusion-v1-1"
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
model_id
).
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
scheduler
=
LMSDiscreteScheduler
.
from_config
(
model_id
,
subfolder
=
"scheduler"
)
pipe
.
scheduler
=
scheduler
prompt
=
"a photograph of an astronaut riding a horse"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
image
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
10
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
512
,
512
,
3
)
expected_slice
=
np
.
array
([
0.9077
,
0.9254
,
0.9181
,
0.9227
,
0.9213
,
0.9367
,
0.9399
,
0.9406
,
0.9024
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_memory_chunking
(
self
):
torch
.
cuda
.
reset_peak_memory_stats
()
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
model_id
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
).
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"a photograph of an astronaut riding a horse"
# make attention efficient
pipe
.
enable_attention_slicing
()
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
output_chunked
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
10
,
output_type
=
"numpy"
)
image_chunked
=
output_chunked
.
images
mem_bytes
=
torch
.
cuda
.
max_memory_allocated
()
torch
.
cuda
.
reset_peak_memory_stats
()
# make sure that less than 3.75 GB is allocated
assert
mem_bytes
<
3.75
*
10
**
9
# disable chunking
pipe
.
disable_attention_slicing
()
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
output
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
10
,
output_type
=
"numpy"
)
image
=
output
.
images
# make sure that more than 3.75 GB is allocated
mem_bytes
=
torch
.
cuda
.
max_memory_allocated
()
assert
mem_bytes
>
3.75
*
10
**
9
assert
np
.
abs
(
image_chunked
.
flatten
()
-
image
.
flatten
()).
max
()
<
1e-3
def
test_stable_diffusion_text2img_pipeline_fp16
(
self
):
torch
.
cuda
.
reset_peak_memory_stats
()
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
model_id
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
).
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"a photograph of an astronaut riding a horse"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output_chunked
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
10
,
output_type
=
"numpy"
)
image_chunked
=
output_chunked
.
images
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
output
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
10
,
output_type
=
"numpy"
)
image
=
output
.
images
# Make sure results are close enough
diff
=
np
.
abs
(
image_chunked
.
flatten
()
-
image
.
flatten
())
# They ARE different since ops are not run always at the same precision
# however, they should be extremely close.
assert
diff
.
mean
()
<
2e-2
def
test_stable_diffusion_text2img_pipeline
(
self
):
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/text2img/astronaut_riding_a_horse.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
None
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"astronaut riding a horse"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
def
test_stable_diffusion_text2img_intermediate_state
(
self
):
number_of_steps
=
0
def
test_callback_fn
(
step
:
int
,
timestep
:
int
,
latents
:
torch
.
FloatTensor
)
->
None
:
test_callback_fn
.
has_been_called
=
True
nonlocal
number_of_steps
number_of_steps
+=
1
if
step
==
0
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
64
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
(
[
1.8285
,
1.2857
,
-
0.1024
,
1.2406
,
-
2.3068
,
1.0747
,
-
0.0818
,
-
0.6520
,
-
2.9506
]
)
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-3
elif
step
==
50
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
64
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
(
[
1.1078
,
1.5803
,
0.2773
,
-
0.0589
,
-
1.7928
,
-
0.3665
,
-
0.4695
,
-
1.0727
,
-
1.1601
]
)
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
test_callback_fn
.
has_been_called
=
False
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-4"
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
)
pipe
=
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"Andromeda galaxy in a bottle"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
pipe
(
prompt
=
prompt
,
num_inference_steps
=
50
,
guidance_scale
=
7.5
,
generator
=
generator
,
callback
=
test_callback_fn
,
callback_steps
=
1
,
)
assert
test_callback_fn
.
has_been_called
assert
number_of_steps
==
51
tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
0 → 100644
View file @
25dfd0f8
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
gc
import
random
import
unittest
import
numpy
as
np
import
torch
from
diffusers
import
(
AutoencoderKL
,
LMSDiscreteScheduler
,
PNDMScheduler
,
StableDiffusionImg2ImgPipeline
,
UNet2DConditionModel
,
UNet2DModel
,
VQModel
,
)
from
diffusers.utils
import
floats_tensor
,
load_image
,
slow
,
torch_device
from
transformers
import
CLIPTextConfig
,
CLIPTextModel
,
CLIPTokenizer
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
class
PipelineFastTests
(
unittest
.
TestCase
):
def
tearDown
(
self
):
# clean up the VRAM after each test
super
().
tearDown
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
@
property
def
dummy_image
(
self
):
batch_size
=
1
num_channels
=
3
sizes
=
(
32
,
32
)
image
=
floats_tensor
((
batch_size
,
num_channels
)
+
sizes
,
rng
=
random
.
Random
(
0
)).
to
(
torch_device
)
return
image
@
property
def
dummy_uncond_unet
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
(
"DownBlock2D"
,
"AttnDownBlock2D"
),
up_block_types
=
(
"AttnUpBlock2D"
,
"UpBlock2D"
),
)
return
model
@
property
def
dummy_cond_unet
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
4
,
out_channels
=
4
,
down_block_types
=
(
"DownBlock2D"
,
"CrossAttnDownBlock2D"
),
up_block_types
=
(
"CrossAttnUpBlock2D"
,
"UpBlock2D"
),
cross_attention_dim
=
32
,
)
return
model
@
property
def
dummy_cond_unet_inpaint
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
9
,
out_channels
=
4
,
down_block_types
=
(
"DownBlock2D"
,
"CrossAttnDownBlock2D"
),
up_block_types
=
(
"CrossAttnUpBlock2D"
,
"UpBlock2D"
),
cross_attention_dim
=
32
,
)
return
model
@
property
def
dummy_vq_model
(
self
):
torch
.
manual_seed
(
0
)
model
=
VQModel
(
block_out_channels
=
[
32
,
64
],
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
[
"DownEncoderBlock2D"
,
"DownEncoderBlock2D"
],
up_block_types
=
[
"UpDecoderBlock2D"
,
"UpDecoderBlock2D"
],
latent_channels
=
3
,
)
return
model
@
property
def
dummy_vae
(
self
):
torch
.
manual_seed
(
0
)
model
=
AutoencoderKL
(
block_out_channels
=
[
32
,
64
],
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
[
"DownEncoderBlock2D"
,
"DownEncoderBlock2D"
],
up_block_types
=
[
"UpDecoderBlock2D"
,
"UpDecoderBlock2D"
],
latent_channels
=
4
,
)
return
model
@
property
def
dummy_text_encoder
(
self
):
torch
.
manual_seed
(
0
)
config
=
CLIPTextConfig
(
bos_token_id
=
0
,
eos_token_id
=
2
,
hidden_size
=
32
,
intermediate_size
=
37
,
layer_norm_eps
=
1e-05
,
num_attention_heads
=
4
,
num_hidden_layers
=
5
,
pad_token_id
=
1
,
vocab_size
=
1000
,
)
return
CLIPTextModel
(
config
)
@
property
def
dummy_extractor
(
self
):
def
extract
(
*
args
,
**
kwargs
):
class
Out
:
def
__init__
(
self
):
self
.
pixel_values
=
torch
.
ones
([
0
])
def
to
(
self
,
device
):
self
.
pixel_values
.
to
(
device
)
return
self
return
Out
()
return
extract
def
test_stable_diffusion_img2img
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4492
,
0.3865
,
0.4222
,
0.5854
,
0.5139
,
0.4379
,
0.4193
,
0.48
,
0.4218
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_img2img_negative_prompt
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
negative_prompt
=
"french fries"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
prompt
,
negative_prompt
=
negative_prompt
,
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4065
,
0.3783
,
0.4050
,
0.5266
,
0.4781
,
0.4252
,
0.4203
,
0.4692
,
0.4365
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_img2img_multiple_init_images
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
).
repeat
(
2
,
1
,
1
,
1
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
2
*
[
"A painting of a squirrel eating a burger"
]
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
prompt
,
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
)
image
=
output
.
images
image_slice
=
image
[
-
1
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
2
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.5144
,
0.4447
,
0.4735
,
0.6676
,
0.5526
,
0.5454
,
0.645
,
0.5149
,
0.4689
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_img2img_k_lms
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
return_dict
=
False
,
)
image_from_tuple
=
output
[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4367
,
0.4986
,
0.4372
,
0.6706
,
0.5665
,
0.444
,
0.5864
,
0.6019
,
0.5203
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_img2img_num_images_per_prompt
(
self
):
device
=
"cpu"
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
# test num_images_per_prompt=1 (default)
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
).
images
assert
images
.
shape
==
(
1
,
32
,
32
,
3
)
# test num_images_per_prompt=1 (default) for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
).
images
assert
images
.
shape
==
(
batch_size
,
32
,
32
,
3
)
# test num_images_per_prompt for single prompt
num_images_per_prompt
=
2
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
num_images_per_prompt
=
num_images_per_prompt
,
).
images
assert
images
.
shape
==
(
num_images_per_prompt
,
32
,
32
,
3
)
# test num_images_per_prompt for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
num_images_per_prompt
=
num_images_per_prompt
,
).
images
assert
images
.
shape
==
(
batch_size
*
num_images_per_prompt
,
32
,
32
,
3
)
@
unittest
.
skipIf
(
torch_device
!=
"cuda"
,
"This test requires a GPU"
)
def
test_stable_diffusion_img2img_fp16
(
self
):
"""Test that stable diffusion img2img works with fp16"""
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
torch_device
)
# put models in fp16
unet
=
unet
.
half
()
vae
=
vae
.
half
()
bert
=
bert
.
half
()
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
image
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
).
images
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
class
PipelineIntegrationTests
(
unittest
.
TestCase
):
def
tearDown
(
self
):
# clean up the VRAM after each test
super
().
tearDown
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
def
test_stable_diffusion_img2img_pipeline
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/sketch-mountains-input.jpg"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/fantasy_landscape.png"
)
init_image
=
init_image
.
resize
((
768
,
512
))
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionImg2ImgPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
None
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A fantasy landscape, trending on artstation"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
512
,
768
,
3
)
# img2img is flaky across GPUs even in fp32, so using MAE here
assert
np
.
abs
(
expected_image
-
image
).
mean
()
<
1e-2
def
test_stable_diffusion_img2img_pipeline_k_lms
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/sketch-mountains-input.jpg"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/fantasy_landscape_k_lms.png"
)
init_image
=
init_image
.
resize
((
768
,
512
))
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
lms
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionImg2ImgPipeline
.
from_pretrained
(
model_id
,
scheduler
=
lms
,
safety_checker
=
None
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A fantasy landscape, trending on artstation"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
512
,
768
,
3
)
# img2img is flaky across GPUs even in fp32, so using MAE here
assert
np
.
abs
(
expected_image
-
image
).
mean
()
<
1e-2
def
test_stable_diffusion_img2img_intermediate_state
(
self
):
number_of_steps
=
0
def
test_callback_fn
(
step
:
int
,
timestep
:
int
,
latents
:
torch
.
FloatTensor
)
->
None
:
test_callback_fn
.
has_been_called
=
True
nonlocal
number_of_steps
number_of_steps
+=
1
if
step
==
0
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
96
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
([
0.9052
,
-
0.0184
,
0.4810
,
0.2898
,
0.5851
,
1.4920
,
0.5362
,
1.9838
,
0.0530
])
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-3
elif
step
==
37
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
96
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
([
0.7071
,
0.7831
,
0.8300
,
1.8140
,
1.7840
,
1.9402
,
1.3651
,
1.6590
,
1.2828
])
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
test_callback_fn
.
has_been_called
=
False
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/sketch-mountains-input.jpg"
)
init_image
=
init_image
.
resize
((
768
,
512
))
pipe
=
StableDiffusionImg2ImgPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-4"
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A fantasy landscape, trending on artstation"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
strength
=
0.75
,
num_inference_steps
=
50
,
guidance_scale
=
7.5
,
generator
=
generator
,
callback
=
test_callback_fn
,
callback_steps
=
1
,
)
assert
test_callback_fn
.
has_been_called
assert
number_of_steps
==
38
tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint.py
0 → 100644
View file @
25dfd0f8
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
gc
import
random
import
unittest
import
numpy
as
np
import
torch
from
diffusers
import
(
AutoencoderKL
,
PNDMScheduler
,
StableDiffusionInpaintPipeline
,
UNet2DConditionModel
,
UNet2DModel
,
VQModel
,
)
from
diffusers.utils
import
floats_tensor
,
load_image
,
slow
,
torch_device
from
PIL
import
Image
from
transformers
import
CLIPTextConfig
,
CLIPTextModel
,
CLIPTokenizer
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
class
PipelineFastTests
(
unittest
.
TestCase
):
def
tearDown
(
self
):
# clean up the VRAM after each test
super
().
tearDown
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
@
property
def
dummy_image
(
self
):
batch_size
=
1
num_channels
=
3
sizes
=
(
32
,
32
)
image
=
floats_tensor
((
batch_size
,
num_channels
)
+
sizes
,
rng
=
random
.
Random
(
0
)).
to
(
torch_device
)
return
image
@
property
def
dummy_uncond_unet
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
(
"DownBlock2D"
,
"AttnDownBlock2D"
),
up_block_types
=
(
"AttnUpBlock2D"
,
"UpBlock2D"
),
)
return
model
@
property
def
dummy_cond_unet
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
4
,
out_channels
=
4
,
down_block_types
=
(
"DownBlock2D"
,
"CrossAttnDownBlock2D"
),
up_block_types
=
(
"CrossAttnUpBlock2D"
,
"UpBlock2D"
),
cross_attention_dim
=
32
,
)
return
model
@
property
def
dummy_cond_unet_inpaint
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
9
,
out_channels
=
4
,
down_block_types
=
(
"DownBlock2D"
,
"CrossAttnDownBlock2D"
),
up_block_types
=
(
"CrossAttnUpBlock2D"
,
"UpBlock2D"
),
cross_attention_dim
=
32
,
)
return
model
@
property
def
dummy_vq_model
(
self
):
torch
.
manual_seed
(
0
)
model
=
VQModel
(
block_out_channels
=
[
32
,
64
],
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
[
"DownEncoderBlock2D"
,
"DownEncoderBlock2D"
],
up_block_types
=
[
"UpDecoderBlock2D"
,
"UpDecoderBlock2D"
],
latent_channels
=
3
,
)
return
model
@
property
def
dummy_vae
(
self
):
torch
.
manual_seed
(
0
)
model
=
AutoencoderKL
(
block_out_channels
=
[
32
,
64
],
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
[
"DownEncoderBlock2D"
,
"DownEncoderBlock2D"
],
up_block_types
=
[
"UpDecoderBlock2D"
,
"UpDecoderBlock2D"
],
latent_channels
=
4
,
)
return
model
@
property
def
dummy_text_encoder
(
self
):
torch
.
manual_seed
(
0
)
config
=
CLIPTextConfig
(
bos_token_id
=
0
,
eos_token_id
=
2
,
hidden_size
=
32
,
intermediate_size
=
37
,
layer_norm_eps
=
1e-05
,
num_attention_heads
=
4
,
num_hidden_layers
=
5
,
pad_token_id
=
1
,
vocab_size
=
1000
,
)
return
CLIPTextModel
(
config
)
@
property
def
dummy_extractor
(
self
):
def
extract
(
*
args
,
**
kwargs
):
class
Out
:
def
__init__
(
self
):
self
.
pixel_values
=
torch
.
ones
([
0
])
def
to
(
self
,
device
):
self
.
pixel_values
.
to
(
device
)
return
self
return
Out
()
return
extract
def
test_stable_diffusion_inpaint
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet_inpaint
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
None
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
image
=
init_image
,
mask_image
=
mask_image
,
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
image
=
init_image
,
mask_image
=
mask_image
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.5075
,
0.4485
,
0.4558
,
0.5369
,
0.5369
,
0.5236
,
0.5127
,
0.4983
,
0.4776
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
unittest
.
skipIf
(
torch_device
!=
"cuda"
,
"This test requires a GPU"
)
def
test_stable_diffusion_inpaint_fp16
(
self
):
"""Test that stable diffusion inpaint_legacy works with fp16"""
unet
=
self
.
dummy_cond_unet_inpaint
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# put models in fp16
unet
=
unet
.
half
()
vae
=
vae
.
half
()
bert
=
bert
.
half
()
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
None
,
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
image
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"np"
,
image
=
init_image
,
mask_image
=
mask_image
,
).
images
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
class
PipelineIntegrationTests
(
unittest
.
TestCase
):
def
tearDown
(
self
):
# clean up the VRAM after each test
super
().
tearDown
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
def
test_stable_diffusion_inpaint_pipeline
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/yellow_cat_sitting_on_a_park_bench.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
model_id
=
"runwayml/stable-diffusion-inpainting"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
None
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"Face of a yellow cat, high resolution, sitting on a park bench"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
image
=
init_image
,
mask_image
=
mask_image
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
def
test_stable_diffusion_inpaint_pipeline_fp16
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/yellow_cat_sitting_on_a_park_bench_fp16.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
model_id
=
"runwayml/stable-diffusion-inpainting"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
,
safety_checker
=
None
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"Face of a yellow cat, high resolution, sitting on a park bench"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
image
=
init_image
,
mask_image
=
mask_image
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
def
test_stable_diffusion_inpaint_pipeline_pndm
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/yellow_cat_sitting_on_a_park_bench_pndm.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
pndm
=
PNDMScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
,
skip_prk_steps
=
True
)
model_id
=
"runwayml/stable-diffusion-inpainting"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
None
,
scheduler
=
pndm
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"Face of a yellow cat, high resolution, sitting on a park bench"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
image
=
init_image
,
mask_image
=
mask_image
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
tests/pipelines/stable_diffusion/test_stable_diffusion_inpaint_legacy.py
0 → 100644
View file @
25dfd0f8
# coding=utf-8
# Copyright 2022 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
gc
import
random
import
unittest
import
numpy
as
np
import
torch
from
diffusers
import
(
AutoencoderKL
,
LMSDiscreteScheduler
,
PNDMScheduler
,
StableDiffusionInpaintPipeline
,
StableDiffusionInpaintPipelineLegacy
,
UNet2DConditionModel
,
UNet2DModel
,
VQModel
,
)
from
diffusers.utils
import
floats_tensor
,
load_image
,
slow
,
torch_device
from
PIL
import
Image
from
transformers
import
CLIPTextConfig
,
CLIPTextModel
,
CLIPTokenizer
torch
.
backends
.
cuda
.
matmul
.
allow_tf32
=
False
class
PipelineFastTests
(
unittest
.
TestCase
):
def
tearDown
(
self
):
# clean up the VRAM after each test
super
().
tearDown
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
@
property
def
dummy_image
(
self
):
batch_size
=
1
num_channels
=
3
sizes
=
(
32
,
32
)
image
=
floats_tensor
((
batch_size
,
num_channels
)
+
sizes
,
rng
=
random
.
Random
(
0
)).
to
(
torch_device
)
return
image
@
property
def
dummy_uncond_unet
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
(
"DownBlock2D"
,
"AttnDownBlock2D"
),
up_block_types
=
(
"AttnUpBlock2D"
,
"UpBlock2D"
),
)
return
model
@
property
def
dummy_cond_unet
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
4
,
out_channels
=
4
,
down_block_types
=
(
"DownBlock2D"
,
"CrossAttnDownBlock2D"
),
up_block_types
=
(
"CrossAttnUpBlock2D"
,
"UpBlock2D"
),
cross_attention_dim
=
32
,
)
return
model
@
property
def
dummy_cond_unet_inpaint
(
self
):
torch
.
manual_seed
(
0
)
model
=
UNet2DConditionModel
(
block_out_channels
=
(
32
,
64
),
layers_per_block
=
2
,
sample_size
=
32
,
in_channels
=
9
,
out_channels
=
4
,
down_block_types
=
(
"DownBlock2D"
,
"CrossAttnDownBlock2D"
),
up_block_types
=
(
"CrossAttnUpBlock2D"
,
"UpBlock2D"
),
cross_attention_dim
=
32
,
)
return
model
@
property
def
dummy_vq_model
(
self
):
torch
.
manual_seed
(
0
)
model
=
VQModel
(
block_out_channels
=
[
32
,
64
],
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
[
"DownEncoderBlock2D"
,
"DownEncoderBlock2D"
],
up_block_types
=
[
"UpDecoderBlock2D"
,
"UpDecoderBlock2D"
],
latent_channels
=
3
,
)
return
model
@
property
def
dummy_vae
(
self
):
torch
.
manual_seed
(
0
)
model
=
AutoencoderKL
(
block_out_channels
=
[
32
,
64
],
in_channels
=
3
,
out_channels
=
3
,
down_block_types
=
[
"DownEncoderBlock2D"
,
"DownEncoderBlock2D"
],
up_block_types
=
[
"UpDecoderBlock2D"
,
"UpDecoderBlock2D"
],
latent_channels
=
4
,
)
return
model
@
property
def
dummy_text_encoder
(
self
):
torch
.
manual_seed
(
0
)
config
=
CLIPTextConfig
(
bos_token_id
=
0
,
eos_token_id
=
2
,
hidden_size
=
32
,
intermediate_size
=
37
,
layer_norm_eps
=
1e-05
,
num_attention_heads
=
4
,
num_hidden_layers
=
5
,
pad_token_id
=
1
,
vocab_size
=
1000
,
)
return
CLIPTextModel
(
config
)
@
property
def
dummy_extractor
(
self
):
def
extract
(
*
args
,
**
kwargs
):
class
Out
:
def
__init__
(
self
):
self
.
pixel_values
=
torch
.
ones
([
0
])
def
to
(
self
,
device
):
self
.
pixel_values
.
to
(
device
)
return
self
return
Out
()
return
extract
def
test_stable_diffusion_inpaint_legacy
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
)
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipelineLegacy
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4731
,
0.5346
,
0.4531
,
0.6251
,
0.5446
,
0.4057
,
0.5527
,
0.5896
,
0.5153
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_inpaint_legacy_negative_prompt
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
)
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipelineLegacy
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
negative_prompt
=
"french fries"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
prompt
,
negative_prompt
=
negative_prompt
,
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4765
,
0.5339
,
0.4541
,
0.6240
,
0.5439
,
0.4055
,
0.5503
,
0.5891
,
0.5150
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_inpaint_legacy_num_images_per_prompt
(
self
):
device
=
"cpu"
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
)
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipelineLegacy
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
# test num_images_per_prompt=1 (default)
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
).
images
assert
images
.
shape
==
(
1
,
32
,
32
,
3
)
# test num_images_per_prompt=1 (default) for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
).
images
assert
images
.
shape
==
(
batch_size
,
32
,
32
,
3
)
# test num_images_per_prompt for single prompt
num_images_per_prompt
=
2
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
num_images_per_prompt
=
num_images_per_prompt
,
).
images
assert
images
.
shape
==
(
num_images_per_prompt
,
32
,
32
,
3
)
# test num_images_per_prompt for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
num_images_per_prompt
=
num_images_per_prompt
,
).
images
assert
images
.
shape
==
(
batch_size
*
num_images_per_prompt
,
32
,
32
,
3
)
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
class
PipelineIntegrationTests
(
unittest
.
TestCase
):
def
tearDown
(
self
):
# clean up the VRAM after each test
super
().
tearDown
()
gc
.
collect
()
torch
.
cuda
.
empty_cache
()
def
test_stable_diffusion_inpaint_legacy_pipeline
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/red_cat_sitting_on_a_park_bench.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
None
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A red cat sitting on a park bench"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
mask_image
=
mask_image
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
def
test_stable_diffusion_inpaint_legacy_pipeline_k_lms
(
self
):
# TODO(Anton, Patrick) - I think we can remove this test soon
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/red_cat_sitting_on_a_park_bench_k_lms.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
lms
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
scheduler
=
lms
,
safety_checker
=
None
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A red cat sitting on a park bench"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
mask_image
=
mask_image
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
def
test_stable_diffusion_inpaint_legacy_intermediate_state
(
self
):
number_of_steps
=
0
def
test_callback_fn
(
step
:
int
,
timestep
:
int
,
latents
:
torch
.
FloatTensor
)
->
None
:
test_callback_fn
.
has_been_called
=
True
nonlocal
number_of_steps
number_of_steps
+=
1
if
step
==
0
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
64
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
(
[
-
0.5472
,
1.1218
,
-
0.5505
,
-
0.9390
,
-
1.0794
,
0.4063
,
0.5158
,
0.6429
,
-
1.5246
]
)
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-3
elif
step
==
37
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
64
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
([
0.4781
,
1.1572
,
0.6258
,
0.2291
,
0.2554
,
-
0.1443
,
0.7085
,
-
0.1598
,
-
0.5659
])
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-3
test_callback_fn
.
has_been_called
=
False
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-4"
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A red cat sitting on a park bench"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
mask_image
=
mask_image
,
strength
=
0.75
,
num_inference_steps
=
50
,
guidance_scale
=
7.5
,
generator
=
generator
,
callback
=
test_callback_fn
,
callback_steps
=
1
,
)
assert
test_callback_fn
.
has_been_called
assert
number_of_steps
==
38
tests/test_pipelines.py
View file @
25dfd0f8
...
...
@@ -36,7 +36,6 @@ from diffusers import (
KarrasVeScheduler
,
LDMPipeline
,
LDMTextToImagePipeline
,
LMSDiscreteScheduler
,
OnnxStableDiffusionImg2ImgPipeline
,
OnnxStableDiffusionInpaintPipeline
,
OnnxStableDiffusionPipeline
,
...
...
@@ -45,7 +44,6 @@ from diffusers import (
ScoreSdeVePipeline
,
ScoreSdeVeScheduler
,
StableDiffusionImg2ImgPipeline
,
StableDiffusionInpaintPipeline
,
StableDiffusionInpaintPipelineLegacy
,
StableDiffusionPipeline
,
UNet2DConditionModel
,
...
...
@@ -248,13 +246,6 @@ class PipelineFastTests(unittest.TestCase):
)
return
CLIPTextModel
(
config
)
@
property
def
dummy_safety_checker
(
self
):
def
check
(
images
,
*
args
,
**
kwargs
):
return
images
,
[
False
]
*
len
(
images
)
return
check
@
property
def
dummy_extractor
(
self
):
def
extract
(
*
args
,
**
kwargs
):
...
...
@@ -364,287 +355,6 @@ class PipelineFastTests(unittest.TestCase):
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_ddim
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
DDIMScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
,
clip_sample
=
False
,
set_alpha_to_one
=
False
,
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.5112
,
0.4692
,
0.4715
,
0.5206
,
0.4894
,
0.5114
,
0.5096
,
0.4932
,
0.4755
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_ddim_factor_8
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
DDIMScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
,
clip_sample
=
False
,
set_alpha_to_one
=
False
,
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
height
=
536
,
width
=
536
,
num_inference_steps
=
2
,
output_type
=
"np"
,
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
134
,
134
,
3
)
expected_slice
=
np
.
array
([
0.7834
,
0.5488
,
0.5781
,
0.46
,
0.3609
,
0.5369
,
0.542
,
0.4855
,
0.5557
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_pndm
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.4937
,
0.4649
,
0.4716
,
0.5145
,
0.4889
,
0.513
,
0.513
,
0.4905
,
0.4738
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_from_pretrained_error_message_uninstalled_packages
(
self
):
# TODO(Patrick, Pedro) - need better test here for the future
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
"hf-internal-testing/tiny-stable-diffusion-lms-pipe"
)
assert
isinstance
(
pipe
,
StableDiffusionPipeline
)
assert
isinstance
(
pipe
.
scheduler
,
LMSDiscreteScheduler
)
def
test_stable_diffusion_no_safety_checker
(
self
):
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
"hf-internal-testing/tiny-stable-diffusion-lms-pipe"
,
safety_checker
=
None
)
assert
isinstance
(
pipe
,
StableDiffusionPipeline
)
assert
isinstance
(
pipe
.
scheduler
,
LMSDiscreteScheduler
)
assert
pipe
.
safety_checker
is
None
image
=
pipe
(
"example prompt"
,
num_inference_steps
=
2
).
images
[
0
]
assert
image
is
not
None
def
test_stable_diffusion_k_lms
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.5067
,
0.4689
,
0.4614
,
0.5233
,
0.4903
,
0.5112
,
0.524
,
0.5069
,
0.4785
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_attention_chunk
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output_1
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
# make sure chunking the attention yields the same result
sd_pipe
.
enable_attention_slicing
(
slice_size
=
1
)
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output_2
=
sd_pipe
([
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
)
assert
np
.
abs
(
output_2
.
images
.
flatten
()
-
output_1
.
images
.
flatten
()).
max
()
<
1e-4
def
test_stable_diffusion_negative_prompt
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
negative_prompt
=
"french fries"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
prompt
,
negative_prompt
=
negative_prompt
,
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.4851
,
0.4617
,
0.4765
,
0.5127
,
0.4845
,
0.5153
,
0.5141
,
0.4886
,
0.4719
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_score_sde_ve_pipeline
(
self
):
unet
=
self
.
dummy_uncond_unet
scheduler
=
ScoreSdeVeScheduler
()
...
...
@@ -719,731 +429,58 @@ class PipelineFastTests(unittest.TestCase):
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_
stable_diffusion_img2img
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
def
test_
components
(
self
):
"""Test that components property works correctly"""
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
)
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionI
mg2Img
Pipeline
(
inpaint
=
StableDiffusionI
npaint
Pipeline
Legacy
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
safety_checker
=
None
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
Non
e
)
)
.
to
(
torch_device
)
img2img
=
StableDiffusionImg2ImgPipeline
(
**
inpaint
.
components
).
to
(
torch_
device
)
text2img
=
StableDiffusionPipeline
(
**
inpaint
.
components
).
to
(
torch_devic
e
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
generator
=
torch
.
Generator
(
device
=
torch_
device
).
manual_seed
(
0
)
image_inpaint
=
inpaint
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
mask_image
=
mask_image
,
).
images
image_img2img
=
img2img
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4492
,
0.3865
,
0.4222
,
0.5854
,
0.5139
,
0.4379
,
0.4193
,
0.48
,
0.4218
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_img2img_negative_prompt
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
negative_prompt
=
"french fries"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
prompt
,
negative_prompt
=
negative_prompt
,
).
images
image_text2img
=
text2img
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4065
,
0.3783
,
0.4050
,
0.5266
,
0.4781
,
0.4252
,
0.4203
,
0.4692
,
0.4365
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
).
images
def
test_stable_diffusion_img2img_multiple_init_images
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
).
repeat
(
2
,
1
,
1
,
1
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
2
*
[
"A painting of a squirrel eating a burger"
]
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
prompt
,
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
)
image
=
output
.
images
image_slice
=
image
[
-
1
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
2
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.5144
,
0.4447
,
0.4735
,
0.6676
,
0.5526
,
0.5454
,
0.645
,
0.5149
,
0.4689
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_img2img_k_lms
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
return_dict
=
False
,
)
image_from_tuple
=
output
[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4367
,
0.4986
,
0.4372
,
0.6706
,
0.5665
,
0.444
,
0.5864
,
0.6019
,
0.5203
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_inpaint_legacy
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
)
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipelineLegacy
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4731
,
0.5346
,
0.4531
,
0.6251
,
0.5446
,
0.4057
,
0.5527
,
0.5896
,
0.5153
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_inpaint
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet_inpaint
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
None
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
image
=
init_image
,
mask_image
=
mask_image
,
)
image
=
output
.
images
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
image_from_tuple
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
image
=
init_image
,
mask_image
=
mask_image
,
return_dict
=
False
,
)[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
image_from_tuple_slice
=
image_from_tuple
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
expected_slice
=
np
.
array
([
0.5075
,
0.4485
,
0.4558
,
0.5369
,
0.5369
,
0.5236
,
0.5127
,
0.4983
,
0.4776
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
assert
np
.
abs
(
image_from_tuple_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_inpaint_legacy_negative_prompt
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
)
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipelineLegacy
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
negative_prompt
=
"french fries"
generator
=
torch
.
Generator
(
device
=
device
).
manual_seed
(
0
)
output
=
sd_pipe
(
prompt
,
negative_prompt
=
negative_prompt
,
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4765
,
0.5339
,
0.4541
,
0.6240
,
0.5439
,
0.4055
,
0.5503
,
0.5891
,
0.5150
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
def
test_stable_diffusion_num_images_per_prompt
(
self
):
device
=
"cpu"
# ensure determinism for the device-dependent torch.Generator
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
# test num_images_per_prompt=1 (default)
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
).
images
assert
images
.
shape
==
(
1
,
128
,
128
,
3
)
# test num_images_per_prompt=1 (default) for batch of prompts
batch_size
=
2
images
=
sd_pipe
([
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
).
images
assert
images
.
shape
==
(
batch_size
,
128
,
128
,
3
)
# test num_images_per_prompt for single prompt
num_images_per_prompt
=
2
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
num_images_per_prompt
=
num_images_per_prompt
).
images
assert
images
.
shape
==
(
num_images_per_prompt
,
128
,
128
,
3
)
# test num_images_per_prompt for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
num_images_per_prompt
=
num_images_per_prompt
).
images
assert
images
.
shape
==
(
batch_size
*
num_images_per_prompt
,
128
,
128
,
3
)
def
test_stable_diffusion_img2img_num_images_per_prompt
(
self
):
device
=
"cpu"
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
device
)
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
# test num_images_per_prompt=1 (default)
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
).
images
assert
images
.
shape
==
(
1
,
32
,
32
,
3
)
# test num_images_per_prompt=1 (default) for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
).
images
assert
images
.
shape
==
(
batch_size
,
32
,
32
,
3
)
# test num_images_per_prompt for single prompt
num_images_per_prompt
=
2
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
num_images_per_prompt
=
num_images_per_prompt
,
).
images
assert
images
.
shape
==
(
num_images_per_prompt
,
32
,
32
,
3
)
# test num_images_per_prompt for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
num_images_per_prompt
=
num_images_per_prompt
,
).
images
assert
images
.
shape
==
(
batch_size
*
num_images_per_prompt
,
32
,
32
,
3
)
def
test_stable_diffusion_inpaint_legacy_num_images_per_prompt
(
self
):
device
=
"cpu"
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
)
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipelineLegacy
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
# test num_images_per_prompt=1 (default)
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
).
images
assert
images
.
shape
==
(
1
,
32
,
32
,
3
)
# test num_images_per_prompt=1 (default) for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
).
images
assert
images
.
shape
==
(
batch_size
,
32
,
32
,
3
)
# test num_images_per_prompt for single prompt
num_images_per_prompt
=
2
images
=
sd_pipe
(
prompt
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
num_images_per_prompt
=
num_images_per_prompt
,
).
images
assert
images
.
shape
==
(
num_images_per_prompt
,
32
,
32
,
3
)
# test num_images_per_prompt for batch of prompts
batch_size
=
2
images
=
sd_pipe
(
[
prompt
]
*
batch_size
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
num_images_per_prompt
=
num_images_per_prompt
,
).
images
assert
images
.
shape
==
(
batch_size
*
num_images_per_prompt
,
32
,
32
,
3
)
@
unittest
.
skipIf
(
torch_device
!=
"cuda"
,
"This test requires a GPU"
)
def
test_stable_diffusion_fp16
(
self
):
"""Test that stable diffusion works with fp16"""
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
# put models in fp16
unet
=
unet
.
half
()
vae
=
vae
.
half
()
bert
=
bert
.
half
()
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
image
=
sd_pipe
([
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"np"
).
images
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
@
unittest
.
skipIf
(
torch_device
!=
"cuda"
,
"This test requires a GPU"
)
def
test_stable_diffusion_img2img_fp16
(
self
):
"""Test that stable diffusion img2img works with fp16"""
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
init_image
=
self
.
dummy_image
.
to
(
torch_device
)
# put models in fp16
unet
=
unet
.
half
()
vae
=
vae
.
half
()
bert
=
bert
.
half
()
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionImg2ImgPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
image
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
).
images
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
@
unittest
.
skipIf
(
torch_device
!=
"cuda"
,
"This test requires a GPU"
)
def
test_stable_diffusion_inpaint_fp16
(
self
):
"""Test that stable diffusion inpaint_legacy works with fp16"""
unet
=
self
.
dummy_cond_unet_inpaint
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# put models in fp16
unet
=
unet
.
half
()
vae
=
vae
.
half
()
bert
=
bert
.
half
()
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionInpaintPipeline
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
None
,
feature_extractor
=
None
,
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
image
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"np"
,
image
=
init_image
,
mask_image
=
mask_image
,
).
images
assert
image
.
shape
==
(
1
,
128
,
128
,
3
)
def
test_components
(
self
):
"""Test that components property works correctly"""
unet
=
self
.
dummy_cond_unet
scheduler
=
PNDMScheduler
(
skip_prk_steps
=
True
)
vae
=
self
.
dummy_vae
bert
=
self
.
dummy_text_encoder
tokenizer
=
CLIPTokenizer
.
from_pretrained
(
"hf-internal-testing/tiny-random-clip"
)
image
=
self
.
dummy_image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)[
0
]
init_image
=
Image
.
fromarray
(
np
.
uint8
(
image
)).
convert
(
"RGB"
)
mask_image
=
Image
.
fromarray
(
np
.
uint8
(
image
+
4
)).
convert
(
"RGB"
).
resize
((
128
,
128
))
# make sure here that pndm scheduler skips prk
inpaint
=
StableDiffusionInpaintPipelineLegacy
(
unet
=
unet
,
scheduler
=
scheduler
,
vae
=
vae
,
text_encoder
=
bert
,
tokenizer
=
tokenizer
,
safety_checker
=
self
.
dummy_safety_checker
,
feature_extractor
=
self
.
dummy_extractor
,
).
to
(
torch_device
)
img2img
=
StableDiffusionImg2ImgPipeline
(
**
inpaint
.
components
).
to
(
torch_device
)
text2img
=
StableDiffusionPipeline
(
**
inpaint
.
components
).
to
(
torch_device
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
image_inpaint
=
inpaint
(
[
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
mask_image
=
mask_image
,
).
images
image_img2img
=
img2img
(
[
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"np"
,
init_image
=
init_image
,
).
images
image_text2img
=
text2img
(
[
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"np"
,
).
images
assert
image_inpaint
.
shape
==
(
1
,
32
,
32
,
3
)
assert
image_img2img
.
shape
==
(
1
,
32
,
32
,
3
)
assert
image_text2img
.
shape
==
(
1
,
128
,
128
,
3
)
assert
image_inpaint
.
shape
==
(
1
,
32
,
32
,
3
)
assert
image_img2img
.
shape
==
(
1
,
32
,
32
,
3
)
assert
image_text2img
.
shape
==
(
1
,
128
,
128
,
3
)
class
PipelineTesterMixin
(
unittest
.
TestCase
):
...
...
@@ -1483,13 +520,6 @@ class PipelineTesterMixin(unittest.TestCase):
assert
cap_logger
.
out
==
"Keyword arguments {'not_used': True} not recognized.
\n
"
@
property
def
dummy_safety_checker
(
self
):
def
check
(
images
,
*
args
,
**
kwargs
):
return
images
,
[
False
]
*
len
(
images
)
return
check
def
test_from_pretrained_save_pretrained
(
self
):
# 1. Load models
model
=
UNet2DModel
(
...
...
@@ -1597,682 +627,214 @@ class PipelineTesterMixin(unittest.TestCase):
ddpm
=
DDPMPipeline
(
unet
=
unet
,
scheduler
=
scheduler
)
ddpm
.
to
(
torch_device
)
ddpm
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
manual_seed
(
0
)
image
=
ddpm
(
generator
=
generator
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.41995
,
0.35885
,
0.19385
,
0.38475
,
0.3382
,
0.2647
,
0.41545
,
0.3582
,
0.33845
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_ddim_lsun
(
self
):
model_id
=
"google/ddpm-ema-bedroom-256"
unet
=
UNet2DModel
.
from_pretrained
(
model_id
)
scheduler
=
DDIMScheduler
.
from_config
(
model_id
)
ddpm
=
DDIMPipeline
(
unet
=
unet
,
scheduler
=
scheduler
)
ddpm
.
to
(
torch_device
)
ddpm
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
manual_seed
(
0
)
image
=
ddpm
(
generator
=
generator
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.00605
,
0.0201
,
0.0344
,
0.00235
,
0.00185
,
0.00025
,
0.00215
,
0.0
,
0.00685
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_ddim_cifar10
(
self
):
model_id
=
"google/ddpm-cifar10-32"
unet
=
UNet2DModel
.
from_pretrained
(
model_id
)
scheduler
=
DDIMScheduler
()
ddim
=
DDIMPipeline
(
unet
=
unet
,
scheduler
=
scheduler
)
ddim
.
to
(
torch_device
)
ddim
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
manual_seed
(
0
)
image
=
ddim
(
generator
=
generator
,
eta
=
0.0
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.17235
,
0.16175
,
0.16005
,
0.16255
,
0.1497
,
0.1513
,
0.15045
,
0.1442
,
0.1453
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_pndm_cifar10
(
self
):
model_id
=
"google/ddpm-cifar10-32"
unet
=
UNet2DModel
.
from_pretrained
(
model_id
)
scheduler
=
PNDMScheduler
()
pndm
=
PNDMPipeline
(
unet
=
unet
,
scheduler
=
scheduler
)
pndm
.
to
(
torch_device
)
pndm
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
manual_seed
(
0
)
image
=
pndm
(
generator
=
generator
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.1564
,
0.14645
,
0.1406
,
0.14715
,
0.12425
,
0.14045
,
0.13115
,
0.12175
,
0.125
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_ldm_text2img
(
self
):
ldm
=
LDMTextToImagePipeline
.
from_pretrained
(
"CompVis/ldm-text2im-large-256"
)
ldm
.
to
(
torch_device
)
ldm
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
manual_seed
(
0
)
image
=
ldm
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
20
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.9256
,
0.9340
,
0.8933
,
0.9361
,
0.9113
,
0.8727
,
0.9122
,
0.8745
,
0.8099
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_ldm_text2img_fast
(
self
):
ldm
=
LDMTextToImagePipeline
.
from_pretrained
(
"CompVis/ldm-text2im-large-256"
)
ldm
.
to
(
torch_device
)
ldm
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
manual_seed
(
0
)
image
=
ldm
(
prompt
,
generator
=
generator
,
num_inference_steps
=
1
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.3163
,
0.8670
,
0.6465
,
0.1865
,
0.6291
,
0.5139
,
0.2824
,
0.3723
,
0.4344
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion
(
self
):
# make sure here that pndm scheduler skips prk
sd_pipe
=
StableDiffusionPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-1"
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
"cuda"
):
output
=
sd_pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
20
,
output_type
=
"np"
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
512
,
512
,
3
)
expected_slice
=
np
.
array
([
0.8887
,
0.915
,
0.91
,
0.894
,
0.909
,
0.912
,
0.919
,
0.925
,
0.883
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_fast_ddim
(
self
):
sd_pipe
=
StableDiffusionPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-1"
)
sd_pipe
=
sd_pipe
.
to
(
torch_device
)
sd_pipe
.
set_progress_bar_config
(
disable
=
None
)
scheduler
=
DDIMScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
,
clip_sample
=
False
,
set_alpha_to_one
=
False
,
)
sd_pipe
.
scheduler
=
scheduler
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
"cuda"
):
output
=
sd_pipe
([
prompt
],
generator
=
generator
,
num_inference_steps
=
2
,
output_type
=
"numpy"
)
image
=
output
.
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
512
,
512
,
3
)
expected_slice
=
np
.
array
([
0.9326
,
0.923
,
0.951
,
0.9365
,
0.9214
,
0.951
,
0.9365
,
0.9414
,
0.918
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_score_sde_ve_pipeline
(
self
):
model_id
=
"google/ncsnpp-church-256"
model
=
UNet2DModel
.
from_pretrained
(
model_id
)
scheduler
=
ScoreSdeVeScheduler
.
from_config
(
model_id
)
sde_ve
=
ScoreSdeVePipeline
(
unet
=
model
,
scheduler
=
scheduler
)
sde_ve
.
to
(
torch_device
)
sde_ve
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
manual_seed
(
0
)
image
=
sde_ve
(
num_inference_steps
=
10
,
output_type
=
"numpy"
,
generator
=
generator
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.0
,
0.0
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_ldm_uncond
(
self
):
ldm
=
LDMPipeline
.
from_pretrained
(
"CompVis/ldm-celebahq-256"
)
ldm
.
to
(
torch_device
)
ldm
.
set_progress_bar_config
(
disable
=
None
)
ddpm
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
manual_seed
(
0
)
image
=
l
dm
(
generator
=
generator
,
num_inference_steps
=
5
,
output_type
=
"numpy"
).
images
image
=
d
dp
m
(
generator
=
generator
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.4
3
99
,
0.
4497
5
,
0.
4682
5
,
0.47
4
,
0.
4359
,
0.4581
,
0.4
509
5
,
0.
4341
,
0.4447
])
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.4
1
99
5
,
0.
3588
5
,
0.
1938
5
,
0.
38
47
5
,
0.
3382
,
0.2647
,
0.4
154
5
,
0.
3582
,
0.33845
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_dd
pm_ddim_equality
(
self
):
model_id
=
"google/ddpm-
cifar10-32
"
def
test_dd
im_lsun
(
self
):
model_id
=
"google/ddpm-
ema-bedroom-256
"
unet
=
UNet2DModel
.
from_pretrained
(
model_id
)
ddpm_scheduler
=
DDPMScheduler
()
ddim_scheduler
=
DDIMScheduler
()
scheduler
=
DDIMScheduler
.
from_config
(
model_id
)
ddpm
=
DD
P
MPipeline
(
unet
=
unet
,
scheduler
=
ddpm_
scheduler
)
ddpm
=
DD
I
MPipeline
(
unet
=
unet
,
scheduler
=
scheduler
)
ddpm
.
to
(
torch_device
)
ddpm
.
set_progress_bar_config
(
disable
=
None
)
ddim
=
DDIMPipeline
(
unet
=
unet
,
scheduler
=
ddim_scheduler
)
ddim
.
to
(
torch_device
)
ddim
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
manual_seed
(
0
)
ddpm_
image
=
ddpm
(
generator
=
generator
,
output_type
=
"numpy"
).
images
image
=
ddpm
(
generator
=
generator
,
output_type
=
"numpy"
).
images
generator
=
torch
.
manual_seed
(
0
)
ddim_image
=
ddim
(
generator
=
generator
,
num_inference_steps
=
1000
,
eta
=
1.0
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
# the values aren't exactly equal, but the images look the same visually
assert
np
.
abs
(
ddpm_image
-
ddim_image
).
max
()
<
1e-1
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.00605
,
0.0201
,
0.0344
,
0.00235
,
0.00185
,
0.00025
,
0.00215
,
0.0
,
0.00685
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
unittest
.
skip
(
"(Anton) The test is failing for large batch sizes, needs investigation"
)
def
test_dd
pm_ddim_equality_batched
(
self
):
@
slow
def
test_dd
im_cifar10
(
self
):
model_id
=
"google/ddpm-cifar10-32"
unet
=
UNet2DModel
.
from_pretrained
(
model_id
)
ddpm_scheduler
=
DDPMScheduler
()
ddim_scheduler
=
DDIMScheduler
()
ddpm
=
DDPMPipeline
(
unet
=
unet
,
scheduler
=
ddpm_scheduler
)
ddpm
.
to
(
torch_device
)
ddpm
.
set_progress_bar_config
(
disable
=
None
)
scheduler
=
DDIMScheduler
()
ddim
=
DDIMPipeline
(
unet
=
unet
,
scheduler
=
ddim_
scheduler
)
ddim
=
DDIMPipeline
(
unet
=
unet
,
scheduler
=
scheduler
)
ddim
.
to
(
torch_device
)
ddim
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
manual_seed
(
0
)
ddpm_
image
s
=
dd
p
m
(
batch_size
=
4
,
generator
=
generator
,
output_type
=
"numpy"
).
images
image
=
dd
i
m
(
generator
=
generator
,
eta
=
0.0
,
output_type
=
"numpy"
).
images
generator
=
torch
.
manual_seed
(
0
)
ddim_images
=
ddim
(
batch_size
=
4
,
generator
=
generator
,
num_inference_steps
=
1000
,
eta
=
1.0
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
# the values aren't exactly equal, but the images look the same visually
assert
np
.
abs
(
ddpm_images
-
ddim_images
).
max
()
<
1e-1
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.17235
,
0.16175
,
0.16005
,
0.16255
,
0.1497
,
0.1513
,
0.15045
,
0.1442
,
0.1453
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_karras_ve_pipeline
(
self
):
model_id
=
"google/ncsnpp-celebahq-256"
model
=
UNet2DModel
.
from_pretrained
(
model_id
)
scheduler
=
KarrasVeScheduler
()
def
test_pndm_cifar10
(
self
):
model_id
=
"google/ddpm-cifar10-32"
pipe
=
KarrasVePipeline
(
unet
=
model
,
scheduler
=
scheduler
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
unet
=
UNet2DModel
.
from_pretrained
(
model_id
)
scheduler
=
PNDMScheduler
()
pndm
=
PNDMPipeline
(
unet
=
unet
,
scheduler
=
scheduler
)
pndm
.
to
(
torch_device
)
pndm
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
manual_seed
(
0
)
image
=
p
ipe
(
num_inference_steps
=
20
,
generator
=
generator
,
output_type
=
"numpy"
).
images
image
=
p
ndm
(
generator
=
generator
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.578
,
0.5811
,
0.5924
,
0.5809
,
0.587
,
0.5886
,
0.5861
,
0.5802
,
0.586
])
assert
image
.
shape
==
(
1
,
32
,
32
,
3
)
expected_slice
=
np
.
array
([
0.1564
,
0.14645
,
0.1406
,
0.14715
,
0.12425
,
0.14045
,
0.13115
,
0.12175
,
0.125
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_lms_stable_diffusion_pipeline
(
self
):
model_id
=
"CompVis/stable-diffusion-v1-1"
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
model_id
).
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
scheduler
=
LMSDiscreteScheduler
.
from_config
(
model_id
,
subfolder
=
"scheduler"
)
pipe
.
scheduler
=
scheduler
def
test_ldm_text2img
(
self
):
ldm
=
LDMTextToImagePipeline
.
from_pretrained
(
"CompVis/ldm-text2im-large-256"
)
ldm
.
to
(
torch_device
)
ldm
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"
a
p
hotograph of an astronaut riding a horse
"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
image
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
1
0
,
output_type
=
"numpy"
prompt
=
"
A
p
ainting of a squirrel eating a burger
"
generator
=
torch
.
manual_seed
(
0
)
image
=
ldm
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
6.0
,
num_inference_steps
=
2
0
,
output_type
=
"numpy"
).
images
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
512
,
512
,
3
)
expected_slice
=
np
.
array
([
0.9077
,
0.9254
,
0.9181
,
0.9227
,
0.9213
,
0.9367
,
0.9399
,
0.9406
,
0.9024
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_memory_chunking
(
self
):
torch
.
cuda
.
reset_peak_memory_stats
()
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
model_id
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
).
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"a photograph of an astronaut riding a horse"
# make attention efficient
pipe
.
enable_attention_slicing
()
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
output_chunked
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
10
,
output_type
=
"numpy"
)
image_chunked
=
output_chunked
.
images
mem_bytes
=
torch
.
cuda
.
max_memory_allocated
()
torch
.
cuda
.
reset_peak_memory_stats
()
# make sure that less than 3.75 GB is allocated
assert
mem_bytes
<
3.75
*
10
**
9
# disable chunking
pipe
.
disable_attention_slicing
()
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
output
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
10
,
output_type
=
"numpy"
)
image
=
output
.
images
# make sure that more than 3.75 GB is allocated
mem_bytes
=
torch
.
cuda
.
max_memory_allocated
()
assert
mem_bytes
>
3.75
*
10
**
9
assert
np
.
abs
(
image_chunked
.
flatten
()
-
image
.
flatten
()).
max
()
<
1e-3
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_text2img_pipeline_fp16
(
self
):
torch
.
cuda
.
reset_peak_memory_stats
()
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
model_id
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
).
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"a photograph of an astronaut riding a horse"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output_chunked
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
10
,
output_type
=
"numpy"
)
image_chunked
=
output_chunked
.
images
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
output
=
pipe
(
[
prompt
],
generator
=
generator
,
guidance_scale
=
7.5
,
num_inference_steps
=
10
,
output_type
=
"numpy"
)
image
=
output
.
images
# Make sure results are close enough
diff
=
np
.
abs
(
image_chunked
.
flatten
()
-
image
.
flatten
())
# They ARE different since ops are not run always at the same precision
# however, they should be extremely close.
assert
diff
.
mean
()
<
2e-2
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_text2img_pipeline
(
self
):
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/text2img/astronaut_riding_a_horse.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
self
.
dummy_safety_checker
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"astronaut riding a horse"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.9256
,
0.9340
,
0.8933
,
0.9361
,
0.9113
,
0.8727
,
0.9122
,
0.8745
,
0.8099
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_img2img_pipeline
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/sketch-mountains-input.jpg"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/fantasy_landscape.png"
)
init_image
=
init_image
.
resize
((
768
,
512
))
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionImg2ImgPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
self
.
dummy_safety_checker
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
def
test_ldm_text2img_fast
(
self
):
ldm
=
LDMTextToImagePipeline
.
from_pretrained
(
"CompVis/ldm-text2im-large-256"
)
ldm
.
to
(
torch_device
)
ldm
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"A fantasy landscape, trending on artstation"
prompt
=
"A painting of a squirrel eating a burger"
generator
=
torch
.
manual_seed
(
0
)
image
=
ldm
(
prompt
,
generator
=
generator
,
num_inference_steps
=
1
,
output_type
=
"numpy"
).
images
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
512
,
768
,
3
)
# img2img is flaky across GPUs even in fp32, so using MAE here
assert
np
.
abs
(
expected_image
-
imag
e
).
m
ean
()
<
1e-2
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.3163
,
0.8670
,
0.6465
,
0.1865
,
0.6291
,
0.5139
,
0.2824
,
0.3723
,
0.4344
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slic
e
).
m
ax
()
<
1e-2
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_img2img_pipeline_k_lms
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/sketch-mountains-input.jpg"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/fantasy_landscape_k_lms.png"
)
init_image
=
init_image
.
resize
((
768
,
512
))
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
lms
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionImg2ImgPipeline
.
from_pretrained
(
model_id
,
scheduler
=
lms
,
safety_checker
=
self
.
dummy_safety_checker
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A fantasy landscape, trending on artstation"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
def
test_score_sde_ve_pipeline
(
self
):
model_id
=
"google/ncsnpp-church-256"
model
=
UNet2DModel
.
from_pretrained
(
model_id
)
assert
image
.
shape
==
(
512
,
768
,
3
)
# img2img is flaky across GPUs even in fp32, so using MAE here
assert
np
.
abs
(
expected_image
-
image
).
mean
()
<
1e-2
scheduler
=
ScoreSdeVeScheduler
.
from_config
(
model_id
)
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_inpaint_pipeline
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/yellow_cat_sitting_on_a_park_bench.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
sde_ve
=
ScoreSdeVePipeline
(
unet
=
model
,
scheduler
=
scheduler
)
sde_ve
.
to
(
torch_device
)
sde_ve
.
set_progress_bar_config
(
disable
=
None
)
model_id
=
"runwayml/stable-diffusion-inpainting"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
self
.
dummy_safety_checker
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
generator
=
torch
.
manual_seed
(
0
)
image
=
sde_ve
(
num_inference_steps
=
10
,
output_type
=
"numpy"
,
generator
=
generator
).
images
prompt
=
"Face of a yellow cat, high resolution, sitting on a park bench"
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
image
=
init_image
,
mask_image
=
mask_image
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
imag
e
).
max
()
<
1e-2
expected_slice
=
np
.
array
([
0.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
1.0
,
0.0
,
0.0
]
)
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slic
e
).
max
()
<
1e-2
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_inpaint_pipeline_fp16
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/yellow_cat_sitting_on_a_park_bench_fp16.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
model_id
=
"runwayml/stable-diffusion-inpainting"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
,
safety_checker
=
None
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
def
test_ldm_uncond
(
self
):
ldm
=
LDMPipeline
.
from_pretrained
(
"CompVis/ldm-celebahq-256"
)
ldm
.
to
(
torch_device
)
ldm
.
set_progress_bar_config
(
disable
=
None
)
prompt
=
"Face of a yellow cat, high resolution, sitting on a park bench"
generator
=
torch
.
manual_seed
(
0
)
image
=
ldm
(
generator
=
generator
,
num_inference_steps
=
5
,
output_type
=
"numpy"
).
images
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
image
=
init_image
,
mask_image
=
mask_image
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.4399
,
0.44975
,
0.46825
,
0.474
,
0.4359
,
0.4581
,
0.45095
,
0.4341
,
0.4447
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_inpaint_legacy_pipeline
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/red_cat_sitting_on_a_park_bench.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
def
test_ddpm_ddim_equality
(
self
):
model_id
=
"google/ddpm-cifar10-32"
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
self
.
dummy_safety_checker
,
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
unet
=
UNet2DModel
.
from_pretrained
(
model_id
)
ddpm_scheduler
=
DDPMScheduler
()
ddim_scheduler
=
DDIMScheduler
()
prompt
=
"A red cat sitting on a park bench"
ddpm
=
DDPMPipeline
(
unet
=
unet
,
scheduler
=
ddpm_scheduler
)
ddpm
.
to
(
torch_device
)
ddpm
.
set_progress_bar_config
(
disable
=
None
)
ddim
=
DDIMPipeline
(
unet
=
unet
,
scheduler
=
ddim_scheduler
)
ddim
.
to
(
torch_device
)
ddim
.
set_progress_bar_config
(
disable
=
None
)
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
mask_image
=
mask_image
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
generator
=
torch
.
manual_seed
(
0
)
ddpm_image
=
ddpm
(
generator
=
generator
,
output_type
=
"numpy"
).
images
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
ma
x
()
<
1e-2
generator
=
torch
.
manual_seed
(
0
)
ddim_image
=
ddim
(
generator
=
generator
,
num_inference_steps
=
1000
,
eta
=
1.0
,
output_type
=
"numpy"
).
i
ma
ges
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_inpaint_pipeline_pndm
(
self
):
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/yellow_cat_sitting_on_a_park_bench_pndm.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
# the values aren't exactly equal, but the images look the same visually
assert
np
.
abs
(
ddpm_image
-
ddim_image
).
max
()
<
1e-1
pndm
=
PNDMScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
,
skip_prk_steps
=
True
)
model_id
=
"runwayml/stable-diffusion-inpainting"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
safety_checker
=
self
.
dummy_safety_checker
,
scheduler
=
pndm
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
@
unittest
.
skip
(
"(Anton) The test is failing for large batch sizes, needs investigation"
)
def
test_ddpm_ddim_equality_batched
(
self
):
model_id
=
"google/ddpm-cifar10-32"
prompt
=
"Face of a yellow cat, high resolution, sitting on a park bench"
unet
=
UNet2DModel
.
from_pretrained
(
model_id
)
ddpm_scheduler
=
DDPMScheduler
()
ddim_scheduler
=
DDIMScheduler
()
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
image
=
init_image
,
mask_image
=
mask_image
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
ddpm
=
DDPMPipeline
(
unet
=
unet
,
scheduler
=
ddpm_scheduler
)
ddpm
.
to
(
torch_device
)
ddpm
.
set_progress_bar_config
(
disable
=
None
)
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
ddim
=
DDIMPipeline
(
unet
=
unet
,
scheduler
=
ddim_scheduler
)
ddim
.
to
(
torch_device
)
ddim
.
set_progress_bar_config
(
disable
=
None
)
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_inpaint_legacy_pipeline_k_lms
(
self
):
# TODO(Anton, Patrick) - I think we can remove this test soon
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
expected_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/red_cat_sitting_on_a_park_bench_k_lms.png"
)
expected_image
=
np
.
array
(
expected_image
,
dtype
=
np
.
float32
)
/
255.0
generator
=
torch
.
manual_seed
(
0
)
ddpm_images
=
ddpm
(
batch_size
=
4
,
generator
=
generator
,
output_type
=
"numpy"
).
images
generator
=
torch
.
manual_seed
(
0
)
ddim_images
=
ddim
(
batch_size
=
4
,
generator
=
generator
,
num_inference_steps
=
1000
,
eta
=
1.0
,
output_type
=
"numpy"
).
images
lms
=
LMSDiscreteScheduler
(
beta_start
=
0.00085
,
beta_end
=
0.012
,
beta_schedule
=
"scaled_linear"
)
# the values aren't exactly equal, but the images look the same visually
assert
np
.
abs
(
ddpm_images
-
ddim_images
).
max
()
<
1e-1
model_id
=
"CompVis/stable-diffusion-v1-4"
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
model_id
,
scheduler
=
lms
,
safety_checker
=
self
.
dummy_safety_checker
,
)
@
slow
def
test_karras_ve_pipeline
(
self
):
model_id
=
"google/ncsnpp-celebahq-256"
model
=
UNet2DModel
.
from_pretrained
(
model_id
)
scheduler
=
KarrasVeScheduler
()
pipe
=
KarrasVePipeline
(
unet
=
model
,
scheduler
=
scheduler
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A red cat sitting on a park bench"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
output
=
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
mask_image
=
mask_image
,
strength
=
0.75
,
guidance_scale
=
7.5
,
generator
=
generator
,
output_type
=
"np"
,
)
image
=
output
.
images
[
0
]
generator
=
torch
.
manual_seed
(
0
)
image
=
pipe
(
num_inference_steps
=
20
,
generator
=
generator
,
output_type
=
"numpy"
).
images
assert
image
.
shape
==
(
512
,
512
,
3
)
assert
np
.
abs
(
expected_image
-
image
).
max
()
<
1e-2
image_slice
=
image
[
0
,
-
3
:,
-
3
:,
-
1
]
assert
image
.
shape
==
(
1
,
256
,
256
,
3
)
expected_slice
=
np
.
array
([
0.578
,
0.5811
,
0.5924
,
0.5809
,
0.587
,
0.5886
,
0.5861
,
0.5802
,
0.586
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
@
slow
def
test_stable_diffusion_onnx
(
self
):
...
...
@@ -2356,170 +918,6 @@ class PipelineTesterMixin(unittest.TestCase):
expected_slice
=
np
.
array
([
0.2951
,
0.2955
,
0.2922
,
0.2036
,
0.1977
,
0.2279
,
0.1716
,
0.1641
,
0.1799
])
assert
np
.
abs
(
image_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-3
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_text2img_intermediate_state
(
self
):
number_of_steps
=
0
def
test_callback_fn
(
step
:
int
,
timestep
:
int
,
latents
:
torch
.
FloatTensor
)
->
None
:
test_callback_fn
.
has_been_called
=
True
nonlocal
number_of_steps
number_of_steps
+=
1
if
step
==
0
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
64
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
(
[
1.8285
,
1.2857
,
-
0.1024
,
1.2406
,
-
2.3068
,
1.0747
,
-
0.0818
,
-
0.6520
,
-
2.9506
]
)
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-3
elif
step
==
50
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
64
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
(
[
1.1078
,
1.5803
,
0.2773
,
-
0.0589
,
-
1.7928
,
-
0.3665
,
-
0.4695
,
-
1.0727
,
-
1.1601
]
)
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
test_callback_fn
.
has_been_called
=
False
pipe
=
StableDiffusionPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-4"
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
)
pipe
=
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"Andromeda galaxy in a bottle"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
pipe
(
prompt
=
prompt
,
num_inference_steps
=
50
,
guidance_scale
=
7.5
,
generator
=
generator
,
callback
=
test_callback_fn
,
callback_steps
=
1
,
)
assert
test_callback_fn
.
has_been_called
assert
number_of_steps
==
51
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_img2img_intermediate_state
(
self
):
number_of_steps
=
0
def
test_callback_fn
(
step
:
int
,
timestep
:
int
,
latents
:
torch
.
FloatTensor
)
->
None
:
test_callback_fn
.
has_been_called
=
True
nonlocal
number_of_steps
number_of_steps
+=
1
if
step
==
0
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
96
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
([
0.9052
,
-
0.0184
,
0.4810
,
0.2898
,
0.5851
,
1.4920
,
0.5362
,
1.9838
,
0.0530
])
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-3
elif
step
==
37
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
96
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
([
0.7071
,
0.7831
,
0.8300
,
1.8140
,
1.7840
,
1.9402
,
1.3651
,
1.6590
,
1.2828
])
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-2
test_callback_fn
.
has_been_called
=
False
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/img2img/sketch-mountains-input.jpg"
)
init_image
=
init_image
.
resize
((
768
,
512
))
pipe
=
StableDiffusionImg2ImgPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-4"
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A fantasy landscape, trending on artstation"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
strength
=
0.75
,
num_inference_steps
=
50
,
guidance_scale
=
7.5
,
generator
=
generator
,
callback
=
test_callback_fn
,
callback_steps
=
1
,
)
assert
test_callback_fn
.
has_been_called
assert
number_of_steps
==
38
@
slow
@
unittest
.
skipIf
(
torch_device
==
"cpu"
,
"Stable diffusion is supposed to run on GPU"
)
def
test_stable_diffusion_inpaint_legacy_intermediate_state
(
self
):
number_of_steps
=
0
def
test_callback_fn
(
step
:
int
,
timestep
:
int
,
latents
:
torch
.
FloatTensor
)
->
None
:
test_callback_fn
.
has_been_called
=
True
nonlocal
number_of_steps
number_of_steps
+=
1
if
step
==
0
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
64
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
(
[
-
0.5472
,
1.1218
,
-
0.5505
,
-
0.9390
,
-
1.0794
,
0.4063
,
0.5158
,
0.6429
,
-
1.5246
]
)
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-3
elif
step
==
37
:
latents
=
latents
.
detach
().
cpu
().
numpy
()
assert
latents
.
shape
==
(
1
,
4
,
64
,
64
)
latents_slice
=
latents
[
0
,
-
3
:,
-
3
:,
-
1
]
expected_slice
=
np
.
array
([
0.4781
,
1.1572
,
0.6258
,
0.2291
,
0.2554
,
-
0.1443
,
0.7085
,
-
0.1598
,
-
0.5659
])
assert
np
.
abs
(
latents_slice
.
flatten
()
-
expected_slice
).
max
()
<
1e-3
test_callback_fn
.
has_been_called
=
False
init_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo.png"
)
mask_image
=
load_image
(
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main"
"/in_paint/overture-creations-5sI6fQgYIuo_mask.png"
)
pipe
=
StableDiffusionInpaintPipeline
.
from_pretrained
(
"CompVis/stable-diffusion-v1-4"
,
revision
=
"fp16"
,
torch_dtype
=
torch
.
float16
)
pipe
.
to
(
torch_device
)
pipe
.
set_progress_bar_config
(
disable
=
None
)
pipe
.
enable_attention_slicing
()
prompt
=
"A red cat sitting on a park bench"
generator
=
torch
.
Generator
(
device
=
torch_device
).
manual_seed
(
0
)
with
torch
.
autocast
(
torch_device
):
pipe
(
prompt
=
prompt
,
init_image
=
init_image
,
mask_image
=
mask_image
,
strength
=
0.75
,
num_inference_steps
=
50
,
guidance_scale
=
7.5
,
generator
=
generator
,
callback
=
test_callback_fn
,
callback_steps
=
1
,
)
assert
test_callback_fn
.
has_been_called
assert
number_of_steps
==
38
@
slow
def
test_stable_diffusion_onnx_intermediate_state
(
self
):
number_of_steps
=
0
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment