Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
renzhc
diffusers_dcu
Commits
79ea8eb2
Unverified
Commit
79ea8eb2
authored
Apr 22, 2025
by
Ishan Modi
Committed by
GitHub
Apr 21, 2025
Browse files
[BUG] fixes in kadinsky pipeline (#11080)
* bug fix kadinsky pipeline
parent
e7f3a737
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
52 additions
and
125 deletions
+52
-125
src/diffusers/image_processor.py
src/diffusers/image_processor.py
+6
-1
src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
...ffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
+13
-20
src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
.../kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
+11
-33
src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
...s/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
+11
-39
src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
...users/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
+11
-32
No files found.
src/diffusers/image_processor.py
View file @
79ea8eb2
...
...
@@ -116,6 +116,7 @@ class VaeImageProcessor(ConfigMixin):
vae_scale_factor
:
int
=
8
,
vae_latent_channels
:
int
=
4
,
resample
:
str
=
"lanczos"
,
reducing_gap
:
int
=
None
,
do_normalize
:
bool
=
True
,
do_binarize
:
bool
=
False
,
do_convert_rgb
:
bool
=
False
,
...
...
@@ -498,7 +499,11 @@ class VaeImageProcessor(ConfigMixin):
raise
ValueError
(
f
"Only PIL image input is supported for resize_mode
{
resize_mode
}
"
)
if
isinstance
(
image
,
PIL
.
Image
.
Image
):
if
resize_mode
==
"default"
:
image
=
image
.
resize
((
width
,
height
),
resample
=
PIL_INTERPOLATION
[
self
.
config
.
resample
])
image
=
image
.
resize
(
(
width
,
height
),
resample
=
PIL_INTERPOLATION
[
self
.
config
.
resample
],
reducing_gap
=
self
.
config
.
reducing_gap
,
)
elif
resize_mode
==
"fill"
:
image
=
self
.
_resize_and_fill
(
image
,
width
,
height
)
elif
resize_mode
==
"crop"
:
...
...
src/diffusers/pipelines/kandinsky/pipeline_kandinsky_img2img.py
View file @
79ea8eb2
...
...
@@ -14,14 +14,13 @@
from
typing
import
Callable
,
List
,
Optional
,
Union
import
numpy
as
np
import
PIL.Image
import
torch
from
PIL
import
Image
from
transformers
import
(
XLMRobertaTokenizer
,
)
from
...image_processor
import
VaeImageProcessor
from
...models
import
UNet2DConditionModel
,
VQModel
from
...schedulers
import
DDIMScheduler
from
...utils
import
(
...
...
@@ -95,15 +94,6 @@ def get_new_h_w(h, w, scale_factor=8):
return
new_h
*
scale_factor
,
new_w
*
scale_factor
def
prepare_image
(
pil_image
,
w
=
512
,
h
=
512
):
pil_image
=
pil_image
.
resize
((
w
,
h
),
resample
=
Image
.
BICUBIC
,
reducing_gap
=
1
)
arr
=
np
.
array
(
pil_image
.
convert
(
"RGB"
))
arr
=
arr
.
astype
(
np
.
float32
)
/
127.5
-
1
arr
=
np
.
transpose
(
arr
,
[
2
,
0
,
1
])
image
=
torch
.
from_numpy
(
arr
).
unsqueeze
(
0
)
return
image
class
KandinskyImg2ImgPipeline
(
DiffusionPipeline
):
"""
Pipeline for image-to-image generation using Kandinsky
...
...
@@ -143,7 +133,16 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
scheduler
=
scheduler
,
movq
=
movq
,
)
self
.
movq_scale_factor
=
2
**
(
len
(
self
.
movq
.
config
.
block_out_channels
)
-
1
)
self
.
movq_scale_factor
=
(
2
**
(
len
(
self
.
movq
.
config
.
block_out_channels
)
-
1
)
if
getattr
(
self
,
"movq"
,
None
)
else
8
)
movq_latent_channels
=
self
.
movq
.
config
.
latent_channels
if
getattr
(
self
,
"movq"
,
None
)
else
4
self
.
image_processor
=
VaeImageProcessor
(
vae_scale_factor
=
self
.
movq_scale_factor
,
vae_latent_channels
=
movq_latent_channels
,
resample
=
"bicubic"
,
reducing_gap
=
1
,
)
def
get_timesteps
(
self
,
num_inference_steps
,
strength
,
device
):
# get the original timestep using init_timestep
...
...
@@ -417,7 +416,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
f
"Input is in incorrect format:
{
[
type
(
i
)
for
i
in
image
]
}
. Currently, we only support PIL image and pytorch tensor"
)
image
=
torch
.
cat
([
prepare_image
(
i
,
width
,
height
)
for
i
in
image
],
dim
=
0
)
image
=
torch
.
cat
([
self
.
image_processor
.
preprocess
(
i
,
width
,
height
)
for
i
in
image
],
dim
=
0
)
image
=
image
.
to
(
dtype
=
prompt_embeds
.
dtype
,
device
=
device
)
latents
=
self
.
movq
.
encode
(
image
)[
"latents"
]
...
...
@@ -498,13 +497,7 @@ class KandinskyImg2ImgPipeline(DiffusionPipeline):
if
output_type
not
in
[
"pt"
,
"np"
,
"pil"
]:
raise
ValueError
(
f
"Only the output types `pt`, `pil` and `np` are supported not output_type=
{
output_type
}
"
)
if
output_type
in
[
"np"
,
"pil"
]:
image
=
image
*
0.5
+
0.5
image
=
image
.
clamp
(
0
,
1
)
image
=
image
.
cpu
().
permute
(
0
,
2
,
3
,
1
).
float
().
numpy
()
if
output_type
==
"pil"
:
image
=
self
.
numpy_to_pil
(
image
)
image
=
self
.
image_processor
.
postprocess
(
image
,
output_type
)
if
not
return_dict
:
return
(
image
,)
...
...
src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_controlnet_img2img.py
View file @
79ea8eb2
...
...
@@ -14,11 +14,10 @@
from
typing
import
Callable
,
List
,
Optional
,
Union
import
numpy
as
np
import
PIL.Image
import
torch
from
PIL
import
Image
from
...image_processor
import
VaeImageProcessor
from
...models
import
UNet2DConditionModel
,
VQModel
from
...schedulers
import
DDPMScheduler
from
...utils
import
(
...
...
@@ -105,27 +104,6 @@ EXAMPLE_DOC_STRING = """
"""
# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
def
downscale_height_and_width
(
height
,
width
,
scale_factor
=
8
):
new_height
=
height
//
scale_factor
**
2
if
height
%
scale_factor
**
2
!=
0
:
new_height
+=
1
new_width
=
width
//
scale_factor
**
2
if
width
%
scale_factor
**
2
!=
0
:
new_width
+=
1
return
new_height
*
scale_factor
,
new_width
*
scale_factor
# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image
def
prepare_image
(
pil_image
,
w
=
512
,
h
=
512
):
pil_image
=
pil_image
.
resize
((
w
,
h
),
resample
=
Image
.
BICUBIC
,
reducing_gap
=
1
)
arr
=
np
.
array
(
pil_image
.
convert
(
"RGB"
))
arr
=
arr
.
astype
(
np
.
float32
)
/
127.5
-
1
arr
=
np
.
transpose
(
arr
,
[
2
,
0
,
1
])
image
=
torch
.
from_numpy
(
arr
).
unsqueeze
(
0
)
return
image
class
KandinskyV22ControlnetImg2ImgPipeline
(
DiffusionPipeline
):
"""
Pipeline for image-to-image generation using Kandinsky
...
...
@@ -157,7 +135,14 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
scheduler
=
scheduler
,
movq
=
movq
,
)
self
.
movq_scale_factor
=
2
**
(
len
(
self
.
movq
.
config
.
block_out_channels
)
-
1
)
movq_scale_factor
=
2
**
(
len
(
self
.
movq
.
config
.
block_out_channels
)
-
1
)
if
getattr
(
self
,
"movq"
,
None
)
else
8
movq_latent_channels
=
self
.
movq
.
config
.
latent_channels
if
getattr
(
self
,
"movq"
,
None
)
else
4
self
.
image_processor
=
VaeImageProcessor
(
vae_scale_factor
=
movq_scale_factor
,
vae_latent_channels
=
movq_latent_channels
,
resample
=
"bicubic"
,
reducing_gap
=
1
,
)
# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps
def
get_timesteps
(
self
,
num_inference_steps
,
strength
,
device
):
...
...
@@ -316,7 +301,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
f
"Input is in incorrect format:
{
[
type
(
i
)
for
i
in
image
]
}
. Currently, we only support PIL image and pytorch tensor"
)
image
=
torch
.
cat
([
prepare_image
(
i
,
width
,
height
)
for
i
in
image
],
dim
=
0
)
image
=
torch
.
cat
([
self
.
image_processor
.
preprocess
(
i
,
width
,
height
)
for
i
in
image
],
dim
=
0
)
image
=
image
.
to
(
dtype
=
image_embeds
.
dtype
,
device
=
device
)
latents
=
self
.
movq
.
encode
(
image
)[
"latents"
]
...
...
@@ -324,7 +309,6 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
self
.
scheduler
.
set_timesteps
(
num_inference_steps
,
device
=
device
)
timesteps
,
num_inference_steps
=
self
.
get_timesteps
(
num_inference_steps
,
strength
,
device
)
latent_timestep
=
timesteps
[:
1
].
repeat
(
batch_size
*
num_images_per_prompt
)
height
,
width
=
downscale_height_and_width
(
height
,
width
,
self
.
movq_scale_factor
)
latents
=
self
.
prepare_latents
(
latents
,
latent_timestep
,
batch_size
,
num_images_per_prompt
,
image_embeds
.
dtype
,
device
,
generator
)
...
...
@@ -379,13 +363,7 @@ class KandinskyV22ControlnetImg2ImgPipeline(DiffusionPipeline):
if
output_type
not
in
[
"pt"
,
"np"
,
"pil"
]:
raise
ValueError
(
f
"Only the output types `pt`, `pil` and `np` are supported not output_type=
{
output_type
}
"
)
if
output_type
in
[
"np"
,
"pil"
]:
image
=
image
*
0.5
+
0.5
image
=
image
.
clamp
(
0
,
1
)
image
=
image
.
cpu
().
permute
(
0
,
2
,
3
,
1
).
float
().
numpy
()
if
output_type
==
"pil"
:
image
=
self
.
numpy_to_pil
(
image
)
image
=
self
.
image_processor
.
postprocess
(
image
,
output_type
)
if
not
return_dict
:
return
(
image
,)
...
...
src/diffusers/pipelines/kandinsky2_2/pipeline_kandinsky2_2_img2img.py
View file @
79ea8eb2
...
...
@@ -14,11 +14,10 @@
from
typing
import
Callable
,
Dict
,
List
,
Optional
,
Union
import
numpy
as
np
import
PIL.Image
import
torch
from
PIL
import
Image
from
...image_processor
import
VaeImageProcessor
from
...models
import
UNet2DConditionModel
,
VQModel
from
...schedulers
import
DDPMScheduler
from
...utils
import
deprecate
,
is_torch_xla_available
,
logging
...
...
@@ -76,27 +75,6 @@ EXAMPLE_DOC_STRING = """
"""
# Copied from diffusers.pipelines.kandinsky2_2.pipeline_kandinsky2_2.downscale_height_and_width
def
downscale_height_and_width
(
height
,
width
,
scale_factor
=
8
):
new_height
=
height
//
scale_factor
**
2
if
height
%
scale_factor
**
2
!=
0
:
new_height
+=
1
new_width
=
width
//
scale_factor
**
2
if
width
%
scale_factor
**
2
!=
0
:
new_width
+=
1
return
new_height
*
scale_factor
,
new_width
*
scale_factor
# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.prepare_image
def
prepare_image
(
pil_image
,
w
=
512
,
h
=
512
):
pil_image
=
pil_image
.
resize
((
w
,
h
),
resample
=
Image
.
BICUBIC
,
reducing_gap
=
1
)
arr
=
np
.
array
(
pil_image
.
convert
(
"RGB"
))
arr
=
arr
.
astype
(
np
.
float32
)
/
127.5
-
1
arr
=
np
.
transpose
(
arr
,
[
2
,
0
,
1
])
image
=
torch
.
from_numpy
(
arr
).
unsqueeze
(
0
)
return
image
class
KandinskyV22Img2ImgPipeline
(
DiffusionPipeline
):
"""
Pipeline for image-to-image generation using Kandinsky
...
...
@@ -129,7 +107,14 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
scheduler
=
scheduler
,
movq
=
movq
,
)
self
.
movq_scale_factor
=
2
**
(
len
(
self
.
movq
.
config
.
block_out_channels
)
-
1
)
movq_scale_factor
=
2
**
(
len
(
self
.
movq
.
config
.
block_out_channels
)
-
1
)
if
getattr
(
self
,
"movq"
,
None
)
else
8
movq_latent_channels
=
self
.
movq
.
config
.
latent_channels
if
getattr
(
self
,
"movq"
,
None
)
else
4
self
.
image_processor
=
VaeImageProcessor
(
vae_scale_factor
=
movq_scale_factor
,
vae_latent_channels
=
movq_latent_channels
,
resample
=
"bicubic"
,
reducing_gap
=
1
,
)
# Copied from diffusers.pipelines.kandinsky.pipeline_kandinsky_img2img.KandinskyImg2ImgPipeline.get_timesteps
def
get_timesteps
(
self
,
num_inference_steps
,
strength
,
device
):
...
...
@@ -319,7 +304,7 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
f
"Input is in incorrect format:
{
[
type
(
i
)
for
i
in
image
]
}
. Currently, we only support PIL image and pytorch tensor"
)
image
=
torch
.
cat
([
prepare_image
(
i
,
width
,
height
)
for
i
in
image
],
dim
=
0
)
image
=
torch
.
cat
([
self
.
image_processor
.
preprocess
(
i
,
width
,
height
)
for
i
in
image
],
dim
=
0
)
image
=
image
.
to
(
dtype
=
image_embeds
.
dtype
,
device
=
device
)
latents
=
self
.
movq
.
encode
(
image
)[
"latents"
]
...
...
@@ -327,7 +312,6 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
self
.
scheduler
.
set_timesteps
(
num_inference_steps
,
device
=
device
)
timesteps
,
num_inference_steps
=
self
.
get_timesteps
(
num_inference_steps
,
strength
,
device
)
latent_timestep
=
timesteps
[:
1
].
repeat
(
batch_size
*
num_images_per_prompt
)
height
,
width
=
downscale_height_and_width
(
height
,
width
,
self
.
movq_scale_factor
)
latents
=
self
.
prepare_latents
(
latents
,
latent_timestep
,
batch_size
,
num_images_per_prompt
,
image_embeds
.
dtype
,
device
,
generator
)
...
...
@@ -383,21 +367,9 @@ class KandinskyV22Img2ImgPipeline(DiffusionPipeline):
if
XLA_AVAILABLE
:
xm
.
mark_step
()
if
output_type
not
in
[
"pt"
,
"np"
,
"pil"
,
"latent"
]:
raise
ValueError
(
f
"Only the output types `pt`, `pil` ,`np` and `latent` are supported not output_type=
{
output_type
}
"
)
if
not
output_type
==
"latent"
:
# post-processing
image
=
self
.
movq
.
decode
(
latents
,
force_not_quantize
=
True
)[
"sample"
]
if
output_type
in
[
"np"
,
"pil"
]:
image
=
image
*
0.5
+
0.5
image
=
image
.
clamp
(
0
,
1
)
image
=
image
.
cpu
().
permute
(
0
,
2
,
3
,
1
).
float
().
numpy
()
if
output_type
==
"pil"
:
image
=
self
.
numpy_to_pil
(
image
)
image
=
self
.
image_processor
.
postprocess
(
image
,
output_type
)
else
:
image
=
latents
...
...
src/diffusers/pipelines/kandinsky3/pipeline_kandinsky3_img2img.py
View file @
79ea8eb2
import
inspect
from
typing
import
Callable
,
Dict
,
List
,
Optional
,
Union
import
numpy
as
np
import
PIL
import
PIL.Image
import
torch
from
transformers
import
T5EncoderModel
,
T5Tokenizer
from
...image_processor
import
VaeImageProcessor
from
...loaders
import
StableDiffusionLoraLoaderMixin
from
...models
import
Kandinsky3UNet
,
VQModel
from
...schedulers
import
DDPMScheduler
...
...
@@ -53,24 +53,6 @@ EXAMPLE_DOC_STRING = """
"""
def
downscale_height_and_width
(
height
,
width
,
scale_factor
=
8
):
new_height
=
height
//
scale_factor
**
2
if
height
%
scale_factor
**
2
!=
0
:
new_height
+=
1
new_width
=
width
//
scale_factor
**
2
if
width
%
scale_factor
**
2
!=
0
:
new_width
+=
1
return
new_height
*
scale_factor
,
new_width
*
scale_factor
def
prepare_image
(
pil_image
):
arr
=
np
.
array
(
pil_image
.
convert
(
"RGB"
))
arr
=
arr
.
astype
(
np
.
float32
)
/
127.5
-
1
arr
=
np
.
transpose
(
arr
,
[
2
,
0
,
1
])
image
=
torch
.
from_numpy
(
arr
).
unsqueeze
(
0
)
return
image
class
Kandinsky3Img2ImgPipeline
(
DiffusionPipeline
,
StableDiffusionLoraLoaderMixin
):
model_cpu_offload_seq
=
"text_encoder->movq->unet->movq"
_callback_tensor_inputs
=
[
...
...
@@ -94,6 +76,14 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixi
self
.
register_modules
(
tokenizer
=
tokenizer
,
text_encoder
=
text_encoder
,
unet
=
unet
,
scheduler
=
scheduler
,
movq
=
movq
)
movq_scale_factor
=
2
**
(
len
(
self
.
movq
.
config
.
block_out_channels
)
-
1
)
if
getattr
(
self
,
"movq"
,
None
)
else
8
movq_latent_channels
=
self
.
movq
.
config
.
latent_channels
if
getattr
(
self
,
"movq"
,
None
)
else
4
self
.
image_processor
=
VaeImageProcessor
(
vae_scale_factor
=
movq_scale_factor
,
vae_latent_channels
=
movq_latent_channels
,
resample
=
"bicubic"
,
reducing_gap
=
1
,
)
def
get_timesteps
(
self
,
num_inference_steps
,
strength
,
device
):
# get the original timestep using init_timestep
...
...
@@ -566,7 +556,7 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixi
f
"Input is in incorrect format:
{
[
type
(
i
)
for
i
in
image
]
}
. Currently, we only support PIL image and pytorch tensor"
)
image
=
torch
.
cat
([
prepare_image
(
i
)
for
i
in
image
],
dim
=
0
)
image
=
torch
.
cat
([
self
.
image_processor
.
preprocess
(
i
)
for
i
in
image
],
dim
=
0
)
image
=
image
.
to
(
dtype
=
prompt_embeds
.
dtype
,
device
=
device
)
# 4. Prepare timesteps
self
.
scheduler
.
set_timesteps
(
num_inference_steps
,
device
=
device
)
...
...
@@ -630,20 +620,9 @@ class Kandinsky3Img2ImgPipeline(DiffusionPipeline, StableDiffusionLoraLoaderMixi
xm
.
mark_step
()
# post-processing
if
output_type
not
in
[
"pt"
,
"np"
,
"pil"
,
"latent"
]:
raise
ValueError
(
f
"Only the output types `pt`, `pil`, `np` and `latent` are supported not output_type=
{
output_type
}
"
)
if
not
output_type
==
"latent"
:
image
=
self
.
movq
.
decode
(
latents
,
force_not_quantize
=
True
)[
"sample"
]
if
output_type
in
[
"np"
,
"pil"
]:
image
=
image
*
0.5
+
0.5
image
=
image
.
clamp
(
0
,
1
)
image
=
image
.
cpu
().
permute
(
0
,
2
,
3
,
1
).
float
().
numpy
()
if
output_type
==
"pil"
:
image
=
self
.
numpy_to_pil
(
image
)
image
=
self
.
image_processor
.
postprocess
(
image
,
output_type
)
else
:
image
=
latents
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment