Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
diffusers
Commits
11631e81
Commit
11631e81
authored
Jun 13, 2022
by
Patrick von Platen
Browse files
merge
parents
13c5a065
b8a67640
Changes
11
Hide whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
371 additions
and
121 deletions
+371
-121
README.md
README.md
+85
-46
src/diffusers/__init__.py
src/diffusers/__init__.py
+1
-1
src/diffusers/configuration_utils.py
src/diffusers/configuration_utils.py
+4
-4
src/diffusers/pipelines/__init__.py
src/diffusers/pipelines/__init__.py
+1
-0
src/diffusers/pipelines/conversion_glide.py
src/diffusers/pipelines/conversion_glide.py
+3
-1
src/diffusers/pipelines/pipeline_bddm.py
src/diffusers/pipelines/pipeline_bddm.py
+113
-36
src/diffusers/pipelines/pipeline_glide.py
src/diffusers/pipelines/pipeline_glide.py
+17
-23
src/diffusers/schedulers/glide_ddim.py
src/diffusers/schedulers/glide_ddim.py
+0
-0
src/diffusers/schedulers/scheduling_ddim.py
src/diffusers/schedulers/scheduling_ddim.py
+17
-9
src/diffusers/schedulers/scheduling_ddpm.py
src/diffusers/schedulers/scheduling_ddpm.py
+14
-1
src/diffusers/trainers/training_ddpm.py
src/diffusers/trainers/training_ddpm.py
+116
-0
No files found.
README.md
View file @
11631e81
...
@@ -164,7 +164,7 @@ image_pil = PIL.Image.fromarray(image_processed[0])
...
@@ -164,7 +164,7 @@ image_pil = PIL.Image.fromarray(image_processed[0])
image_pil
.
save
(
"test.png"
)
image_pil
.
save
(
"test.png"
)
```
```
**Text to Image generation with Latent Diffusion**
####
**Text to Image generation with Latent Diffusion**
```
python
```
python
from
diffusers
import
DiffusionPipeline
from
diffusers
import
DiffusionPipeline
...
@@ -184,59 +184,98 @@ image_pil = PIL.Image.fromarray(image_processed[0])
...
@@ -184,59 +184,98 @@ image_pil = PIL.Image.fromarray(image_processed[0])
# save image
# save image
image_pil
.
save
(
"test.png"
)
image_pil
.
save
(
"test.png"
)
```
####
**Text to speech with BDDM**
_Follow the isnstructions [here](https://pytorch.org/hub/nvidia_
deeplearningexamples_tacotron2/) to load tacotron2 model._
```
python
import
torch
from
diffusers
import
BDDM
,
DiffusionPipeline
torch_device
=
"cuda"
# load the BDDM pipeline
bddm
=
DiffusionPipeline
.
from_pretrained
(
"fusing/diffwave-vocoder"
)
# load tacotron2 to get the mel spectograms
tacotron2
=
torch
.
hub
.
load
(
'NVIDIA/DeepLearningExamples:torchhub'
,
'nvidia_tacotron2'
,
model_math
=
'fp16'
)
tacotron2
=
tacotron2
.
to
(
torch_device
).
eval
()
text
=
"Hello world, I missed you so much."
utils
=
torch
.
hub
.
load
(
'NVIDIA/DeepLearningExamples:torchhub'
,
'nvidia_tts_utils'
)
sequences
,
lengths
=
utils
.
prepare_input_sequence
([
text
])
# generate mel spectograms using text
with
torch
.
no_grad
():
mel_spec
,
_
,
_
=
tacotron2
.
infer
(
sequences
,
lengths
)
# generate the speech by passing mel spectograms to BDDM pipeline
generator
=
torch
.
manual_seed
(
0
)
audio
=
bddm
(
mel_spec
,
generator
,
torch_device
)
# save generated audio
from
scipy.io.wavfile
import
write
as
wavwrite
sampling_rate
=
22050
wavwrite
(
"generated_audio.wav"
,
sampling_rate
,
audio
.
squeeze
().
cpu
().
numpy
())
```
```
## Library structure:
## Library structure:
```
```
├── models
├── LICENSE
│ ├── audio
├── Makefile
│ │ └── fastdiff
│ │ ├── modeling_fastdiff.py
│ │ ├── README.md
│ │ └── run_fastdiff.py
│ ├── __init__.py
│ └── vision
│ ├── dalle2
│ │ ├── modeling_dalle2.py
│ │ ├── README.md
│ │ └── run_dalle2.py
│ ├── ddpm
│ │ ├── example.py
│ │ ├── modeling_ddpm.py
│ │ ├── README.md
│ │ └── run_ddpm.py
│ ├── glide
│ │ ├── modeling_glide.py
│ │ ├── modeling_vqvae.py.py
│ │ ├── README.md
│ │ └── run_glide.py
│ ├── imagen
│ │ ├── modeling_dalle2.py
│ │ ├── README.md
│ │ └── run_dalle2.py
│ ├── __init__.py
│ └── latent_diffusion
│ ├── modeling_latent_diffusion.py
│ ├── README.md
│ └── run_latent_diffusion.py
├── pyproject.toml
├── README.md
├── README.md
├── pyproject.toml
├── setup.cfg
├── setup.cfg
├── setup.py
├── setup.py
├── src
├── src
│ └── diffusers
│ ├── diffusers
│ ├── configuration_utils.py
│ ├── __init__.py
│ ├── __init__.py
│ ├── configuration_utils.py
│ ├── modeling_utils.py
│ ├── dependency_versions_check.py
│ ├── models
│ ├── dependency_versions_table.py
│ │ ├── __init__.py
│ ├── dynamic_modules_utils.py
│ │ ├── unet_glide.py
│ ├── modeling_utils.py
│ │ └── unet.py
│ ├── models
│ ├── pipeline_utils.py
│ │ ├── __init__.py
│ └── schedulers
│ │ ├── unet.py
│ ├── gaussian_ddpm.py
│ │ ├── unet_glide.py
│ ├── __init__.py
│ │ └── unet_ldm.py
│ ├── pipeline_utils.py
│ ├── pipelines
│ │ ├── __init__.py
│ │ ├── configuration_ldmbert.py
│ │ ├── conversion_glide.py
│ │ ├── modeling_vae.py
│ │ ├── pipeline_bddm.py
│ │ ├── pipeline_ddim.py
│ │ ├── pipeline_ddpm.py
│ │ ├── pipeline_glide.py
│ │ └── pipeline_latent_diffusion.py
│ ├── schedulers
│ │ ├── __init__.py
│ │ ├── classifier_free_guidance.py
│ │ ├── scheduling_ddim.py
│ │ ├── scheduling_ddpm.py
│ │ ├── scheduling_plms.py
│ │ └── scheduling_utils.py
│ ├── testing_utils.py
│ └── utils
│ ├── __init__.py
│ └── logging.py
├── tests
├── tests
│ └── test_modeling_utils.py
│ ├── __init__.py
│ ├── test_modeling_utils.py
│ └── test_scheduler.py
└── utils
├── check_config_docstrings.py
├── check_copies.py
├── check_dummies.py
├── check_inits.py
├── check_repo.py
├── check_table.py
└── check_tf_ops.py
```
```
src/diffusers/__init__.py
View file @
11631e81
...
@@ -9,6 +9,6 @@ from .models.unet import UNetModel
...
@@ -9,6 +9,6 @@ from .models.unet import UNetModel
from
.models.unet_glide
import
GLIDESuperResUNetModel
,
GLIDETextToImageUNetModel
from
.models.unet_glide
import
GLIDESuperResUNetModel
,
GLIDETextToImageUNetModel
from
.models.unet_ldm
import
UNetLDMModel
from
.models.unet_ldm
import
UNetLDMModel
from
.pipeline_utils
import
DiffusionPipeline
from
.pipeline_utils
import
DiffusionPipeline
from
.pipelines
import
DDIM
,
DDPM
,
GLIDE
,
LatentDiffusion
,
PNDM
from
.pipelines
import
DDIM
,
DDPM
,
GLIDE
,
LatentDiffusion
,
PNDM
,
BDDM
from
.schedulers
import
DDIMScheduler
,
DDPMScheduler
,
SchedulerMixin
,
PNDMScheduler
from
.schedulers
import
DDIMScheduler
,
DDPMScheduler
,
SchedulerMixin
,
PNDMScheduler
from
.schedulers.classifier_free_guidance
import
ClassifierFreeGuidanceScheduler
from
.schedulers.classifier_free_guidance
import
ClassifierFreeGuidanceScheduler
src/diffusers/configuration_utils.py
View file @
11631e81
...
@@ -225,11 +225,11 @@ class ConfigMixin:
...
@@ -225,11 +225,11 @@ class ConfigMixin:
text
=
reader
.
read
()
text
=
reader
.
read
()
return
json
.
loads
(
text
)
return
json
.
loads
(
text
)
def
__eq__
(
self
,
other
):
#
def __eq__(self, other):
return
self
.
__dict__
==
other
.
__dict__
#
return self.__dict__ == other.__dict__
def
__repr__
(
self
):
#
def __repr__(self):
return
f
"
{
self
.
__class__
.
__name__
}
{
self
.
to_json_string
()
}
"
#
return f"{self.__class__.__name__} {self.to_json_string()}"
@
property
@
property
def
config
(
self
)
->
Dict
[
str
,
Any
]:
def
config
(
self
)
->
Dict
[
str
,
Any
]:
...
...
src/diffusers/pipelines/__init__.py
View file @
11631e81
...
@@ -3,3 +3,4 @@ from .pipeline_ddpm import DDPM
...
@@ -3,3 +3,4 @@ from .pipeline_ddpm import DDPM
from
.pipeline_pndm
import
PNDM
from
.pipeline_pndm
import
PNDM
from
.pipeline_glide
import
GLIDE
from
.pipeline_glide
import
GLIDE
from
.pipeline_latent_diffusion
import
LatentDiffusion
from
.pipeline_latent_diffusion
import
LatentDiffusion
from
.pipeline_bddm
import
BDDM
src/diffusers/pipelines/conversion_glide.py
View file @
11631e81
...
@@ -97,7 +97,9 @@ superres_model = GLIDESuperResUNetModel(
...
@@ -97,7 +97,9 @@ superres_model = GLIDESuperResUNetModel(
superres_model
.
load_state_dict
(
ups_state_dict
,
strict
=
False
)
superres_model
.
load_state_dict
(
ups_state_dict
,
strict
=
False
)
upscale_scheduler
=
DDIMScheduler
(
timesteps
=
1000
,
beta_schedule
=
"linear"
,
beta_start
=
0.0001
,
beta_end
=
0.02
)
upscale_scheduler
=
DDIMScheduler
(
timesteps
=
1000
,
beta_schedule
=
"linear"
,
beta_start
=
0.0001
,
beta_end
=
0.02
,
tensor_format
=
"pt"
)
glide
=
GLIDE
(
glide
=
GLIDE
(
text_unet
=
text2im_model
,
text_unet
=
text2im_model
,
...
...
src/diffusers/pipelines/pipeline_bddm.py
View file @
11631e81
...
@@ -13,11 +13,18 @@
...
@@ -13,11 +13,18 @@
import
math
import
math
import
numpy
as
np
import
numpy
as
np
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
import
tqdm
from
..configuration_utils
import
ConfigMixin
from
..modeling_utils
import
ModelMixin
from
..pipeline_utils
import
DiffusionPipeline
def
calc_diffusion_step_embedding
(
diffusion_steps
,
diffusion_step_embed_dim_in
):
def
calc_diffusion_step_embedding
(
diffusion_steps
,
diffusion_step_embed_dim_in
):
"""
"""
...
@@ -41,8 +48,7 @@ def calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):
...
@@ -41,8 +48,7 @@ def calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):
_embed
=
np
.
log
(
10000
)
/
(
half_dim
-
1
)
_embed
=
np
.
log
(
10000
)
/
(
half_dim
-
1
)
_embed
=
torch
.
exp
(
torch
.
arange
(
half_dim
)
*
-
_embed
).
cuda
()
_embed
=
torch
.
exp
(
torch
.
arange
(
half_dim
)
*
-
_embed
).
cuda
()
_embed
=
diffusion_steps
*
_embed
_embed
=
diffusion_steps
*
_embed
diffusion_step_embed
=
torch
.
cat
((
torch
.
sin
(
_embed
),
diffusion_step_embed
=
torch
.
cat
((
torch
.
sin
(
_embed
),
torch
.
cos
(
_embed
)),
1
)
torch
.
cos
(
_embed
)),
1
)
return
diffusion_step_embed
return
diffusion_step_embed
...
@@ -62,8 +68,7 @@ class Conv(nn.Module):
...
@@ -62,8 +68,7 @@ class Conv(nn.Module):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
=
3
,
dilation
=
1
):
def
__init__
(
self
,
in_channels
,
out_channels
,
kernel_size
=
3
,
dilation
=
1
):
super
().
__init__
()
super
().
__init__
()
self
.
padding
=
dilation
*
(
kernel_size
-
1
)
//
2
self
.
padding
=
dilation
*
(
kernel_size
-
1
)
//
2
self
.
conv
=
nn
.
Conv1d
(
in_channels
,
out_channels
,
kernel_size
,
self
.
conv
=
nn
.
Conv1d
(
in_channels
,
out_channels
,
kernel_size
,
dilation
=
dilation
,
padding
=
self
.
padding
)
dilation
=
dilation
,
padding
=
self
.
padding
)
self
.
conv
=
nn
.
utils
.
weight_norm
(
self
.
conv
)
self
.
conv
=
nn
.
utils
.
weight_norm
(
self
.
conv
)
nn
.
init
.
kaiming_normal_
(
self
.
conv
.
weight
)
nn
.
init
.
kaiming_normal_
(
self
.
conv
.
weight
)
...
@@ -89,8 +94,7 @@ class ZeroConv1d(nn.Module):
...
@@ -89,8 +94,7 @@ class ZeroConv1d(nn.Module):
# every residual block (named residual layer in paper)
# every residual block (named residual layer in paper)
# contains one noncausal dilated conv
# contains one noncausal dilated conv
class
ResidualBlock
(
nn
.
Module
):
class
ResidualBlock
(
nn
.
Module
):
def
__init__
(
self
,
res_channels
,
skip_channels
,
dilation
,
def
__init__
(
self
,
res_channels
,
skip_channels
,
dilation
,
diffusion_step_embed_dim_out
):
diffusion_step_embed_dim_out
):
super
().
__init__
()
super
().
__init__
()
self
.
res_channels
=
res_channels
self
.
res_channels
=
res_channels
...
@@ -98,15 +102,12 @@ class ResidualBlock(nn.Module):
...
@@ -98,15 +102,12 @@ class ResidualBlock(nn.Module):
self
.
fc_t
=
nn
.
Linear
(
diffusion_step_embed_dim_out
,
self
.
res_channels
)
self
.
fc_t
=
nn
.
Linear
(
diffusion_step_embed_dim_out
,
self
.
res_channels
)
# Dilated conv layer
# Dilated conv layer
self
.
dilated_conv_layer
=
Conv
(
self
.
res_channels
,
2
*
self
.
res_channels
,
self
.
dilated_conv_layer
=
Conv
(
self
.
res_channels
,
2
*
self
.
res_channels
,
kernel_size
=
3
,
dilation
=
dilation
)
kernel_size
=
3
,
dilation
=
dilation
)
# Add mel spectrogram upsampler and conditioner conv1x1 layer
# Add mel spectrogram upsampler and conditioner conv1x1 layer
self
.
upsample_conv2d
=
nn
.
ModuleList
()
self
.
upsample_conv2d
=
nn
.
ModuleList
()
for
s
in
[
16
,
16
]:
for
s
in
[
16
,
16
]:
conv_trans2d
=
nn
.
ConvTranspose2d
(
1
,
1
,
(
3
,
2
*
s
),
conv_trans2d
=
nn
.
ConvTranspose2d
(
1
,
1
,
(
3
,
2
*
s
),
padding
=
(
1
,
s
//
2
),
stride
=
(
1
,
s
))
padding
=
(
1
,
s
//
2
),
stride
=
(
1
,
s
))
conv_trans2d
=
nn
.
utils
.
weight_norm
(
conv_trans2d
)
conv_trans2d
=
nn
.
utils
.
weight_norm
(
conv_trans2d
)
nn
.
init
.
kaiming_normal_
(
conv_trans2d
.
weight
)
nn
.
init
.
kaiming_normal_
(
conv_trans2d
.
weight
)
self
.
upsample_conv2d
.
append
(
conv_trans2d
)
self
.
upsample_conv2d
.
append
(
conv_trans2d
)
...
@@ -152,7 +153,7 @@ class ResidualBlock(nn.Module):
...
@@ -152,7 +153,7 @@ class ResidualBlock(nn.Module):
h
+=
mel_spec
h
+=
mel_spec
# Gated-tanh nonlinearity
# Gated-tanh nonlinearity
out
=
torch
.
tanh
(
h
[:,
:
self
.
res_channels
,
:])
*
torch
.
sigmoid
(
h
[:,
self
.
res_channels
:,
:])
out
=
torch
.
tanh
(
h
[:,
:
self
.
res_channels
,
:])
*
torch
.
sigmoid
(
h
[:,
self
.
res_channels
:,
:])
# Residual and skip outputs
# Residual and skip outputs
res
=
self
.
res_conv
(
out
)
res
=
self
.
res_conv
(
out
)
...
@@ -164,10 +165,16 @@ class ResidualBlock(nn.Module):
...
@@ -164,10 +165,16 @@ class ResidualBlock(nn.Module):
class
ResidualGroup
(
nn
.
Module
):
class
ResidualGroup
(
nn
.
Module
):
def
__init__
(
self
,
res_channels
,
skip_channels
,
num_res_layers
,
dilation_cycle
,
def
__init__
(
diffusion_step_embed_dim_in
,
self
,
diffusion_step_embed_dim_mid
,
res_channels
,
diffusion_step_embed_dim_out
):
skip_channels
,
num_res_layers
,
dilation_cycle
,
diffusion_step_embed_dim_in
,
diffusion_step_embed_dim_mid
,
diffusion_step_embed_dim_out
,
):
super
().
__init__
()
super
().
__init__
()
self
.
num_res_layers
=
num_res_layers
self
.
num_res_layers
=
num_res_layers
self
.
diffusion_step_embed_dim_in
=
diffusion_step_embed_dim_in
self
.
diffusion_step_embed_dim_in
=
diffusion_step_embed_dim_in
...
@@ -180,16 +187,19 @@ class ResidualGroup(nn.Module):
...
@@ -180,16 +187,19 @@ class ResidualGroup(nn.Module):
self
.
residual_blocks
=
nn
.
ModuleList
()
self
.
residual_blocks
=
nn
.
ModuleList
()
for
n
in
range
(
self
.
num_res_layers
):
for
n
in
range
(
self
.
num_res_layers
):
self
.
residual_blocks
.
append
(
self
.
residual_blocks
.
append
(
ResidualBlock
(
res_channels
,
skip_channels
,
ResidualBlock
(
dilation
=
2
**
(
n
%
dilation_cycle
),
res_channels
,
diffusion_step_embed_dim_out
=
diffusion_step_embed_dim_out
))
skip_channels
,
dilation
=
2
**
(
n
%
dilation_cycle
),
diffusion_step_embed_dim_out
=
diffusion_step_embed_dim_out
,
)
)
def
forward
(
self
,
input_data
):
def
forward
(
self
,
input_data
):
x
,
mel_spectrogram
,
diffusion_steps
=
input_data
x
,
mel_spectrogram
,
diffusion_steps
=
input_data
# Embed diffusion step t
# Embed diffusion step t
diffusion_step_embed
=
calc_diffusion_step_embedding
(
diffusion_step_embed
=
calc_diffusion_step_embedding
(
diffusion_steps
,
self
.
diffusion_step_embed_dim_in
)
diffusion_steps
,
self
.
diffusion_step_embed_dim_in
)
diffusion_step_embed
=
swish
(
self
.
fc_t1
(
diffusion_step_embed
))
diffusion_step_embed
=
swish
(
self
.
fc_t1
(
diffusion_step_embed
))
diffusion_step_embed
=
swish
(
self
.
fc_t2
(
diffusion_step_embed
))
diffusion_step_embed
=
swish
(
self
.
fc_t2
(
diffusion_step_embed
))
...
@@ -206,27 +216,52 @@ class ResidualGroup(nn.Module):
...
@@ -206,27 +216,52 @@ class ResidualGroup(nn.Module):
return
skip
*
math
.
sqrt
(
1.0
/
self
.
num_res_layers
)
return
skip
*
math
.
sqrt
(
1.0
/
self
.
num_res_layers
)
class
DiffWave
(
nn
.
Module
):
class
DiffWave
(
ModelMixin
,
ConfigMixin
):
def
__init__
(
self
,
in_channels
,
res_channels
,
skip_channels
,
out_channels
,
def
__init__
(
num_res_layers
,
dilation_cycle
,
self
,
diffusion_step_embed_dim_in
,
in_channels
=
1
,
diffusion_step_embed_dim_mid
,
res_channels
=
128
,
diffusion_step_embed_dim_out
):
skip_channels
=
128
,
out_channels
=
1
,
num_res_layers
=
30
,
dilation_cycle
=
10
,
diffusion_step_embed_dim_in
=
128
,
diffusion_step_embed_dim_mid
=
512
,
diffusion_step_embed_dim_out
=
512
,
):
super
().
__init__
()
super
().
__init__
()
# register all init arguments with self.register
self
.
register
(
in_channels
=
in_channels
,
res_channels
=
res_channels
,
skip_channels
=
skip_channels
,
out_channels
=
out_channels
,
num_res_layers
=
num_res_layers
,
dilation_cycle
=
dilation_cycle
,
diffusion_step_embed_dim_in
=
diffusion_step_embed_dim_in
,
diffusion_step_embed_dim_mid
=
diffusion_step_embed_dim_mid
,
diffusion_step_embed_dim_out
=
diffusion_step_embed_dim_out
,
)
# Initial conv1x1 with relu
# Initial conv1x1 with relu
self
.
init_conv
=
nn
.
Sequential
(
Conv
(
in_channels
,
res_channels
,
kernel_size
=
1
),
nn
.
ReLU
(
inplace
=
False
))
self
.
init_conv
=
nn
.
Sequential
(
Conv
(
in_channels
,
res_channels
,
kernel_size
=
1
),
nn
.
ReLU
(
inplace
=
False
))
# All residual layers
# All residual layers
self
.
residual_layer
=
ResidualGroup
(
res_channels
,
self
.
residual_layer
=
ResidualGroup
(
skip_channels
,
res_channels
,
num_res_layers
,
skip_channels
,
dilation_cycle
,
num_res_layers
,
diffusion_step_embed_dim_in
,
dilation_cycle
,
diffusion_step_embed_dim_mid
,
diffusion_step_embed_dim_in
,
diffusion_step_embed_dim_out
)
diffusion_step_embed_dim_mid
,
diffusion_step_embed_dim_out
,
)
# Final conv1x1 -> relu -> zeroconv1x1
# Final conv1x1 -> relu -> zeroconv1x1
self
.
final_conv
=
nn
.
Sequential
(
Conv
(
skip_channels
,
skip_channels
,
kernel_size
=
1
),
self
.
final_conv
=
nn
.
Sequential
(
nn
.
ReLU
(
inplace
=
False
),
ZeroConv1d
(
skip_channels
,
out_channels
))
Conv
(
skip_channels
,
skip_channels
,
kernel_size
=
1
),
nn
.
ReLU
(
inplace
=
False
),
ZeroConv1d
(
skip_channels
,
out_channels
),
)
def
forward
(
self
,
input_data
):
def
forward
(
self
,
input_data
):
audio
,
mel_spectrogram
,
diffusion_steps
=
input_data
audio
,
mel_spectrogram
,
diffusion_steps
=
input_data
...
@@ -234,3 +269,45 @@ class DiffWave(nn.Module):
...
@@ -234,3 +269,45 @@ class DiffWave(nn.Module):
x
=
self
.
init_conv
(
x
).
clone
()
x
=
self
.
init_conv
(
x
).
clone
()
x
=
self
.
residual_layer
((
x
,
mel_spectrogram
,
diffusion_steps
))
x
=
self
.
residual_layer
((
x
,
mel_spectrogram
,
diffusion_steps
))
return
self
.
final_conv
(
x
)
return
self
.
final_conv
(
x
)
class
BDDM
(
DiffusionPipeline
):
def
__init__
(
self
,
diffwave
,
noise_scheduler
):
super
().
__init__
()
noise_scheduler
=
noise_scheduler
.
set_format
(
"pt"
)
self
.
register_modules
(
diffwave
=
diffwave
,
noise_scheduler
=
noise_scheduler
)
@
torch
.
no_grad
()
def
__call__
(
self
,
mel_spectrogram
,
generator
,
torch_device
=
None
):
if
torch_device
is
None
:
torch_device
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
self
.
diffwave
.
to
(
torch_device
)
mel_spectrogram
=
mel_spectrogram
.
to
(
torch_device
)
audio_length
=
mel_spectrogram
.
size
(
-
1
)
*
256
audio_size
=
(
1
,
1
,
audio_length
)
# Sample gaussian noise to begin loop
audio
=
torch
.
normal
(
0
,
1
,
size
=
audio_size
,
generator
=
generator
).
to
(
torch_device
)
timestep_values
=
self
.
noise_scheduler
.
timestep_values
num_prediction_steps
=
len
(
self
.
noise_scheduler
)
for
t
in
tqdm
.
tqdm
(
reversed
(
range
(
num_prediction_steps
)),
total
=
num_prediction_steps
):
# 1. predict noise residual
ts
=
(
torch
.
tensor
(
timestep_values
[
t
])
*
torch
.
ones
((
1
,
1
))).
to
(
torch_device
)
residual
=
self
.
diffwave
((
audio
,
mel_spectrogram
,
ts
))
# 2. predict previous mean of audio x_t-1
pred_prev_audio
=
self
.
noise_scheduler
.
step
(
residual
,
audio
,
t
)
# 3. optionally sample variance
variance
=
0
if
t
>
0
:
noise
=
torch
.
normal
(
0
,
1
,
size
=
audio_size
,
generator
=
generator
).
to
(
torch_device
)
variance
=
self
.
noise_scheduler
.
get_variance
(
t
).
sqrt
()
*
noise
# 4. set current audio to prev_audio: x_t -> x_t-1
audio
=
pred_prev_audio
+
variance
return
audio
src/diffusers/pipelines/pipeline_glide.py
View file @
11631e81
...
@@ -28,13 +28,7 @@ from transformers import CLIPConfig, CLIPModel, CLIPTextConfig, CLIPVisionConfig
...
@@ -28,13 +28,7 @@ from transformers import CLIPConfig, CLIPModel, CLIPTextConfig, CLIPVisionConfig
from
transformers.activations
import
ACT2FN
from
transformers.activations
import
ACT2FN
from
transformers.modeling_outputs
import
BaseModelOutput
,
BaseModelOutputWithPooling
from
transformers.modeling_outputs
import
BaseModelOutput
,
BaseModelOutputWithPooling
from
transformers.modeling_utils
import
PreTrainedModel
from
transformers.modeling_utils
import
PreTrainedModel
from
transformers.utils
import
(
from
transformers.utils
import
ModelOutput
,
add_start_docstrings_to_model_forward
,
logging
,
replace_return_docstrings
ModelOutput
,
add_start_docstrings
,
add_start_docstrings_to_model_forward
,
logging
,
replace_return_docstrings
,
)
from
..models
import
GLIDESuperResUNetModel
,
GLIDETextToImageUNetModel
from
..models
import
GLIDESuperResUNetModel
,
GLIDETextToImageUNetModel
from
..pipeline_utils
import
DiffusionPipeline
from
..pipeline_utils
import
DiffusionPipeline
...
@@ -872,31 +866,31 @@ class GLIDE(DiffusionPipeline):
...
@@ -872,31 +866,31 @@ class GLIDE(DiffusionPipeline):
# Sample gaussian noise to begin loop
# Sample gaussian noise to begin loop
image
=
torch
.
randn
(
image
=
torch
.
randn
(
(
batch_size
,
self
.
unet
.
in_channels
,
self
.
unet
.
resolution
,
self
.
unet
.
resolution
),
(
batch_size
,
self
.
upscale_unet
.
in_channels
//
2
,
self
.
upscale_unet
.
resolution
,
self
.
upscale_unet
.
resolution
,
),
generator
=
generator
,
generator
=
generator
,
)
)
image
=
image
.
to
(
torch_device
)
image
=
image
.
to
(
torch_device
)
*
upsample_temp
# See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
num_trained_timesteps
=
self
.
upscale_noise_scheduler
.
timesteps
# Ideally, read DDIM paper in-detail understanding
inference_step_times
=
range
(
0
,
num_trained_timesteps
,
num_trained_timesteps
//
num_inference_steps_upscale
)
# adapt the beta schedule to the number of steps
# Notation (<variable name> -> <name in paper>
# self.upscale_noise_scheduler.rescale_betas(num_inference_steps_upscale)
# - pred_noise_t -> e_theta(x_t, t)
# - pred_original_image -> f_theta(x_t, t) or x_0
# - std_dev_t -> sigma_t
# - eta -> η
# - pred_image_direction -> "direction pointingc to x_t"
# - pred_prev_image -> "x_t-1"
for
t
in
tqdm
.
tqdm
(
reversed
(
range
(
num_inference_steps_upscale
)),
total
=
num_inference_steps_upscale
):
for
t
in
tqdm
.
tqdm
(
reversed
(
range
(
num_inference_steps_upscale
)),
total
=
num_inference_steps_upscale
):
# 1. predict noise residual
# 1. predict noise residual
with
torch
.
no_grad
():
with
torch
.
no_grad
():
time_input
=
torch
.
tensor
([
t
]
*
image
.
shape
[
0
],
device
=
torch_device
)
time_input
=
torch
.
tensor
([
inference_step_times
[
t
]
]
*
image
.
shape
[
0
],
device
=
torch_device
)
model_output
=
self
.
upscale_unet
(
image
,
time_input
,
low_res
)
model_output
=
self
.
upscale_unet
(
image
,
time_input
,
low_res
)
noise_residual
,
pred_variance
=
torch
.
split
(
model_output
,
3
,
dim
=
1
)
noise_residual
,
pred_variance
=
torch
.
split
(
model_output
,
3
,
dim
=
1
)
# 2. predict previous mean of image x_t-1
# 2. predict previous mean of image x_t-1
pred_prev_image
=
self
.
upscale_noise_scheduler
.
step
(
pred_prev_image
=
self
.
upscale_noise_scheduler
.
step
(
noise_residual
,
image
,
t
,
num_inference_steps_upscale
,
eta
noise_residual
,
image
,
t
,
num_inference_steps_upscale
,
eta
,
use_clipped_residual
=
True
)
)
# 3. optionally sample variance
# 3. optionally sample variance
...
@@ -910,6 +904,6 @@ class GLIDE(DiffusionPipeline):
...
@@ -910,6 +904,6 @@ class GLIDE(DiffusionPipeline):
# 4. set current image to prev_image: x_t -> x_t-1
# 4. set current image to prev_image: x_t -> x_t-1
image
=
pred_prev_image
+
variance
image
=
pred_prev_image
+
variance
image
=
image
.
permute
(
0
,
2
,
3
,
1
)
image
=
image
.
clamp
(
-
1
,
1
).
permute
(
0
,
2
,
3
,
1
)
return
image
return
image
src/diffusers/schedulers/glide_ddim.py
deleted
100644 → 0
View file @
13c5a065
src/diffusers/schedulers/scheduling_ddim.py
View file @
11631e81
...
@@ -26,6 +26,8 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
...
@@ -26,6 +26,8 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
beta_start
=
0.0001
,
beta_start
=
0.0001
,
beta_end
=
0.02
,
beta_end
=
0.02
,
beta_schedule
=
"linear"
,
beta_schedule
=
"linear"
,
trained_betas
=
None
,
timestep_values
=
None
,
clip_predicted_image
=
True
,
clip_predicted_image
=
True
,
tensor_format
=
"np"
,
tensor_format
=
"np"
,
):
):
...
@@ -37,6 +39,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
...
@@ -37,6 +39,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
beta_schedule
=
beta_schedule
,
beta_schedule
=
beta_schedule
,
)
)
self
.
timesteps
=
int
(
timesteps
)
self
.
timesteps
=
int
(
timesteps
)
self
.
timestep_values
=
timestep_values
# save the fixed timestep values for BDDM
self
.
clip_image
=
clip_predicted_image
self
.
clip_image
=
clip_predicted_image
if
beta_schedule
==
"linear"
:
if
beta_schedule
==
"linear"
:
...
@@ -69,14 +72,15 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
...
@@ -69,14 +72,15 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
#
#
# self.register_buffer("log_variance", log_variance.to(torch.float32))
# self.register_buffer("log_variance", log_variance.to(torch.float32))
def
rescale_betas
(
self
,
num_timesteps
):
# def rescale_betas(self, num_timesteps):
if
self
.
beta_schedule
==
"linear"
:
# # GLIDE scaling
scale
=
self
.
timesteps
/
num_timesteps
# if self.beta_schedule == "linear":
self
.
betas
=
linear_beta_schedule
(
# scale = self.timesteps / num_timesteps
num_timesteps
,
beta_start
=
self
.
beta_start
*
scale
,
beta_end
=
self
.
beta_end
*
scale
# self.betas = linear_beta_schedule(
)
# num_timesteps, beta_start=self.beta_start * scale, beta_end=self.beta_end * scale
self
.
alphas
=
1.0
-
self
.
betas
# )
self
.
alphas_cumprod
=
np
.
cumprod
(
self
.
alphas
,
axis
=
0
)
# self.alphas = 1.0 - self.betas
# self.alphas_cumprod = np.cumprod(self.alphas, axis=0)
def
get_alpha
(
self
,
time_step
):
def
get_alpha
(
self
,
time_step
):
return
self
.
alphas
[
time_step
]
return
self
.
alphas
[
time_step
]
...
@@ -107,7 +111,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
...
@@ -107,7 +111,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
return
variance
return
variance
def
step
(
self
,
residual
,
image
,
t
,
num_inference_steps
,
eta
):
def
step
(
self
,
residual
,
image
,
t
,
num_inference_steps
,
eta
,
use_clipped_residual
=
False
):
# See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
# See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
# Ideally, read DDIM paper in-detail understanding
# Ideally, read DDIM paper in-detail understanding
...
@@ -141,6 +145,10 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
...
@@ -141,6 +145,10 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
variance
=
self
.
get_variance
(
t
,
num_inference_steps
)
variance
=
self
.
get_variance
(
t
,
num_inference_steps
)
std_dev_t
=
eta
*
variance
**
(
0.5
)
std_dev_t
=
eta
*
variance
**
(
0.5
)
if
use_clipped_residual
:
# the residual is always re-derived from the clipped x_0 in GLIDE
residual
=
(
image
-
alpha_prod_t
**
(
0.5
)
*
pred_original_image
)
/
beta_prod_t
**
(
0.5
)
# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
# 6. compute "direction pointing to x_t" of formula (12) from https://arxiv.org/pdf/2010.02502.pdf
pred_image_direction
=
(
1
-
alpha_prod_t_prev
-
std_dev_t
**
2
)
**
(
0.5
)
*
residual
pred_image_direction
=
(
1
-
alpha_prod_t_prev
-
std_dev_t
**
2
)
**
(
0.5
)
*
residual
...
...
src/diffusers/schedulers/scheduling_ddpm.py
View file @
11631e81
...
@@ -26,6 +26,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
...
@@ -26,6 +26,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
beta_start
=
0.0001
,
beta_start
=
0.0001
,
beta_end
=
0.02
,
beta_end
=
0.02
,
beta_schedule
=
"linear"
,
beta_schedule
=
"linear"
,
trained_betas
=
None
,
timestep_values
=
None
,
variance_type
=
"fixed_small"
,
variance_type
=
"fixed_small"
,
clip_predicted_image
=
True
,
clip_predicted_image
=
True
,
tensor_format
=
"np"
,
tensor_format
=
"np"
,
...
@@ -36,14 +38,19 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
...
@@ -36,14 +38,19 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
beta_start
=
beta_start
,
beta_start
=
beta_start
,
beta_end
=
beta_end
,
beta_end
=
beta_end
,
beta_schedule
=
beta_schedule
,
beta_schedule
=
beta_schedule
,
trained_betas
=
trained_betas
,
timestep_values
=
timestep_values
,
variance_type
=
variance_type
,
variance_type
=
variance_type
,
clip_predicted_image
=
clip_predicted_image
,
clip_predicted_image
=
clip_predicted_image
,
)
)
self
.
timesteps
=
int
(
timesteps
)
self
.
timesteps
=
int
(
timesteps
)
self
.
timestep_values
=
timestep_values
# save the fixed timestep values for BDDM
self
.
clip_image
=
clip_predicted_image
self
.
clip_image
=
clip_predicted_image
self
.
variance_type
=
variance_type
self
.
variance_type
=
variance_type
if
beta_schedule
==
"linear"
:
if
trained_betas
is
not
None
:
self
.
betas
=
np
.
asarray
(
trained_betas
)
elif
beta_schedule
==
"linear"
:
self
.
betas
=
linear_beta_schedule
(
timesteps
,
beta_start
=
beta_start
,
beta_end
=
beta_end
)
self
.
betas
=
linear_beta_schedule
(
timesteps
,
beta_start
=
beta_start
,
beta_end
=
beta_end
)
elif
beta_schedule
==
"squaredcos_cap_v2"
:
elif
beta_schedule
==
"squaredcos_cap_v2"
:
# GLIDE cosine schedule
# GLIDE cosine schedule
...
@@ -56,6 +63,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
...
@@ -56,6 +63,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
self
.
alphas
=
1.0
-
self
.
betas
self
.
alphas
=
1.0
-
self
.
betas
self
.
alphas_cumprod
=
np
.
cumprod
(
self
.
alphas
,
axis
=
0
)
self
.
alphas_cumprod
=
np
.
cumprod
(
self
.
alphas
,
axis
=
0
)
self
.
sqrt_alphas_cumprod
=
np
.
sqrt
(
self
.
alphas_cumprod
)
self
.
sqrt_one_minus_alphas_cumprod
=
np
.
sqrt
(
1
-
self
.
alphas_cumprod
)
self
.
one
=
np
.
array
(
1.0
)
self
.
one
=
np
.
array
(
1.0
)
self
.
set_format
(
tensor_format
=
tensor_format
)
self
.
set_format
(
tensor_format
=
tensor_format
)
...
@@ -131,5 +140,9 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
...
@@ -131,5 +140,9 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
return
pred_prev_image
return
pred_prev_image
def
forward_step
(
self
,
original_image
,
noise
,
t
):
noisy_image
=
self
.
sqrt_alphas_cumprod
[
t
]
*
original_image
+
self
.
sqrt_one_minus_alphas_cumprod
[
t
]
*
noise
return
noisy_image
def
__len__
(
self
):
def
__len__
(
self
):
return
self
.
timesteps
return
self
.
timesteps
src/diffusers/trainers/training_ddpm.py
0 → 100644
View file @
11631e81
import
random
import
numpy
as
np
import
torch
import
torch.nn.functional
as
F
import
PIL.Image
from
accelerate
import
Accelerator
from
datasets
import
load_dataset
from
diffusers
import
DDPM
,
DDPMScheduler
,
UNetModel
from
torchvision.transforms
import
CenterCrop
,
Compose
,
Lambda
,
RandomHorizontalFlip
,
Resize
,
ToTensor
from
tqdm.auto
import
tqdm
from
transformers
import
get_linear_schedule_with_warmup
def
set_seed
(
seed
):
torch
.
backends
.
cudnn
.
deterministic
=
True
torch
.
backends
.
cudnn
.
benchmark
=
False
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed_all
(
seed
)
np
.
random
.
seed
(
seed
)
random
.
seed
(
seed
)
set_seed
(
0
)
accelerator
=
Accelerator
(
mixed_precision
=
"fp16"
)
model
=
UNetModel
(
ch
=
128
,
ch_mult
=
(
1
,
2
,
4
,
8
),
resolution
=
64
)
noise_scheduler
=
DDPMScheduler
(
timesteps
=
1000
)
optimizer
=
torch
.
optim
.
AdamW
(
model
.
parameters
(),
lr
=
1e-4
)
num_epochs
=
100
batch_size
=
8
gradient_accumulation_steps
=
8
augmentations
=
Compose
(
[
Resize
(
64
),
CenterCrop
(
64
),
RandomHorizontalFlip
(),
ToTensor
(),
Lambda
(
lambda
x
:
x
*
2
-
1
),
]
)
dataset
=
load_dataset
(
"huggan/pokemon"
,
split
=
"train"
)
def
transforms
(
examples
):
images
=
[
augmentations
(
image
.
convert
(
"RGB"
))
for
image
in
examples
[
"image"
]]
return
{
"input"
:
images
}
dataset
=
dataset
.
shuffle
(
seed
=
0
)
dataset
.
set_transform
(
transforms
)
train_dataloader
=
torch
.
utils
.
data
.
DataLoader
(
dataset
,
batch_size
=
batch_size
,
shuffle
=
False
)
lr_scheduler
=
get_linear_schedule_with_warmup
(
optimizer
=
optimizer
,
num_warmup_steps
=
1000
,
num_training_steps
=
(
len
(
train_dataloader
)
*
num_epochs
)
//
gradient_accumulation_steps
,
)
model
,
optimizer
,
train_dataloader
,
lr_scheduler
=
accelerator
.
prepare
(
model
,
optimizer
,
train_dataloader
,
lr_scheduler
)
for
epoch
in
range
(
num_epochs
):
model
.
train
()
pbar
=
tqdm
(
total
=
len
(
train_dataloader
),
unit
=
"ba"
)
pbar
.
set_description
(
f
"Epoch
{
epoch
}
"
)
for
step
,
batch
in
enumerate
(
train_dataloader
):
clean_images
=
batch
[
"input"
]
noisy_images
=
torch
.
empty_like
(
clean_images
)
bsz
=
clean_images
.
shape
[
0
]
timesteps
=
torch
.
randint
(
0
,
noise_scheduler
.
timesteps
,
(
bsz
,),
device
=
clean_images
.
device
).
long
()
for
idx
in
range
(
bsz
):
noise
=
torch
.
randn_like
(
clean_images
[
0
]).
to
(
clean_images
.
device
)
noisy_images
[
idx
]
=
noise_scheduler
.
forward_step
(
clean_images
[
idx
],
noise
,
timesteps
[
idx
])
if
step
%
gradient_accumulation_steps
==
0
:
with
accelerator
.
no_sync
(
model
):
output
=
model
(
noisy_images
,
timesteps
)
loss
=
F
.
l1_loss
(
output
,
clean_images
)
accelerator
.
backward
(
loss
)
else
:
output
=
model
(
noisy_images
,
timesteps
)
loss
=
F
.
l1_loss
(
output
,
clean_images
)
accelerator
.
backward
(
loss
)
optimizer
.
step
()
lr_scheduler
.
step
()
optimizer
.
zero_grad
()
pbar
.
update
(
1
)
pbar
.
set_postfix
(
loss
=
loss
.
detach
().
item
(),
lr
=
optimizer
.
param_groups
[
0
][
"lr"
])
optimizer
.
step
()
# eval
model
.
eval
()
with
torch
.
no_grad
():
pipeline
=
DDPM
(
unet
=
model
,
noise_scheduler
=
noise_scheduler
)
generator
=
torch
.
Generator
()
generator
=
generator
.
manual_seed
(
0
)
# run pipeline in inference (sample random noise and denoise)
image
=
pipeline
(
generator
=
generator
)
# process image to PIL
image_processed
=
image
.
cpu
().
permute
(
0
,
2
,
3
,
1
)
image_processed
=
(
image_processed
+
1.0
)
*
127.5
image_processed
=
image_processed
.
type
(
torch
.
uint8
).
numpy
()
image_pil
=
PIL
.
Image
.
fromarray
(
image_processed
[
0
])
# save image
pipeline
.
save_pretrained
(
"./poke-ddpm"
)
image_pil
.
save
(
f
"./poke-ddpm/test_
{
epoch
}
.png"
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment