Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
renzhc
diffusers_dcu
Commits
b09b152f
Commit
b09b152f
authored
Jun 21, 2022
by
anton-l
Browse files
Merge branch 'main' of github.com:huggingface/diffusers
parents
a2117cb7
4497e78d
Changes
49
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
111 additions
and
3014 deletions
+111
-3014
src/diffusers/pipelines/old/glide/README.md
src/diffusers/pipelines/old/glide/README.md
+0
-4
src/diffusers/pipelines/old/glide/convert_weights.py
src/diffusers/pipelines/old/glide/convert_weights.py
+0
-111
src/diffusers/pipelines/old/glide/modeling_glide.py
src/diffusers/pipelines/old/glide/modeling_glide.py
+0
-923
src/diffusers/pipelines/old/glide/run_glide.py
src/diffusers/pipelines/old/glide/run_glide.py
+0
-24
src/diffusers/pipelines/old/latent_diffusion/README.md
src/diffusers/pipelines/old/latent_diffusion/README.md
+0
-0
src/diffusers/pipelines/old/latent_diffusion/configuration_ldmbert.py
...s/pipelines/old/latent_diffusion/configuration_ldmbert.py
+0
-146
src/diffusers/pipelines/old/latent_diffusion/modeling_latent_diffusion.py
...pelines/old/latent_diffusion/modeling_latent_diffusion.py
+0
-107
src/diffusers/pipelines/old/latent_diffusion/modeling_ldmbert.py
...fusers/pipelines/old/latent_diffusion/modeling_ldmbert.py
+0
-706
src/diffusers/pipelines/old/latent_diffusion/modeling_vae.py
src/diffusers/pipelines/old/latent_diffusion/modeling_vae.py
+0
-859
src/diffusers/pipelines/old/latent_diffusion/modeling_vqvae.py
...iffusers/pipelines/old/latent_diffusion/modeling_vqvae.py
+0
-0
src/diffusers/pipelines/old/latent_diffusion/run_latent_diffusion.py
...rs/pipelines/old/latent_diffusion/run_latent_diffusion.py
+0
-0
src/diffusers/pipelines/pipeline_bddm.py
src/diffusers/pipelines/pipeline_bddm.py
+1
-1
src/diffusers/pipelines/pipeline_ddim.py
src/diffusers/pipelines/pipeline_ddim.py
+1
-1
src/diffusers/pipelines/pipeline_glide.py
src/diffusers/pipelines/pipeline_glide.py
+5
-11
src/diffusers/pipelines/pipeline_grad_tts.py
src/diffusers/pipelines/pipeline_grad_tts.py
+1
-1
src/diffusers/pipelines/pipeline_latent_diffusion.py
src/diffusers/pipelines/pipeline_latent_diffusion.py
+1
-1
src/diffusers/pipelines/pipeline_pndm.py
src/diffusers/pipelines/pipeline_pndm.py
+3
-3
src/diffusers/schedulers/classifier_free_guidance.py
src/diffusers/schedulers/classifier_free_guidance.py
+1
-2
src/diffusers/schedulers/scheduling_ddim.py
src/diffusers/schedulers/scheduling_ddim.py
+45
-59
src/diffusers/schedulers/scheduling_ddpm.py
src/diffusers/schedulers/scheduling_ddpm.py
+53
-55
No files found.
src/diffusers/pipelines/old/glide/README.md
deleted
100644 → 0
View file @
a2117cb7
# References
[
GLIDE: Towards Photorealistic Image Generation and Editing with Text-Guided Diffusion Models
](
https://arxiv.org/pdf/2112.10741.pdf
)
[
Diffusion Models Beat GANs on Image Synthesis
](
https://arxiv.org/pdf/2105.05233.pdf
)
\ No newline at end of file
src/diffusers/pipelines/old/glide/convert_weights.py
deleted
100644 → 0
View file @
a2117cb7
import
torch
from
torch
import
nn
from
diffusers
import
ClassifierFreeGuidanceScheduler
,
GLIDESuperResUNetModel
,
GLIDETextToImageUNetModel
from
modeling_glide
import
GLIDE
,
CLIPTextModel
from
transformers
import
CLIPTextConfig
,
GPT2Tokenizer
# wget https://openaipublic.blob.core.windows.net/diffusion/dec-2021/base.pt
state_dict
=
torch
.
load
(
"base.pt"
,
map_location
=
"cpu"
)
state_dict
=
{
k
:
nn
.
Parameter
(
v
)
for
k
,
v
in
state_dict
.
items
()}
### Convert the text encoder
config
=
CLIPTextConfig
(
vocab_size
=
50257
,
max_position_embeddings
=
128
,
hidden_size
=
512
,
intermediate_size
=
2048
,
num_hidden_layers
=
16
,
num_attention_heads
=
8
,
use_padding_embeddings
=
True
,
)
model
=
CLIPTextModel
(
config
).
eval
()
tokenizer
=
GPT2Tokenizer
(
"./glide-base/tokenizer/vocab.json"
,
"./glide-base/tokenizer/merges.txt"
,
pad_token
=
"<|endoftext|>"
)
hf_encoder
=
model
.
text_model
hf_encoder
.
embeddings
.
token_embedding
.
weight
=
state_dict
[
"token_embedding.weight"
]
hf_encoder
.
embeddings
.
position_embedding
.
weight
.
data
=
state_dict
[
"positional_embedding"
]
hf_encoder
.
embeddings
.
padding_embedding
.
weight
.
data
=
state_dict
[
"padding_embedding"
]
hf_encoder
.
final_layer_norm
.
weight
=
state_dict
[
"final_ln.weight"
]
hf_encoder
.
final_layer_norm
.
bias
=
state_dict
[
"final_ln.bias"
]
for
layer_idx
in
range
(
config
.
num_hidden_layers
):
hf_layer
=
hf_encoder
.
encoder
.
layers
[
layer_idx
]
hf_layer
.
self_attn
.
qkv_proj
.
weight
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.attn.c_qkv.weight"
]
hf_layer
.
self_attn
.
qkv_proj
.
bias
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.attn.c_qkv.bias"
]
hf_layer
.
self_attn
.
out_proj
.
weight
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.attn.c_proj.weight"
]
hf_layer
.
self_attn
.
out_proj
.
bias
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.attn.c_proj.bias"
]
hf_layer
.
layer_norm1
.
weight
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.ln_1.weight"
]
hf_layer
.
layer_norm1
.
bias
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.ln_1.bias"
]
hf_layer
.
layer_norm2
.
weight
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.ln_2.weight"
]
hf_layer
.
layer_norm2
.
bias
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.ln_2.bias"
]
hf_layer
.
mlp
.
fc1
.
weight
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.mlp.c_fc.weight"
]
hf_layer
.
mlp
.
fc1
.
bias
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.mlp.c_fc.bias"
]
hf_layer
.
mlp
.
fc2
.
weight
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.mlp.c_proj.weight"
]
hf_layer
.
mlp
.
fc2
.
bias
=
state_dict
[
f
"transformer.resblocks.
{
layer_idx
}
.mlp.c_proj.bias"
]
### Convert the Text-to-Image UNet
text2im_model
=
GLIDETextToImageUNetModel
(
in_channels
=
3
,
model_channels
=
192
,
out_channels
=
6
,
num_res_blocks
=
3
,
attention_resolutions
=
(
2
,
4
,
8
),
dropout
=
0.1
,
channel_mult
=
(
1
,
2
,
3
,
4
),
num_heads
=
1
,
num_head_channels
=
64
,
num_heads_upsample
=
1
,
use_scale_shift_norm
=
True
,
resblock_updown
=
True
,
transformer_dim
=
512
,
)
text2im_model
.
load_state_dict
(
state_dict
,
strict
=
False
)
text_scheduler
=
ClassifierFreeGuidanceScheduler
(
timesteps
=
1000
,
beta_schedule
=
"squaredcos_cap_v2"
)
### Convert the Super-Resolution UNet
# wget https://openaipublic.blob.core.windows.net/diffusion/dec-2021/upsample.pt
ups_state_dict
=
torch
.
load
(
"upsample.pt"
,
map_location
=
"cpu"
)
superres_model
=
GLIDESuperResUNetModel
(
in_channels
=
6
,
model_channels
=
192
,
out_channels
=
6
,
num_res_blocks
=
2
,
attention_resolutions
=
(
8
,
16
,
32
),
dropout
=
0.1
,
channel_mult
=
(
1
,
1
,
2
,
2
,
4
,
4
),
num_heads
=
1
,
num_head_channels
=
64
,
num_heads_upsample
=
1
,
use_scale_shift_norm
=
True
,
resblock_updown
=
True
,
)
superres_model
.
load_state_dict
(
ups_state_dict
,
strict
=
False
)
upscale_scheduler
=
DDIMScheduler
(
timesteps
=
1000
,
beta_schedule
=
"linear"
)
glide
=
GLIDE
(
text_unet
=
text2im_model
,
text_noise_scheduler
=
text_scheduler
,
text_encoder
=
model
,
tokenizer
=
tokenizer
,
upscale_unet
=
superres_model
,
upscale_noise_scheduler
=
upscale_scheduler
,
)
glide
.
save_pretrained
(
"./glide-base"
)
src/diffusers/pipelines/old/glide/modeling_glide.py
deleted
100644 → 0
View file @
a2117cb7
This diff is collapsed.
Click to expand it.
src/diffusers/pipelines/old/glide/run_glide.py
deleted
100644 → 0
View file @
a2117cb7
import
torch
import
PIL.Image
from
diffusers
import
DiffusionPipeline
generator
=
torch
.
Generator
()
generator
=
generator
.
manual_seed
(
0
)
model_id
=
"fusing/glide-base"
# load model and scheduler
pipeline
=
DiffusionPipeline
.
from_pretrained
(
model_id
)
# run inference (text-conditioned denoising + upscaling)
img
=
pipeline
(
"a crayon drawing of a corgi"
,
generator
)
# process image to PIL
img
=
img
.
squeeze
(
0
)
img
=
((
img
+
1
)
*
127.5
).
round
().
clamp
(
0
,
255
).
to
(
torch
.
uint8
).
cpu
().
numpy
()
image_pil
=
PIL
.
Image
.
fromarray
(
img
)
# save image
image_pil
.
save
(
"test.png"
)
src/diffusers/pipelines/old/latent_diffusion/README.md
deleted
100644 → 0
View file @
a2117cb7
src/diffusers/pipelines/old/latent_diffusion/configuration_ldmbert.py
deleted
100644 → 0
View file @
a2117cb7
# coding=utf-8
# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
""" LDMBERT model configuration"""
from
transformers.configuration_utils
import
PretrainedConfig
from
transformers.utils
import
logging
logger
=
logging
.
get_logger
(
__name__
)
LDMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
=
{
"ldm-bert"
:
"https://huggingface.co/ldm-bert/resolve/main/config.json"
,
}
class
LDMBertConfig
(
PretrainedConfig
):
r
"""
This is the configuration class to store the configuration of a [`LDMBertModel`]. It is used to instantiate a
LDMBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration
with the defaults will yield a similar configuration to that of the LDMBERT
[facebook/ldmbert-large](https://huggingface.co/facebook/ldmbert-large) architecture.
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
documentation from [`PretrainedConfig`] for more information.
Args:
vocab_size (`int`, *optional*, defaults to 50265):
Vocabulary size of the LDMBERT model. Defines the number of different tokens that can be represented by the
`inputs_ids` passed when calling [`LDMBertModel`] or [`TFLDMBertModel`].
d_model (`int`, *optional*, defaults to 1024):
Dimensionality of the layers and the pooler layer.
encoder_layers (`int`, *optional*, defaults to 12):
Number of encoder layers.
decoder_layers (`int`, *optional*, defaults to 12):
Number of decoder layers.
encoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer encoder.
decoder_attention_heads (`int`, *optional*, defaults to 16):
Number of attention heads for each attention layer in the Transformer decoder.
decoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
encoder_ffn_dim (`int`, *optional*, defaults to 4096):
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
`"relu"`, `"silu"` and `"gelu_new"` are supported.
dropout (`float`, *optional*, defaults to 0.1):
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
attention_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for the attention probabilities.
activation_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for activations inside the fully connected layer.
classifier_dropout (`float`, *optional*, defaults to 0.0):
The dropout ratio for classifier.
max_position_embeddings (`int`, *optional*, defaults to 1024):
The maximum sequence length that this model might ever be used with. Typically set this to something large
just in case (e.g., 512 or 1024 or 2048).
init_std (`float`, *optional*, defaults to 0.02):
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
encoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
decoder_layerdrop: (`float`, *optional*, defaults to 0.0):
The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://arxiv.org/abs/1909.11556)
for more details.
scale_embedding (`bool`, *optional*, defaults to `False`):
Scale embeddings by diving by sqrt(d_model).
use_cache (`bool`, *optional*, defaults to `True`):
Whether or not the model should return the last key/values attentions (not used by all models).
num_labels: (`int`, *optional*, defaults to 3):
The number of labels to use in [`LDMBertForSequenceClassification`].
forced_eos_token_id (`int`, *optional*, defaults to 2):
The id of the token to force as the last generated token when `max_length` is reached. Usually set to
`eos_token_id`.
Example:
```python
>>> from transformers import LDMBertModel, LDMBertConfig
>>> # Initializing a LDMBERT facebook/ldmbert-large style configuration
>>> configuration = LDMBertConfig()
>>> # Initializing a model from the facebook/ldmbert-large style configuration
>>> model = LDMBertModel(configuration)
>>> # Accessing the model configuration
>>> configuration = model.config
```"""
model_type
=
"ldmbert"
keys_to_ignore_at_inference
=
[
"past_key_values"
]
attribute_map
=
{
"num_attention_heads"
:
"encoder_attention_heads"
,
"hidden_size"
:
"d_model"
}
def
__init__
(
self
,
vocab_size
=
30522
,
max_position_embeddings
=
77
,
encoder_layers
=
32
,
encoder_ffn_dim
=
5120
,
encoder_attention_heads
=
8
,
head_dim
=
64
,
encoder_layerdrop
=
0.0
,
activation_function
=
"gelu"
,
d_model
=
1280
,
dropout
=
0.1
,
attention_dropout
=
0.0
,
activation_dropout
=
0.0
,
init_std
=
0.02
,
classifier_dropout
=
0.0
,
scale_embedding
=
False
,
use_cache
=
True
,
pad_token_id
=
0
,
**
kwargs
,
):
self
.
vocab_size
=
vocab_size
self
.
max_position_embeddings
=
max_position_embeddings
self
.
d_model
=
d_model
self
.
encoder_ffn_dim
=
encoder_ffn_dim
self
.
encoder_layers
=
encoder_layers
self
.
encoder_attention_heads
=
encoder_attention_heads
self
.
head_dim
=
head_dim
self
.
dropout
=
dropout
self
.
attention_dropout
=
attention_dropout
self
.
activation_dropout
=
activation_dropout
self
.
activation_function
=
activation_function
self
.
init_std
=
init_std
self
.
encoder_layerdrop
=
encoder_layerdrop
self
.
classifier_dropout
=
classifier_dropout
self
.
use_cache
=
use_cache
self
.
num_hidden_layers
=
encoder_layers
self
.
scale_embedding
=
scale_embedding
# scale factor will be sqrt(d_model) if True
super
().
__init__
(
pad_token_id
=
pad_token_id
,
**
kwargs
)
src/diffusers/pipelines/old/latent_diffusion/modeling_latent_diffusion.py
deleted
100644 → 0
View file @
a2117cb7
import
torch
import
tqdm
from
diffusers
import
DiffusionPipeline
from
.configuration_ldmbert
import
LDMBertConfig
# NOQA
from
.modeling_ldmbert
import
LDMBertModel
# NOQA
# add these relative imports here, so we can load from hub
from
.modeling_vae
import
AutoencoderKL
# NOQA
class
LatentDiffusion
(
DiffusionPipeline
):
def
__init__
(
self
,
vqvae
,
bert
,
tokenizer
,
unet
,
noise_scheduler
):
super
().
__init__
()
self
.
register_modules
(
vqvae
=
vqvae
,
bert
=
bert
,
tokenizer
=
tokenizer
,
unet
=
unet
,
noise_scheduler
=
noise_scheduler
)
@
torch
.
no_grad
()
def
__call__
(
self
,
prompt
,
batch_size
=
1
,
generator
=
None
,
torch_device
=
None
,
eta
=
0.0
,
guidance_scale
=
1.0
,
num_inference_steps
=
50
,
):
# eta corresponds to η in paper and should be between [0, 1]
if
torch_device
is
None
:
torch_device
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
self
.
unet
.
to
(
torch_device
)
self
.
vqvae
.
to
(
torch_device
)
self
.
bert
.
to
(
torch_device
)
# get unconditional embeddings for classifier free guidence
if
guidance_scale
!=
1.0
:
uncond_input
=
self
.
tokenizer
([
""
],
padding
=
"max_length"
,
max_length
=
77
,
return_tensors
=
"pt"
).
to
(
torch_device
)
uncond_embeddings
=
self
.
bert
(
uncond_input
.
input_ids
)[
0
]
# get text embedding
text_input
=
self
.
tokenizer
(
prompt
,
padding
=
"max_length"
,
max_length
=
77
,
return_tensors
=
"pt"
).
to
(
torch_device
)
text_embedding
=
self
.
bert
(
text_input
.
input_ids
)[
0
]
num_trained_timesteps
=
self
.
noise_scheduler
.
timesteps
inference_step_times
=
range
(
0
,
num_trained_timesteps
,
num_trained_timesteps
//
num_inference_steps
)
image
=
self
.
noise_scheduler
.
sample_noise
(
(
batch_size
,
self
.
unet
.
in_channels
,
self
.
unet
.
image_size
,
self
.
unet
.
image_size
),
device
=
torch_device
,
generator
=
generator
,
)
# See formulas (12) and (16) of DDIM paper https://arxiv.org/pdf/2010.02502.pdf
# Ideally, read DDIM paper in-detail understanding
# Notation (<variable name> -> <name in paper>
# - pred_noise_t -> e_theta(x_t, t)
# - pred_original_image -> f_theta(x_t, t) or x_0
# - std_dev_t -> sigma_t
# - eta -> η
# - pred_image_direction -> "direction pointingc to x_t"
# - pred_prev_image -> "x_t-1"
for
t
in
tqdm
.
tqdm
(
reversed
(
range
(
num_inference_steps
)),
total
=
num_inference_steps
):
# guidance_scale of 1 means no guidance
if
guidance_scale
==
1.0
:
image_in
=
image
context
=
text_embedding
timesteps
=
torch
.
tensor
([
inference_step_times
[
t
]]
*
image
.
shape
[
0
],
device
=
torch_device
)
else
:
# for classifier free guidance, we need to do two forward passes
# here we concanate embedding and unconditioned embedding in a single batch
# to avoid doing two forward passes
image_in
=
torch
.
cat
([
image
]
*
2
)
context
=
torch
.
cat
([
uncond_embeddings
,
text_embedding
])
timesteps
=
torch
.
tensor
([
inference_step_times
[
t
]]
*
image
.
shape
[
0
],
device
=
torch_device
)
# 1. predict noise residual
pred_noise_t
=
self
.
unet
(
image_in
,
timesteps
,
context
=
context
)
# perform guidance
if
guidance_scale
!=
1.0
:
pred_noise_t_uncond
,
pred_noise_t
=
pred_noise_t
.
chunk
(
2
)
pred_noise_t
=
pred_noise_t_uncond
+
guidance_scale
*
(
pred_noise_t
-
pred_noise_t_uncond
)
# 2. predict previous mean of image x_t-1
pred_prev_image
=
self
.
noise_scheduler
.
step
(
pred_noise_t
,
image
,
t
,
num_inference_steps
,
eta
)
# 3. optionally sample variance
variance
=
0
if
eta
>
0
:
noise
=
self
.
noise_scheduler
.
sample_noise
(
image
.
shape
,
device
=
image
.
device
,
generator
=
generator
)
variance
=
self
.
noise_scheduler
.
get_variance
(
t
,
num_inference_steps
).
sqrt
()
*
eta
*
noise
# 4. set current image to prev_image: x_t -> x_t-1
image
=
pred_prev_image
+
variance
# scale and decode image with vae
image
=
1
/
0.18215
*
image
image
=
self
.
vqvae
.
decode
(
image
)
image
=
torch
.
clamp
((
image
+
1.0
)
/
2.0
,
min
=
0.0
,
max
=
1.0
)
return
image
src/diffusers/pipelines/old/latent_diffusion/modeling_ldmbert.py
deleted
100644 → 0
View file @
a2117cb7
This diff is collapsed.
Click to expand it.
src/diffusers/pipelines/old/latent_diffusion/modeling_vae.py
deleted
100644 → 0
View file @
a2117cb7
This diff is collapsed.
Click to expand it.
src/diffusers/pipelines/old/latent_diffusion/modeling_vqvae.py
deleted
100644 → 0
View file @
a2117cb7
src/diffusers/pipelines/old/latent_diffusion/run_latent_diffusion.py
deleted
100644 → 0
View file @
a2117cb7
src/diffusers/pipelines/pipeline_bddm.py
View file @
b09b152f
...
...
@@ -291,7 +291,7 @@ class BDDM(DiffusionPipeline):
# Sample gaussian noise to begin loop
audio
=
torch
.
normal
(
0
,
1
,
size
=
audio_size
,
generator
=
generator
).
to
(
torch_device
)
timestep_values
=
self
.
noise_scheduler
.
timestep_values
timestep_values
=
self
.
noise_scheduler
.
config
.
timestep_values
num_prediction_steps
=
len
(
self
.
noise_scheduler
)
for
t
in
tqdm
.
tqdm
(
reversed
(
range
(
num_prediction_steps
)),
total
=
num_prediction_steps
):
# 1. predict noise residual
...
...
src/diffusers/pipelines/pipeline_ddim.py
View file @
b09b152f
...
...
@@ -32,7 +32,7 @@ class DDIM(DiffusionPipeline):
if
torch_device
is
None
:
torch_device
=
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
num_trained_timesteps
=
self
.
noise_scheduler
.
timesteps
num_trained_timesteps
=
self
.
noise_scheduler
.
config
.
timesteps
inference_step_times
=
range
(
0
,
num_trained_timesteps
,
num_trained_timesteps
//
num_inference_steps
)
self
.
unet
.
to
(
torch_device
)
...
...
src/diffusers/pipelines/pipeline_glide.py
View file @
b09b152f
...
...
@@ -24,17 +24,11 @@ import torch.utils.checkpoint
from
torch
import
nn
import
tqdm
try
:
from
transformers
import
CLIPConfig
,
CLIPModel
,
CLIPTextConfig
,
CLIPVisionConfig
,
GPT2Tokenizer
from
transformers.activations
import
ACT2FN
from
transformers.modeling_outputs
import
BaseModelOutput
,
BaseModelOutputWithPooling
from
transformers.modeling_utils
import
PreTrainedModel
from
transformers.utils
import
ModelOutput
,
add_start_docstrings_to_model_forward
,
replace_return_docstrings
except
:
print
(
"Transformers is not installed"
)
pass
from
transformers
import
CLIPConfig
,
CLIPModel
,
CLIPTextConfig
,
CLIPVisionConfig
,
GPT2Tokenizer
from
transformers.activations
import
ACT2FN
from
transformers.modeling_outputs
import
BaseModelOutput
,
BaseModelOutputWithPooling
from
transformers.modeling_utils
import
PreTrainedModel
from
transformers.utils
import
ModelOutput
,
add_start_docstrings_to_model_forward
,
replace_return_docstrings
from
..models
import
GLIDESuperResUNetModel
,
GLIDETextToImageUNetModel
from
..pipeline_utils
import
DiffusionPipeline
...
...
src/diffusers/pipelines/pipeline_grad_tts.py
View file @
b09b152f
...
...
@@ -472,7 +472,7 @@ class GradTTS(DiffusionPipeline):
t
=
(
1.0
-
(
t
+
0.5
)
*
h
)
*
torch
.
ones
(
z
.
shape
[
0
],
dtype
=
z
.
dtype
,
device
=
z
.
device
)
time
=
t
.
unsqueeze
(
-
1
).
unsqueeze
(
-
1
)
residual
=
self
.
unet
(
xt
,
y_mask
,
mu_y
,
t
,
speaker_id
)
residual
=
self
.
unet
(
xt
,
t
,
mu_y
,
y_mask
,
speaker_id
)
xt
=
self
.
noise_scheduler
.
step
(
xt
,
residual
,
mu_y
,
h
,
time
)
xt
=
xt
*
y_mask
...
...
src/diffusers/pipelines/pipeline_latent_diffusion.py
View file @
b09b152f
...
...
@@ -897,7 +897,7 @@ class LatentDiffusion(DiffusionPipeline):
text_input
=
self
.
tokenizer
(
prompt
,
padding
=
"max_length"
,
max_length
=
77
,
return_tensors
=
"pt"
).
to
(
torch_device
)
text_embedding
=
self
.
bert
(
text_input
.
input_ids
)[
0
]
num_trained_timesteps
=
self
.
noise_scheduler
.
timesteps
num_trained_timesteps
=
self
.
noise_scheduler
.
config
.
timesteps
inference_step_times
=
range
(
0
,
num_trained_timesteps
,
num_trained_timesteps
//
num_inference_steps
)
image
=
torch
.
randn
(
...
...
src/diffusers/pipelines/pipeline_pndm.py
View file @
b09b152f
...
...
@@ -42,9 +42,9 @@ class PNDM(DiffusionPipeline):
)
image
=
image
.
to
(
torch_device
)
warmup
_time_steps
=
self
.
noise_scheduler
.
get_
warmup
_time_steps
(
num_inference_steps
)
for
t
in
tqdm
.
tqdm
(
range
(
len
(
warmup
_time_steps
))):
t_orig
=
warmup
_time_steps
[
t
]
prk
_time_steps
=
self
.
noise_scheduler
.
get_
prk
_time_steps
(
num_inference_steps
)
for
t
in
tqdm
.
tqdm
(
range
(
len
(
prk
_time_steps
))):
t_orig
=
prk
_time_steps
[
t
]
residual
=
self
.
unet
(
image
,
t_orig
)
image
=
self
.
noise_scheduler
.
step_prk
(
residual
,
image
,
t
,
num_inference_steps
)
...
...
src/diffusers/schedulers/classifier_free_guidance.py
View file @
b09b152f
...
...
@@ -61,7 +61,6 @@ class ClassifierFreeGuidanceScheduler(nn.Module, ConfigMixin):
timesteps
=
timesteps
,
beta_schedule
=
beta_schedule
,
)
self
.
timesteps
=
int
(
timesteps
)
if
beta_schedule
==
"squaredcos_cap_v2"
:
# GLIDE cosine schedule
...
...
@@ -94,4 +93,4 @@ class ClassifierFreeGuidanceScheduler(nn.Module, ConfigMixin):
return
torch
.
randn
(
shape
,
generator
=
generator
).
to
(
device
)
def
__len__
(
self
):
return
self
.
timesteps
return
self
.
config
.
timesteps
src/diffusers/schedulers/scheduling_ddim.py
View file @
b09b152f
# Copyright 2022 The HuggingFace Team. All rights reserved.
# Copyright 2022
Stanford University Team and
The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -11,12 +11,40 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# DISCLAIMER: This code is strongly influenced by https://github.com/pesser/pytorch_diffusion
# and https://github.com/hojonathanho/diffusion
import
math
import
numpy
as
np
from
..configuration_utils
import
ConfigMixin
from
.scheduling_utils
import
SchedulerMixin
,
betas_for_alpha_bar
,
linear_beta_schedule
from
.scheduling_utils
import
SchedulerMixin
def
betas_for_alpha_bar
(
num_diffusion_timesteps
,
max_beta
=
0.999
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function,
which defines the cumulative product of (1-beta) over time from t = [0,1].
:param num_diffusion_timesteps: the number of betas to produce.
:param alpha_bar: a lambda that takes an argument t from 0 to 1 and
produces the cumulative product of (1-beta) up to that
part of the diffusion process.
:param max_beta: the maximum beta to use; use values lower than 1 to
prevent singularities.
"""
def
alpha_bar
(
time_step
):
return
math
.
cos
((
time_step
+
0.008
)
/
1.008
*
math
.
pi
/
2
)
**
2
betas
=
[]
for
i
in
range
(
num_diffusion_timesteps
):
t1
=
i
/
num_diffusion_timesteps
t2
=
(
i
+
1
)
/
num_diffusion_timesteps
betas
.
append
(
min
(
1
-
alpha_bar
(
t2
)
/
alpha_bar
(
t1
),
max_beta
))
return
np
.
array
(
betas
,
dtype
=
np
.
float32
)
class
DDIMScheduler
(
SchedulerMixin
,
ConfigMixin
):
...
...
@@ -37,19 +65,16 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
beta_start
=
beta_start
,
beta_end
=
beta_end
,
beta_schedule
=
beta_schedule
,
trained_betas
=
trained_betas
,
timestep_values
=
timestep_values
,
clip_sample
=
clip_sample
,
)
self
.
timesteps
=
int
(
timesteps
)
self
.
timestep_values
=
timestep_values
# save the fixed timestep values for BDDM
self
.
clip_sample
=
clip_sample
if
beta_schedule
==
"linear"
:
self
.
betas
=
linear_beta_schedule
(
timesteps
,
beta_start
=
beta_
start
,
beta_end
=
beta_end
)
self
.
betas
=
np
.
linspace
(
beta_start
,
beta_
end
,
timesteps
,
dtype
=
np
.
float32
)
elif
beta_schedule
==
"squaredcos_cap_v2"
:
# GLIDE cosine schedule
self
.
betas
=
betas_for_alpha_bar
(
timesteps
,
lambda
t
:
math
.
cos
((
t
+
0.008
)
/
1.008
*
math
.
pi
/
2
)
**
2
,
)
self
.
betas
=
betas_for_alpha_bar
(
timesteps
)
else
:
raise
NotImplementedError
(
f
"
{
beta_schedule
}
does is not implemented for
{
self
.
__class__
}
"
)
...
...
@@ -59,51 +84,12 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
self
.
set_format
(
tensor_format
=
tensor_format
)
# alphas_cumprod_prev = torch.nn.functional.pad(alphas_cumprod[:-1], (1, 0), value=1.0)
# TODO(PVP) - check how much of these is actually necessary!
# LDM only uses "fixed_small"; glide seems to use a weird mix of the two, ...
# https://github.com/openai/glide-text2im/blob/69b530740eb6cef69442d6180579ef5ba9ef063e/glide_text2im/gaussian_diffusion.py#L246
# variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
# if variance_type == "fixed_small":
# log_variance = torch.log(variance.clamp(min=1e-20))
# elif variance_type == "fixed_large":
# log_variance = torch.log(torch.cat([variance[1:2], betas[1:]], dim=0))
#
#
# self.register_buffer("log_variance", log_variance.to(torch.float32))
# def rescale_betas(self, num_timesteps):
# # GLIDE scaling
# if self.beta_schedule == "linear":
# scale = self.timesteps / num_timesteps
# self.betas = linear_beta_schedule(
# num_timesteps, beta_start=self.beta_start * scale, beta_end=self.beta_end * scale
# )
# self.alphas = 1.0 - self.betas
# self.alphas_cumprod = np.cumprod(self.alphas, axis=0)
def
get_alpha
(
self
,
time_step
):
return
self
.
alphas
[
time_step
]
def
get_beta
(
self
,
time_step
):
return
self
.
betas
[
time_step
]
def
get_alpha_prod
(
self
,
time_step
):
if
time_step
<
0
:
return
self
.
one
return
self
.
alphas_cumprod
[
time_step
]
def
get_orig_t
(
self
,
t
,
num_inference_steps
):
if
t
<
0
:
return
-
1
return
self
.
timesteps
//
num_inference_steps
*
t
def
get_variance
(
self
,
t
,
num_inference_steps
):
orig_t
=
self
.
get_orig_t
(
t
,
num_inference_steps
)
orig_prev_t
=
self
.
get_orig_t
(
t
-
1
,
num_inference_steps
)
orig_t
=
self
.
config
.
timesteps
//
num_inference_steps
*
t
orig_prev_t
=
self
.
config
.
timesteps
//
num_inference_steps
*
(
t
-
1
)
if
t
>
0
else
-
1
alpha_prod_t
=
self
.
get_
alpha
_
prod
(
orig_t
)
alpha_prod_t_prev
=
self
.
get_
alpha
_
prod
(
orig_prev_t
)
alpha_prod_t
=
self
.
alpha
s_cum
prod
[
orig_t
]
alpha_prod_t_prev
=
self
.
alpha
s_cum
prod
[
orig_prev_t
]
if
orig_prev_t
>=
0
else
self
.
one
beta_prod_t
=
1
-
alpha_prod_t
beta_prod_t_prev
=
1
-
alpha_prod_t_prev
...
...
@@ -124,12 +110,12 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
# - pred_prev_sample -> "x_t-1"
# 1. get actual t and t-1
orig_t
=
self
.
get_orig_t
(
t
,
num_inference_steps
)
orig_prev_t
=
self
.
get_orig_t
(
t
-
1
,
num_inference_steps
)
orig_t
=
self
.
config
.
timesteps
//
num_inference_steps
*
t
orig_prev_t
=
self
.
config
.
timesteps
//
num_inference_steps
*
(
t
-
1
)
if
t
>
0
else
-
1
# 2. compute alphas, betas
alpha_prod_t
=
self
.
get_
alpha
_
prod
(
orig_t
)
alpha_prod_t_prev
=
self
.
get_
alpha
_
prod
(
orig_prev_t
)
alpha_prod_t
=
self
.
alpha
s_cum
prod
[
orig_t
]
alpha_prod_t_prev
=
self
.
alpha
s_cum
prod
[
orig_prev_t
]
if
orig_prev_t
>=
0
else
self
.
one
beta_prod_t
=
1
-
alpha_prod_t
# 3. compute predicted original sample from predicted noise also called
...
...
@@ -137,7 +123,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
pred_original_sample
=
(
sample
-
beta_prod_t
**
(
0.5
)
*
residual
)
/
alpha_prod_t
**
(
0.5
)
# 4. Clip "predicted x_0"
if
self
.
clip_sample
:
if
self
.
config
.
clip_sample
:
pred_original_sample
=
self
.
clip
(
pred_original_sample
,
-
1
,
1
)
# 5. compute variance: "sigma_t(η)" -> see formula (16)
...
...
@@ -158,4 +144,4 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
return
pred_prev_sample
def
__len__
(
self
):
return
self
.
timesteps
return
self
.
config
.
timesteps
src/diffusers/schedulers/scheduling_ddpm.py
View file @
b09b152f
# Copyright 2022 The HuggingFace Team. All rights reserved.
# Copyright 2022
UC Berkely Team and
The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -11,12 +11,39 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
import
math
import
numpy
as
np
from
..configuration_utils
import
ConfigMixin
from
.scheduling_utils
import
SchedulerMixin
,
betas_for_alpha_bar
,
linear_beta_schedule
from
.scheduling_utils
import
SchedulerMixin
def
betas_for_alpha_bar
(
num_diffusion_timesteps
,
max_beta
=
0.999
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function,
which defines the cumulative product of (1-beta) over time from t = [0,1].
:param num_diffusion_timesteps: the number of betas to produce.
:param alpha_bar: a lambda that takes an argument t from 0 to 1 and
produces the cumulative product of (1-beta) up to that
part of the diffusion process.
:param max_beta: the maximum beta to use; use values lower than 1 to
prevent singularities.
"""
def
alpha_bar
(
time_step
):
return
math
.
cos
((
time_step
+
0.008
)
/
1.008
*
math
.
pi
/
2
)
**
2
betas
=
[]
for
i
in
range
(
num_diffusion_timesteps
):
t1
=
i
/
num_diffusion_timesteps
t2
=
(
i
+
1
)
/
num_diffusion_timesteps
betas
.
append
(
min
(
1
-
alpha_bar
(
t2
)
/
alpha_bar
(
t1
),
max_beta
))
return
np
.
array
(
betas
,
dtype
=
np
.
float32
)
class
DDPMScheduler
(
SchedulerMixin
,
ConfigMixin
):
...
...
@@ -43,21 +70,14 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
variance_type
=
variance_type
,
clip_sample
=
clip_sample
,
)
self
.
timesteps
=
int
(
timesteps
)
self
.
timestep_values
=
timestep_values
# save the fixed timestep values for BDDM
self
.
clip_sample
=
clip_sample
self
.
variance_type
=
variance_type
if
trained_betas
is
not
None
:
self
.
betas
=
np
.
asarray
(
trained_betas
)
elif
beta_schedule
==
"linear"
:
self
.
betas
=
linear_beta_schedule
(
timesteps
,
beta_start
=
beta_
start
,
beta_end
=
beta_end
)
self
.
betas
=
np
.
linspace
(
beta_start
,
beta_
end
,
timesteps
,
dtype
=
np
.
float32
)
elif
beta_schedule
==
"squaredcos_cap_v2"
:
# GLIDE cosine schedule
self
.
betas
=
betas_for_alpha_bar
(
timesteps
,
lambda
t
:
math
.
cos
((
t
+
0.008
)
/
1.008
*
math
.
pi
/
2
)
**
2
,
)
self
.
betas
=
betas_for_alpha_bar
(
timesteps
)
else
:
raise
NotImplementedError
(
f
"
{
beta_schedule
}
does is not implemented for
{
self
.
__class__
}
"
)
...
...
@@ -67,70 +87,48 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
self
.
set_format
(
tensor_format
=
tensor_format
)
# self.register_buffer("betas", betas.to(torch.float32))
# self.register_buffer("alphas", alphas.to(torch.float32))
# self.register_buffer("alphas_cumprod", alphas_cumprod.to(torch.float32))
# alphas_cumprod_prev = torch.nn.functional.pad(alphas_cumprod[:-1], (1, 0), value=1.0)
# TODO(PVP) - check how much of these is actually necessary!
# LDM only uses "fixed_small"; glide seems to use a weird mix of the two, ...
# https://github.com/openai/glide-text2im/blob/69b530740eb6cef69442d6180579ef5ba9ef063e/glide_text2im/gaussian_diffusion.py#L246
# variance = betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
# if variance_type == "fixed_small":
# log_variance = torch.log(variance.clamp(min=1e-20))
# elif variance_type == "fixed_large":
# log_variance = torch.log(torch.cat([variance[1:2], betas[1:]], dim=0))
#
#
# self.register_buffer("log_variance", log_variance.to(torch.float32))
def
get_alpha
(
self
,
time_step
):
return
self
.
alphas
[
time_step
]
def
get_beta
(
self
,
time_step
):
return
self
.
betas
[
time_step
]
def
get_alpha_prod
(
self
,
time_step
):
if
time_step
<
0
:
return
self
.
one
return
self
.
alphas_cumprod
[
time_step
]
def
get_variance
(
self
,
t
):
alpha_prod_t
=
self
.
get_
alpha
_
prod
(
t
)
alpha_prod_t_prev
=
self
.
get_
alpha
_
prod
(
t
-
1
)
alpha_prod_t
=
self
.
alpha
s_cum
prod
[
t
]
alpha_prod_t_prev
=
self
.
alpha
s_cum
prod
[
t
-
1
]
if
t
>
0
else
self
.
one
# For t > 0, compute predicted variance βt (see formala (6) and (7) from https://arxiv.org/pdf/2006.11239.pdf)
# and sample from it to get previous sample
# x_{t-1} ~ N(pred_prev_sample, variance) == add variane to pred_sample
variance
=
(
1
-
alpha_prod_t_prev
)
/
(
1
-
alpha_prod_t
)
*
self
.
get_beta
(
t
)
variance
=
(
1
-
alpha_prod_t_prev
)
/
(
1
-
alpha_prod_t
)
*
self
.
betas
[
t
]
# hacks - were probs added for training stability
if
self
.
variance_type
==
"fixed_small"
:
if
self
.
config
.
variance_type
==
"fixed_small"
:
variance
=
self
.
clip
(
variance
,
min_value
=
1e-20
)
elif
self
.
variance_type
==
"fixed_large"
:
variance
=
self
.
get_beta
(
t
)
# for rl-diffuser https://arxiv.org/abs/2205.09991
elif
self
.
config
.
variance_type
==
"fixed_small_log"
:
variance
=
self
.
log
(
self
.
clip
(
variance
,
min_value
=
1e-20
))
elif
self
.
config
.
variance_type
==
"fixed_large"
:
variance
=
self
.
betas
[
t
]
return
variance
def
step
(
self
,
residual
,
sample
,
t
):
def
step
(
self
,
residual
,
sample
,
t
,
predict_epsilon
=
True
):
# 1. compute alphas, betas
alpha_prod_t
=
self
.
get_
alpha
_
prod
(
t
)
alpha_prod_t_prev
=
self
.
get_
alpha
_
prod
(
t
-
1
)
alpha_prod_t
=
self
.
alpha
s_cum
prod
[
t
]
alpha_prod_t_prev
=
self
.
alpha
s_cum
prod
[
t
-
1
]
if
t
>
0
else
self
.
one
beta_prod_t
=
1
-
alpha_prod_t
beta_prod_t_prev
=
1
-
alpha_prod_t_prev
# 2. compute predicted original sample from predicted noise also called
# "predicted x_0" of formula (15) from https://arxiv.org/pdf/2006.11239.pdf
pred_original_sample
=
(
sample
-
beta_prod_t
**
(
0.5
)
*
residual
)
/
alpha_prod_t
**
(
0.5
)
if
predict_epsilon
:
pred_original_sample
=
(
sample
-
beta_prod_t
**
(
0.5
)
*
residual
)
/
alpha_prod_t
**
(
0.5
)
else
:
pred_original_sample
=
residual
# 3. Clip "predicted x_0"
if
self
.
clip_sample
:
if
self
.
config
.
clip_sample
:
pred_original_sample
=
self
.
clip
(
pred_original_sample
,
-
1
,
1
)
# 4. Compute coefficients for pred_original_sample x_0 and current sample x_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
pred_original_sample_coeff
=
(
alpha_prod_t_prev
**
(
0.5
)
*
self
.
get_beta
(
t
)
)
/
beta_prod_t
current_sample_coeff
=
self
.
get_
alpha
(
t
)
**
(
0.5
)
*
beta_prod_t_prev
/
beta_prod_t
pred_original_sample_coeff
=
(
alpha_prod_t_prev
**
(
0.5
)
*
self
.
betas
[
t
]
)
/
beta_prod_t
current_sample_coeff
=
self
.
alpha
s
[
t
]
**
(
0.5
)
*
beta_prod_t_prev
/
beta_prod_t
# 5. Compute predicted previous sample µ_t
# See formula (7) from https://arxiv.org/pdf/2006.11239.pdf
...
...
@@ -139,10 +137,10 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
return
pred_prev_sample
def
forward_step
(
self
,
original_sample
,
noise
,
t
):
sqrt_alpha_prod
=
self
.
get_
alpha_prod
(
t
)
**
0.5
sqrt_one_minus_alpha_prod
=
(
1
-
self
.
get_
alpha_prod
(
t
)
)
**
0.5
sqrt_alpha_prod
=
self
.
alpha_prod
_t
[
t
]
**
0.5
sqrt_one_minus_alpha_prod
=
(
1
-
self
.
alpha_prod
_t
[
t
]
)
**
0.5
noisy_sample
=
sqrt_alpha_prod
*
original_sample
+
sqrt_one_minus_alpha_prod
*
noise
return
noisy_sample
def
__len__
(
self
):
return
self
.
timesteps
return
self
.
config
.
timesteps
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment