Make style

4261c3aa · Patrick von Platen · 932ce05d · 4261c3aa · 4261c3aa · 4261c3aa
Commit 4261c3aa authored Jun 27, 2022 by Patrick von Platen
20 changed files
--- a/Makefile
+++ b/Makefile
@@ -34,13 +34,9 @@ autogenerate_code: deps_table_update
 # Check that the repo is in a good state
 repo-consistency:
-	python utils/check_copies.py
-	python utils/check_table.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_inits.py
-	python utils/check_config_docstrings.py
-	python utils/tests_fetcher.py --sanity_check
 # this target runs checks on all files
@@ -48,14 +44,13 @@ quality:
 	black --check --preview $(check_dirs)
 	isort --check-only $(check_dirs)
 	flake8 $(check_dirs)
-	doc-builder style src/transformers docs/source --max_len 119 --check_only --path_to_docs docs/source
+	doc-builder style src/diffusers docs/source --max_len 119 --check_only --path_to_docs docs/source
 # Format source code automatically and check is there are any problems left that need manual fixing
 extra_style_checks:
 	python utils/custom_init_isort.py
-	python utils/sort_auto_mappings.py
+	doc-builder style src/diffusers docs/source --max_len 119 --path_to_docs docs/source
-	doc-builder style src/transformers docs/source --max_len 119 --path_to_docs docs/source
 # this target runs checks on all files and potentially modifies some of them
@@ -73,8 +68,6 @@ fixup: modified_only_fixup extra_style_checks autogenerate_code repo-consistency
 fix-copies:
 	python utils/check_dummies.py --fix_and_overwrite
-	python utils/check_table.py --fix_and_overwrite
-	python utils/check_copies.py --fix_and_overwrite
 # Run tests for the library

--- a/src/diffusers/hub_utils.py
+++ b/src/diffusers/hub_utils.py
@@ -47,12 +47,11 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
 def init_git_repo(args, at_init: bool = False):
    """
-    Initializes a git repo in `args.hub_model_id`.
    Args:
+    Initializes a git repo in `args.hub_model_id`.
        at_init (`bool`, *optional*, defaults to `False`):
-            Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is
+            Whether this function is called before any training or not. If `self.args.overwrite_output_dir` is `True`
-            `True` and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped
+            and `at_init` is `True`, the path to the repo (which is `self.args.output_dir`) might be wiped out.
-            out.
    """
    if args.local_rank not in [-1, 0]:
        return
@@ -102,8 +101,8 @@ def push_to_hub(
    **kwargs,
 ) -> str:
    """
-    Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
    Parameters:
+    Upload *self.model* and *self.tokenizer* to the 🤗 model hub on the repo *self.args.hub_model_id*.
        commit_message (`str`, *optional*, defaults to `"End of training"`):
            Message to commit while pushing.
        blocking (`bool`, *optional*, defaults to `True`):
@@ -111,8 +110,8 @@ def push_to_hub(
        kwargs:
            Additional keyword arguments passed along to [`create_model_card`].
    Returns:
-        The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of
+        The url of the commit of your model in the given repository if `blocking=False`, a tuple with the url of the
-        the commit and an object to track the progress of the commit if `blocking=True`
+        commit and an object to track the progress of the commit if `blocking=True`
    """
    if args.hub_model_id is None:

--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -123,16 +123,16 @@ class ModelMixin(torch.nn.Module):
    r"""
    Base class for all models.
-    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading,
+    [`ModelMixin`] takes care of storing the configuration of the models and handles methods for loading, downloading
-    downloading and saving models as well as a few methods common to all models to:
+    and saving models as well as a few methods common to all models to:
        - resize the input embeddings,
        - prune heads in the self-attention heads.
    Class attributes (overridden by derived classes):
-        - **config_class** ([`ConfigMixin`]) -- A subclass of [`ConfigMixin`] to use as configuration class
+        - **config_class** ([`ConfigMixin`]) -- A subclass of [`ConfigMixin`] to use as configuration class for this
-          for this model architecture.
+          model architecture.
        - **load_tf_weights** (`Callable`) -- A python *method* for loading a TensorFlow checkpoint in a PyTorch model,
          taking as arguments:
@@ -227,8 +227,8 @@ class ModelMixin(torch.nn.Module):
                    - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
                      Valid model ids can be located at the root-level, like `bert-base-uncased`, or namespaced under a
                      user or organization name, like `dbmdz/bert-base-german-cased`.
-                    - A path to a *directory* containing model weights saved using
+                    - A path to a *directory* containing model weights saved using [`~ModelMixin.save_pretrained`],
-                      [`~ModelMixin.save_pretrained`], e.g., `./my_model_directory/`.
+                      e.g., `./my_model_directory/`.
            config (`Union[ConfigMixin, str, os.PathLike]`, *optional*):
                Can be either:
@@ -236,13 +236,13 @@ class ModelMixin(torch.nn.Module):
                    - an instance of a class derived from [`ConfigMixin`],
                    - a string or path valid as input to [`~ConfigMixin.from_pretrained`].
-                ConfigMixinuration for the model to use instead of an automatically loaded configuration. ConfigMixinuration can
+                ConfigMixinuration for the model to use instead of an automatically loaded configuration.
-                be automatically loaded when:
+                ConfigMixinuration can be automatically loaded when:
                    - The model is a model provided by the library (loaded with the *model id* string of a pretrained
                      model).
-                    - The model was saved using [`~ModelMixin.save_pretrained`] and is reloaded by supplying the
+                    - The model was saved using [`~ModelMixin.save_pretrained`] and is reloaded by supplying the save
-                      save directory.
+                      directory.
                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                      configuration JSON file named *config.json* is found in the directory.
            cache_dir (`Union[str, os.PathLike]`, *optional*):
@@ -292,10 +292,10 @@ class ModelMixin(torch.nn.Module):
                      underlying model's `__init__` method (we assume all relevant updates to the configuration have
                      already been done)
                    - If a configuration is not provided, `kwargs` will be first passed to the configuration class
-                      initialization function ([`~ConfigMixin.from_pretrained`]). Each key of `kwargs` that
+                      initialization function ([`~ConfigMixin.from_pretrained`]). Each key of `kwargs` that corresponds
-                      corresponds to a configuration attribute will be used to override said attribute with the
+                      to a configuration attribute will be used to override said attribute with the supplied `kwargs`
-                      supplied `kwargs` value. Remaining keys that do not correspond to any configuration attribute
+                      value. Remaining keys that do not correspond to any configuration attribute will be passed to the
-                      will be passed to the underlying model's `__init__` function.
+                      underlying model's `__init__` function.
        <Tip>

--- a/src/diffusers/models/embeddings.py
+++ b/src/diffusers/models/embeddings.py
@@ -22,14 +22,12 @@ def get_timestep_embedding(
    timesteps, embedding_dim, flip_sin_to_cos=False, downscale_freq_shift=1, scale=1, max_period=10000
 ):
    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
-    Create sinusoidal timestep embeddings.
    :param timesteps: a 1-D Tensor of N indices, one per batch element.
                      These may be fractional.
-    :param embedding_dim: the dimension of the output.
+    :param embedding_dim: the dimension of the output. :param max_period: controls the minimum frequency of the
-    :param max_period: controls the minimum frequency of the embeddings.
+    embeddings. :return: an [N x dim] Tensor of positional embeddings.
-    :return: an [N x dim] Tensor of positional embeddings.
    """
    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"

--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -58,9 +58,8 @@ class Upsample(nn.Module):
    """
    An upsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    :param use_conv: a bool determining if a convolution is applied.
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    """
@@ -97,9 +96,8 @@ class Downsample(nn.Module):
    """
    A downsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    :param use_conv: a bool determining if a convolution is applied.
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    """
@@ -143,9 +141,8 @@ class GlideUpsample(nn.Module):
    """
    An upsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    :param use_conv: a bool determining if a convolution is applied.
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    """
@@ -171,10 +168,9 @@ class GlideUpsample(nn.Module):
 class LDMUpsample(nn.Module):
    """
-    An upsampling layer with an optional convolution.
+    An upsampling layer with an optional convolution. :param channels: channels in the inputs and outputs. :param
-    :param channels: channels in the inputs and outputs.
+    use_conv: a bool determining if a convolution is applied. :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_conv: a bool determining if a convolution is applied.
+    If 3D, then
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 upsampling occurs in the inner-two dimensions.
    """

--- a/src/diffusers/models/unet_glide.py
+++ b/src/diffusers/models/unet_glide.py
@@ -82,8 +82,7 @@ def normalization(channels, swish=0.0):
    """
    Make a standard normalization layer, with an optional swish activation.
-    :param channels: number of input channels.
+    :param channels: number of input channels. :return: an nn.Module for normalization.
-    :return: an nn.Module for normalization.
    """
    return GroupNorm32(num_channels=channels, num_groups=32, swish=swish)
@@ -111,8 +110,7 @@ class TimestepBlock(nn.Module):
 class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
    """
-    A sequential module that passes timestep embeddings to the children that
+    A sequential module that passes timestep embeddings to the children that support it as an extra input.
-    support it as an extra input.
    """
    def forward(self, x, emb, encoder_out=None):
@@ -130,9 +128,8 @@ class Downsample(nn.Module):
    """
    A downsampling layer with an optional convolution.
-    :param channels: channels in the inputs and outputs.
+    :param channels: channels in the inputs and outputs. :param use_conv: a bool determining if a convolution is
-    :param use_conv: a bool determining if a convolution is applied.
+    applied. :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    """
@@ -158,17 +155,13 @@ class ResBlock(TimestepBlock):
    """
    A residual block that can optionally change the number of channels.
-    :param channels: the number of input channels.
+    :param channels: the number of input channels. :param emb_channels: the number of timestep embedding channels.
-    :param emb_channels: the number of timestep embedding channels.
+    :param dropout: the rate of dropout. :param out_channels: if specified, the number of out channels. :param
-    :param dropout: the rate of dropout.
+    use_conv: if True and out_channels is specified, use a spatial
-    :param out_channels: if specified, the number of out channels.
+        convolution instead of a smaller 1x1 convolution to change the channels in the skip connection.
-    :param use_conv: if True and out_channels is specified, use a spatial
+    :param dims: determines if the signal is 1D, 2D, or 3D. :param use_checkpoint: if True, use gradient checkpointing
-        convolution instead of a smaller 1x1 convolution to change the
+    on this module. :param up: if True, use this block for upsampling. :param down: if True, use this block for
-        channels in the skip connection.
+    downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_checkpoint: if True, use gradient checkpointing on this module.
-    :param up: if True, use this block for upsampling.
-    :param down: if True, use this block for downsampling.
    """
    def __init__(
@@ -235,8 +228,7 @@ class ResBlock(TimestepBlock):
        """
        Apply the block to a Tensor, conditioned on a timestep embedding.
-        :param x: an [N x C x ...] Tensor of features.
+        :param x: an [N x C x ...] Tensor of features. :param emb: an [N x emb_channels] Tensor of timestep embeddings.
-        :param emb: an [N x emb_channels] Tensor of timestep embeddings.
        :return: an [N x C x ...] Tensor of outputs.
        """
        if self.updown:
@@ -320,8 +312,8 @@ class QKVAttention(nn.Module):
        """
        Apply QKV attention.
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. :return: an [N x (H * C) x T] tensor after
-        :return: an [N x (H * C) x T] tensor after attention.
+        attention.
        """
        bs, width, length = qkv.shape
        assert width % (3 * self.n_heads) == 0
@@ -343,29 +335,24 @@ class GlideUNetModel(ModelMixin, ConfigMixin):
    """
    The full UNet model with attention and timestep embedding.
-    :param in_channels: channels in the input Tensor.
+    :param in_channels: channels in the input Tensor. :param model_channels: base channel count for the model. :param
-    :param model_channels: base channel count for the model.
+    out_channels: channels in the output Tensor. :param num_res_blocks: number of residual blocks per downsample.
-    :param out_channels: channels in the output Tensor.
-    :param num_res_blocks: number of residual blocks per downsample.
    :param attention_resolutions: a collection of downsample rates at which
-        attention will take place. May be a set, list, or tuple.
+        attention will take place. May be a set, list, or tuple. For example, if this contains 4, then at 4x
-        For example, if this contains 4, then at 4x downsampling, attention
+        downsampling, attention will be used.
-        will be used.
+    :param dropout: the dropout probability. :param channel_mult: channel multiplier for each level of the UNet. :param
-    :param dropout: the dropout probability.
+    conv_resample: if True, use learned convolutions for upsampling and
-    :param channel_mult: channel multiplier for each level of the UNet.
-    :param conv_resample: if True, use learned convolutions for upsampling and
        downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param dims: determines if the signal is 1D, 2D, or 3D. :param num_classes: if specified (as an int), then this
-    :param num_classes: if specified (as an int), then this model will be
+    model will be
        class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage. :param num_heads: the number of attention
-    :param num_heads: the number of attention heads in each attention layer.
+    heads in each attention layer. :param num_heads_channels: if specified, ignore num_heads and instead use
-    :param num_heads_channels: if specified, ignore num_heads and instead use
                               a fixed channel width per attention head.
    :param num_heads_upsample: works with num_heads to set a different number
                               of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. :param resblock_updown: use residual blocks
-    :param resblock_updown: use residual blocks for up/downsampling.
+    for up/downsampling.
    """
    def __init__(
@@ -571,10 +558,8 @@ class GlideUNetModel(ModelMixin, ConfigMixin):
        """
        Apply the model to an input batch.
-        :param x: an [N x C x ...] Tensor of inputs.
+        :param x: an [N x C x ...] Tensor of inputs. :param timesteps: a 1-D batch of timesteps. :param y: an [N]
-        :param timesteps: a 1-D batch of timesteps.
+        Tensor of labels, if class-conditional. :return: an [N x C x ...] Tensor of outputs.
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
        """
        hs = []

--- a/src/diffusers/models/unet_ldm.py
+++ b/src/diffusers/models/unet_ldm.py
@@ -222,11 +222,8 @@ class BasicTransformerBlock(nn.Module):
 class SpatialTransformer(nn.Module):
    """
-    Transformer block for image-like data.
+    Transformer block for image-like data. First, project the input (aka embedding) and reshape to b, t, d. Then apply
-    First, project the input (aka embedding)
+    standard transformer action. Finally, reshape to image
-    and reshape to b, t, d.
-    Then apply standard transformer action.
-    Finally, reshape to image
    """
    def __init__(self, in_channels, n_heads, d_head, depth=1, dropout=0.0, context_dim=None):
@@ -331,8 +328,7 @@ def normalization(channels, swish=0.0):
    """
    Make a standard normalization layer, with an optional swish activation.
-    :param channels: number of input channels.
+    :param channels: number of input channels. :return: an nn.Module for normalization.
-    :return: an nn.Module for normalization.
    """
    return GroupNorm32(num_channels=channels, num_groups=32, swish=swish)
@@ -382,8 +378,7 @@ class TimestepBlock(nn.Module):
 class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
    """
-    A sequential module that passes timestep embeddings to the children that
+    A sequential module that passes timestep embeddings to the children that support it as an extra input.
-    support it as an extra input.
    """
    def forward(self, x, emb, context=None):
@@ -399,10 +394,9 @@ class TimestepEmbedSequential(nn.Sequential, TimestepBlock):
 class Downsample(nn.Module):
    """
-    A downsampling layer with an optional convolution.
+    A downsampling layer with an optional convolution. :param channels: channels in the inputs and outputs. :param
-    :param channels: channels in the inputs and outputs.
+    use_conv: a bool determining if a convolution is applied. :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_conv: a bool determining if a convolution is applied.
+    If 3D, then
-    :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then
                 downsampling occurs in the inner-two dimensions.
    """
@@ -426,18 +420,14 @@ class Downsample(nn.Module):
 class ResBlock(TimestepBlock):
    """
-    A residual block that can optionally change the number of channels.
+    A residual block that can optionally change the number of channels. :param channels: the number of input channels.
-    :param channels: the number of input channels.
+    :param emb_channels: the number of timestep embedding channels. :param dropout: the rate of dropout. :param
-    :param emb_channels: the number of timestep embedding channels.
+    out_channels: if specified, the number of out channels. :param use_conv: if True and out_channels is specified, use
-    :param dropout: the rate of dropout.
+    a spatial
-    :param out_channels: if specified, the number of out channels.
+        convolution instead of a smaller 1x1 convolution to change the channels in the skip connection.
-    :param use_conv: if True and out_channels is specified, use a spatial
+    :param dims: determines if the signal is 1D, 2D, or 3D. :param use_checkpoint: if True, use gradient checkpointing
-        convolution instead of a smaller 1x1 convolution to change the
+    on this module. :param up: if True, use this block for upsampling. :param down: if True, use this block for
-        channels in the skip connection.
+    downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
-    :param use_checkpoint: if True, use gradient checkpointing on this module.
-    :param up: if True, use this block for upsampling.
-    :param down: if True, use this block for downsampling.
    """
    def __init__(
@@ -525,8 +515,8 @@ class ResBlock(TimestepBlock):
 class AttentionBlock(nn.Module):
    """
-    An attention block that allows spatial positions to attend to each other.
+    An attention block that allows spatial positions to attend to each other. Originally ported from here, but adapted
-    Originally ported from here, but adapted to the N-d case.
+    to the N-d case.
    https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66.
    """
@@ -575,9 +565,8 @@ class QKVAttention(nn.Module):
    def forward(self, qkv):
        """
-        Apply QKV attention.
+        Apply QKV attention. :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. :return: an [N x (H * C) x
-        :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs.
+        T] tensor after attention.
-        :return: an [N x (H * C) x T] tensor after attention.
        """
        bs, width, length = qkv.shape
        assert width % (3 * self.n_heads) == 0
@@ -600,13 +589,9 @@ class QKVAttention(nn.Module):
 def count_flops_attn(model, _x, y):
    """
-    A counter for the `thop` package to count the operations in an
+    A counter for the `thop` package to count the operations in an attention operation. Meant to be used like:
-    attention operation.
-    Meant to be used like:
        macs, params = thop.profile(
-            model,
+            model, inputs=(inputs, timestamps), custom_ops={QKVAttention: QKVAttention.count_flops},
-            inputs=(inputs, timestamps),
-            custom_ops={QKVAttention: QKVAttention.count_flops},
        )
    """
    b, c, *spatial = y[0].shape
@@ -629,9 +614,8 @@ class QKVAttentionLegacy(nn.Module):
    def forward(self, qkv):
        """
-        Apply QKV attention.
+        Apply QKV attention. :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. :return: an [N x (H * C) x
-        :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs.
+        T] tensor after attention.
-        :return: an [N x (H * C) x T] tensor after attention.
        """
        bs, width, length = qkv.shape
        assert width % (3 * self.n_heads) == 0
@@ -650,31 +634,25 @@ class QKVAttentionLegacy(nn.Module):
 class UNetLDMModel(ModelMixin, ConfigMixin):
    """
-    The full UNet model with attention and timestep embedding.
+    The full UNet model with attention and timestep embedding. :param in_channels: channels in the input Tensor. :param
-    :param in_channels: channels in the input Tensor.
+    model_channels: base channel count for the model. :param out_channels: channels in the output Tensor. :param
-    :param model_channels: base channel count for the model.
+    num_res_blocks: number of residual blocks per downsample. :param attention_resolutions: a collection of downsample
-    :param out_channels: channels in the output Tensor.
+    rates at which
-    :param num_res_blocks: number of residual blocks per downsample.
+        attention will take place. May be a set, list, or tuple. For example, if this contains 4, then at 4x
-    :param attention_resolutions: a collection of downsample rates at which
+        downsampling, attention will be used.
-        attention will take place. May be a set, list, or tuple.
+    :param dropout: the dropout probability. :param channel_mult: channel multiplier for each level of the UNet. :param
-        For example, if this contains 4, then at 4x downsampling, attention
+    conv_resample: if True, use learned convolutions for upsampling and
-        will be used.
-    :param dropout: the dropout probability.
-    :param channel_mult: channel multiplier for each level of the UNet.
-    :param conv_resample: if True, use learned convolutions for upsampling and
        downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D.
+    :param dims: determines if the signal is 1D, 2D, or 3D. :param num_classes: if specified (as an int), then this
-    :param num_classes: if specified (as an int), then this model will be
+    model will be
        class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage.
+    :param use_checkpoint: use gradient checkpointing to reduce memory usage. :param num_heads: the number of attention
-    :param num_heads: the number of attention heads in each attention layer.
+    heads in each attention layer. :param num_heads_channels: if specified, ignore num_heads and instead use
-    :param num_heads_channels: if specified, ignore num_heads and instead use
                               a fixed channel width per attention head.
    :param num_heads_upsample: works with num_heads to set a different number
                               of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism.
+    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. :param resblock_updown: use residual blocks
-    :param resblock_updown: use residual blocks for up/downsampling.
+    for up/downsampling. :param use_new_attention_order: use a different attention pattern for potentially
-    :param use_new_attention_order: use a different attention pattern for potentially
                                    increased efficiency.
    """
@@ -975,12 +953,9 @@ class UNetLDMModel(ModelMixin, ConfigMixin):
    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
        """
-        Apply the model to an input batch.
+        Apply the model to an input batch. :param x: an [N x C x ...] Tensor of inputs. :param timesteps: a 1-D batch
-        :param x: an [N x C x ...] Tensor of inputs.
+        of timesteps. :param context: conditioning plugged in via crossattn :param y: an [N] Tensor of labels, if
-        :param timesteps: a 1-D batch of timesteps.
+        class-conditional. :return: an [N x C x ...] Tensor of outputs.
-        :param context: conditioning plugged in via crossattn
-        :param y: an [N] Tensor of labels, if class-conditional.
-        :return: an [N x C x ...] Tensor of outputs.
        """
        assert (y is not None) == (
            self.num_classes is not None
@@ -1012,8 +987,7 @@ class UNetLDMModel(ModelMixin, ConfigMixin):
 class EncoderUNetModel(nn.Module):
    """
-    The half UNet model with attention and timestep embedding.
+    The half UNet model with attention and timestep embedding. For usage, see UNet.
-    For usage, see UNet.
    """
    def __init__(
@@ -1197,10 +1171,8 @@ class EncoderUNetModel(nn.Module):
    def forward(self, x, timesteps):
        """
-        Apply the model to an input batch.
+        Apply the model to an input batch. :param x: an [N x C x ...] Tensor of inputs. :param timesteps: a 1-D batch
-        :param x: an [N x C x ...] Tensor of inputs.
+        of timesteps. :return: an [N x K] Tensor of outputs.
-        :param timesteps: a 1-D batch of timesteps.
-        :return: an [N x K] Tensor of outputs.
        """
        emb = self.time_embed(
            get_timestep_embedding(timesteps, self.model_channels, flip_sin_to_cos=True, downscale_freq_shift=0)

--- a/src/diffusers/models/unet_rl.py
+++ b/src/diffusers/models/unet_rl.py
@@ -111,10 +111,8 @@ class ResidualTemporalBlock(nn.Module):
    def forward(self, x, t):
        """
-        x : [ batch_size x inp_channels x horizon ]
+        x : [ batch_size x inp_channels x horizon ] t : [ batch_size x embed_dim ] returns: out : [ batch_size x
-        t : [ batch_size x embed_dim ]
+        out_channels x horizon ]
-        returns:
-        out : [ batch_size x out_channels x horizon ]
        """
        out = self.blocks[0](x) + self.time_mlp(t)
        out = self.blocks[1](out)

--- a/src/diffusers/models/unet_sde_score_estimation.py
+++ b/src/diffusers/models/unet_sde_score_estimation.py
@@ -136,26 +136,21 @@ def naive_downsample_2d(x, factor=2):
 def upsample_conv_2d(x, w, k=None, factor=2, gain=1):
    """Fused `upsample_2d()` followed by `tf.nn.conv2d()`.
-    Padding is performed only once at the beginning, not between the
-    operations.
-    The fused op is considerably more efficient than performing the same
-    calculation
-    using standard TensorFlow ops. It supports gradients of arbitrary order.
    Args:
-      x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+    Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+    efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of arbitrary
+    order.
+      x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
        C]`.
-      w:            Weight tensor of the shape `[filterH, filterW, inChannels,
+      w: Weight tensor of the shape `[filterH, filterW, inChannels,
-        outChannels]`. Grouped convolution can be performed by `inChannels =
+        outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
-        x.shape[0] // numGroups`.
+      k: FIR filter of the shape `[firH, firW]` or `[firN]`
-      k:            FIR filter of the shape `[firH, firW]` or `[firN]`
+        (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
-        (separable). The default is `[1] * factor`, which corresponds to
+      factor: Integer upsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
-        nearest-neighbor upsampling.
-      factor:       Integer upsampling factor (default: 2).
-      gain:         Scaling factor for signal magnitude (default: 1.0).
    Returns:
-      Tensor of the shape `[N, C, H * factor, W * factor]` or
+      Tensor of the shape `[N, C, H * factor, W * factor]` or `[N, H * factor, W * factor, C]`, and same datatype as
-      `[N, H * factor, W * factor, C]`, and same datatype as `x`.
+      `x`.
    """
    assert isinstance(factor, int) and factor >= 1
@@ -208,25 +203,21 @@ def upsample_conv_2d(x, w, k=None, factor=2, gain=1):
 def conv_downsample_2d(x, w, k=None, factor=2, gain=1):
    """Fused `tf.nn.conv2d()` followed by `downsample_2d()`.
-    Padding is performed only once at the beginning, not between the operations.
-    The fused op is considerably more efficient than performing the same
-    calculation
-    using standard TensorFlow ops. It supports gradients of arbitrary order.
    Args:
-        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+    Padding is performed only once at the beginning, not between the operations. The fused op is considerably more
+    efficient than performing the same calculation using standard TensorFlow ops. It supports gradients of arbitrary
+    order.
+        x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
          C]`.
-        w:            Weight tensor of the shape `[filterH, filterW, inChannels,
+        w: Weight tensor of the shape `[filterH, filterW, inChannels,
-          outChannels]`. Grouped convolution can be performed by `inChannels =
+          outChannels]`. Grouped convolution can be performed by `inChannels = x.shape[0] // numGroups`.
-          x.shape[0] // numGroups`.
+        k: FIR filter of the shape `[firH, firW]` or `[firN]`
-        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
+          (separable). The default is `[1] * factor`, which corresponds to average pooling.
-          (separable). The default is `[1] * factor`, which corresponds to
+        factor: Integer downsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
-          average pooling.
-        factor:       Integer downsampling factor (default: 2).
-        gain:         Scaling factor for signal magnitude (default: 1.0).
    Returns:
-        Tensor of the shape `[N, C, H // factor, W // factor]` or
+        Tensor of the shape `[N, C, H // factor, W // factor]` or `[N, H // factor, W // factor, C]`, and same datatype
-        `[N, H // factor, W // factor, C]`, and same datatype as `x`.
+        as `x`.
    """
    assert isinstance(factor, int) and factor >= 1
@@ -258,22 +249,16 @@ def _shape(x, dim):
 def upsample_2d(x, k=None, factor=2, gain=1):
    r"""Upsample a batch of 2D images with the given filter.
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
-    and upsamples each image with the given filter. The filter is normalized so
-    that
-    if the input pixels are constant, they will be scaled by the specified
-    `gain`.
-    Pixels outside the image are assumed to be zero, and the filter is padded
-    with
-    zeros so that its shape is a multiple of the upsampling factor.
    Args:
-        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and upsamples each image with the given
+    filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the specified
+    `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its shape is a:
+    multiple of the upsampling factor.
+        x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
          C]`.
-        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
+        k: FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to
+          (separable). The default is `[1] * factor`, which corresponds to nearest-neighbor upsampling.
-          nearest-neighbor upsampling.
+        factor: Integer upsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
-        factor:       Integer upsampling factor (default: 2).
-        gain:         Scaling factor for signal magnitude (default: 1.0).
    Returns:
        Tensor of the shape `[N, C, H * factor, W * factor]`
@@ -289,22 +274,16 @@ def upsample_2d(x, k=None, factor=2, gain=1):
 def downsample_2d(x, k=None, factor=2, gain=1):
    r"""Downsample a batch of 2D images with the given filter.
-    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]`
-    and downsamples each image with the given filter. The filter is normalized
-    so that
-    if the input pixels are constant, they will be scaled by the specified
-    `gain`.
-    Pixels outside the image are assumed to be zero, and the filter is padded
-    with
-    zeros so that its shape is a multiple of the downsampling factor.
    Args:
-        x:            Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
+    Accepts a batch of 2D images of the shape `[N, C, H, W]` or `[N, H, W, C]` and downsamples each image with the
+    given filter. The filter is normalized so that if the input pixels are constant, they will be scaled by the
+    specified `gain`. Pixels outside the image are assumed to be zero, and the filter is padded with zeros so that its
+    shape is a multiple of the downsampling factor.
+        x: Input tensor of the shape `[N, C, H, W]` or `[N, H, W,
          C]`.
-        k:            FIR filter of the shape `[firH, firW]` or `[firN]`
+        k: FIR filter of the shape `[firH, firW]` or `[firN]`
-          (separable). The default is `[1] * factor`, which corresponds to
+          (separable). The default is `[1] * factor`, which corresponds to average pooling.
-          average pooling.
+        factor: Integer downsampling factor (default: 2). gain: Scaling factor for signal magnitude (default: 1.0).
-        factor:       Integer downsampling factor (default: 2).
-        gain:         Scaling factor for signal magnitude (default: 1.0).
    Returns:
        Tensor of the shape `[N, C, H // factor, W // factor]`

--- a/src/diffusers/pipelines/grad_tts_utils.py
+++ b/src/diffusers/pipelines/grad_tts_utils.py
@@ -290,7 +290,7 @@ def normalize_numbers(text):
    return text
-""" from https://github.com/keithito/tacotron """
+""" from https://github.com/keithito/tacotron"""
 _pad = "_"
@@ -322,8 +322,8 @@ def get_arpabet(word, dictionary):
 def text_to_sequence(text, cleaner_names=[english_cleaners], dictionary=None):
    """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
-    The text can optionally have ARPAbet sequences enclosed in curly braces embedded
+    The text can optionally have ARPAbet sequences enclosed in curly braces embedded in it. For example, "Turn left on
-    in it. For example, "Turn left on {HH AW1 S S T AH0 N} Street."
+    {HH AW1 S S T AH0 N} Street."
    Args:
      text: string to convert to a sequence

--- a/src/diffusers/pipelines/pipeline_bddm.py
+++ b/src/diffusers/pipelines/pipeline_bddm.py
@@ -29,8 +29,7 @@ from ..pipeline_utils import DiffusionPipeline
 def calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):
    """
    Embed a diffusion step $t$ into a higher dimensional space
-        E.g. the embedding vector in the 128-dimensional space is
+        E.g. the embedding vector in the 128-dimensional space is [sin(t * 10^(0*4/63)), ... , sin(t * 10^(63*4/63)),
-        [sin(t * 10^(0*4/63)), ... , sin(t * 10^(63*4/63)),
         cos(t * 10^(0*4/63)), ... , cos(t * 10^(63*4/63))]
    Parameters:
@@ -53,8 +52,7 @@ def calc_diffusion_step_embedding(diffusion_steps, diffusion_step_embed_dim_in):
 """
-Below scripts were borrowed from
+Below scripts were borrowed from https://github.com/philsyn/DiffWave-Vocoder/blob/master/WaveNet.py
-https://github.com/philsyn/DiffWave-Vocoder/blob/master/WaveNet.py
 """

--- a/src/diffusers/pipelines/pipeline_glide.py
+++ b/src/diffusers/pipelines/pipeline_glide.py
@@ -699,9 +699,8 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape):
    """
    Extract values from a 1-D numpy array for a batch of indices.
-    :param arr: the 1-D numpy array.
+    :param arr: the 1-D numpy array. :param timesteps: a tensor of indices into the array to extract. :param
-    :param timesteps: a tensor of indices into the array to extract.
+    broadcast_shape: a larger shape of K dimensions with the batch
-    :param broadcast_shape: a larger shape of K dimensions with the batch
                            dimension equal to the length of timesteps.
    :return: a tensor of shape [batch_size, 1, ...] where the shape has K dims.
    """

--- a/src/diffusers/pipelines/pipeline_grad_tts.py
+++ b/src/diffusers/pipelines/pipeline_grad_tts.py
-""" from https://github.com/jaywalnut310/glow-tts """
+""" from https://github.com/jaywalnut310/glow-tts"""
 import math

--- a/src/diffusers/pipelines/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/pipeline_latent_diffusion.py
@@ -554,11 +554,9 @@ class LDMBertModel(LDMBertPreTrainedModel):
 def get_timestep_embedding(timesteps, embedding_dim):
    """
-    This matches the implementation in Denoising Diffusion Probabilistic Models:
+    This matches the implementation in Denoising Diffusion Probabilistic Models: From Fairseq. Build sinusoidal
-    From Fairseq.
+    embeddings. This matches the implementation in tensor2tensor, but differs slightly from the description in Section
-    Build sinusoidal embeddings.
+    3.5 of "Attention Is All You Need".
-    This matches the implementation in tensor2tensor, but differs slightly
-    from the description in Section 3.5 of "Attention Is All You Need".
    """
    assert len(timesteps.shape) == 1
@@ -1055,8 +1053,8 @@ class Decoder(nn.Module):
 class VectorQuantizer(nn.Module):
    """
-    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly
+    Improved version over VectorQuantizer, can be used as a drop-in replacement. Mostly avoids costly matrix
-    avoids costly matrix multiplications and allows for post-hoc remapping of indices.
+    multiplications and allows for post-hoc remapping of indices.
    """
    # NOTE: due to a bug the beta term was applied to the wrong term. for

--- a/src/diffusers/schedulers/scheduling_ddim.py
+++ b/src/diffusers/schedulers/scheduling_ddim.py
@@ -25,13 +25,12 @@ from .scheduling_utils import SchedulerMixin
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    (1-beta) over time from t = [0,1].
-    :param num_diffusion_timesteps: the number of betas to produce.
+    :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+    from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
+                      produces the cumulative product of (1-beta) up to that part of the diffusion process.
-                      part of the diffusion process.
    :param max_beta: the maximum beta to use; use values lower than 1 to
                     prevent singularities.
    """

--- a/src/diffusers/schedulers/scheduling_ddpm.py
+++ b/src/diffusers/schedulers/scheduling_ddpm.py
@@ -25,13 +25,12 @@ from .scheduling_utils import SchedulerMixin
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    (1-beta) over time from t = [0,1].
-    :param num_diffusion_timesteps: the number of betas to produce.
+    :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+    from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
+                      produces the cumulative product of (1-beta) up to that part of the diffusion process.
-                      part of the diffusion process.
    :param max_beta: the maximum beta to use; use values lower than 1 to
                     prevent singularities.
    """

--- a/src/diffusers/schedulers/scheduling_pndm.py
+++ b/src/diffusers/schedulers/scheduling_pndm.py
@@ -24,13 +24,12 @@ from .scheduling_utils import SchedulerMixin
 def betas_for_alpha_bar(num_diffusion_timesteps, max_beta=0.999):
    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
+    Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
+    (1-beta) over time from t = [0,1].
-    :param num_diffusion_timesteps: the number of betas to produce.
+    :param num_diffusion_timesteps: the number of betas to produce. :param alpha_bar: a lambda that takes an argument t
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
+    from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
+                      produces the cumulative product of (1-beta) up to that part of the diffusion process.
-                      part of the diffusion process.
    :param max_beta: the maximum beta to use; use values lower than 1 to
                     prevent singularities.
    """

--- a/src/diffusers/training_utils.py
+++ b/src/diffusers/training_utils.py
@@ -20,11 +20,10 @@ class EMAModel:
    ):
        """
        @crowsonkb's notes on EMA Warmup:
-            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are
+            If gamma=1 and power=1, implements a simple average. gamma=1, power=2/3 are good values for models you plan
-            good values for models you plan to train for a million or more steps (reaches decay
+            to train for a million or more steps (reaches decay factor 0.999 at 31.6K steps, 0.9999 at 1M steps),
-            factor 0.999 at 31.6K steps, 0.9999 at 1M steps), gamma=1, power=3/4 for models
+            gamma=1, power=3/4 for models you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999
-            you plan to train for less (reaches decay factor 0.999 at 10K steps, 0.9999 at
+            at 215.4k steps).
-            215.4k steps).
        Args:
            inv_gamma (float): Inverse multiplicative factor of EMA warmup. Default: 1.
            power (float): Exponential factor of EMA warmup. Default: 2/3.

--- a/src/diffusers/utils/__init__.py
+++ b/src/diffusers/utils/__init__.py
@@ -89,20 +89,20 @@ class RevisionNotFoundError(HTTPError):
 TRANSFORMERS_IMPORT_ERROR = """
-{0} requires the transformers library but it was not found in your environment. You can install it with pip:
+{0} requires the transformers library but it was not found in your environment. You can install it with pip: `pip
-`pip install transformers`
+install transformers`
 """
 UNIDECODE_IMPORT_ERROR = """
-{0} requires the unidecode library but it was not found in your environment. You can install it with pip:
+{0} requires the unidecode library but it was not found in your environment. You can install it with pip: `pip install
-`pip install Unidecode`
+Unidecode`
 """
 INFLECT_IMPORT_ERROR = """
-{0} requires the inflect library but it was not found in your environment. You can install it with pip:
+{0} requires the inflect library but it was not found in your environment. You can install it with pip: `pip install
-`pip install inflect`
+inflect`
 """

--- a/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py
+++ b/src/diffusers/utils/dummy_transformers_and_inflect_and_unidecode_objects.py
@@ -3,7 +3,7 @@
 from ..utils import DummyObject, requires_backends
-class GradTTS(metaclass=DummyObject):
+class GradTTSPipeline(metaclass=DummyObject):
    _backends = ["transformers", "inflect", "unidecode"]
    def __init__(self, *args, **kwargs):