Fix configuring VAE from single file mixin (#6950)

* update

Fix configuring VAE from single file mixin (#6950)
* update
a326d611 · Dhruv Nair · GitHub · e7696e20 · a326d611 · a326d611
Unverified Commit a326d611 authored Feb 13, 2024 by Dhruv Nair Committed by GitHub Feb 12, 2024
Hide whitespace changes
Inline Side-by-side

Showing with 28 additions and 3 deletions

src/diffusers/loaders/autoencoder.py src/diffusers/loaders/autoencoder.py +22 -1

src/diffusers/loaders/single_file_utils.py src/diffusers/loaders/single_file_utils.py +6 -2

No files found.
--- a/src/diffusers/loaders/autoencoder.py
+++ b/src/diffusers/loaders/autoencoder.py
@@ -38,6 +38,9 @@ class FromOriginalVAEMixin:
                    - A link to the `.ckpt` file (for example
                      `"https://huggingface.co/<repo_id>/blob/main/<path_to_file>.ckpt"`) on the Hub.
                    - A path to a *file* containing all pipeline weights.
+            config_file (`str`, *optional*):
+                Filepath to the configuration YAML file associated with the model. If not provided it will default to:
+                https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml
            torch_dtype (`str` or `torch.dtype`, *optional*):
                Override the default `torch.dtype` and load the model with another dtype. If `"auto"` is passed, the
                dtype is automatically derived from the model's weights.
@@ -65,6 +68,13 @@ class FromOriginalVAEMixin:
            image_size (`int`, *optional*, defaults to 512):
                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
+            scaling_factor (`float`, *optional*, defaults to 0.18215):
+                The component-wise standard deviation of the trained latent space computed using the first batch of the
+                training set. This is used to scale the latent space to have unit variance when training the diffusion
+                model. The latents are scaled with the formula `z = z * scaling_factor` before being passed to the
+                diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z
+                = 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution
+                Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
            use_safetensors (`bool`, *optional*, defaults to `None`):
                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
@@ -92,6 +102,7 @@ class FromOriginalVAEMixin:
        """
        original_config_file = kwargs.pop("original_config_file", None)
+        config_file = kwargs.pop("config_file", None)
        resume_download = kwargs.pop("resume_download", False)
        force_download = kwargs.pop("force_download", False)
        proxies = kwargs.pop("proxies", None)
@@ -103,6 +114,13 @@ class FromOriginalVAEMixin:
        use_safetensors = kwargs.pop("use_safetensors", True)
        class_name = cls.__name__
+        if (config_file is not None) and (original_config_file is not None):
+            raise ValueError(
+                "You cannot pass both `config_file` and `original_config_file` to `from_single_file`. Please use only one of these arguments."
+            )
+        original_config_file = original_config_file or config_file
        original_config, checkpoint = fetch_ldm_config_and_checkpoint(
            pretrained_model_link_or_path=pretrained_model_link_or_path,
            class_name=class_name,
@@ -118,7 +136,10 @@ class FromOriginalVAEMixin:
        )
        image_size = kwargs.pop("image_size", None)
-        component = create_diffusers_vae_model_from_ldm(class_name, original_config, checkpoint, image_size=image_size)
+        scaling_factor = kwargs.pop("scaling_factor", None)
+        component = create_diffusers_vae_model_from_ldm(
+            class_name, original_config, checkpoint, image_size=image_size, scaling_factor=scaling_factor
+        )
        vae = component["vae"]
        if torch_dtype is not None:
            vae = vae.to(torch_dtype)

--- a/src/diffusers/loaders/single_file_utils.py
+++ b/src/diffusers/loaders/single_file_utils.py
@@ -175,6 +175,7 @@ DIFFUSERS_TO_LDM_MAPPING = {
 }
 LDM_VAE_KEY = "first_stage_model."
+LDM_VAE_DEFAULT_SCALING_FACTOR = 0.18215
 LDM_UNET_KEY = "model.diffusion_model."
 LDM_CONTROLNET_KEY = "control_model."
 LDM_CLIP_PREFIX_TO_REMOVE = ["cond_stage_model.transformer.", "conditioner.embedders.0.transformer."]
@@ -518,7 +519,10 @@ def create_vae_diffusers_config(original_config, image_size, scaling_factor=None
    Creates a config for the diffusers based on the config of the LDM model.
    """
    vae_params = original_config["model"]["params"]["first_stage_config"]["params"]["ddconfig"]
-    scaling_factor = scaling_factor or original_config["model"]["params"]["scale_factor"]
+    if scaling_factor is None and "scale_factor" in original_config["model"]["params"]:
+        scaling_factor = original_config["model"]["params"]["scale_factor"]
+    elif scaling_factor is None:
+        scaling_factor = LDM_VAE_DEFAULT_SCALING_FACTOR
    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
@@ -1173,7 +1177,7 @@ def create_diffusers_unet_model_from_ldm(
 def create_diffusers_vae_model_from_ldm(
-    pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=0.18125
+    pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=None
 ):
    # import here to avoid circular imports
    from ..models import AutoencoderKL