更新diffusers文件夹

3d1b9667 · wangwei990215 · b6a53272 · 3d1b9667 · 3d1b9667 · 3d1b9667
Commit 3d1b9667 authored Apr 17, 2025 by wangwei990215
20 changed files
--- a/diffusers-0.27.0/examples/wuerstchen/text_to_image/requirements.txt
+++ b/diffusers-0.27.0/examples/wuerstchen/text_to_image/requirements.txt
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+wandb
+bitsandbytes
+deepspeed
+peft>=0.6.0
--- a/diffusers-0.27.0/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
+++ b/diffusers-0.27.0/examples/wuerstchen/text_to_image/train_text_to_image_lora_prior.py
--- a/diffusers-0.27.0/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
+++ b/diffusers-0.27.0/examples/wuerstchen/text_to_image/train_text_to_image_prior.py
--- a/diffusers-0.27.0/pyproject.toml
+++ b/diffusers-0.27.0/pyproject.toml
+[tool.ruff]
+# Never enforce `E501` (line length violations).
+ignore = ["C901", "E501", "E741", "F402", "F823"]
+select = ["C", "E", "F", "I", "W"]
+line-length = 119
+# Ignore import violations in all `__init__.py` files.
+[tool.ruff.per-file-ignores]
+"__init__.py" = ["E402", "F401", "F403", "F811"]
+"src/diffusers/utils/dummy_*.py" = ["F401"]
+[tool.ruff.isort]
+lines-after-imports = 2
+known-first-party = ["diffusers"]
+[tool.ruff.format]
+# Like Black, use double quotes for strings.
+quote-style = "double"
+# Like Black, indent with spaces, rather than tabs.
+indent-style = "space"
+# Like Black, respect magic trailing commas.
+skip-magic-trailing-comma = false
+# Like Black, automatically detect the appropriate line ending.
+line-ending = "auto"
--- a/diffusers-0.27.0/scripts/__init__.py
+++ b/diffusers-0.27.0/scripts/__init__.py
--- a/diffusers-0.27.0/scripts/change_naming_configs_and_checkpoints.py
+++ b/diffusers-0.27.0/scripts/change_naming_configs_and_checkpoints.py
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LDM checkpoints. """
+import argparse
+import json
+import os
+import torch
+from transformers.file_utils import has_file
+from diffusers import UNet2DConditionModel, UNet2DModel
+do_only_config = False
+do_only_weights = True
+do_only_renaming = False
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--repo_path",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the architecture.",
+    )
+    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
+    args = parser.parse_args()
+    config_parameters_to_change = {
+        "image_size": "sample_size",
+        "num_res_blocks": "layers_per_block",
+        "block_channels": "block_out_channels",
+        "down_blocks": "down_block_types",
+        "up_blocks": "up_block_types",
+        "downscale_freq_shift": "freq_shift",
+        "resnet_num_groups": "norm_num_groups",
+        "resnet_act_fn": "act_fn",
+        "resnet_eps": "norm_eps",
+        "num_head_channels": "attention_head_dim",
+    }
+    key_parameters_to_change = {
+        "time_steps": "time_proj",
+        "mid": "mid_block",
+        "downsample_blocks": "down_blocks",
+        "upsample_blocks": "up_blocks",
+    }
+    subfolder = "" if has_file(args.repo_path, "config.json") else "unet"
+    with open(os.path.join(args.repo_path, subfolder, "config.json"), "r", encoding="utf-8") as reader:
+        text = reader.read()
+        config = json.loads(text)
+    if do_only_config:
+        for key in config_parameters_to_change.keys():
+            config.pop(key, None)
+    if has_file(args.repo_path, "config.json"):
+        model = UNet2DModel(**config)
+    else:
+        class_name = UNet2DConditionModel if "ldm-text2im-large-256" in args.repo_path else UNet2DModel
+        model = class_name(**config)
+    if do_only_config:
+        model.save_config(os.path.join(args.repo_path, subfolder))
+    config = dict(model.config)
+    if do_only_renaming:
+        for key, value in config_parameters_to_change.items():
+            if key in config:
+                config[value] = config[key]
+                del config[key]
+        config["down_block_types"] = [k.replace("UNetRes", "") for k in config["down_block_types"]]
+        config["up_block_types"] = [k.replace("UNetRes", "") for k in config["up_block_types"]]
+    if do_only_weights:
+        state_dict = torch.load(os.path.join(args.repo_path, subfolder, "diffusion_pytorch_model.bin"))
+        new_state_dict = {}
+        for param_key, param_value in state_dict.items():
+            if param_key.endswith(".op.bias") or param_key.endswith(".op.weight"):
+                continue
+            has_changed = False
+            for key, new_key in key_parameters_to_change.items():
+                if not has_changed and param_key.split(".")[0] == key:
+                    new_state_dict[".".join([new_key] + param_key.split(".")[1:])] = param_value
+                    has_changed = True
+            if not has_changed:
+                new_state_dict[param_key] = param_value
+        model.load_state_dict(new_state_dict)
+        model.save_pretrained(os.path.join(args.repo_path, subfolder))
--- a/diffusers-0.27.0/scripts/conversion_ldm_uncond.py
+++ b/diffusers-0.27.0/scripts/conversion_ldm_uncond.py
+import argparse
+import torch
+import yaml
+from diffusers import DDIMScheduler, LDMPipeline, UNetLDMModel, VQModel
+def convert_ldm_original(checkpoint_path, config_path, output_path):
+    config = yaml.safe_load(config_path)
+    state_dict = torch.load(checkpoint_path, map_location="cpu")["model"]
+    keys = list(state_dict.keys())
+    # extract state_dict for VQVAE
+    first_stage_dict = {}
+    first_stage_key = "first_stage_model."
+    for key in keys:
+        if key.startswith(first_stage_key):
+            first_stage_dict[key.replace(first_stage_key, "")] = state_dict[key]
+    # extract state_dict for UNetLDM
+    unet_state_dict = {}
+    unet_key = "model.diffusion_model."
+    for key in keys:
+        if key.startswith(unet_key):
+            unet_state_dict[key.replace(unet_key, "")] = state_dict[key]
+    vqvae_init_args = config["model"]["params"]["first_stage_config"]["params"]
+    unet_init_args = config["model"]["params"]["unet_config"]["params"]
+    vqvae = VQModel(**vqvae_init_args).eval()
+    vqvae.load_state_dict(first_stage_dict)
+    unet = UNetLDMModel(**unet_init_args).eval()
+    unet.load_state_dict(unet_state_dict)
+    noise_scheduler = DDIMScheduler(
+        timesteps=config["model"]["params"]["timesteps"],
+        beta_schedule="scaled_linear",
+        beta_start=config["model"]["params"]["linear_start"],
+        beta_end=config["model"]["params"]["linear_end"],
+        clip_sample=False,
+    )
+    pipeline = LDMPipeline(vqvae, unet, noise_scheduler)
+    pipeline.save_pretrained(output_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--checkpoint_path", type=str, required=True)
+    parser.add_argument("--config_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)
+    args = parser.parse_args()
+    convert_ldm_original(args.checkpoint_path, args.config_path, args.output_path)
--- a/diffusers-0.27.0/scripts/convert_amused.py
+++ b/diffusers-0.27.0/scripts/convert_amused.py
--- a/diffusers-0.27.0/scripts/convert_animatediff_motion_lora_to_diffusers.py
+++ b/diffusers-0.27.0/scripts/convert_animatediff_motion_lora_to_diffusers.py
+import argparse
+import torch
+from safetensors.torch import save_file
+def convert_motion_module(original_state_dict):
+    converted_state_dict = {}
+    for k, v in original_state_dict.items():
+        if "pos_encoder" in k:
+            continue
+        else:
+            converted_state_dict[
+                k.replace(".norms.0", ".norm1")
+                .replace(".norms.1", ".norm2")
+                .replace(".ff_norm", ".norm3")
+                .replace(".attention_blocks.0", ".attn1")
+                .replace(".attention_blocks.1", ".attn2")
+                .replace(".temporal_transformer", "")
+            ] = v
+    return converted_state_dict
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = get_args()
+    state_dict = torch.load(args.ckpt_path, map_location="cpu")
+    if "state_dict" in state_dict.keys():
+        state_dict = state_dict["state_dict"]
+    conv_state_dict = convert_motion_module(state_dict)
+    # convert to new format
+    output_dict = {}
+    for module_name, params in conv_state_dict.items():
+        if type(params) is not torch.Tensor:
+            continue
+        output_dict.update({f"unet.{module_name}": params})
+    save_file(output_dict, f"{args.output_path}/diffusion_pytorch_model.safetensors")
--- a/diffusers-0.27.0/scripts/convert_animatediff_motion_module_to_diffusers.py
+++ b/diffusers-0.27.0/scripts/convert_animatediff_motion_module_to_diffusers.py
+import argparse
+import torch
+from diffusers import MotionAdapter
+def convert_motion_module(original_state_dict):
+    converted_state_dict = {}
+    for k, v in original_state_dict.items():
+        if "pos_encoder" in k:
+            continue
+        else:
+            converted_state_dict[
+                k.replace(".norms.0", ".norm1")
+                .replace(".norms.1", ".norm2")
+                .replace(".ff_norm", ".norm3")
+                .replace(".attention_blocks.0", ".attn1")
+                .replace(".attention_blocks.1", ".attn2")
+                .replace(".temporal_transformer", "")
+            ] = v
+    return converted_state_dict
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--ckpt_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)
+    parser.add_argument("--use_motion_mid_block", action="store_true")
+    parser.add_argument("--motion_max_seq_length", type=int, default=32)
+    parser.add_argument("--save_fp16", action="store_true")
+    return parser.parse_args()
+if __name__ == "__main__":
+    args = get_args()
+    state_dict = torch.load(args.ckpt_path, map_location="cpu")
+    if "state_dict" in state_dict.keys():
+        state_dict = state_dict["state_dict"]
+    conv_state_dict = convert_motion_module(state_dict)
+    adapter = MotionAdapter(
+        use_motion_mid_block=args.use_motion_mid_block, motion_max_seq_length=args.motion_max_seq_length
+    )
+    # skip loading position embeddings
+    adapter.load_state_dict(conv_state_dict, strict=False)
+    adapter.save_pretrained(args.output_path)
+    if args.save_fp16:
+        adapter.to(torch.float16).save_pretrained(args.output_path, variant="fp16")
--- a/diffusers-0.27.0/scripts/convert_asymmetric_vqgan_to_diffusers.py
+++ b/diffusers-0.27.0/scripts/convert_asymmetric_vqgan_to_diffusers.py
+import argparse
+import time
+from pathlib import Path
+from typing import Any, Dict, Literal
+import torch
+from diffusers import AsymmetricAutoencoderKL
+ASYMMETRIC_AUTOENCODER_KL_x_1_5_CONFIG = {
+    "in_channels": 3,
+    "out_channels": 3,
+    "down_block_types": [
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+    ],
+    "down_block_out_channels": [128, 256, 512, 512],
+    "layers_per_down_block": 2,
+    "up_block_types": [
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+    ],
+    "up_block_out_channels": [192, 384, 768, 768],
+    "layers_per_up_block": 3,
+    "act_fn": "silu",
+    "latent_channels": 4,
+    "norm_num_groups": 32,
+    "sample_size": 256,
+    "scaling_factor": 0.18215,
+}
+ASYMMETRIC_AUTOENCODER_KL_x_2_CONFIG = {
+    "in_channels": 3,
+    "out_channels": 3,
+    "down_block_types": [
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+        "DownEncoderBlock2D",
+    ],
+    "down_block_out_channels": [128, 256, 512, 512],
+    "layers_per_down_block": 2,
+    "up_block_types": [
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+        "UpDecoderBlock2D",
+    ],
+    "up_block_out_channels": [256, 512, 1024, 1024],
+    "layers_per_up_block": 5,
+    "act_fn": "silu",
+    "latent_channels": 4,
+    "norm_num_groups": 32,
+    "sample_size": 256,
+    "scaling_factor": 0.18215,
+}
+def convert_asymmetric_autoencoder_kl_state_dict(original_state_dict: Dict[str, Any]) -> Dict[str, Any]:
+    converted_state_dict = {}
+    for k, v in original_state_dict.items():
+        if k.startswith("encoder."):
+            converted_state_dict[
+                k.replace("encoder.down.", "encoder.down_blocks.")
+                .replace("encoder.mid.", "encoder.mid_block.")
+                .replace("encoder.norm_out.", "encoder.conv_norm_out.")
+                .replace(".downsample.", ".downsamplers.0.")
+                .replace(".nin_shortcut.", ".conv_shortcut.")
+                .replace(".block.", ".resnets.")
+                .replace(".block_1.", ".resnets.0.")
+                .replace(".block_2.", ".resnets.1.")
+                .replace(".attn_1.k.", ".attentions.0.to_k.")
+                .replace(".attn_1.q.", ".attentions.0.to_q.")
+                .replace(".attn_1.v.", ".attentions.0.to_v.")
+                .replace(".attn_1.proj_out.", ".attentions.0.to_out.0.")
+                .replace(".attn_1.norm.", ".attentions.0.group_norm.")
+            ] = v
+        elif k.startswith("decoder.") and "up_layers" not in k:
+            converted_state_dict[
+                k.replace("decoder.encoder.", "decoder.condition_encoder.")
+                .replace(".norm_out.", ".conv_norm_out.")
+                .replace(".up.0.", ".up_blocks.3.")
+                .replace(".up.1.", ".up_blocks.2.")
+                .replace(".up.2.", ".up_blocks.1.")
+                .replace(".up.3.", ".up_blocks.0.")
+                .replace(".block.", ".resnets.")
+                .replace("mid", "mid_block")
+                .replace(".0.upsample.", ".0.upsamplers.0.")
+                .replace(".1.upsample.", ".1.upsamplers.0.")
+                .replace(".2.upsample.", ".2.upsamplers.0.")
+                .replace(".nin_shortcut.", ".conv_shortcut.")
+                .replace(".block_1.", ".resnets.0.")
+                .replace(".block_2.", ".resnets.1.")
+                .replace(".attn_1.k.", ".attentions.0.to_k.")
+                .replace(".attn_1.q.", ".attentions.0.to_q.")
+                .replace(".attn_1.v.", ".attentions.0.to_v.")
+                .replace(".attn_1.proj_out.", ".attentions.0.to_out.0.")
+                .replace(".attn_1.norm.", ".attentions.0.group_norm.")
+            ] = v
+        elif k.startswith("quant_conv."):
+            converted_state_dict[k] = v
+        elif k.startswith("post_quant_conv."):
+            converted_state_dict[k] = v
+        else:
+            print(f"  skipping key `{k}`")
+    # fix weights shape
+    for k, v in converted_state_dict.items():
+        if (
+            (k.startswith("encoder.mid_block.attentions.0") or k.startswith("decoder.mid_block.attentions.0"))
+            and k.endswith("weight")
+            and ("to_q" in k or "to_k" in k or "to_v" in k or "to_out" in k)
+        ):
+            converted_state_dict[k] = converted_state_dict[k][:, :, 0, 0]
+    return converted_state_dict
+def get_asymmetric_autoencoder_kl_from_original_checkpoint(
+    scale: Literal["1.5", "2"], original_checkpoint_path: str, map_location: torch.device
+) -> AsymmetricAutoencoderKL:
+    print("Loading original state_dict")
+    original_state_dict = torch.load(original_checkpoint_path, map_location=map_location)
+    original_state_dict = original_state_dict["state_dict"]
+    print("Converting state_dict")
+    converted_state_dict = convert_asymmetric_autoencoder_kl_state_dict(original_state_dict)
+    kwargs = ASYMMETRIC_AUTOENCODER_KL_x_1_5_CONFIG if scale == "1.5" else ASYMMETRIC_AUTOENCODER_KL_x_2_CONFIG
+    print("Initializing AsymmetricAutoencoderKL model")
+    asymmetric_autoencoder_kl = AsymmetricAutoencoderKL(**kwargs)
+    print("Loading weight from converted state_dict")
+    asymmetric_autoencoder_kl.load_state_dict(converted_state_dict)
+    asymmetric_autoencoder_kl.eval()
+    print("AsymmetricAutoencoderKL successfully initialized")
+    return asymmetric_autoencoder_kl
+if __name__ == "__main__":
+    start = time.time()
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--scale",
+        default=None,
+        type=str,
+        required=True,
+        help="Asymmetric VQGAN scale: `1.5` or `2`",
+    )
+    parser.add_argument(
+        "--original_checkpoint_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to the original Asymmetric VQGAN checkpoint",
+    )
+    parser.add_argument(
+        "--output_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to save pretrained AsymmetricAutoencoderKL model",
+    )
+    parser.add_argument(
+        "--map_location",
+        default="cpu",
+        type=str,
+        required=False,
+        help="The device passed to `map_location` when loading the checkpoint",
+    )
+    args = parser.parse_args()
+    assert args.scale in ["1.5", "2"], f"{args.scale} should be `1.5` of `2`"
+    assert Path(args.original_checkpoint_path).is_file()
+    asymmetric_autoencoder_kl = get_asymmetric_autoencoder_kl_from_original_checkpoint(
+        scale=args.scale,
+        original_checkpoint_path=args.original_checkpoint_path,
+        map_location=torch.device(args.map_location),
+    )
+    print("Saving pretrained AsymmetricAutoencoderKL")
+    asymmetric_autoencoder_kl.save_pretrained(args.output_path)
+    print(f"Done in {time.time() - start:.2f} seconds")
--- a/diffusers-0.27.0/scripts/convert_blipdiffusion_to_diffusers.py
+++ b/diffusers-0.27.0/scripts/convert_blipdiffusion_to_diffusers.py
--- a/diffusers-0.27.0/scripts/convert_consistency_decoder.py
+++ b/diffusers-0.27.0/scripts/convert_consistency_decoder.py
--- a/diffusers-0.27.0/scripts/convert_consistency_to_diffusers.py
+++ b/diffusers-0.27.0/scripts/convert_consistency_to_diffusers.py
--- a/diffusers-0.27.0/scripts/convert_dance_diffusion_to_diffusers.py
+++ b/diffusers-0.27.0/scripts/convert_dance_diffusion_to_diffusers.py
--- a/diffusers-0.27.0/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
+++ b/diffusers-0.27.0/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
--- a/diffusers-0.27.0/scripts/convert_diffusers_sdxl_lora_to_webui.py
+++ b/diffusers-0.27.0/scripts/convert_diffusers_sdxl_lora_to_webui.py
+# Script for converting a Hugging Face Diffusers trained SDXL LoRAs to Kohya format
+# This means that you can input your diffusers-trained LoRAs and
+# Get the output to work with WebUIs such as AUTOMATIC1111, ComfyUI, SD.Next and others.
+# To get started you can find some cool `diffusers` trained LoRAs such as this cute Corgy
+# https://huggingface.co/ignasbud/corgy_dog_LoRA/, download its `pytorch_lora_weights.safetensors` file
+# and run the script:
+# python convert_diffusers_sdxl_lora_to_webui.py --input_lora pytorch_lora_weights.safetensors --output_lora corgy.safetensors
+# now you can use corgy.safetensors in your WebUI of choice!
+# To train your own, here are some diffusers training scripts and utils that you can use and then convert:
+# LoRA Ease - no code SDXL Dreambooth LoRA trainer: https://huggingface.co/spaces/multimodalart/lora-ease
+# Dreambooth Advanced Training Script - state of the art techniques such as pivotal tuning and prodigy optimizer:
+# - Script: https://github.com/huggingface/diffusers/blob/main/examples/advanced_diffusion_training/train_dreambooth_lora_sdxl_advanced.py
+# - Colab (only on Pro): https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_Dreambooth_LoRA_advanced_example.ipynb
+# Canonical diffusers training scripts:
+# - Script: https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora_sdxl.py
+# - Colab (runs on free tier): https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/SDXL_DreamBooth_LoRA_.ipynb
+import argparse
+import os
+from safetensors.torch import load_file, save_file
+from diffusers.utils import convert_all_state_dict_to_peft, convert_state_dict_to_kohya
+def convert_and_save(input_lora, output_lora=None):
+    if output_lora is None:
+        base_name = os.path.splitext(input_lora)[0]
+        output_lora = f"{base_name}_webui.safetensors"
+    diffusers_state_dict = load_file(input_lora)
+    peft_state_dict = convert_all_state_dict_to_peft(diffusers_state_dict)
+    kohya_state_dict = convert_state_dict_to_kohya(peft_state_dict)
+    save_file(kohya_state_dict, output_lora)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Convert LoRA model to PEFT and then to Kohya format.")
+    parser.add_argument(
+        "--input_lora",
+        type=str,
+        required=True,
+        help="Path to the input LoRA model file in the diffusers format.",
+    )
+    parser.add_argument(
+        "--output_lora",
+        type=str,
+        required=False,
+        help="Path for the converted LoRA (safetensors format for AUTOMATIC1111, ComfyUI, etc.). Optional, defaults to input name with a _webui suffix.",
+    )
+    args = parser.parse_args()
+    convert_and_save(args.input_lora, args.output_lora)
--- a/diffusers-0.27.0/scripts/convert_diffusers_to_original_sdxl.py
+++ b/diffusers-0.27.0/scripts/convert_diffusers_to_original_sdxl.py
--- a/diffusers-0.27.0/scripts/convert_diffusers_to_original_stable_diffusion.py
+++ b/diffusers-0.27.0/scripts/convert_diffusers_to_original_stable_diffusion.py
--- a/diffusers-0.27.0/scripts/convert_dit_to_diffusers.py
+++ b/diffusers-0.27.0/scripts/convert_dit_to_diffusers.py