convert_original_controlnet_to_diffusers.py 4.14 KB
Newer Older
1
# coding=utf-8
2
# Copyright 2025 The HuggingFace Inc. team.
3
4
5
6
7
8
9
10
11
12
13
14
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
15
"""Conversion script for stable diffusion checkpoints which _only_ contain a controlnet."""
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

import argparse

from diffusers.pipelines.stable_diffusion.convert_from_ckpt import download_controlnet_from_original_ckpt


if __name__ == "__main__":
    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--checkpoint_path", default=None, type=str, required=True, help="Path to the checkpoint to convert."
    )
    parser.add_argument(
        "--original_config_file",
        type=str,
        required=True,
        help="The YAML config file corresponding to the original architecture.",
    )
    parser.add_argument(
        "--num_in_channels",
        default=None,
        type=int,
        help="The number of input channels. If `None` number of input channels will be automatically inferred.",
    )
    parser.add_argument(
        "--image_size",
        default=512,
        type=int,
        help=(
45
            "The image size that the model was trained on. Use 512 for Stable Diffusion v1.X and Stable Diffusion v2"
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
            " Base. Use 768 for Stable Diffusion v2."
        ),
    )
    parser.add_argument(
        "--extract_ema",
        action="store_true",
        help=(
            "Only relevant for checkpoints that have both EMA and non-EMA weights. Whether to extract the EMA weights"
            " or not. Defaults to `False`. Add `--extract_ema` to extract the EMA weights. EMA weights usually yield"
            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
        ),
    )
    parser.add_argument(
        "--upcast_attention",
        action="store_true",
        help=(
            "Whether the attention computation should always be upcasted. This is necessary when running stable"
            " diffusion 2.1."
        ),
    )
    parser.add_argument(
        "--from_safetensors",
        action="store_true",
        help="If `--checkpoint_path` is in `safetensors` format, load checkpoint with safetensors instead of PyTorch.",
    )
    parser.add_argument(
        "--to_safetensors",
        action="store_true",
        help="Whether to store pipeline in safetensors format or not.",
    )
    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
    parser.add_argument("--device", type=str, help="Device to use (e.g. cpu, cuda:0, cuda:1, etc.)")
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93

    # small workaround to get argparser to parse a boolean input as either true _or_ false
    def parse_bool(string):
        if string == "True":
            return True
        elif string == "False":
            return False
        else:
            raise ValueError(f"could not parse string as bool {string}")

    parser.add_argument(
        "--use_linear_projection", help="Override for use linear projection", required=False, type=parse_bool
    )

    parser.add_argument("--cross_attention_dim", help="Override for cross attention_dim", required=False, type=int)

94
95
96
97
98
99
100
101
102
103
104
    args = parser.parse_args()

    controlnet = download_controlnet_from_original_ckpt(
        checkpoint_path=args.checkpoint_path,
        original_config_file=args.original_config_file,
        image_size=args.image_size,
        extract_ema=args.extract_ema,
        num_in_channels=args.num_in_channels,
        upcast_attention=args.upcast_attention,
        from_safetensors=args.from_safetensors,
        device=args.device,
105
106
        use_linear_projection=args.use_linear_projection,
        cross_attention_dim=args.cross_attention_dim,
107
108
109
    )

    controlnet.save_pretrained(args.dump_path, safe_serialization=args.to_safetensors)