Add text encoder conversion (#1559)

* Initial code for attempt at improving SD <--> diffusers conversions for v2.0 * Updates to support round-trip between orig. SD 2.0 and diffusers models * Corrected formatting to Black standard * Correcting import formatting * Fixed imports (properly this time) * add some corrections * remove inference files Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>

Add text encoder conversion (#1559)
* Initial code for attempt at improving SD <--> diffusers conversions for v2.0 * Updates to support round-trip between orig. SD 2.0 and diffusers models * Corrected formatting to Black standard * Correcting import formatting * Fixed imports (properly this time) * add some corrections * remove inference files Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
31444f57 · lawfordp2017 · GitHub · c3b2f975 · 31444f57 · 31444f57
Unverified Commit 31444f57 authored Dec 12, 2022 by lawfordp2017 Committed by GitHub Dec 12, 2022
2 changed files
--- a/scripts/convert_diffusers_to_original_stable_diffusion.py
+++ b/scripts/convert_diffusers_to_original_stable_diffusion.py
@@ -4,6 +4,7 @@
 import argparse
 import os.path as osp
+import re
 import torch
@@ -187,13 +188,80 @@ def convert_vae_state_dict(vae_state_dict):
 # =========================#
 # Text Encoder Conversion #
 # =========================#
-# pretty much a no-op
-def convert_text_enc_state_dict(text_enc_dict):
+textenc_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[1]): x[0] for x in textenc_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
+# Ordering is from https://github.com/pytorch/pytorch/blob/master/test/cpp/api/modules.cpp
+code2idx = {"q": 0, "k": 1, "v": 2}
+def convert_text_enc_state_dict_v20(text_enc_dict: dict[str, torch.Tensor]):
+    new_state_dict = {}
+    capture_qkv_weight = {}
+    capture_qkv_bias = {}
+    for k, v in text_enc_dict.items():
+        if (
+            k.endswith(".self_attn.q_proj.weight")
+            or k.endswith(".self_attn.k_proj.weight")
+            or k.endswith(".self_attn.v_proj.weight")
+        ):
+            k_pre = k[: -len(".q_proj.weight")]
+            k_code = k[-len("q_proj.weight")]
+            if k_pre not in capture_qkv_weight:
+                capture_qkv_weight[k_pre] = [None, None, None]
+            capture_qkv_weight[k_pre][code2idx[k_code]] = v
+            continue
+        if (
+            k.endswith(".self_attn.q_proj.bias")
+            or k.endswith(".self_attn.k_proj.bias")
+            or k.endswith(".self_attn.v_proj.bias")
+        ):
+            k_pre = k[: -len(".q_proj.bias")]
+            k_code = k[-len("q_proj.bias")]
+            if k_pre not in capture_qkv_bias:
+                capture_qkv_bias[k_pre] = [None, None, None]
+            capture_qkv_bias[k_pre][code2idx[k_code]] = v
+            continue
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k)
+        new_state_dict[relabelled_key] = v
+    for k_pre, tensors in capture_qkv_weight.items():
+        if None in tensors:
+            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
+        new_state_dict[relabelled_key + ".in_proj_weight"] = torch.cat(tensors)
+    for k_pre, tensors in capture_qkv_bias.items():
+        if None in tensors:
+            raise Exception("CORRUPTED MODEL: one of the q-k-v values for the text encoder was missing")
+        relabelled_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], k_pre)
+        new_state_dict[relabelled_key + ".in_proj_bias"] = torch.cat(tensors)
+    return new_state_dict
+def convert_text_enc_state_dict(text_enc_dict: dict[str, torch.Tensor]):
    return text_enc_dict
+IS_V20_MODEL = True
 if __name__ == "__main__":
    parser = argparse.ArgumentParser()
@@ -223,8 +291,18 @@ if __name__ == "__main__":
    # Convert the text encoder model
    text_enc_dict = torch.load(text_enc_path, map_location="cpu")
-    text_enc_dict = convert_text_enc_state_dict(text_enc_dict)
-    text_enc_dict = {"cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()}
+    # Easiest way to identify v2.0 model seems to be that the text encoder (OpenCLIP) is deeper
+    is_v20_model = "text_model.encoder.layers.22.layer_norm2.bias" in text_enc_dict
+    if is_v20_model:
+        # Need to add the tag 'transformer' in advance so we can knock it out from the final layer-norm
+        text_enc_dict = {"transformer." + k: v for k, v in text_enc_dict.items()}
+        text_enc_dict = convert_text_enc_state_dict_v20(text_enc_dict)
+        text_enc_dict = {"cond_stage_model.model." + k: v for k, v in text_enc_dict.items()}
+    else:
+        text_enc_dict = convert_text_enc_state_dict(text_enc_dict)
+        text_enc_dict = {"cond_stage_model.transformer." + k: v for k, v in text_enc_dict.items()}
    # Put together new checkpoint
    state_dict = {**unet_state_dict, **vae_state_dict, **text_enc_dict}

--- a/scripts/convert_original_stable_diffusion_to_diffusers.py
+++ b/scripts/convert_original_stable_diffusion_to_diffusers.py
@@ -16,6 +16,7 @@
 import argparse
 import os
+import re
 import torch
@@ -648,6 +649,30 @@ def convert_ldm_clip_checkpoint(checkpoint):
    return text_model
+textenc_conversion_lst = [
+    ("cond_stage_model.model.positional_embedding", "text_model.embeddings.position_embedding.weight"),
+    ("cond_stage_model.model.token_embedding.weight", "text_model.embeddings.token_embedding.weight"),
+    ("cond_stage_model.model.ln_final.weight", "text_model.final_layer_norm.weight"),
+    ("cond_stage_model.model.ln_final.bias", "text_model.final_layer_norm.bias"),
+]
+textenc_conversion_map = {x[0]: x[1] for x in textenc_conversion_lst}
+textenc_transformer_conversion_lst = [
+    # (stable-diffusion, HF Diffusers)
+    ("resblocks.", "text_model.encoder.layers."),
+    ("ln_1", "layer_norm1"),
+    ("ln_2", "layer_norm2"),
+    (".c_fc.", ".fc1."),
+    (".c_proj.", ".fc2."),
+    (".attn", ".self_attn"),
+    ("ln_final.", "transformer.text_model.final_layer_norm."),
+    ("token_embedding.weight", "transformer.text_model.embeddings.token_embedding.weight"),
+    ("positional_embedding", "transformer.text_model.embeddings.position_embedding.weight"),
+]
+protected = {re.escape(x[0]): x[1] for x in textenc_transformer_conversion_lst}
+textenc_pattern = re.compile("|".join(protected.keys()))
 def convert_paint_by_example_checkpoint(checkpoint):
    config = CLIPVisionConfig.from_pretrained("openai/clip-vit-large-patch14")
    model = PaintByExampleImageEncoder(config)
@@ -718,15 +743,39 @@ def convert_paint_by_example_checkpoint(checkpoint):
 def convert_open_clip_checkpoint(checkpoint):
    text_model = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2", subfolder="text_encoder")
-    # SKIP for now - need openclip -> HF conversion script here
+    keys = list(checkpoint.keys())
-    #    keys = list(checkpoint.keys())
-    #
+    text_model_dict = {}
-    #    text_model_dict = {}
-    #    for key in keys:
+    d_model = int(checkpoint["cond_stage_model.model.text_projection"].shape[0])
-    #        if key.startswith("cond_stage_model.model.transformer"):
-    #            text_model_dict[key[len("cond_stage_model.model.transformer.") :]] = checkpoint[key]
+    text_model_dict["text_model.embeddings.position_ids"] = text_model.text_model.embeddings.get_buffer("position_ids")
-    #
-    #    text_model.load_state_dict(text_model_dict)
+    for key in keys:
+        if "resblocks.23" in key:  # Diffusers drops the final layer and only uses the penultimate layer
+            continue
+        if key in textenc_conversion_map:
+            text_model_dict[textenc_conversion_map[key]] = checkpoint[key]
+        if key.startswith("cond_stage_model.model.transformer."):
+            new_key = key[len("cond_stage_model.model.transformer.") :]
+            if new_key.endswith(".in_proj_weight"):
+                new_key = new_key[: -len(".in_proj_weight")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.weight"] = checkpoint[key][:d_model, :]
+                text_model_dict[new_key + ".k_proj.weight"] = checkpoint[key][d_model : d_model * 2, :]
+                text_model_dict[new_key + ".v_proj.weight"] = checkpoint[key][d_model * 2 :, :]
+            elif new_key.endswith(".in_proj_bias"):
+                new_key = new_key[: -len(".in_proj_bias")]
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key + ".q_proj.bias"] = checkpoint[key][:d_model]
+                text_model_dict[new_key + ".k_proj.bias"] = checkpoint[key][d_model : d_model * 2]
+                text_model_dict[new_key + ".v_proj.bias"] = checkpoint[key][d_model * 2 :]
+            else:
+                new_key = textenc_pattern.sub(lambda m: protected[re.escape(m.group(0))], new_key)
+                text_model_dict[new_key] = checkpoint[key]
+    text_model.load_state_dict(text_model_dict)
    return text_model
@@ -789,6 +838,15 @@ if __name__ == "__main__":
            " higher quality images for inference. Non-EMA weights are usually better to continue fine-tuning."
        ),
    )
+    parser.add_argument(
+        "--upcast_attn",
+        default=False,
+        type=bool,
+        help=(
+            "Whether the attention computation should always be upcasted. This is necessary when running stable"
+            " diffusion 2.1."
+        ),
+    )
    parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output model.")
    args = parser.parse_args()
@@ -799,6 +857,7 @@ if __name__ == "__main__":
    global_step = checkpoint["global_step"]
    checkpoint = checkpoint["state_dict"]
+    upcast_attention = False
    if args.original_config_file is None:
        key_name = "model.diffusion_model.input_blocks.2.1.transformer_blocks.0.attn2.to_k.weight"
@@ -808,6 +867,10 @@ if __name__ == "__main__":
                "wget https://raw.githubusercontent.com/Stability-AI/stablediffusion/main/configs/stable-diffusion/v2-inference-v.yaml"
            )
            args.original_config_file = "./v2-inference-v.yaml"
+            if global_step == 110000:
+                # v2.1 needs to upcast attention
+                upcast_attention = True
        else:
            # model_type = "v1"
            os.system(
@@ -852,6 +915,7 @@ if __name__ == "__main__":
        set_alpha_to_one=False,
        prediction_type=prediction_type,
    )
    if args.scheduler_type == "pndm":
        config = dict(scheduler.config)
        config["skip_prk_steps"] = True
@@ -873,6 +937,7 @@ if __name__ == "__main__":
    # Convert the UNet2DConditionModel model.
    unet_config = create_unet_diffusers_config(original_config, image_size=image_size)
+    unet_config["upcast_attention"] = upcast_attention
    unet = UNet2DConditionModel(**unet_config)
    converted_unet_checkpoint = convert_ldm_unet_checkpoint(