supported_models_base.py 3.33 KB
Newer Older
1
2
3
import torch
from . import model_base
from . import utils
4
from . import latent_formats
5
6
7
8
9
10
11
12
13
14
15
16
17
18

class ClipTarget:
    def __init__(self, tokenizer, clip):
        self.clip = clip
        self.tokenizer = tokenizer
        self.params = {}

class BASE:
    unet_config = {}
    unet_extra_config = {
        "num_heads": -1,
        "num_head_channels": 64,
    }

comfyanonymous's avatar
comfyanonymous committed
19
20
    required_keys = {}

21
22
23
    clip_prefix = []
    clip_vision_prefix = None
    noise_aug_config = None
24
    sampling_settings = {}
25
    latent_format = latent_formats.LatentFormat
26
    vae_key_prefix = ["first_stage_model."]
27
    text_encoder_key_prefix = ["cond_stage_model."]
comfyanonymous's avatar
comfyanonymous committed
28
    supported_inference_dtypes = [torch.float16, torch.bfloat16, torch.float32]
29

30
31
    manual_cast_dtype = None

32
    @classmethod
comfyanonymous's avatar
comfyanonymous committed
33
    def matches(s, unet_config, state_dict=None):
34
        for k in s.unet_config:
comfyanonymous's avatar
comfyanonymous committed
35
            if k not in unet_config or s.unet_config[k] != unet_config[k]:
36
                return False
comfyanonymous's avatar
comfyanonymous committed
37
38
39
40
        if state_dict is not None:
            for k in s.required_keys:
                if k not in state_dict:
                    return False
41
42
        return True

43
44
    def model_type(self, state_dict, prefix=""):
        return model_base.ModelType.EPS
45
46
47
48
49

    def inpaint_model(self):
        return self.unet_config["in_channels"] > 4

    def __init__(self, unet_config):
50
51
        self.unet_config = unet_config.copy()
        self.sampling_settings = self.sampling_settings.copy()
52
        self.latent_format = self.latent_format()
53
54
55
        for x in self.unet_extra_config:
            self.unet_config[x] = self.unet_extra_config[x]

56
    def get_model(self, state_dict, prefix="", device=None):
comfyanonymous's avatar
comfyanonymous committed
57
58
        if self.noise_aug_config is not None:
            out = model_base.SD21UNCLIP(self, self.noise_aug_config, model_type=self.model_type(state_dict, prefix), device=device)
59
        else:
comfyanonymous's avatar
comfyanonymous committed
60
61
62
63
            out = model_base.BaseModel(self, model_type=self.model_type(state_dict, prefix), device=device)
        if self.inpaint_model():
            out.set_inpaint()
        return out
64
65

    def process_clip_state_dict(self, state_dict):
66
        state_dict = utils.state_dict_prefix_replace(state_dict, {k: "" for k in self.text_encoder_key_prefix}, filter_keys=True)
67
68
        return state_dict

69
70
71
    def process_unet_state_dict(self, state_dict):
        return state_dict

72
73
74
    def process_vae_state_dict(self, state_dict):
        return state_dict

75
    def process_clip_state_dict_for_saving(self, state_dict):
76
        replace_prefix = {"": self.text_encoder_key_prefix[0]}
77
        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
78

79
80
81
82
83
84
    def process_clip_vision_state_dict_for_saving(self, state_dict):
        replace_prefix = {}
        if self.clip_vision_prefix is not None:
            replace_prefix[""] = self.clip_vision_prefix
        return utils.state_dict_prefix_replace(state_dict, replace_prefix)

85
86
    def process_unet_state_dict_for_saving(self, state_dict):
        replace_prefix = {"": "model.diffusion_model."}
87
        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
88
89

    def process_vae_state_dict_for_saving(self, state_dict):
90
        replace_prefix = {"": self.vae_key_prefix[0]}
91
        return utils.state_dict_prefix_replace(state_dict, replace_prefix)
92

comfyanonymous's avatar
comfyanonymous committed
93
94
    def set_inference_dtype(self, dtype, manual_cast_dtype):
        self.unet_config['dtype'] = dtype
95
        self.manual_cast_dtype = manual_cast_dtype