Replace chainner_models with Spandrel package (#2146)

* Replace chainner_models with Spandrel * Update to latest spandrel * Use spandrel_foss instead * update spandrel to new FOSS-compliant version

Replace chainner_models with Spandrel package (#2146)
* Replace chainner_models with Spandrel * Update to latest spandrel * Use spandrel_foss instead * update spandrel to new FOSS-compliant version
8cfd677c · Joey Ballentine · GitHub · ffc4b7c3 · ffc4b7c3 · ffc4b7c3
Unverified Commit 8cfd677c authored May 26, 2024 by Joey Ballentine Committed by GitHub May 26, 2024
13 changed files
--- a/comfy_extras/chainner_models/architecture/face/restoreformer_arch.py
+++ b/comfy_extras/chainner_models/architecture/face/restoreformer_arch.py
-# pylint: skip-file
-# type: ignore
-"""Modified from https://github.com/wzhouxiff/RestoreFormer
-"""
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-class VectorQuantizer(nn.Module):
-    """
-    see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py
-    ____________________________________________
-    Discretization bottleneck part of the VQ-VAE.
-    Inputs:
-    - n_e : number of embeddings
-    - e_dim : dimension of embedding
-    - beta : commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2
-    _____________________________________________
-    """
-    def __init__(self, n_e, e_dim, beta):
-        super(VectorQuantizer, self).__init__()
-        self.n_e = n_e
-        self.e_dim = e_dim
-        self.beta = beta
-        self.embedding = nn.Embedding(self.n_e, self.e_dim)
-        self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e)
-    def forward(self, z):
-        """
-        Inputs the output of the encoder network z and maps it to a discrete
-        one-hot vector that is the index of the closest embedding vector e_j
-        z (continuous) -> z_q (discrete)
-        z.shape = (batch, channel, height, width)
-        quantization pipeline:
-            1. get encoder input (B,C,H,W)
-            2. flatten input to (B*H*W,C)
-        """
-        # reshape z -> (batch, height, width, channel) and flatten
-        z = z.permute(0, 2, 3, 1).contiguous()
-        z_flattened = z.view(-1, self.e_dim)
-        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
-        d = (
-            torch.sum(z_flattened**2, dim=1, keepdim=True)
-            + torch.sum(self.embedding.weight**2, dim=1)
-            - 2 * torch.matmul(z_flattened, self.embedding.weight.t())
-        )
-        # could possible replace this here
-        # #\start...
-        # find closest encodings
-        min_value, min_encoding_indices = torch.min(d, dim=1)
-        min_encoding_indices = min_encoding_indices.unsqueeze(1)
-        min_encodings = torch.zeros(min_encoding_indices.shape[0], self.n_e).to(z)
-        min_encodings.scatter_(1, min_encoding_indices, 1)
-        # dtype min encodings: torch.float32
-        # min_encodings shape: torch.Size([2048, 512])
-        # min_encoding_indices.shape: torch.Size([2048, 1])
-        # get quantized latent vectors
-        z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape)
-        # .........\end
-        # with:
-        # .........\start
-        # min_encoding_indices = torch.argmin(d, dim=1)
-        # z_q = self.embedding(min_encoding_indices)
-        # ......\end......... (TODO)
-        # compute loss for embedding
-        loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean(
-            (z_q - z.detach()) ** 2
-        )
-        # preserve gradients
-        z_q = z + (z_q - z).detach()
-        # perplexity
-        e_mean = torch.mean(min_encodings, dim=0)
-        perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10)))
-        # reshape back to match original input shape
-        z_q = z_q.permute(0, 3, 1, 2).contiguous()
-        return z_q, loss, (perplexity, min_encodings, min_encoding_indices, d)
-    def get_codebook_entry(self, indices, shape):
-        # shape specifying (batch, height, width, channel)
-        # TODO: check for more easy handling with nn.Embedding
-        min_encodings = torch.zeros(indices.shape[0], self.n_e).to(indices)
-        min_encodings.scatter_(1, indices[:, None], 1)
-        # get quantized latent vectors
-        z_q = torch.matmul(min_encodings.float(), self.embedding.weight)
-        if shape is not None:
-            z_q = z_q.view(shape)
-            # reshape back to match original input shape
-            z_q = z_q.permute(0, 3, 1, 2).contiguous()
-        return z_q
-# pytorch_diffusion + derived encoder decoder
-def nonlinearity(x):
-    # swish
-    return x * torch.sigmoid(x)
-def Normalize(in_channels):
-    return torch.nn.GroupNorm(
-        num_groups=32, num_channels=in_channels, eps=1e-6, affine=True
-    )
-class Upsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            self.conv = torch.nn.Conv2d(
-                in_channels, in_channels, kernel_size=3, stride=1, padding=1
-            )
-    def forward(self, x):
-        x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
-        if self.with_conv:
-            x = self.conv(x)
-        return x
-class Downsample(nn.Module):
-    def __init__(self, in_channels, with_conv):
-        super().__init__()
-        self.with_conv = with_conv
-        if self.with_conv:
-            # no asymmetric padding in torch conv, must do it ourselves
-            self.conv = torch.nn.Conv2d(
-                in_channels, in_channels, kernel_size=3, stride=2, padding=0
-            )
-    def forward(self, x):
-        if self.with_conv:
-            pad = (0, 1, 0, 1)
-            x = torch.nn.functional.pad(x, pad, mode="constant", value=0)
-            x = self.conv(x)
-        else:
-            x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2)
-        return x
-class ResnetBlock(nn.Module):
-    def __init__(
-        self,
-        *,
-        in_channels,
-        out_channels=None,
-        conv_shortcut=False,
-        dropout,
-        temb_channels=512
-    ):
-        super().__init__()
-        self.in_channels = in_channels
-        out_channels = in_channels if out_channels is None else out_channels
-        self.out_channels = out_channels
-        self.use_conv_shortcut = conv_shortcut
-        self.norm1 = Normalize(in_channels)
-        self.conv1 = torch.nn.Conv2d(
-            in_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-        if temb_channels > 0:
-            self.temb_proj = torch.nn.Linear(temb_channels, out_channels)
-        self.norm2 = Normalize(out_channels)
-        self.dropout = torch.nn.Dropout(dropout)
-        self.conv2 = torch.nn.Conv2d(
-            out_channels, out_channels, kernel_size=3, stride=1, padding=1
-        )
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                self.conv_shortcut = torch.nn.Conv2d(
-                    in_channels, out_channels, kernel_size=3, stride=1, padding=1
-                )
-            else:
-                self.nin_shortcut = torch.nn.Conv2d(
-                    in_channels, out_channels, kernel_size=1, stride=1, padding=0
-                )
-    def forward(self, x, temb):
-        h = x
-        h = self.norm1(h)
-        h = nonlinearity(h)
-        h = self.conv1(h)
-        if temb is not None:
-            h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None]
-        h = self.norm2(h)
-        h = nonlinearity(h)
-        h = self.dropout(h)
-        h = self.conv2(h)
-        if self.in_channels != self.out_channels:
-            if self.use_conv_shortcut:
-                x = self.conv_shortcut(x)
-            else:
-                x = self.nin_shortcut(x)
-        return x + h
-class MultiHeadAttnBlock(nn.Module):
-    def __init__(self, in_channels, head_size=1):
-        super().__init__()
-        self.in_channels = in_channels
-        self.head_size = head_size
-        self.att_size = in_channels // head_size
-        assert (
-            in_channels % head_size == 0
-        ), "The size of head should be divided by the number of channels."
-        self.norm1 = Normalize(in_channels)
-        self.norm2 = Normalize(in_channels)
-        self.q = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.k = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.v = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.proj_out = torch.nn.Conv2d(
-            in_channels, in_channels, kernel_size=1, stride=1, padding=0
-        )
-        self.num = 0
-    def forward(self, x, y=None):
-        h_ = x
-        h_ = self.norm1(h_)
-        if y is None:
-            y = h_
-        else:
-            y = self.norm2(y)
-        q = self.q(y)
-        k = self.k(h_)
-        v = self.v(h_)
-        # compute attention
-        b, c, h, w = q.shape
-        q = q.reshape(b, self.head_size, self.att_size, h * w)
-        q = q.permute(0, 3, 1, 2)  # b, hw, head, att
-        k = k.reshape(b, self.head_size, self.att_size, h * w)
-        k = k.permute(0, 3, 1, 2)
-        v = v.reshape(b, self.head_size, self.att_size, h * w)
-        v = v.permute(0, 3, 1, 2)
-        q = q.transpose(1, 2)
-        v = v.transpose(1, 2)
-        k = k.transpose(1, 2).transpose(2, 3)
-        scale = int(self.att_size) ** (-0.5)
-        q.mul_(scale)
-        w_ = torch.matmul(q, k)
-        w_ = F.softmax(w_, dim=3)
-        w_ = w_.matmul(v)
-        w_ = w_.transpose(1, 2).contiguous()  # [b, h*w, head, att]
-        w_ = w_.view(b, h, w, -1)
-        w_ = w_.permute(0, 3, 1, 2)
-        w_ = self.proj_out(w_)
-        return x + w_
-class MultiHeadEncoder(nn.Module):
-    def __init__(
-        self,
-        ch,
-        out_ch,
-        ch_mult=(1, 2, 4, 8),
-        num_res_blocks=2,
-        attn_resolutions=(16,),
-        dropout=0.0,
-        resamp_with_conv=True,
-        in_channels=3,
-        resolution=512,
-        z_channels=256,
-        double_z=True,
-        enable_mid=True,
-        head_size=1,
-        **ignore_kwargs
-    ):
-        super().__init__()
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.enable_mid = enable_mid
-        # downsampling
-        self.conv_in = torch.nn.Conv2d(
-            in_channels, self.ch, kernel_size=3, stride=1, padding=1
-        )
-        curr_res = resolution
-        in_ch_mult = (1,) + tuple(ch_mult)
-        self.down = nn.ModuleList()
-        for i_level in range(self.num_resolutions):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_in = ch * in_ch_mult[i_level]
-            block_out = ch * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks):
-                block.append(
-                    ResnetBlock(
-                        in_channels=block_in,
-                        out_channels=block_out,
-                        temb_channels=self.temb_ch,
-                        dropout=dropout,
-                    )
-                )
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(MultiHeadAttnBlock(block_in, head_size))
-            down = nn.Module()
-            down.block = block
-            down.attn = attn
-            if i_level != self.num_resolutions - 1:
-                down.downsample = Downsample(block_in, resamp_with_conv)
-                curr_res = curr_res // 2
-            self.down.append(down)
-        # middle
-        if self.enable_mid:
-            self.mid = nn.Module()
-            self.mid.block_1 = ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            )
-            self.mid.attn_1 = MultiHeadAttnBlock(block_in, head_size)
-            self.mid.block_2 = ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            )
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(
-            block_in,
-            2 * z_channels if double_z else z_channels,
-            kernel_size=3,
-            stride=1,
-            padding=1,
-        )
-    def forward(self, x):
-        hs = {}
-        # timestep embedding
-        temb = None
-        # downsampling
-        h = self.conv_in(x)
-        hs["in"] = h
-        for i_level in range(self.num_resolutions):
-            for i_block in range(self.num_res_blocks):
-                h = self.down[i_level].block[i_block](h, temb)
-                if len(self.down[i_level].attn) > 0:
-                    h = self.down[i_level].attn[i_block](h)
-            if i_level != self.num_resolutions - 1:
-                # hs.append(h)
-                hs["block_" + str(i_level)] = h
-                h = self.down[i_level].downsample(h)
-        # middle
-        # h = hs[-1]
-        if self.enable_mid:
-            h = self.mid.block_1(h, temb)
-            hs["block_" + str(i_level) + "_atten"] = h
-            h = self.mid.attn_1(h)
-            h = self.mid.block_2(h, temb)
-            hs["mid_atten"] = h
-        # end
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        # hs.append(h)
-        hs["out"] = h
-        return hs
-class MultiHeadDecoder(nn.Module):
-    def __init__(
-        self,
-        ch,
-        out_ch,
-        ch_mult=(1, 2, 4, 8),
-        num_res_blocks=2,
-        attn_resolutions=(16,),
-        dropout=0.0,
-        resamp_with_conv=True,
-        in_channels=3,
-        resolution=512,
-        z_channels=256,
-        give_pre_end=False,
-        enable_mid=True,
-        head_size=1,
-        **ignorekwargs
-    ):
-        super().__init__()
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.give_pre_end = give_pre_end
-        self.enable_mid = enable_mid
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        block_in = ch * ch_mult[self.num_resolutions - 1]
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
-        self.z_shape = (1, z_channels, curr_res, curr_res)
-        print(
-            "Working with z of shape {} = {} dimensions.".format(
-                self.z_shape, np.prod(self.z_shape)
-            )
-        )
-        # z to block_in
-        self.conv_in = torch.nn.Conv2d(
-            z_channels, block_in, kernel_size=3, stride=1, padding=1
-        )
-        # middle
-        if self.enable_mid:
-            self.mid = nn.Module()
-            self.mid.block_1 = ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            )
-            self.mid.attn_1 = MultiHeadAttnBlock(block_in, head_size)
-            self.mid.block_2 = ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            )
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks + 1):
-                block.append(
-                    ResnetBlock(
-                        in_channels=block_in,
-                        out_channels=block_out,
-                        temb_channels=self.temb_ch,
-                        dropout=dropout,
-                    )
-                )
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(MultiHeadAttnBlock(block_in, head_size))
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv)
-                curr_res = curr_res * 2
-            self.up.insert(0, up)  # prepend to get consistent order
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(
-            block_in, out_ch, kernel_size=3, stride=1, padding=1
-        )
-    def forward(self, z):
-        # assert z.shape[1:] == self.z_shape[1:]
-        self.last_z_shape = z.shape
-        # timestep embedding
-        temb = None
-        # z to block_in
-        h = self.conv_in(z)
-        # middle
-        if self.enable_mid:
-            h = self.mid.block_1(h, temb)
-            h = self.mid.attn_1(h)
-            h = self.mid.block_2(h, temb)
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = self.up[i_level].block[i_block](h, temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](h)
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-        # end
-        if self.give_pre_end:
-            return h
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-class MultiHeadDecoderTransformer(nn.Module):
-    def __init__(
-        self,
-        ch,
-        out_ch,
-        ch_mult=(1, 2, 4, 8),
-        num_res_blocks=2,
-        attn_resolutions=(16,),
-        dropout=0.0,
-        resamp_with_conv=True,
-        in_channels=3,
-        resolution=512,
-        z_channels=256,
-        give_pre_end=False,
-        enable_mid=True,
-        head_size=1,
-        **ignorekwargs
-    ):
-        super().__init__()
-        self.ch = ch
-        self.temb_ch = 0
-        self.num_resolutions = len(ch_mult)
-        self.num_res_blocks = num_res_blocks
-        self.resolution = resolution
-        self.in_channels = in_channels
-        self.give_pre_end = give_pre_end
-        self.enable_mid = enable_mid
-        # compute in_ch_mult, block_in and curr_res at lowest res
-        block_in = ch * ch_mult[self.num_resolutions - 1]
-        curr_res = resolution // 2 ** (self.num_resolutions - 1)
-        self.z_shape = (1, z_channels, curr_res, curr_res)
-        print(
-            "Working with z of shape {} = {} dimensions.".format(
-                self.z_shape, np.prod(self.z_shape)
-            )
-        )
-        # z to block_in
-        self.conv_in = torch.nn.Conv2d(
-            z_channels, block_in, kernel_size=3, stride=1, padding=1
-        )
-        # middle
-        if self.enable_mid:
-            self.mid = nn.Module()
-            self.mid.block_1 = ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            )
-            self.mid.attn_1 = MultiHeadAttnBlock(block_in, head_size)
-            self.mid.block_2 = ResnetBlock(
-                in_channels=block_in,
-                out_channels=block_in,
-                temb_channels=self.temb_ch,
-                dropout=dropout,
-            )
-        # upsampling
-        self.up = nn.ModuleList()
-        for i_level in reversed(range(self.num_resolutions)):
-            block = nn.ModuleList()
-            attn = nn.ModuleList()
-            block_out = ch * ch_mult[i_level]
-            for i_block in range(self.num_res_blocks + 1):
-                block.append(
-                    ResnetBlock(
-                        in_channels=block_in,
-                        out_channels=block_out,
-                        temb_channels=self.temb_ch,
-                        dropout=dropout,
-                    )
-                )
-                block_in = block_out
-                if curr_res in attn_resolutions:
-                    attn.append(MultiHeadAttnBlock(block_in, head_size))
-            up = nn.Module()
-            up.block = block
-            up.attn = attn
-            if i_level != 0:
-                up.upsample = Upsample(block_in, resamp_with_conv)
-                curr_res = curr_res * 2
-            self.up.insert(0, up)  # prepend to get consistent order
-        # end
-        self.norm_out = Normalize(block_in)
-        self.conv_out = torch.nn.Conv2d(
-            block_in, out_ch, kernel_size=3, stride=1, padding=1
-        )
-    def forward(self, z, hs):
-        # assert z.shape[1:] == self.z_shape[1:]
-        # self.last_z_shape = z.shape
-        # timestep embedding
-        temb = None
-        # z to block_in
-        h = self.conv_in(z)
-        # middle
-        if self.enable_mid:
-            h = self.mid.block_1(h, temb)
-            h = self.mid.attn_1(h, hs["mid_atten"])
-            h = self.mid.block_2(h, temb)
-        # upsampling
-        for i_level in reversed(range(self.num_resolutions)):
-            for i_block in range(self.num_res_blocks + 1):
-                h = self.up[i_level].block[i_block](h, temb)
-                if len(self.up[i_level].attn) > 0:
-                    h = self.up[i_level].attn[i_block](
-                        h, hs["block_" + str(i_level) + "_atten"]
-                    )
-                    # hfeature = h.clone()
-            if i_level != 0:
-                h = self.up[i_level].upsample(h)
-        # end
-        if self.give_pre_end:
-            return h
-        h = self.norm_out(h)
-        h = nonlinearity(h)
-        h = self.conv_out(h)
-        return h
-class RestoreFormer(nn.Module):
-    def __init__(
-        self,
-        state_dict,
-    ):
-        super(RestoreFormer, self).__init__()
-        n_embed = 1024
-        embed_dim = 256
-        ch = 64
-        out_ch = 3
-        ch_mult = (1, 2, 2, 4, 4, 8)
-        num_res_blocks = 2
-        attn_resolutions = (16,)
-        dropout = 0.0
-        in_channels = 3
-        resolution = 512
-        z_channels = 256
-        double_z = False
-        enable_mid = True
-        fix_decoder = False
-        fix_codebook = True
-        fix_encoder = False
-        head_size = 8
-        self.model_arch = "RestoreFormer"
-        self.sub_type = "Face SR"
-        self.scale = 8
-        self.in_nc = 3
-        self.out_nc = out_ch
-        self.state = state_dict
-        self.supports_fp16 = False
-        self.supports_bf16 = True
-        self.min_size_restriction = 16
-        self.encoder = MultiHeadEncoder(
-            ch=ch,
-            out_ch=out_ch,
-            ch_mult=ch_mult,
-            num_res_blocks=num_res_blocks,
-            attn_resolutions=attn_resolutions,
-            dropout=dropout,
-            in_channels=in_channels,
-            resolution=resolution,
-            z_channels=z_channels,
-            double_z=double_z,
-            enable_mid=enable_mid,
-            head_size=head_size,
-        )
-        self.decoder = MultiHeadDecoderTransformer(
-            ch=ch,
-            out_ch=out_ch,
-            ch_mult=ch_mult,
-            num_res_blocks=num_res_blocks,
-            attn_resolutions=attn_resolutions,
-            dropout=dropout,
-            in_channels=in_channels,
-            resolution=resolution,
-            z_channels=z_channels,
-            enable_mid=enable_mid,
-            head_size=head_size,
-        )
-        self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25)
-        self.quant_conv = torch.nn.Conv2d(z_channels, embed_dim, 1)
-        self.post_quant_conv = torch.nn.Conv2d(embed_dim, z_channels, 1)
-        if fix_decoder:
-            for _, param in self.decoder.named_parameters():
-                param.requires_grad = False
-            for _, param in self.post_quant_conv.named_parameters():
-                param.requires_grad = False
-            for _, param in self.quantize.named_parameters():
-                param.requires_grad = False
-        elif fix_codebook:
-            for _, param in self.quantize.named_parameters():
-                param.requires_grad = False
-        if fix_encoder:
-            for _, param in self.encoder.named_parameters():
-                param.requires_grad = False
-        self.load_state_dict(state_dict)
-    def encode(self, x):
-        hs = self.encoder(x)
-        h = self.quant_conv(hs["out"])
-        quant, emb_loss, info = self.quantize(h)
-        return quant, emb_loss, info, hs
-    def decode(self, quant, hs):
-        quant = self.post_quant_conv(quant)
-        dec = self.decoder(quant, hs)
-        return dec
-    def forward(self, input, **kwargs):
-        quant, diff, info, hs = self.encode(input)
-        dec = self.decode(quant, hs)
-        return dec, None
--- a/comfy_extras/chainner_models/architecture/face/stylegan2_arch.py
+++ b/comfy_extras/chainner_models/architecture/face/stylegan2_arch.py
-# pylint: skip-file
-# type: ignore
-import math
-import random
-import torch
-from torch import nn
-from torch.nn import functional as F
-from .fused_act import FusedLeakyReLU, fused_leaky_relu
-from .upfirdn2d import upfirdn2d
-class NormStyleCode(nn.Module):
-    def forward(self, x):
-        """Normalize the style codes.
-        Args:
-            x (Tensor): Style codes with shape (b, c).
-        Returns:
-            Tensor: Normalized tensor.
-        """
-        return x * torch.rsqrt(torch.mean(x**2, dim=1, keepdim=True) + 1e-8)
-def make_resample_kernel(k):
-    """Make resampling kernel for UpFirDn.
-    Args:
-        k (list[int]): A list indicating the 1D resample kernel magnitude.
-    Returns:
-        Tensor: 2D resampled kernel.
-    """
-    k = torch.tensor(k, dtype=torch.float32)
-    if k.ndim == 1:
-        k = k[None, :] * k[:, None]  # to 2D kernel, outer product
-    # normalize
-    k /= k.sum()
-    return k
-class UpFirDnUpsample(nn.Module):
-    """Upsample, FIR filter, and downsample (upsampole version).
-    References:
-    1. https://docs.scipy.org/doc/scipy/reference/generated/scipy.signal.upfirdn.html  # noqa: E501
-    2. http://www.ece.northwestern.edu/local-apps/matlabhelp/toolbox/signal/upfirdn.html  # noqa: E501
-    Args:
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude.
-        factor (int): Upsampling scale factor. Default: 2.
-    """
-    def __init__(self, resample_kernel, factor=2):
-        super(UpFirDnUpsample, self).__init__()
-        self.kernel = make_resample_kernel(resample_kernel) * (factor**2)
-        self.factor = factor
-        pad = self.kernel.shape[0] - factor
-        self.pad = ((pad + 1) // 2 + factor - 1, pad // 2)
-    def forward(self, x):
-        out = upfirdn2d(x, self.kernel.type_as(x), up=self.factor, down=1, pad=self.pad)
-        return out
-    def __repr__(self):
-        return f"{self.__class__.__name__}(factor={self.factor})"
-class UpFirDnDownsample(nn.Module):
-    """Upsample, FIR filter, and downsample (downsampole version).
-    Args:
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude.
-        factor (int): Downsampling scale factor. Default: 2.
-    """
-    def __init__(self, resample_kernel, factor=2):
-        super(UpFirDnDownsample, self).__init__()
-        self.kernel = make_resample_kernel(resample_kernel)
-        self.factor = factor
-        pad = self.kernel.shape[0] - factor
-        self.pad = ((pad + 1) // 2, pad // 2)
-    def forward(self, x):
-        out = upfirdn2d(x, self.kernel.type_as(x), up=1, down=self.factor, pad=self.pad)
-        return out
-    def __repr__(self):
-        return f"{self.__class__.__name__}(factor={self.factor})"
-class UpFirDnSmooth(nn.Module):
-    """Upsample, FIR filter, and downsample (smooth version).
-    Args:
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude.
-        upsample_factor (int): Upsampling scale factor. Default: 1.
-        downsample_factor (int): Downsampling scale factor. Default: 1.
-        kernel_size (int): Kernel size: Default: 1.
-    """
-    def __init__(
-        self, resample_kernel, upsample_factor=1, downsample_factor=1, kernel_size=1
-    ):
-        super(UpFirDnSmooth, self).__init__()
-        self.upsample_factor = upsample_factor
-        self.downsample_factor = downsample_factor
-        self.kernel = make_resample_kernel(resample_kernel)
-        if upsample_factor > 1:
-            self.kernel = self.kernel * (upsample_factor**2)
-        if upsample_factor > 1:
-            pad = (self.kernel.shape[0] - upsample_factor) - (kernel_size - 1)
-            self.pad = ((pad + 1) // 2 + upsample_factor - 1, pad // 2 + 1)
-        elif downsample_factor > 1:
-            pad = (self.kernel.shape[0] - downsample_factor) + (kernel_size - 1)
-            self.pad = ((pad + 1) // 2, pad // 2)
-        else:
-            raise NotImplementedError
-    def forward(self, x):
-        out = upfirdn2d(x, self.kernel.type_as(x), up=1, down=1, pad=self.pad)
-        return out
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}(upsample_factor={self.upsample_factor}"
-            f", downsample_factor={self.downsample_factor})"
-        )
-class EqualLinear(nn.Module):
-    """Equalized Linear as StyleGAN2.
-    Args:
-        in_channels (int): Size of each sample.
-        out_channels (int): Size of each output sample.
-        bias (bool): If set to ``False``, the layer will not learn an additive
-            bias. Default: ``True``.
-        bias_init_val (float): Bias initialized value. Default: 0.
-        lr_mul (float): Learning rate multiplier. Default: 1.
-        activation (None | str): The activation after ``linear`` operation.
-            Supported: 'fused_lrelu', None. Default: None.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        bias=True,
-        bias_init_val=0,
-        lr_mul=1,
-        activation=None,
-    ):
-        super(EqualLinear, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.lr_mul = lr_mul
-        self.activation = activation
-        if self.activation not in ["fused_lrelu", None]:
-            raise ValueError(
-                f"Wrong activation value in EqualLinear: {activation}"
-                "Supported ones are: ['fused_lrelu', None]."
-            )
-        self.scale = (1 / math.sqrt(in_channels)) * lr_mul
-        self.weight = nn.Parameter(torch.randn(out_channels, in_channels).div_(lr_mul))
-        if bias:
-            self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val))
-        else:
-            self.register_parameter("bias", None)
-    def forward(self, x):
-        if self.bias is None:
-            bias = None
-        else:
-            bias = self.bias * self.lr_mul
-        if self.activation == "fused_lrelu":
-            out = F.linear(x, self.weight * self.scale)
-            out = fused_leaky_relu(out, bias)
-        else:
-            out = F.linear(x, self.weight * self.scale, bias=bias)
-        return out
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}(in_channels={self.in_channels}, "
-            f"out_channels={self.out_channels}, bias={self.bias is not None})"
-        )
-class ModulatedConv2d(nn.Module):
-    """Modulated Conv2d used in StyleGAN2.
-    There is no bias in ModulatedConv2d.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether to demodulate in the conv layer.
-            Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
-            Default: None.
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude. Default: (1, 3, 3, 1).
-        eps (float): A value added to the denominator for numerical stability.
-            Default: 1e-8.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        num_style_feat,
-        demodulate=True,
-        sample_mode=None,
-        resample_kernel=(1, 3, 3, 1),
-        eps=1e-8,
-    ):
-        super(ModulatedConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.demodulate = demodulate
-        self.sample_mode = sample_mode
-        self.eps = eps
-        if self.sample_mode == "upsample":
-            self.smooth = UpFirDnSmooth(
-                resample_kernel,
-                upsample_factor=2,
-                downsample_factor=1,
-                kernel_size=kernel_size,
-            )
-        elif self.sample_mode == "downsample":
-            self.smooth = UpFirDnSmooth(
-                resample_kernel,
-                upsample_factor=1,
-                downsample_factor=2,
-                kernel_size=kernel_size,
-            )
-        elif self.sample_mode is None:
-            pass
-        else:
-            raise ValueError(
-                f"Wrong sample mode {self.sample_mode}, "
-                "supported ones are ['upsample', 'downsample', None]."
-            )
-        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
-        # modulation inside each modulated conv
-        self.modulation = EqualLinear(
-            num_style_feat,
-            in_channels,
-            bias=True,
-            bias_init_val=1,
-            lr_mul=1,
-            activation=None,
-        )
-        self.weight = nn.Parameter(
-            torch.randn(1, out_channels, in_channels, kernel_size, kernel_size)
-        )
-        self.padding = kernel_size // 2
-    def forward(self, x, style):
-        """Forward function.
-        Args:
-            x (Tensor): Tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-        Returns:
-            Tensor: Modulated tensor after convolution.
-        """
-        b, c, h, w = x.shape  # c = c_in
-        # weight modulation
-        style = self.modulation(style).view(b, 1, c, 1, 1)
-        # self.weight: (1, c_out, c_in, k, k); style: (b, 1, c, 1, 1)
-        weight = self.scale * self.weight * style  # (b, c_out, c_in, k, k)
-        if self.demodulate:
-            demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps)
-            weight = weight * demod.view(b, self.out_channels, 1, 1, 1)
-        weight = weight.view(
-            b * self.out_channels, c, self.kernel_size, self.kernel_size
-        )
-        if self.sample_mode == "upsample":
-            x = x.view(1, b * c, h, w)
-            weight = weight.view(
-                b, self.out_channels, c, self.kernel_size, self.kernel_size
-            )
-            weight = weight.transpose(1, 2).reshape(
-                b * c, self.out_channels, self.kernel_size, self.kernel_size
-            )
-            out = F.conv_transpose2d(x, weight, padding=0, stride=2, groups=b)
-            out = out.view(b, self.out_channels, *out.shape[2:4])
-            out = self.smooth(out)
-        elif self.sample_mode == "downsample":
-            x = self.smooth(x)
-            x = x.view(1, b * c, *x.shape[2:4])
-            out = F.conv2d(x, weight, padding=0, stride=2, groups=b)
-            out = out.view(b, self.out_channels, *out.shape[2:4])
-        else:
-            x = x.view(1, b * c, h, w)
-            # weight: (b*c_out, c_in, k, k), groups=b
-            out = F.conv2d(x, weight, padding=self.padding, groups=b)
-            out = out.view(b, self.out_channels, *out.shape[2:4])
-        return out
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}(in_channels={self.in_channels}, "
-            f"out_channels={self.out_channels}, "
-            f"kernel_size={self.kernel_size}, "
-            f"demodulate={self.demodulate}, sample_mode={self.sample_mode})"
-        )
-class StyleConv(nn.Module):
-    """Style conv.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether demodulate in the conv layer. Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
-            Default: None.
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude. Default: (1, 3, 3, 1).
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        num_style_feat,
-        demodulate=True,
-        sample_mode=None,
-        resample_kernel=(1, 3, 3, 1),
-    ):
-        super(StyleConv, self).__init__()
-        self.modulated_conv = ModulatedConv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            num_style_feat,
-            demodulate=demodulate,
-            sample_mode=sample_mode,
-            resample_kernel=resample_kernel,
-        )
-        self.weight = nn.Parameter(torch.zeros(1))  # for noise injection
-        self.activate = FusedLeakyReLU(out_channels)
-    def forward(self, x, style, noise=None):
-        # modulate
-        out = self.modulated_conv(x, style)
-        # noise injection
-        if noise is None:
-            b, _, h, w = out.shape
-            noise = out.new_empty(b, 1, h, w).normal_()
-        out = out + self.weight * noise
-        # activation (with bias)
-        out = self.activate(out)
-        return out
-class ToRGB(nn.Module):
-    """To RGB from features.
-    Args:
-        in_channels (int): Channel number of input.
-        num_style_feat (int): Channel number of style features.
-        upsample (bool): Whether to upsample. Default: True.
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude. Default: (1, 3, 3, 1).
-    """
-    def __init__(
-        self, in_channels, num_style_feat, upsample=True, resample_kernel=(1, 3, 3, 1)
-    ):
-        super(ToRGB, self).__init__()
-        if upsample:
-            self.upsample = UpFirDnUpsample(resample_kernel, factor=2)
-        else:
-            self.upsample = None
-        self.modulated_conv = ModulatedConv2d(
-            in_channels,
-            3,
-            kernel_size=1,
-            num_style_feat=num_style_feat,
-            demodulate=False,
-            sample_mode=None,
-        )
-        self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1))
-    def forward(self, x, style, skip=None):
-        """Forward function.
-        Args:
-            x (Tensor): Feature tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-            skip (Tensor): Base/skip tensor. Default: None.
-        Returns:
-            Tensor: RGB images.
-        """
-        out = self.modulated_conv(x, style)
-        out = out + self.bias
-        if skip is not None:
-            if self.upsample:
-                skip = self.upsample(skip)
-            out = out + skip
-        return out
-class ConstantInput(nn.Module):
-    """Constant input.
-    Args:
-        num_channel (int): Channel number of constant input.
-        size (int): Spatial size of constant input.
-    """
-    def __init__(self, num_channel, size):
-        super(ConstantInput, self).__init__()
-        self.weight = nn.Parameter(torch.randn(1, num_channel, size, size))
-    def forward(self, batch):
-        out = self.weight.repeat(batch, 1, 1, 1)
-        return out
-class StyleGAN2Generator(nn.Module):
-    """StyleGAN2 Generator.
-    Args:
-        out_size (int): The spatial size of outputs.
-        num_style_feat (int): Channel number of style features. Default: 512.
-        num_mlp (int): Layer number of MLP style layers. Default: 8.
-        channel_multiplier (int): Channel multiplier for large networks of
-            StyleGAN2. Default: 2.
-        resample_kernel (list[int]): A list indicating the 1D resample kernel
-            magnitude. A cross production will be applied to extent 1D resample
-            kernel to 2D resample kernel. Default: (1, 3, 3, 1).
-        lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01.
-        narrow (float): Narrow ratio for channels. Default: 1.0.
-    """
-    def __init__(
-        self,
-        out_size,
-        num_style_feat=512,
-        num_mlp=8,
-        channel_multiplier=2,
-        resample_kernel=(1, 3, 3, 1),
-        lr_mlp=0.01,
-        narrow=1,
-    ):
-        super(StyleGAN2Generator, self).__init__()
-        # Style MLP layers
-        self.num_style_feat = num_style_feat
-        style_mlp_layers = [NormStyleCode()]
-        for i in range(num_mlp):
-            style_mlp_layers.append(
-                EqualLinear(
-                    num_style_feat,
-                    num_style_feat,
-                    bias=True,
-                    bias_init_val=0,
-                    lr_mul=lr_mlp,
-                    activation="fused_lrelu",
-                )
-            )
-        self.style_mlp = nn.Sequential(*style_mlp_layers)
-        channels = {
-            "4": int(512 * narrow),
-            "8": int(512 * narrow),
-            "16": int(512 * narrow),
-            "32": int(512 * narrow),
-            "64": int(256 * channel_multiplier * narrow),
-            "128": int(128 * channel_multiplier * narrow),
-            "256": int(64 * channel_multiplier * narrow),
-            "512": int(32 * channel_multiplier * narrow),
-            "1024": int(16 * channel_multiplier * narrow),
-        }
-        self.channels = channels
-        self.constant_input = ConstantInput(channels["4"], size=4)
-        self.style_conv1 = StyleConv(
-            channels["4"],
-            channels["4"],
-            kernel_size=3,
-            num_style_feat=num_style_feat,
-            demodulate=True,
-            sample_mode=None,
-            resample_kernel=resample_kernel,
-        )
-        self.to_rgb1 = ToRGB(
-            channels["4"],
-            num_style_feat,
-            upsample=False,
-            resample_kernel=resample_kernel,
-        )
-        self.log_size = int(math.log(out_size, 2))
-        self.num_layers = (self.log_size - 2) * 2 + 1
-        self.num_latent = self.log_size * 2 - 2
-        self.style_convs = nn.ModuleList()
-        self.to_rgbs = nn.ModuleList()
-        self.noises = nn.Module()
-        in_channels = channels["4"]
-        # noise
-        for layer_idx in range(self.num_layers):
-            resolution = 2 ** ((layer_idx + 5) // 2)
-            shape = [1, 1, resolution, resolution]
-            self.noises.register_buffer(f"noise{layer_idx}", torch.randn(*shape))
-        # style convs and to_rgbs
-        for i in range(3, self.log_size + 1):
-            out_channels = channels[f"{2**i}"]
-            self.style_convs.append(
-                StyleConv(
-                    in_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode="upsample",
-                    resample_kernel=resample_kernel,
-                )
-            )
-            self.style_convs.append(
-                StyleConv(
-                    out_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode=None,
-                    resample_kernel=resample_kernel,
-                )
-            )
-            self.to_rgbs.append(
-                ToRGB(
-                    out_channels,
-                    num_style_feat,
-                    upsample=True,
-                    resample_kernel=resample_kernel,
-                )
-            )
-            in_channels = out_channels
-    def make_noise(self):
-        """Make noise for noise injection."""
-        device = self.constant_input.weight.device
-        noises = [torch.randn(1, 1, 4, 4, device=device)]
-        for i in range(3, self.log_size + 1):
-            for _ in range(2):
-                noises.append(torch.randn(1, 1, 2**i, 2**i, device=device))
-        return noises
-    def get_latent(self, x):
-        return self.style_mlp(x)
-    def mean_latent(self, num_latent):
-        latent_in = torch.randn(
-            num_latent, self.num_style_feat, device=self.constant_input.weight.device
-        )
-        latent = self.style_mlp(latent_in).mean(0, keepdim=True)
-        return latent
-    def forward(
-        self,
-        styles,
-        input_is_latent=False,
-        noise=None,
-        randomize_noise=True,
-        truncation=1,
-        truncation_latent=None,
-        inject_index=None,
-        return_latents=False,
-    ):
-        """Forward function for StyleGAN2Generator.
-        Args:
-            styles (list[Tensor]): Sample codes of styles.
-            input_is_latent (bool): Whether input is latent style.
-                Default: False.
-            noise (Tensor | None): Input noise or None. Default: None.
-            randomize_noise (bool): Randomize noise, used when 'noise' is
-                False. Default: True.
-            truncation (float): TODO. Default: 1.
-            truncation_latent (Tensor | None): TODO. Default: None.
-            inject_index (int | None): The injection index for mixing noise.
-                Default: None.
-            return_latents (bool): Whether to return style latents.
-                Default: False.
-        """
-        # style codes -> latents with Style MLP layer
-        if not input_is_latent:
-            styles = [self.style_mlp(s) for s in styles]
-        # noises
-        if noise is None:
-            if randomize_noise:
-                noise = [None] * self.num_layers  # for each style conv layer
-            else:  # use the stored noise
-                noise = [
-                    getattr(self.noises, f"noise{i}") for i in range(self.num_layers)
-                ]
-        # style truncation
-        if truncation < 1:
-            style_truncation = []
-            for style in styles:
-                style_truncation.append(
-                    truncation_latent + truncation * (style - truncation_latent)
-                )
-            styles = style_truncation
-        # get style latent with injection
-        if len(styles) == 1:
-            inject_index = self.num_latent
-            if styles[0].ndim < 3:
-                # repeat latent code for all the layers
-                latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            else:  # used for encoder with different latent code for each layer
-                latent = styles[0]
-        elif len(styles) == 2:  # mixing noises
-            if inject_index is None:
-                inject_index = random.randint(1, self.num_latent - 1)
-            latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            latent2 = (
-                styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1)
-            )
-            latent = torch.cat([latent1, latent2], 1)
-        # main generation
-        out = self.constant_input(latent.shape[0])
-        out = self.style_conv1(out, latent[:, 0], noise=noise[0])
-        skip = self.to_rgb1(out, latent[:, 1])
-        i = 1
-        for conv1, conv2, noise1, noise2, to_rgb in zip(
-            self.style_convs[::2],
-            self.style_convs[1::2],
-            noise[1::2],
-            noise[2::2],
-            self.to_rgbs,
-        ):
-            out = conv1(out, latent[:, i], noise=noise1)
-            out = conv2(out, latent[:, i + 1], noise=noise2)
-            skip = to_rgb(out, latent[:, i + 2], skip)
-            i += 2
-        image = skip
-        if return_latents:
-            return image, latent
-        else:
-            return image, None
-class ScaledLeakyReLU(nn.Module):
-    """Scaled LeakyReLU.
-    Args:
-        negative_slope (float): Negative slope. Default: 0.2.
-    """
-    def __init__(self, negative_slope=0.2):
-        super(ScaledLeakyReLU, self).__init__()
-        self.negative_slope = negative_slope
-    def forward(self, x):
-        out = F.leaky_relu(x, negative_slope=self.negative_slope)
-        return out * math.sqrt(2)
-class EqualConv2d(nn.Module):
-    """Equalized Linear as StyleGAN2.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        stride (int): Stride of the convolution. Default: 1
-        padding (int): Zero-padding added to both sides of the input.
-            Default: 0.
-        bias (bool): If ``True``, adds a learnable bias to the output.
-            Default: ``True``.
-        bias_init_val (float): Bias initialized value. Default: 0.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        padding=0,
-        bias=True,
-        bias_init_val=0,
-    ):
-        super(EqualConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
-        self.weight = nn.Parameter(
-            torch.randn(out_channels, in_channels, kernel_size, kernel_size)
-        )
-        if bias:
-            self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val))
-        else:
-            self.register_parameter("bias", None)
-    def forward(self, x):
-        out = F.conv2d(
-            x,
-            self.weight * self.scale,
-            bias=self.bias,
-            stride=self.stride,
-            padding=self.padding,
-        )
-        return out
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}(in_channels={self.in_channels}, "
-            f"out_channels={self.out_channels}, "
-            f"kernel_size={self.kernel_size},"
-            f" stride={self.stride}, padding={self.padding}, "
-            f"bias={self.bias is not None})"
-        )
-class ConvLayer(nn.Sequential):
-    """Conv Layer used in StyleGAN2 Discriminator.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Kernel size.
-        downsample (bool): Whether downsample by a factor of 2.
-            Default: False.
-        resample_kernel (list[int]): A list indicating the 1D resample
-            kernel magnitude. A cross production will be applied to
-            extent 1D resample kernel to 2D resample kernel.
-            Default: (1, 3, 3, 1).
-        bias (bool): Whether with bias. Default: True.
-        activate (bool): Whether use activateion. Default: True.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        downsample=False,
-        resample_kernel=(1, 3, 3, 1),
-        bias=True,
-        activate=True,
-    ):
-        layers = []
-        # downsample
-        if downsample:
-            layers.append(
-                UpFirDnSmooth(
-                    resample_kernel,
-                    upsample_factor=1,
-                    downsample_factor=2,
-                    kernel_size=kernel_size,
-                )
-            )
-            stride = 2
-            self.padding = 0
-        else:
-            stride = 1
-            self.padding = kernel_size // 2
-        # conv
-        layers.append(
-            EqualConv2d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride=stride,
-                padding=self.padding,
-                bias=bias and not activate,
-            )
-        )
-        # activation
-        if activate:
-            if bias:
-                layers.append(FusedLeakyReLU(out_channels))
-            else:
-                layers.append(ScaledLeakyReLU(0.2))
-        super(ConvLayer, self).__init__(*layers)
-class ResBlock(nn.Module):
-    """Residual block used in StyleGAN2 Discriminator.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        resample_kernel (list[int]): A list indicating the 1D resample
-            kernel magnitude. A cross production will be applied to
-            extent 1D resample kernel to 2D resample kernel.
-            Default: (1, 3, 3, 1).
-    """
-    def __init__(self, in_channels, out_channels, resample_kernel=(1, 3, 3, 1)):
-        super(ResBlock, self).__init__()
-        self.conv1 = ConvLayer(in_channels, in_channels, 3, bias=True, activate=True)
-        self.conv2 = ConvLayer(
-            in_channels,
-            out_channels,
-            3,
-            downsample=True,
-            resample_kernel=resample_kernel,
-            bias=True,
-            activate=True,
-        )
-        self.skip = ConvLayer(
-            in_channels,
-            out_channels,
-            1,
-            downsample=True,
-            resample_kernel=resample_kernel,
-            bias=False,
-            activate=False,
-        )
-    def forward(self, x):
-        out = self.conv1(x)
-        out = self.conv2(out)
-        skip = self.skip(x)
-        out = (out + skip) / math.sqrt(2)
-        return out
--- a/comfy_extras/chainner_models/architecture/face/stylegan2_bilinear_arch.py
+++ b/comfy_extras/chainner_models/architecture/face/stylegan2_bilinear_arch.py
-# pylint: skip-file
-# type: ignore
-import math
-import random
-import torch
-from torch import nn
-from torch.nn import functional as F
-from .fused_act import FusedLeakyReLU, fused_leaky_relu
-class NormStyleCode(nn.Module):
-    def forward(self, x):
-        """Normalize the style codes.
-        Args:
-            x (Tensor): Style codes with shape (b, c).
-        Returns:
-            Tensor: Normalized tensor.
-        """
-        return x * torch.rsqrt(torch.mean(x**2, dim=1, keepdim=True) + 1e-8)
-class EqualLinear(nn.Module):
-    """Equalized Linear as StyleGAN2.
-    Args:
-        in_channels (int): Size of each sample.
-        out_channels (int): Size of each output sample.
-        bias (bool): If set to ``False``, the layer will not learn an additive
-            bias. Default: ``True``.
-        bias_init_val (float): Bias initialized value. Default: 0.
-        lr_mul (float): Learning rate multiplier. Default: 1.
-        activation (None | str): The activation after ``linear`` operation.
-            Supported: 'fused_lrelu', None. Default: None.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        bias=True,
-        bias_init_val=0,
-        lr_mul=1,
-        activation=None,
-    ):
-        super(EqualLinear, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.lr_mul = lr_mul
-        self.activation = activation
-        if self.activation not in ["fused_lrelu", None]:
-            raise ValueError(
-                f"Wrong activation value in EqualLinear: {activation}"
-                "Supported ones are: ['fused_lrelu', None]."
-            )
-        self.scale = (1 / math.sqrt(in_channels)) * lr_mul
-        self.weight = nn.Parameter(torch.randn(out_channels, in_channels).div_(lr_mul))
-        if bias:
-            self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val))
-        else:
-            self.register_parameter("bias", None)
-    def forward(self, x):
-        if self.bias is None:
-            bias = None
-        else:
-            bias = self.bias * self.lr_mul
-        if self.activation == "fused_lrelu":
-            out = F.linear(x, self.weight * self.scale)
-            out = fused_leaky_relu(out, bias)
-        else:
-            out = F.linear(x, self.weight * self.scale, bias=bias)
-        return out
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}(in_channels={self.in_channels}, "
-            f"out_channels={self.out_channels}, bias={self.bias is not None})"
-        )
-class ModulatedConv2d(nn.Module):
-    """Modulated Conv2d used in StyleGAN2.
-    There is no bias in ModulatedConv2d.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether to demodulate in the conv layer.
-            Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
-            Default: None.
-        eps (float): A value added to the denominator for numerical stability.
-            Default: 1e-8.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        num_style_feat,
-        demodulate=True,
-        sample_mode=None,
-        eps=1e-8,
-        interpolation_mode="bilinear",
-    ):
-        super(ModulatedConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.demodulate = demodulate
-        self.sample_mode = sample_mode
-        self.eps = eps
-        self.interpolation_mode = interpolation_mode
-        if self.interpolation_mode == "nearest":
-            self.align_corners = None
-        else:
-            self.align_corners = False
-        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
-        # modulation inside each modulated conv
-        self.modulation = EqualLinear(
-            num_style_feat,
-            in_channels,
-            bias=True,
-            bias_init_val=1,
-            lr_mul=1,
-            activation=None,
-        )
-        self.weight = nn.Parameter(
-            torch.randn(1, out_channels, in_channels, kernel_size, kernel_size)
-        )
-        self.padding = kernel_size // 2
-    def forward(self, x, style):
-        """Forward function.
-        Args:
-            x (Tensor): Tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-        Returns:
-            Tensor: Modulated tensor after convolution.
-        """
-        b, c, h, w = x.shape  # c = c_in
-        # weight modulation
-        style = self.modulation(style).view(b, 1, c, 1, 1)
-        # self.weight: (1, c_out, c_in, k, k); style: (b, 1, c, 1, 1)
-        weight = self.scale * self.weight * style  # (b, c_out, c_in, k, k)
-        if self.demodulate:
-            demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps)
-            weight = weight * demod.view(b, self.out_channels, 1, 1, 1)
-        weight = weight.view(
-            b * self.out_channels, c, self.kernel_size, self.kernel_size
-        )
-        if self.sample_mode == "upsample":
-            x = F.interpolate(
-                x,
-                scale_factor=2,
-                mode=self.interpolation_mode,
-                align_corners=self.align_corners,
-            )
-        elif self.sample_mode == "downsample":
-            x = F.interpolate(
-                x,
-                scale_factor=0.5,
-                mode=self.interpolation_mode,
-                align_corners=self.align_corners,
-            )
-        b, c, h, w = x.shape
-        x = x.view(1, b * c, h, w)
-        # weight: (b*c_out, c_in, k, k), groups=b
-        out = F.conv2d(x, weight, padding=self.padding, groups=b)
-        out = out.view(b, self.out_channels, *out.shape[2:4])
-        return out
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}(in_channels={self.in_channels}, "
-            f"out_channels={self.out_channels}, "
-            f"kernel_size={self.kernel_size}, "
-            f"demodulate={self.demodulate}, sample_mode={self.sample_mode})"
-        )
-class StyleConv(nn.Module):
-    """Style conv.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether demodulate in the conv layer. Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None.
-            Default: None.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        num_style_feat,
-        demodulate=True,
-        sample_mode=None,
-        interpolation_mode="bilinear",
-    ):
-        super(StyleConv, self).__init__()
-        self.modulated_conv = ModulatedConv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            num_style_feat,
-            demodulate=demodulate,
-            sample_mode=sample_mode,
-            interpolation_mode=interpolation_mode,
-        )
-        self.weight = nn.Parameter(torch.zeros(1))  # for noise injection
-        self.activate = FusedLeakyReLU(out_channels)
-    def forward(self, x, style, noise=None):
-        # modulate
-        out = self.modulated_conv(x, style)
-        # noise injection
-        if noise is None:
-            b, _, h, w = out.shape
-            noise = out.new_empty(b, 1, h, w).normal_()
-        out = out + self.weight * noise
-        # activation (with bias)
-        out = self.activate(out)
-        return out
-class ToRGB(nn.Module):
-    """To RGB from features.
-    Args:
-        in_channels (int): Channel number of input.
-        num_style_feat (int): Channel number of style features.
-        upsample (bool): Whether to upsample. Default: True.
-    """
-    def __init__(
-        self, in_channels, num_style_feat, upsample=True, interpolation_mode="bilinear"
-    ):
-        super(ToRGB, self).__init__()
-        self.upsample = upsample
-        self.interpolation_mode = interpolation_mode
-        if self.interpolation_mode == "nearest":
-            self.align_corners = None
-        else:
-            self.align_corners = False
-        self.modulated_conv = ModulatedConv2d(
-            in_channels,
-            3,
-            kernel_size=1,
-            num_style_feat=num_style_feat,
-            demodulate=False,
-            sample_mode=None,
-            interpolation_mode=interpolation_mode,
-        )
-        self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1))
-    def forward(self, x, style, skip=None):
-        """Forward function.
-        Args:
-            x (Tensor): Feature tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-            skip (Tensor): Base/skip tensor. Default: None.
-        Returns:
-            Tensor: RGB images.
-        """
-        out = self.modulated_conv(x, style)
-        out = out + self.bias
-        if skip is not None:
-            if self.upsample:
-                skip = F.interpolate(
-                    skip,
-                    scale_factor=2,
-                    mode=self.interpolation_mode,
-                    align_corners=self.align_corners,
-                )
-            out = out + skip
-        return out
-class ConstantInput(nn.Module):
-    """Constant input.
-    Args:
-        num_channel (int): Channel number of constant input.
-        size (int): Spatial size of constant input.
-    """
-    def __init__(self, num_channel, size):
-        super(ConstantInput, self).__init__()
-        self.weight = nn.Parameter(torch.randn(1, num_channel, size, size))
-    def forward(self, batch):
-        out = self.weight.repeat(batch, 1, 1, 1)
-        return out
-class StyleGAN2GeneratorBilinear(nn.Module):
-    """StyleGAN2 Generator.
-    Args:
-        out_size (int): The spatial size of outputs.
-        num_style_feat (int): Channel number of style features. Default: 512.
-        num_mlp (int): Layer number of MLP style layers. Default: 8.
-        channel_multiplier (int): Channel multiplier for large networks of
-            StyleGAN2. Default: 2.
-        lr_mlp (float): Learning rate multiplier for mlp layers. Default: 0.01.
-        narrow (float): Narrow ratio for channels. Default: 1.0.
-    """
-    def __init__(
-        self,
-        out_size,
-        num_style_feat=512,
-        num_mlp=8,
-        channel_multiplier=2,
-        lr_mlp=0.01,
-        narrow=1,
-        interpolation_mode="bilinear",
-    ):
-        super(StyleGAN2GeneratorBilinear, self).__init__()
-        # Style MLP layers
-        self.num_style_feat = num_style_feat
-        style_mlp_layers = [NormStyleCode()]
-        for i in range(num_mlp):
-            style_mlp_layers.append(
-                EqualLinear(
-                    num_style_feat,
-                    num_style_feat,
-                    bias=True,
-                    bias_init_val=0,
-                    lr_mul=lr_mlp,
-                    activation="fused_lrelu",
-                )
-            )
-        self.style_mlp = nn.Sequential(*style_mlp_layers)
-        channels = {
-            "4": int(512 * narrow),
-            "8": int(512 * narrow),
-            "16": int(512 * narrow),
-            "32": int(512 * narrow),
-            "64": int(256 * channel_multiplier * narrow),
-            "128": int(128 * channel_multiplier * narrow),
-            "256": int(64 * channel_multiplier * narrow),
-            "512": int(32 * channel_multiplier * narrow),
-            "1024": int(16 * channel_multiplier * narrow),
-        }
-        self.channels = channels
-        self.constant_input = ConstantInput(channels["4"], size=4)
-        self.style_conv1 = StyleConv(
-            channels["4"],
-            channels["4"],
-            kernel_size=3,
-            num_style_feat=num_style_feat,
-            demodulate=True,
-            sample_mode=None,
-            interpolation_mode=interpolation_mode,
-        )
-        self.to_rgb1 = ToRGB(
-            channels["4"],
-            num_style_feat,
-            upsample=False,
-            interpolation_mode=interpolation_mode,
-        )
-        self.log_size = int(math.log(out_size, 2))
-        self.num_layers = (self.log_size - 2) * 2 + 1
-        self.num_latent = self.log_size * 2 - 2
-        self.style_convs = nn.ModuleList()
-        self.to_rgbs = nn.ModuleList()
-        self.noises = nn.Module()
-        in_channels = channels["4"]
-        # noise
-        for layer_idx in range(self.num_layers):
-            resolution = 2 ** ((layer_idx + 5) // 2)
-            shape = [1, 1, resolution, resolution]
-            self.noises.register_buffer(f"noise{layer_idx}", torch.randn(*shape))
-        # style convs and to_rgbs
-        for i in range(3, self.log_size + 1):
-            out_channels = channels[f"{2**i}"]
-            self.style_convs.append(
-                StyleConv(
-                    in_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode="upsample",
-                    interpolation_mode=interpolation_mode,
-                )
-            )
-            self.style_convs.append(
-                StyleConv(
-                    out_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode=None,
-                    interpolation_mode=interpolation_mode,
-                )
-            )
-            self.to_rgbs.append(
-                ToRGB(
-                    out_channels,
-                    num_style_feat,
-                    upsample=True,
-                    interpolation_mode=interpolation_mode,
-                )
-            )
-            in_channels = out_channels
-    def make_noise(self):
-        """Make noise for noise injection."""
-        device = self.constant_input.weight.device
-        noises = [torch.randn(1, 1, 4, 4, device=device)]
-        for i in range(3, self.log_size + 1):
-            for _ in range(2):
-                noises.append(torch.randn(1, 1, 2**i, 2**i, device=device))
-        return noises
-    def get_latent(self, x):
-        return self.style_mlp(x)
-    def mean_latent(self, num_latent):
-        latent_in = torch.randn(
-            num_latent, self.num_style_feat, device=self.constant_input.weight.device
-        )
-        latent = self.style_mlp(latent_in).mean(0, keepdim=True)
-        return latent
-    def forward(
-        self,
-        styles,
-        input_is_latent=False,
-        noise=None,
-        randomize_noise=True,
-        truncation=1,
-        truncation_latent=None,
-        inject_index=None,
-        return_latents=False,
-    ):
-        """Forward function for StyleGAN2Generator.
-        Args:
-            styles (list[Tensor]): Sample codes of styles.
-            input_is_latent (bool): Whether input is latent style.
-                Default: False.
-            noise (Tensor | None): Input noise or None. Default: None.
-            randomize_noise (bool): Randomize noise, used when 'noise' is
-                False. Default: True.
-            truncation (float): TODO. Default: 1.
-            truncation_latent (Tensor | None): TODO. Default: None.
-            inject_index (int | None): The injection index for mixing noise.
-                Default: None.
-            return_latents (bool): Whether to return style latents.
-                Default: False.
-        """
-        # style codes -> latents with Style MLP layer
-        if not input_is_latent:
-            styles = [self.style_mlp(s) for s in styles]
-        # noises
-        if noise is None:
-            if randomize_noise:
-                noise = [None] * self.num_layers  # for each style conv layer
-            else:  # use the stored noise
-                noise = [
-                    getattr(self.noises, f"noise{i}") for i in range(self.num_layers)
-                ]
-        # style truncation
-        if truncation < 1:
-            style_truncation = []
-            for style in styles:
-                style_truncation.append(
-                    truncation_latent + truncation * (style - truncation_latent)
-                )
-            styles = style_truncation
-        # get style latent with injection
-        if len(styles) == 1:
-            inject_index = self.num_latent
-            if styles[0].ndim < 3:
-                # repeat latent code for all the layers
-                latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            else:  # used for encoder with different latent code for each layer
-                latent = styles[0]
-        elif len(styles) == 2:  # mixing noises
-            if inject_index is None:
-                inject_index = random.randint(1, self.num_latent - 1)
-            latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            latent2 = (
-                styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1)
-            )
-            latent = torch.cat([latent1, latent2], 1)
-        # main generation
-        out = self.constant_input(latent.shape[0])
-        out = self.style_conv1(out, latent[:, 0], noise=noise[0])
-        skip = self.to_rgb1(out, latent[:, 1])
-        i = 1
-        for conv1, conv2, noise1, noise2, to_rgb in zip(
-            self.style_convs[::2],
-            self.style_convs[1::2],
-            noise[1::2],
-            noise[2::2],
-            self.to_rgbs,
-        ):
-            out = conv1(out, latent[:, i], noise=noise1)
-            out = conv2(out, latent[:, i + 1], noise=noise2)
-            skip = to_rgb(out, latent[:, i + 2], skip)
-            i += 2
-        image = skip
-        if return_latents:
-            return image, latent
-        else:
-            return image, None
-class ScaledLeakyReLU(nn.Module):
-    """Scaled LeakyReLU.
-    Args:
-        negative_slope (float): Negative slope. Default: 0.2.
-    """
-    def __init__(self, negative_slope=0.2):
-        super(ScaledLeakyReLU, self).__init__()
-        self.negative_slope = negative_slope
-    def forward(self, x):
-        out = F.leaky_relu(x, negative_slope=self.negative_slope)
-        return out * math.sqrt(2)
-class EqualConv2d(nn.Module):
-    """Equalized Linear as StyleGAN2.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        stride (int): Stride of the convolution. Default: 1
-        padding (int): Zero-padding added to both sides of the input.
-            Default: 0.
-        bias (bool): If ``True``, adds a learnable bias to the output.
-            Default: ``True``.
-        bias_init_val (float): Bias initialized value. Default: 0.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        stride=1,
-        padding=0,
-        bias=True,
-        bias_init_val=0,
-    ):
-        super(EqualConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.stride = stride
-        self.padding = padding
-        self.scale = 1 / math.sqrt(in_channels * kernel_size**2)
-        self.weight = nn.Parameter(
-            torch.randn(out_channels, in_channels, kernel_size, kernel_size)
-        )
-        if bias:
-            self.bias = nn.Parameter(torch.zeros(out_channels).fill_(bias_init_val))
-        else:
-            self.register_parameter("bias", None)
-    def forward(self, x):
-        out = F.conv2d(
-            x,
-            self.weight * self.scale,
-            bias=self.bias,
-            stride=self.stride,
-            padding=self.padding,
-        )
-        return out
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}(in_channels={self.in_channels}, "
-            f"out_channels={self.out_channels}, "
-            f"kernel_size={self.kernel_size},"
-            f" stride={self.stride}, padding={self.padding}, "
-            f"bias={self.bias is not None})"
-        )
-class ConvLayer(nn.Sequential):
-    """Conv Layer used in StyleGAN2 Discriminator.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Kernel size.
-        downsample (bool): Whether downsample by a factor of 2.
-            Default: False.
-        bias (bool): Whether with bias. Default: True.
-        activate (bool): Whether use activateion. Default: True.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        downsample=False,
-        bias=True,
-        activate=True,
-        interpolation_mode="bilinear",
-    ):
-        layers = []
-        self.interpolation_mode = interpolation_mode
-        # downsample
-        if downsample:
-            if self.interpolation_mode == "nearest":
-                self.align_corners = None
-            else:
-                self.align_corners = False
-            layers.append(
-                torch.nn.Upsample(
-                    scale_factor=0.5,
-                    mode=interpolation_mode,
-                    align_corners=self.align_corners,
-                )
-            )
-        stride = 1
-        self.padding = kernel_size // 2
-        # conv
-        layers.append(
-            EqualConv2d(
-                in_channels,
-                out_channels,
-                kernel_size,
-                stride=stride,
-                padding=self.padding,
-                bias=bias and not activate,
-            )
-        )
-        # activation
-        if activate:
-            if bias:
-                layers.append(FusedLeakyReLU(out_channels))
-            else:
-                layers.append(ScaledLeakyReLU(0.2))
-        super(ConvLayer, self).__init__(*layers)
-class ResBlock(nn.Module):
-    """Residual block used in StyleGAN2 Discriminator.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-    """
-    def __init__(self, in_channels, out_channels, interpolation_mode="bilinear"):
-        super(ResBlock, self).__init__()
-        self.conv1 = ConvLayer(in_channels, in_channels, 3, bias=True, activate=True)
-        self.conv2 = ConvLayer(
-            in_channels,
-            out_channels,
-            3,
-            downsample=True,
-            interpolation_mode=interpolation_mode,
-            bias=True,
-            activate=True,
-        )
-        self.skip = ConvLayer(
-            in_channels,
-            out_channels,
-            1,
-            downsample=True,
-            interpolation_mode=interpolation_mode,
-            bias=False,
-            activate=False,
-        )
-    def forward(self, x):
-        out = self.conv1(x)
-        out = self.conv2(out)
-        skip = self.skip(x)
-        out = (out + skip) / math.sqrt(2)
-        return out
--- a/comfy_extras/chainner_models/architecture/face/stylegan2_clean_arch.py
+++ b/comfy_extras/chainner_models/architecture/face/stylegan2_clean_arch.py
-# pylint: skip-file
-# type: ignore
-import math
-import torch
-from torch import nn
-from torch.nn import functional as F
-from torch.nn import init
-from torch.nn.modules.batchnorm import _BatchNorm
-@torch.no_grad()
-def default_init_weights(module_list, scale=1, bias_fill=0, **kwargs):
-    """Initialize network weights.
-    Args:
-        module_list (list[nn.Module] | nn.Module): Modules to be initialized.
-        scale (float): Scale initialized weights, especially for residual
-            blocks. Default: 1.
-        bias_fill (float): The value to fill bias. Default: 0
-        kwargs (dict): Other arguments for initialization function.
-    """
-    if not isinstance(module_list, list):
-        module_list = [module_list]
-    for module in module_list:
-        for m in module.modules():
-            if isinstance(m, nn.Conv2d):
-                init.kaiming_normal_(m.weight, **kwargs)
-                m.weight.data *= scale
-                if m.bias is not None:
-                    m.bias.data.fill_(bias_fill)
-            elif isinstance(m, nn.Linear):
-                init.kaiming_normal_(m.weight, **kwargs)
-                m.weight.data *= scale
-                if m.bias is not None:
-                    m.bias.data.fill_(bias_fill)
-            elif isinstance(m, _BatchNorm):
-                init.constant_(m.weight, 1)
-                if m.bias is not None:
-                    m.bias.data.fill_(bias_fill)
-class NormStyleCode(nn.Module):
-    def forward(self, x):
-        """Normalize the style codes.
-        Args:
-            x (Tensor): Style codes with shape (b, c).
-        Returns:
-            Tensor: Normalized tensor.
-        """
-        return x * torch.rsqrt(torch.mean(x**2, dim=1, keepdim=True) + 1e-8)
-class ModulatedConv2d(nn.Module):
-    """Modulated Conv2d used in StyleGAN2.
-    There is no bias in ModulatedConv2d.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether to demodulate in the conv layer. Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None. Default: None.
-        eps (float): A value added to the denominator for numerical stability. Default: 1e-8.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        num_style_feat,
-        demodulate=True,
-        sample_mode=None,
-        eps=1e-8,
-    ):
-        super(ModulatedConv2d, self).__init__()
-        self.in_channels = in_channels
-        self.out_channels = out_channels
-        self.kernel_size = kernel_size
-        self.demodulate = demodulate
-        self.sample_mode = sample_mode
-        self.eps = eps
-        # modulation inside each modulated conv
-        self.modulation = nn.Linear(num_style_feat, in_channels, bias=True)
-        # initialization
-        default_init_weights(
-            self.modulation,
-            scale=1,
-            bias_fill=1,
-            a=0,
-            mode="fan_in",
-            nonlinearity="linear",
-        )
-        self.weight = nn.Parameter(
-            torch.randn(1, out_channels, in_channels, kernel_size, kernel_size)
-            / math.sqrt(in_channels * kernel_size**2)
-        )
-        self.padding = kernel_size // 2
-    def forward(self, x, style):
-        """Forward function.
-        Args:
-            x (Tensor): Tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-        Returns:
-            Tensor: Modulated tensor after convolution.
-        """
-        b, c, h, w = x.shape  # c = c_in
-        # weight modulation
-        style = self.modulation(style).view(b, 1, c, 1, 1)
-        # self.weight: (1, c_out, c_in, k, k); style: (b, 1, c, 1, 1)
-        weight = self.weight * style  # (b, c_out, c_in, k, k)
-        if self.demodulate:
-            demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps)
-            weight = weight * demod.view(b, self.out_channels, 1, 1, 1)
-        weight = weight.view(
-            b * self.out_channels, c, self.kernel_size, self.kernel_size
-        )
-        # upsample or downsample if necessary
-        if self.sample_mode == "upsample":
-            x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=False)
-        elif self.sample_mode == "downsample":
-            x = F.interpolate(x, scale_factor=0.5, mode="bilinear", align_corners=False)
-        b, c, h, w = x.shape
-        x = x.view(1, b * c, h, w)
-        # weight: (b*c_out, c_in, k, k), groups=b
-        out = F.conv2d(x, weight, padding=self.padding, groups=b)
-        out = out.view(b, self.out_channels, *out.shape[2:4])
-        return out
-    def __repr__(self):
-        return (
-            f"{self.__class__.__name__}(in_channels={self.in_channels}, out_channels={self.out_channels}, "
-            f"kernel_size={self.kernel_size}, demodulate={self.demodulate}, sample_mode={self.sample_mode})"
-        )
-class StyleConv(nn.Module):
-    """Style conv used in StyleGAN2.
-    Args:
-        in_channels (int): Channel number of the input.
-        out_channels (int): Channel number of the output.
-        kernel_size (int): Size of the convolving kernel.
-        num_style_feat (int): Channel number of style features.
-        demodulate (bool): Whether demodulate in the conv layer. Default: True.
-        sample_mode (str | None): Indicating 'upsample', 'downsample' or None. Default: None.
-    """
-    def __init__(
-        self,
-        in_channels,
-        out_channels,
-        kernel_size,
-        num_style_feat,
-        demodulate=True,
-        sample_mode=None,
-    ):
-        super(StyleConv, self).__init__()
-        self.modulated_conv = ModulatedConv2d(
-            in_channels,
-            out_channels,
-            kernel_size,
-            num_style_feat,
-            demodulate=demodulate,
-            sample_mode=sample_mode,
-        )
-        self.weight = nn.Parameter(torch.zeros(1))  # for noise injection
-        self.bias = nn.Parameter(torch.zeros(1, out_channels, 1, 1))
-        self.activate = nn.LeakyReLU(negative_slope=0.2, inplace=True)
-    def forward(self, x, style, noise=None):
-        # modulate
-        out = self.modulated_conv(x, style) * 2**0.5  # for conversion
-        # noise injection
-        if noise is None:
-            b, _, h, w = out.shape
-            noise = out.new_empty(b, 1, h, w).normal_()
-        out = out + self.weight * noise
-        # add bias
-        out = out + self.bias
-        # activation
-        out = self.activate(out)
-        return out
-class ToRGB(nn.Module):
-    """To RGB (image space) from features.
-    Args:
-        in_channels (int): Channel number of input.
-        num_style_feat (int): Channel number of style features.
-        upsample (bool): Whether to upsample. Default: True.
-    """
-    def __init__(self, in_channels, num_style_feat, upsample=True):
-        super(ToRGB, self).__init__()
-        self.upsample = upsample
-        self.modulated_conv = ModulatedConv2d(
-            in_channels,
-            3,
-            kernel_size=1,
-            num_style_feat=num_style_feat,
-            demodulate=False,
-            sample_mode=None,
-        )
-        self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1))
-    def forward(self, x, style, skip=None):
-        """Forward function.
-        Args:
-            x (Tensor): Feature tensor with shape (b, c, h, w).
-            style (Tensor): Tensor with shape (b, num_style_feat).
-            skip (Tensor): Base/skip tensor. Default: None.
-        Returns:
-            Tensor: RGB images.
-        """
-        out = self.modulated_conv(x, style)
-        out = out + self.bias
-        if skip is not None:
-            if self.upsample:
-                skip = F.interpolate(
-                    skip, scale_factor=2, mode="bilinear", align_corners=False
-                )
-            out = out + skip
-        return out
-class ConstantInput(nn.Module):
-    """Constant input.
-    Args:
-        num_channel (int): Channel number of constant input.
-        size (int): Spatial size of constant input.
-    """
-    def __init__(self, num_channel, size):
-        super(ConstantInput, self).__init__()
-        self.weight = nn.Parameter(torch.randn(1, num_channel, size, size))
-    def forward(self, batch):
-        out = self.weight.repeat(batch, 1, 1, 1)
-        return out
-class StyleGAN2GeneratorClean(nn.Module):
-    """Clean version of StyleGAN2 Generator.
-    Args:
-        out_size (int): The spatial size of outputs.
-        num_style_feat (int): Channel number of style features. Default: 512.
-        num_mlp (int): Layer number of MLP style layers. Default: 8.
-        channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2.
-        narrow (float): Narrow ratio for channels. Default: 1.0.
-    """
-    def __init__(
-        self, out_size, num_style_feat=512, num_mlp=8, channel_multiplier=2, narrow=1
-    ):
-        super(StyleGAN2GeneratorClean, self).__init__()
-        # Style MLP layers
-        self.num_style_feat = num_style_feat
-        style_mlp_layers = [NormStyleCode()]
-        for i in range(num_mlp):
-            style_mlp_layers.extend(
-                [
-                    nn.Linear(num_style_feat, num_style_feat, bias=True),
-                    nn.LeakyReLU(negative_slope=0.2, inplace=True),
-                ]
-            )
-        self.style_mlp = nn.Sequential(*style_mlp_layers)
-        # initialization
-        default_init_weights(
-            self.style_mlp,
-            scale=1,
-            bias_fill=0,
-            a=0.2,
-            mode="fan_in",
-            nonlinearity="leaky_relu",
-        )
-        # channel list
-        channels = {
-            "4": int(512 * narrow),
-            "8": int(512 * narrow),
-            "16": int(512 * narrow),
-            "32": int(512 * narrow),
-            "64": int(256 * channel_multiplier * narrow),
-            "128": int(128 * channel_multiplier * narrow),
-            "256": int(64 * channel_multiplier * narrow),
-            "512": int(32 * channel_multiplier * narrow),
-            "1024": int(16 * channel_multiplier * narrow),
-        }
-        self.channels = channels
-        self.constant_input = ConstantInput(channels["4"], size=4)
-        self.style_conv1 = StyleConv(
-            channels["4"],
-            channels["4"],
-            kernel_size=3,
-            num_style_feat=num_style_feat,
-            demodulate=True,
-            sample_mode=None,
-        )
-        self.to_rgb1 = ToRGB(channels["4"], num_style_feat, upsample=False)
-        self.log_size = int(math.log(out_size, 2))
-        self.num_layers = (self.log_size - 2) * 2 + 1
-        self.num_latent = self.log_size * 2 - 2
-        self.style_convs = nn.ModuleList()
-        self.to_rgbs = nn.ModuleList()
-        self.noises = nn.Module()
-        in_channels = channels["4"]
-        # noise
-        for layer_idx in range(self.num_layers):
-            resolution = 2 ** ((layer_idx + 5) // 2)
-            shape = [1, 1, resolution, resolution]
-            self.noises.register_buffer(f"noise{layer_idx}", torch.randn(*shape))
-        # style convs and to_rgbs
-        for i in range(3, self.log_size + 1):
-            out_channels = channels[f"{2**i}"]
-            self.style_convs.append(
-                StyleConv(
-                    in_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode="upsample",
-                )
-            )
-            self.style_convs.append(
-                StyleConv(
-                    out_channels,
-                    out_channels,
-                    kernel_size=3,
-                    num_style_feat=num_style_feat,
-                    demodulate=True,
-                    sample_mode=None,
-                )
-            )
-            self.to_rgbs.append(ToRGB(out_channels, num_style_feat, upsample=True))
-            in_channels = out_channels
-    def make_noise(self):
-        """Make noise for noise injection."""
-        device = self.constant_input.weight.device
-        noises = [torch.randn(1, 1, 4, 4, device=device)]
-        for i in range(3, self.log_size + 1):
-            for _ in range(2):
-                noises.append(torch.randn(1, 1, 2**i, 2**i, device=device))
-        return noises
-    def get_latent(self, x):
-        return self.style_mlp(x)
-    def mean_latent(self, num_latent):
-        latent_in = torch.randn(
-            num_latent, self.num_style_feat, device=self.constant_input.weight.device
-        )
-        latent = self.style_mlp(latent_in).mean(0, keepdim=True)
-        return latent
-    def forward(
-        self,
-        styles,
-        input_is_latent=False,
-        noise=None,
-        randomize_noise=True,
-        truncation=1,
-        truncation_latent=None,
-        inject_index=None,
-        return_latents=False,
-    ):
-        """Forward function for StyleGAN2GeneratorClean.
-        Args:
-            styles (list[Tensor]): Sample codes of styles.
-            input_is_latent (bool): Whether input is latent style. Default: False.
-            noise (Tensor | None): Input noise or None. Default: None.
-            randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True.
-            truncation (float): The truncation ratio. Default: 1.
-            truncation_latent (Tensor | None): The truncation latent tensor. Default: None.
-            inject_index (int | None): The injection index for mixing noise. Default: None.
-            return_latents (bool): Whether to return style latents. Default: False.
-        """
-        # style codes -> latents with Style MLP layer
-        if not input_is_latent:
-            styles = [self.style_mlp(s) for s in styles]
-        # noises
-        if noise is None:
-            if randomize_noise:
-                noise = [None] * self.num_layers  # for each style conv layer
-            else:  # use the stored noise
-                noise = [
-                    getattr(self.noises, f"noise{i}") for i in range(self.num_layers)
-                ]
-        # style truncation
-        if truncation < 1:
-            style_truncation = []
-            for style in styles:
-                style_truncation.append(
-                    truncation_latent + truncation * (style - truncation_latent)
-                )
-            styles = style_truncation
-        # get style latents with injection
-        if len(styles) == 1:
-            inject_index = self.num_latent
-            if styles[0].ndim < 3:
-                # repeat latent code for all the layers
-                latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            else:  # used for encoder with different latent code for each layer
-                latent = styles[0]
-        elif len(styles) == 2:  # mixing noises
-            if inject_index is None:
-                inject_index = random.randint(1, self.num_latent - 1)
-            latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1)
-            latent2 = (
-                styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1)
-            )
-            latent = torch.cat([latent1, latent2], 1)
-        # main generation
-        out = self.constant_input(latent.shape[0])
-        out = self.style_conv1(out, latent[:, 0], noise=noise[0])
-        skip = self.to_rgb1(out, latent[:, 1])
-        i = 1
-        for conv1, conv2, noise1, noise2, to_rgb in zip(
-            self.style_convs[::2],
-            self.style_convs[1::2],
-            noise[1::2],
-            noise[2::2],
-            self.to_rgbs,
-        ):
-            out = conv1(out, latent[:, i], noise=noise1)
-            out = conv2(out, latent[:, i + 1], noise=noise2)
-            skip = to_rgb(out, latent[:, i + 2], skip)  # feature back to the rgb space
-            i += 2
-        image = skip
-        if return_latents:
-            return image, latent
-        else:
-            return image, None
--- a/comfy_extras/chainner_models/architecture/face/upfirdn2d.py
+++ b/comfy_extras/chainner_models/architecture/face/upfirdn2d.py
-# pylint: skip-file
-# type: ignore
-# modify from https://github.com/rosinality/stylegan2-pytorch/blob/master/op/upfirdn2d.py  # noqa:E501
-import os
-import torch
-from torch.autograd import Function
-from torch.nn import functional as F
-upfirdn2d_ext = None
-class UpFirDn2dBackward(Function):
-    @staticmethod
-    def forward(
-        ctx, grad_output, kernel, grad_kernel, up, down, pad, g_pad, in_size, out_size
-    ):
-        up_x, up_y = up
-        down_x, down_y = down
-        g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1 = g_pad
-        grad_output = grad_output.reshape(-1, out_size[0], out_size[1], 1)
-        grad_input = upfirdn2d_ext.upfirdn2d(
-            grad_output,
-            grad_kernel,
-            down_x,
-            down_y,
-            up_x,
-            up_y,
-            g_pad_x0,
-            g_pad_x1,
-            g_pad_y0,
-            g_pad_y1,
-        )
-        grad_input = grad_input.view(in_size[0], in_size[1], in_size[2], in_size[3])
-        ctx.save_for_backward(kernel)
-        pad_x0, pad_x1, pad_y0, pad_y1 = pad
-        ctx.up_x = up_x
-        ctx.up_y = up_y
-        ctx.down_x = down_x
-        ctx.down_y = down_y
-        ctx.pad_x0 = pad_x0
-        ctx.pad_x1 = pad_x1
-        ctx.pad_y0 = pad_y0
-        ctx.pad_y1 = pad_y1
-        ctx.in_size = in_size
-        ctx.out_size = out_size
-        return grad_input
-    @staticmethod
-    def backward(ctx, gradgrad_input):
-        (kernel,) = ctx.saved_tensors
-        gradgrad_input = gradgrad_input.reshape(-1, ctx.in_size[2], ctx.in_size[3], 1)
-        gradgrad_out = upfirdn2d_ext.upfirdn2d(
-            gradgrad_input,
-            kernel,
-            ctx.up_x,
-            ctx.up_y,
-            ctx.down_x,
-            ctx.down_y,
-            ctx.pad_x0,
-            ctx.pad_x1,
-            ctx.pad_y0,
-            ctx.pad_y1,
-        )
-        # gradgrad_out = gradgrad_out.view(ctx.in_size[0], ctx.out_size[0],
-        #                                  ctx.out_size[1], ctx.in_size[3])
-        gradgrad_out = gradgrad_out.view(
-            ctx.in_size[0], ctx.in_size[1], ctx.out_size[0], ctx.out_size[1]
-        )
-        return gradgrad_out, None, None, None, None, None, None, None, None
-class UpFirDn2d(Function):
-    @staticmethod
-    def forward(ctx, input, kernel, up, down, pad):
-        up_x, up_y = up
-        down_x, down_y = down
-        pad_x0, pad_x1, pad_y0, pad_y1 = pad
-        kernel_h, kernel_w = kernel.shape
-        _, channel, in_h, in_w = input.shape
-        ctx.in_size = input.shape
-        input = input.reshape(-1, in_h, in_w, 1)
-        ctx.save_for_backward(kernel, torch.flip(kernel, [0, 1]))
-        out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
-        out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
-        ctx.out_size = (out_h, out_w)
-        ctx.up = (up_x, up_y)
-        ctx.down = (down_x, down_y)
-        ctx.pad = (pad_x0, pad_x1, pad_y0, pad_y1)
-        g_pad_x0 = kernel_w - pad_x0 - 1
-        g_pad_y0 = kernel_h - pad_y0 - 1
-        g_pad_x1 = in_w * up_x - out_w * down_x + pad_x0 - up_x + 1
-        g_pad_y1 = in_h * up_y - out_h * down_y + pad_y0 - up_y + 1
-        ctx.g_pad = (g_pad_x0, g_pad_x1, g_pad_y0, g_pad_y1)
-        out = upfirdn2d_ext.upfirdn2d(
-            input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1
-        )
-        # out = out.view(major, out_h, out_w, minor)
-        out = out.view(-1, channel, out_h, out_w)
-        return out
-    @staticmethod
-    def backward(ctx, grad_output):
-        kernel, grad_kernel = ctx.saved_tensors
-        grad_input = UpFirDn2dBackward.apply(
-            grad_output,
-            kernel,
-            grad_kernel,
-            ctx.up,
-            ctx.down,
-            ctx.pad,
-            ctx.g_pad,
-            ctx.in_size,
-            ctx.out_size,
-        )
-        return grad_input, None, None, None, None
-def upfirdn2d(input, kernel, up=1, down=1, pad=(0, 0)):
-    if input.device.type == "cpu":
-        out = upfirdn2d_native(
-            input, kernel, up, up, down, down, pad[0], pad[1], pad[0], pad[1]
-        )
-    else:
-        out = UpFirDn2d.apply(
-            input, kernel, (up, up), (down, down), (pad[0], pad[1], pad[0], pad[1])
-        )
-    return out
-def upfirdn2d_native(
-    input, kernel, up_x, up_y, down_x, down_y, pad_x0, pad_x1, pad_y0, pad_y1
-):
-    _, channel, in_h, in_w = input.shape
-    input = input.reshape(-1, in_h, in_w, 1)
-    _, in_h, in_w, minor = input.shape
-    kernel_h, kernel_w = kernel.shape
-    out = input.view(-1, in_h, 1, in_w, 1, minor)
-    out = F.pad(out, [0, 0, 0, up_x - 1, 0, 0, 0, up_y - 1])
-    out = out.view(-1, in_h * up_y, in_w * up_x, minor)
-    out = F.pad(
-        out, [0, 0, max(pad_x0, 0), max(pad_x1, 0), max(pad_y0, 0), max(pad_y1, 0)]
-    )
-    out = out[
-        :,
-        max(-pad_y0, 0) : out.shape[1] - max(-pad_y1, 0),
-        max(-pad_x0, 0) : out.shape[2] - max(-pad_x1, 0),
-        :,
-    ]
-    out = out.permute(0, 3, 1, 2)
-    out = out.reshape(
-        [-1, 1, in_h * up_y + pad_y0 + pad_y1, in_w * up_x + pad_x0 + pad_x1]
-    )
-    w = torch.flip(kernel, [0, 1]).view(1, 1, kernel_h, kernel_w)
-    out = F.conv2d(out, w)
-    out = out.reshape(
-        -1,
-        minor,
-        in_h * up_y + pad_y0 + pad_y1 - kernel_h + 1,
-        in_w * up_x + pad_x0 + pad_x1 - kernel_w + 1,
-    )
-    out = out.permute(0, 2, 3, 1)
-    out = out[:, ::down_y, ::down_x, :]
-    out_h = (in_h * up_y + pad_y0 + pad_y1 - kernel_h) // down_y + 1
-    out_w = (in_w * up_x + pad_x0 + pad_x1 - kernel_w) // down_x + 1
-    return out.view(-1, channel, out_h, out_w)
--- a/comfy_extras/chainner_models/architecture/timm/LICENSE
+++ b/comfy_extras/chainner_models/architecture/timm/LICENSE
-                                 Apache License
-                           Version 2.0, January 2004
-                        http://www.apache.org/licenses/
-   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-   1. Definitions.
-      "License" shall mean the terms and conditions for use, reproduction,
-      and distribution as defined by Sections 1 through 9 of this document.
-      "Licensor" shall mean the copyright owner or entity authorized by
-      the copyright owner that is granting the License.
-      "Legal Entity" shall mean the union of the acting entity and all
-      other entities that control, are controlled by, or are under common
-      control with that entity. For the purposes of this definition,
-      "control" means (i) the power, direct or indirect, to cause the
-      direction or management of such entity, whether by contract or
-      otherwise, or (ii) ownership of fifty percent (50%) or more of the
-      outstanding shares, or (iii) beneficial ownership of such entity.
-      "You" (or "Your") shall mean an individual or Legal Entity
-      exercising permissions granted by this License.
-      "Source" form shall mean the preferred form for making modifications,
-      including but not limited to software source code, documentation
-      source, and configuration files.
-      "Object" form shall mean any form resulting from mechanical
-      transformation or translation of a Source form, including but
-      not limited to compiled object code, generated documentation,
-      and conversions to other media types.
-      "Work" shall mean the work of authorship, whether in Source or
-      Object form, made available under the License, as indicated by a
-      copyright notice that is included in or attached to the work
-      (an example is provided in the Appendix below).
-      "Derivative Works" shall mean any work, whether in Source or Object
-      form, that is based on (or derived from) the Work and for which the
-      editorial revisions, annotations, elaborations, or other modifications
-      represent, as a whole, an original work of authorship. For the purposes
-      of this License, Derivative Works shall not include works that remain
-      separable from, or merely link (or bind by name) to the interfaces of,
-      the Work and Derivative Works thereof.
-      "Contribution" shall mean any work of authorship, including
-      the original version of the Work and any modifications or additions
-      to that Work or Derivative Works thereof, that is intentionally
-      submitted to Licensor for inclusion in the Work by the copyright owner
-      or by an individual or Legal Entity authorized to submit on behalf of
-      the copyright owner. For the purposes of this definition, "submitted"
-      means any form of electronic, verbal, or written communication sent
-      to the Licensor or its representatives, including but not limited to
-      communication on electronic mailing lists, source code control systems,
-      and issue tracking systems that are managed by, or on behalf of, the
-      Licensor for the purpose of discussing and improving the Work, but
-      excluding communication that is conspicuously marked or otherwise
-      designated in writing by the copyright owner as "Not a Contribution."
-      "Contributor" shall mean Licensor and any individual or Legal Entity
-      on behalf of whom a Contribution has been received by Licensor and
-      subsequently incorporated within the Work.
-   2. Grant of Copyright License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      copyright license to reproduce, prepare Derivative Works of,
-      publicly display, publicly perform, sublicense, and distribute the
-      Work and such Derivative Works in Source or Object form.
-   3. Grant of Patent License. Subject to the terms and conditions of
-      this License, each Contributor hereby grants to You a perpetual,
-      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-      (except as stated in this section) patent license to make, have made,
-      use, offer to sell, sell, import, and otherwise transfer the Work,
-      where such license applies only to those patent claims licensable
-      by such Contributor that are necessarily infringed by their
-      Contribution(s) alone or by combination of their Contribution(s)
-      with the Work to which such Contribution(s) was submitted. If You
-      institute patent litigation against any entity (including a
-      cross-claim or counterclaim in a lawsuit) alleging that the Work
-      or a Contribution incorporated within the Work constitutes direct
-      or contributory patent infringement, then any patent licenses
-      granted to You under this License for that Work shall terminate
-      as of the date such litigation is filed.
-   4. Redistribution. You may reproduce and distribute copies of the
-      Work or Derivative Works thereof in any medium, with or without
-      modifications, and in Source or Object form, provided that You
-      meet the following conditions:
-      (a) You must give any other recipients of the Work or
-          Derivative Works a copy of this License; and
-      (b) You must cause any modified files to carry prominent notices
-          stating that You changed the files; and
-      (c) You must retain, in the Source form of any Derivative Works
-          that You distribute, all copyright, patent, trademark, and
-          attribution notices from the Source form of the Work,
-          excluding those notices that do not pertain to any part of
-          the Derivative Works; and
-      (d) If the Work includes a "NOTICE" text file as part of its
-          distribution, then any Derivative Works that You distribute must
-          include a readable copy of the attribution notices contained
-          within such NOTICE file, excluding those notices that do not
-          pertain to any part of the Derivative Works, in at least one
-          of the following places: within a NOTICE text file distributed
-          as part of the Derivative Works; within the Source form or
-          documentation, if provided along with the Derivative Works; or,
-          within a display generated by the Derivative Works, if and
-          wherever such third-party notices normally appear. The contents
-          of the NOTICE file are for informational purposes only and
-          do not modify the License. You may add Your own attribution
-          notices within Derivative Works that You distribute, alongside
-          or as an addendum to the NOTICE text from the Work, provided
-          that such additional attribution notices cannot be construed
-          as modifying the License.
-      You may add Your own copyright statement to Your modifications and
-      may provide additional or different license terms and conditions
-      for use, reproduction, or distribution of Your modifications, or
-      for any such Derivative Works as a whole, provided Your use,
-      reproduction, and distribution of the Work otherwise complies with
-      the conditions stated in this License.
-   5. Submission of Contributions. Unless You explicitly state otherwise,
-      any Contribution intentionally submitted for inclusion in the Work
-      by You to the Licensor shall be under the terms and conditions of
-      this License, without any additional terms or conditions.
-      Notwithstanding the above, nothing herein shall supersede or modify
-      the terms of any separate license agreement you may have executed
-      with Licensor regarding such Contributions.
-   6. Trademarks. This License does not grant permission to use the trade
-      names, trademarks, service marks, or product names of the Licensor,
-      except as required for reasonable and customary use in describing the
-      origin of the Work and reproducing the content of the NOTICE file.
-   7. Disclaimer of Warranty. Unless required by applicable law or
-      agreed to in writing, Licensor provides the Work (and each
-      Contributor provides its Contributions) on an "AS IS" BASIS,
-      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
-      implied, including, without limitation, any warranties or conditions
-      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
-      PARTICULAR PURPOSE. You are solely responsible for determining the
-      appropriateness of using or redistributing the Work and assume any
-      risks associated with Your exercise of permissions under this License.
-   8. Limitation of Liability. In no event and under no legal theory,
-      whether in tort (including negligence), contract, or otherwise,
-      unless required by applicable law (such as deliberate and grossly
-      negligent acts) or agreed to in writing, shall any Contributor be
-      liable to You for damages, including any direct, indirect, special,
-      incidental, or consequential damages of any character arising as a
-      result of this License or out of the use or inability to use the
-      Work (including but not limited to damages for loss of goodwill,
-      work stoppage, computer failure or malfunction, or any and all
-      other commercial damages or losses), even if such Contributor
-      has been advised of the possibility of such damages.
-   9. Accepting Warranty or Additional Liability. While redistributing
-      the Work or Derivative Works thereof, You may choose to offer,
-      and charge a fee for, acceptance of support, warranty, indemnity,
-      or other liability obligations and/or rights consistent with this
-      License. However, in accepting such obligations, You may act only
-      on Your own behalf and on Your sole responsibility, not on behalf
-      of any other Contributor, and only if You agree to indemnify,
-      defend, and hold each Contributor harmless for any liability
-      incurred by, or claims asserted against, such Contributor by reason
-      of your accepting any such warranty or additional liability.
-   END OF TERMS AND CONDITIONS
-   APPENDIX: How to apply the Apache License to your work.
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-   Copyright 2019 Ross Wightman
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-       http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
\ No newline at end of file
--- a/comfy_extras/chainner_models/architecture/timm/drop.py
+++ b/comfy_extras/chainner_models/architecture/timm/drop.py
-""" DropBlock, DropPath
-PyTorch implementations of DropBlock and DropPath (Stochastic Depth) regularization layers.
-Papers:
-DropBlock: A regularization method for convolutional networks (https://arxiv.org/abs/1810.12890)
-Deep Networks with Stochastic Depth (https://arxiv.org/abs/1603.09382)
-Code:
-DropBlock impl inspired by two Tensorflow impl that I liked:
- - https://github.com/tensorflow/tpu/blob/master/models/official/resnet/resnet_model.py#L74
- - https://github.com/clovaai/assembled-cnn/blob/master/nets/blocks.py
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-def drop_block_2d(
-    x,
-    drop_prob: float = 0.1,
-    block_size: int = 7,
-    gamma_scale: float = 1.0,
-    with_noise: bool = False,
-    inplace: bool = False,
-    batchwise: bool = False,
-):
-    """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
-    DropBlock with an experimental gaussian noise option. This layer has been tested on a few training
-    runs with success, but needs further validation and possibly optimization for lower runtime impact.
-    """
-    _, C, H, W = x.shape
-    total_size = W * H
-    clipped_block_size = min(block_size, min(W, H))
-    # seed_drop_rate, the gamma parameter
-    gamma = (
-        gamma_scale
-        * drop_prob
-        * total_size
-        / clipped_block_size**2
-        / ((W - block_size + 1) * (H - block_size + 1))
-    )
-    # Forces the block to be inside the feature map.
-    w_i, h_i = torch.meshgrid(
-        torch.arange(W).to(x.device), torch.arange(H).to(x.device)
-    )
-    valid_block = (
-        (w_i >= clipped_block_size // 2) & (w_i < W - (clipped_block_size - 1) // 2)
-    ) & ((h_i >= clipped_block_size // 2) & (h_i < H - (clipped_block_size - 1) // 2))
-    valid_block = torch.reshape(valid_block, (1, 1, H, W)).to(dtype=x.dtype)
-    if batchwise:
-        # one mask for whole batch, quite a bit faster
-        uniform_noise = torch.rand((1, C, H, W), dtype=x.dtype, device=x.device)
-    else:
-        uniform_noise = torch.rand_like(x)
-    block_mask = ((2 - gamma - valid_block + uniform_noise) >= 1).to(dtype=x.dtype)
-    block_mask = -F.max_pool2d(
-        -block_mask,
-        kernel_size=clipped_block_size,  # block_size,
-        stride=1,
-        padding=clipped_block_size // 2,
-    )
-    if with_noise:
-        normal_noise = (
-            torch.randn((1, C, H, W), dtype=x.dtype, device=x.device)
-            if batchwise
-            else torch.randn_like(x)
-        )
-        if inplace:
-            x.mul_(block_mask).add_(normal_noise * (1 - block_mask))
-        else:
-            x = x * block_mask + normal_noise * (1 - block_mask)
-    else:
-        normalize_scale = (
-            block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-7)
-        ).to(x.dtype)
-        if inplace:
-            x.mul_(block_mask * normalize_scale)
-        else:
-            x = x * block_mask * normalize_scale
-    return x
-def drop_block_fast_2d(
-    x: torch.Tensor,
-    drop_prob: float = 0.1,
-    block_size: int = 7,
-    gamma_scale: float = 1.0,
-    with_noise: bool = False,
-    inplace: bool = False,
-):
-    """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf
-    DropBlock with an experimental gaussian noise option. Simplied from above without concern for valid
-    block mask at edges.
-    """
-    _, _, H, W = x.shape
-    total_size = W * H
-    clipped_block_size = min(block_size, min(W, H))
-    gamma = (
-        gamma_scale
-        * drop_prob
-        * total_size
-        / clipped_block_size**2
-        / ((W - block_size + 1) * (H - block_size + 1))
-    )
-    block_mask = torch.empty_like(x).bernoulli_(gamma)
-    block_mask = F.max_pool2d(
-        block_mask.to(x.dtype),
-        kernel_size=clipped_block_size,
-        stride=1,
-        padding=clipped_block_size // 2,
-    )
-    if with_noise:
-        normal_noise = torch.empty_like(x).normal_()
-        if inplace:
-            x.mul_(1.0 - block_mask).add_(normal_noise * block_mask)
-        else:
-            x = x * (1.0 - block_mask) + normal_noise * block_mask
-    else:
-        block_mask = 1 - block_mask
-        normalize_scale = (
-            block_mask.numel() / block_mask.to(dtype=torch.float32).sum().add(1e-6)
-        ).to(dtype=x.dtype)
-        if inplace:
-            x.mul_(block_mask * normalize_scale)
-        else:
-            x = x * block_mask * normalize_scale
-    return x
-class DropBlock2d(nn.Module):
-    """DropBlock. See https://arxiv.org/pdf/1810.12890.pdf"""
-    def __init__(
-        self,
-        drop_prob: float = 0.1,
-        block_size: int = 7,
-        gamma_scale: float = 1.0,
-        with_noise: bool = False,
-        inplace: bool = False,
-        batchwise: bool = False,
-        fast: bool = True,
-    ):
-        super(DropBlock2d, self).__init__()
-        self.drop_prob = drop_prob
-        self.gamma_scale = gamma_scale
-        self.block_size = block_size
-        self.with_noise = with_noise
-        self.inplace = inplace
-        self.batchwise = batchwise
-        self.fast = fast  # FIXME finish comparisons of fast vs not
-    def forward(self, x):
-        if not self.training or not self.drop_prob:
-            return x
-        if self.fast:
-            return drop_block_fast_2d(
-                x,
-                self.drop_prob,
-                self.block_size,
-                self.gamma_scale,
-                self.with_noise,
-                self.inplace,
-            )
-        else:
-            return drop_block_2d(
-                x,
-                self.drop_prob,
-                self.block_size,
-                self.gamma_scale,
-                self.with_noise,
-                self.inplace,
-                self.batchwise,
-            )
-def drop_path(
-    x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True
-):
-    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
-    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
-    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
-    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
-    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
-    'survival rate' as the argument.
-    """
-    if drop_prob == 0.0 or not training:
-        return x
-    keep_prob = 1 - drop_prob
-    shape = (x.shape[0],) + (1,) * (
-        x.ndim - 1
-    )  # work with diff dim tensors, not just 2D ConvNets
-    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
-    if keep_prob > 0.0 and scale_by_keep:
-        random_tensor.div_(keep_prob)
-    return x * random_tensor
-class DropPath(nn.Module):
-    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks)."""
-    def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True):
-        super(DropPath, self).__init__()
-        self.drop_prob = drop_prob
-        self.scale_by_keep = scale_by_keep
-    def forward(self, x):
-        return drop_path(x, self.drop_prob, self.training, self.scale_by_keep)
-    def extra_repr(self):
-        return f"drop_prob={round(self.drop_prob,3):0.3f}"
--- a/comfy_extras/chainner_models/architecture/timm/helpers.py
+++ b/comfy_extras/chainner_models/architecture/timm/helpers.py
-""" Layer/Module Helpers
-Hacked together by / Copyright 2020 Ross Wightman
-"""
-import collections.abc
-from itertools import repeat
-# From PyTorch internals
-def _ntuple(n):
-    def parse(x):
-        if isinstance(x, collections.abc.Iterable) and not isinstance(x, str):
-            return x
-        return tuple(repeat(x, n))
-    return parse
-to_1tuple = _ntuple(1)
-to_2tuple = _ntuple(2)
-to_3tuple = _ntuple(3)
-to_4tuple = _ntuple(4)
-to_ntuple = _ntuple
-def make_divisible(v, divisor=8, min_value=None, round_limit=0.9):
-    min_value = min_value or divisor
-    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
-    # Make sure that round down does not go down by more than 10%.
-    if new_v < round_limit * v:
-        new_v += divisor
-    return new_v
--- a/comfy_extras/chainner_models/architecture/timm/weight_init.py
+++ b/comfy_extras/chainner_models/architecture/timm/weight_init.py
-import math
-import warnings
-import torch
-from torch.nn.init import _calculate_fan_in_and_fan_out
-def _no_grad_trunc_normal_(tensor, mean, std, a, b):
-    # Cut & paste from PyTorch official master until it's in a few official releases - RW
-    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
-    def norm_cdf(x):
-        # Computes standard normal cumulative distribution function
-        return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0
-    if (mean < a - 2 * std) or (mean > b + 2 * std):
-        warnings.warn(
-            "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
-            "The distribution of values may be incorrect.",
-            stacklevel=2,
-        )
-    with torch.no_grad():
-        # Values are generated by using a truncated uniform distribution and
-        # then using the inverse CDF for the normal distribution.
-        # Get upper and lower cdf values
-        l = norm_cdf((a - mean) / std)
-        u = norm_cdf((b - mean) / std)
-        # Uniformly fill tensor with values from [l, u], then translate to
-        # [2l-1, 2u-1].
-        tensor.uniform_(2 * l - 1, 2 * u - 1)
-        # Use inverse cdf transform for normal distribution to get truncated
-        # standard normal
-        tensor.erfinv_()
-        # Transform to proper mean, std
-        tensor.mul_(std * math.sqrt(2.0))
-        tensor.add_(mean)
-        # Clamp to ensure it's in the proper range
-        tensor.clamp_(min=a, max=b)
-        return tensor
-def trunc_normal_(
-    tensor: torch.Tensor, mean=0.0, std=1.0, a=-2.0, b=2.0
-) -> torch.Tensor:
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-    NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are
-    applied while sampling the normal with mean/std applied, therefore a, b args
-    should be adjusted to match the range of mean, std args.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-    return _no_grad_trunc_normal_(tensor, mean, std, a, b)
-def trunc_normal_tf_(
-    tensor: torch.Tensor, mean=0.0, std=1.0, a=-2.0, b=2.0
-) -> torch.Tensor:
-    r"""Fills the input Tensor with values drawn from a truncated
-    normal distribution. The values are effectively drawn from the
-    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
-    with values outside :math:`[a, b]` redrawn until they are within
-    the bounds. The method used for generating the random values works
-    best when :math:`a \leq \text{mean} \leq b`.
-    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
-    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
-    Args:
-        tensor: an n-dimensional `torch.Tensor`
-        mean: the mean of the normal distribution
-        std: the standard deviation of the normal distribution
-        a: the minimum cutoff value
-        b: the maximum cutoff value
-    Examples:
-        >>> w = torch.empty(3, 5)
-        >>> nn.init.trunc_normal_(w)
-    """
-    _no_grad_trunc_normal_(tensor, 0, 1.0, a, b)
-    with torch.no_grad():
-        tensor.mul_(std).add_(mean)
-    return tensor
-def variance_scaling_(tensor, scale=1.0, mode="fan_in", distribution="normal"):
-    fan_in, fan_out = _calculate_fan_in_and_fan_out(tensor)
-    if mode == "fan_in":
-        denom = fan_in
-    elif mode == "fan_out":
-        denom = fan_out
-    elif mode == "fan_avg":
-        denom = (fan_in + fan_out) / 2
-    variance = scale / denom  # type: ignore
-    if distribution == "truncated_normal":
-        # constant is stddev of standard normal truncated to (-2, 2)
-        trunc_normal_tf_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
-    elif distribution == "normal":
-        tensor.normal_(std=math.sqrt(variance))
-    elif distribution == "uniform":
-        bound = math.sqrt(3 * variance)
-        # pylint: disable=invalid-unary-operand-type
-        tensor.uniform_(-bound, bound)
-    else:
-        raise ValueError(f"invalid distribution {distribution}")
-def lecun_normal_(tensor):
-    variance_scaling_(tensor, mode="fan_in", distribution="truncated_normal")
--- a/comfy_extras/chainner_models/model_loading.py
+++ b/comfy_extras/chainner_models/model_loading.py
-import logging as logger
-from .architecture.DAT import DAT
-from .architecture.face.codeformer import CodeFormer
-from .architecture.face.gfpganv1_clean_arch import GFPGANv1Clean
-from .architecture.face.restoreformer_arch import RestoreFormer
-from .architecture.HAT import HAT
-from .architecture.LaMa import LaMa
-from .architecture.OmniSR.OmniSR import OmniSR
-from .architecture.RRDB import RRDBNet as ESRGAN
-from .architecture.SCUNet import SCUNet
-from .architecture.SPSR import SPSRNet as SPSR
-from .architecture.SRVGG import SRVGGNetCompact as RealESRGANv2
-from .architecture.SwiftSRGAN import Generator as SwiftSRGAN
-from .architecture.Swin2SR import Swin2SR
-from .architecture.SwinIR import SwinIR
-from .types import PyTorchModel
-class UnsupportedModel(Exception):
-    pass
-def load_state_dict(state_dict) -> PyTorchModel:
-    logger.debug(f"Loading state dict into pytorch model arch")
-    state_dict_keys = list(state_dict.keys())
-    if "params_ema" in state_dict_keys:
-        state_dict = state_dict["params_ema"]
-    elif "params-ema" in state_dict_keys:
-        state_dict = state_dict["params-ema"]
-    elif "params" in state_dict_keys:
-        state_dict = state_dict["params"]
-    state_dict_keys = list(state_dict.keys())
-    # SRVGGNet Real-ESRGAN (v2)
-    if "body.0.weight" in state_dict_keys and "body.1.weight" in state_dict_keys:
-        model = RealESRGANv2(state_dict)
-    # SPSR (ESRGAN with lots of extra layers)
-    elif "f_HR_conv1.0.weight" in state_dict:
-        model = SPSR(state_dict)
-    # Swift-SRGAN
-    elif (
-        "model" in state_dict_keys
-        and "initial.cnn.depthwise.weight" in state_dict["model"].keys()
-    ):
-        model = SwiftSRGAN(state_dict)
-    # SwinIR, Swin2SR, HAT
-    elif "layers.0.residual_group.blocks.0.norm1.weight" in state_dict_keys:
-        if (
-            "layers.0.residual_group.blocks.0.conv_block.cab.0.weight"
-            in state_dict_keys
-        ):
-            model = HAT(state_dict)
-        elif "patch_embed.proj.weight" in state_dict_keys:
-            model = Swin2SR(state_dict)
-        else:
-            model = SwinIR(state_dict)
-    # GFPGAN
-    elif (
-        "toRGB.0.weight" in state_dict_keys
-        and "stylegan_decoder.style_mlp.1.weight" in state_dict_keys
-    ):
-        model = GFPGANv1Clean(state_dict)
-    # RestoreFormer
-    elif (
-        "encoder.conv_in.weight" in state_dict_keys
-        and "encoder.down.0.block.0.norm1.weight" in state_dict_keys
-    ):
-        model = RestoreFormer(state_dict)
-    elif (
-        "encoder.blocks.0.weight" in state_dict_keys
-        and "quantize.embedding.weight" in state_dict_keys
-    ):
-        model = CodeFormer(state_dict)
-    # LaMa
-    elif (
-        "model.model.1.bn_l.running_mean" in state_dict_keys
-        or "generator.model.1.bn_l.running_mean" in state_dict_keys
-    ):
-        model = LaMa(state_dict)
-    # Omni-SR
-    elif "residual_layer.0.residual_layer.0.layer.0.fn.0.weight" in state_dict_keys:
-        model = OmniSR(state_dict)
-    # SCUNet
-    elif "m_head.0.weight" in state_dict_keys and "m_tail.0.weight" in state_dict_keys:
-        model = SCUNet(state_dict)
-    # DAT
-    elif "layers.0.blocks.2.attn.attn_mask_0" in state_dict_keys:
-        model = DAT(state_dict)
-    # Regular ESRGAN, "new-arch" ESRGAN, Real-ESRGAN v1
-    else:
-        try:
-            model = ESRGAN(state_dict)
-        except:
-            # pylint: disable=raise-missing-from
-            raise UnsupportedModel
-    return model
--- a/comfy_extras/chainner_models/types.py
+++ b/comfy_extras/chainner_models/types.py
-from typing import Union
-from .architecture.DAT import DAT
-from .architecture.face.codeformer import CodeFormer
-from .architecture.face.gfpganv1_clean_arch import GFPGANv1Clean
-from .architecture.face.restoreformer_arch import RestoreFormer
-from .architecture.HAT import HAT
-from .architecture.LaMa import LaMa
-from .architecture.OmniSR.OmniSR import OmniSR
-from .architecture.RRDB import RRDBNet as ESRGAN
-from .architecture.SCUNet import SCUNet
-from .architecture.SPSR import SPSRNet as SPSR
-from .architecture.SRVGG import SRVGGNetCompact as RealESRGANv2
-from .architecture.SwiftSRGAN import Generator as SwiftSRGAN
-from .architecture.Swin2SR import Swin2SR
-from .architecture.SwinIR import SwinIR
-PyTorchSRModels = (
-    RealESRGANv2,
-    SPSR,
-    SwiftSRGAN,
-    ESRGAN,
-    SwinIR,
-    Swin2SR,
-    HAT,
-    OmniSR,
-    SCUNet,
-    DAT,
-)
-PyTorchSRModel = Union[
-    RealESRGANv2,
-    SPSR,
-    SwiftSRGAN,
-    ESRGAN,
-    SwinIR,
-    Swin2SR,
-    HAT,
-    OmniSR,
-    SCUNet,
-    DAT,
-]
-def is_pytorch_sr_model(model: object):
-    return isinstance(model, PyTorchSRModels)
-PyTorchFaceModels = (GFPGANv1Clean, RestoreFormer, CodeFormer)
-PyTorchFaceModel = Union[GFPGANv1Clean, RestoreFormer, CodeFormer]
-def is_pytorch_face_model(model: object):
-    return isinstance(model, PyTorchFaceModels)
-PyTorchInpaintModels = (LaMa,)
-PyTorchInpaintModel = Union[LaMa]
-def is_pytorch_inpaint_model(model: object):
-    return isinstance(model, PyTorchInpaintModels)
-PyTorchModels = (*PyTorchSRModels, *PyTorchFaceModels, *PyTorchInpaintModels)
-PyTorchModel = Union[PyTorchSRModel, PyTorchFaceModel, PyTorchInpaintModel]
-def is_pytorch_model(model: object):
-    return isinstance(model, PyTorchModels)
--- a/comfy_extras/nodes_upscale_model.py
+++ b/comfy_extras/nodes_upscale_model.py
 import os
-from comfy_extras.chainner_models import model_loading
+from spandrel import ModelLoader, ImageModelDescriptor
 from comfy import model_management
 import torch
 import comfy.utils
@@ -20,7 +20,11 @@ class UpscaleModelLoader:
        sd = comfy.utils.load_torch_file(model_path, safe_load=True)
        if "module.layers.0.residual_group.blocks.0.norm1.weight" in sd:
            sd = comfy.utils.state_dict_prefix_replace(sd, {"module.":""})
-        out = model_loading.load_state_dict(sd).eval()
+        out = ModelLoader().load_from_state_dict(sd).eval()
+        if not isinstance(out, ImageModelDescriptor):
+            raise Exception("Upscale model must be a single-image model.")
        return (out, )
@@ -61,7 +65,7 @@ class ImageUpscaleWithModel:
                if tile < 128:
                    raise e
-        upscale_model.cpu()
+        upscale_model.to("cpu")
        s = torch.clamp(s.movedim(-3,-1), min=0, max=1.0)
        return (s,)

--- a/requirements.txt
+++ b/requirements.txt
@@ -11,3 +11,4 @@ scipy
 tqdm
 psutil
 kornia>=0.7.1
+spandrel==0.3.1
\ No newline at end of file