import re import torch import torch.nn as nn from copy import deepcopy from torch import Tensor from torch.nn import Module, Linear, init from typing import Any, Mapping import sys sys.path.append("/home/test/Workspace/ruixie/Open-Sora") from einops import rearrange # from diffusion.model.nets import PixArtMSBlock, PixArtMS, PixArt from opensora.models.stdit.stdit import STDiTBlock, STDiT from opensora.models.layers.blocks import ( Attention, CaptionEmbedder, MultiHeadCrossAttention, PatchEmbed3D, SeqParallelAttention, SeqParallelMultiHeadCrossAttention, T2IFinalLayer, TimestepEmbedder, approx_gelu, get_1d_sincos_pos_embed, get_2d_sincos_pos_embed, get_layernorm, t2i_modulate, ) from opensora.acceleration.checkpoint import auto_grad_checkpoint # The implementation of ControlNet-Half architrecture # https://github.com/lllyasviel/ControlNet/discussions/188 class ControlT2IDitBlockHalf(Module): def __init__(self, base_block: STDiTBlock, block_index: 0) -> None: super().__init__() self.copied_block = deepcopy(base_block) self.block_index = block_index for p in self.copied_block.parameters(): p.requires_grad_(True) self.copied_block.load_state_dict(base_block.state_dict()) self.copied_block.train() self.hidden_size = hidden_size = base_block.hidden_size if self.block_index == 0: self.before_proj = Linear(hidden_size, hidden_size) init.zeros_(self.before_proj.weight) init.zeros_(self.before_proj.bias) self.after_proj = Linear(hidden_size, hidden_size) init.zeros_(self.after_proj.weight) init.zeros_(self.after_proj.bias) def forward(self, x, y, t0, y_lens, c, tpe): if self.block_index == 0: # the first block c = self.before_proj(c) c = self.copied_block(x + c, y, t0, y_lens, tpe) c_skip = self.after_proj(c) else: # load from previous c and produce the c for skip connection c = self.copied_block(c, y, t0, y_lens, tpe) c_skip = self.after_proj(c) return c, c_skip # The implementation of ControlPixArtHalf net class ControlPixArtHalf(Module): # only support single res model def __init__(self, base_model: STDiT, copy_blocks_num: int = 13) -> None: super().__init__() self.base_model = base_model.eval() self.controlnet = [] self.copy_blocks_num = copy_blocks_num self.total_blocks_num = len(base_model.blocks) for p in self.base_model.parameters(): p.requires_grad_(False) # Copy first copy_blocks_num block for i in range(copy_blocks_num): self.controlnet.append(ControlT2IDitBlockHalf(base_model.blocks[i], i)) self.controlnet = nn.ModuleList(self.controlnet) def __getattr__(self, name: str) -> Tensor or Module: if name in ['forward', 'forward_with_dpmsolver', 'forward_with_cfg', 'forward_c', 'load_state_dict']: return self.__dict__[name] elif name in ['base_model', 'controlnet']: return super().__getattr__(name) else: return getattr(self.base_model, name) def forward_c(self, c): # self.h, self.w = c.shape[-2]//self.patch_size, c.shape[-1]//self.patch_size # pos_embed = torch.from_numpy(get_2d_sincos_pos_embed(self.pos_embed.shape[-1], (self.h, self.w), lewei_scale=self.lewei_scale, base_size=self.base_size)).unsqueeze(0).to(c.device).to(self.dtype) x = self.x_embedder(c) # [B, N, C] x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial) x = x + self.pos_embed x = rearrange(x, "B T S C -> B (T S) C") return x if c is not None else c # def forward(self, x, t, c, **kwargs): # return self.base_model(x, t, c=self.forward_c(c), **kwargs) def forward(self, x, timestep, y, mask=None, x_mask=None, c=None): # modify the original PixArtMS forward function if c is not None: c = c.to(self.dtype) c = self.forward_c(c) """ Forward pass of PixArt. x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images) t: (N,) tensor of diffusion timesteps y: (N, 1, 120, C) tensor of class labels """ x = x.to(self.dtype) timestep = timestep.to(self.dtype) y = y.to(self.dtype) # embedding x = self.x_embedder(x) # [B, N, C] x = rearrange(x, "B (T S) C -> B T S C", T=self.num_temporal, S=self.num_spatial) x = x + self.pos_embed x = rearrange(x, "B T S C -> B (T S) C") # shard over the sequence dim if sp is enabled if self.enable_sequence_parallelism: x = split_forward_gather_backward(x, get_sequence_parallel_group(), dim=1, grad_scale="down") t = self.t_embedder(timestep, dtype=x.dtype) # [B, C] t0 = self.t_block(t) # [B, C] y = self.y_embedder(y, self.training) # [B, 1, N_token, C] if mask is not None: if mask.shape[0] != y.shape[0]: mask = mask.repeat(y.shape[0] // mask.shape[0], 1) mask = mask.squeeze(1).squeeze(1) y = y.squeeze(1).masked_select(mask.unsqueeze(-1) != 0).view(1, -1, x.shape[-1]) y_lens = mask.sum(dim=1).tolist() else: y_lens = [y.shape[2]] * y.shape[0] y = y.squeeze(1).view(1, -1, x.shape[-1]) # define the first layer # y_ori = y tpe = self.pos_embed_temporal x = auto_grad_checkpoint(self.base_model.blocks[0], x, y, t0, y_lens, tpe) # define the rest layers # update c for index in range(1, self.copy_blocks_num + 1): if index == 1: c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, tpe) else: c, c_skip = auto_grad_checkpoint(self.controlnet[index - 1], x, y, t0, y_lens, c, None) x = auto_grad_checkpoint(self.base_model.blocks[index], x + c_skip, y, t0, y_lens, None) # update x for index in range(self.copy_blocks_num + 1, self.total_blocks_num): x = auto_grad_checkpoint(self.base_model.blocks[index], x, y, t0, y_lens, None) # final process x = self.final_layer(x, t) # [B, N, C=T_p * H_p * W_p * C_out] x = self.unpatchify(x) # [B, C_out, T, H, W] # cast to float32 for better accuracy x = x.to(torch.float32) return x def load_state_dict(self, state_dict: Mapping[str, Any], strict: bool = True): if all((k.startswith('base_model') or k.startswith('controlnet')) for k in state_dict.keys()): return super().load_state_dict(state_dict, strict) else: new_key = {} for k in state_dict.keys(): new_key[k] = re.sub(r"(blocks\.\d+)(.*)", r"\1.base_block\2", k) for k, v in new_key.items(): if k != v: print(f"replace {k} to {v}") state_dict[v] = state_dict.pop(k) return self.base_model.load_state_dict(state_dict, strict) def unpatchify(self, x): """ Args: x (torch.Tensor): of shape [B, N, C] Return: x (torch.Tensor): of shape [B, C_out, T, H, W] """ N_t, N_h, N_w = [self.input_size[i] // self.patch_size[i] for i in range(3)] T_p, H_p, W_p = self.patch_size x = rearrange( x, "B (N_t N_h N_w) (T_p H_p W_p C_out) -> B C_out (N_t T_p) (N_h H_p) (N_w W_p)", N_t=N_t, N_h=N_h, N_w=N_w, T_p=T_p, H_p=H_p, W_p=W_p, C_out=self.out_channels, ) return x def unpatchify_old(self, x): c = self.out_channels t, h, w = [self.input_size[i] // self.patch_size[i] for i in range(3)] pt, ph, pw = self.patch_size x = x.reshape(shape=(x.shape[0], t, h, w, pt, ph, pw, c)) x = rearrange(x, "n t h w r p q c -> n c t r h p w q") imgs = x.reshape(shape=(x.shape[0], c, t * pt, h * ph, w * pw)) return imgs def get_spatial_pos_embed(self, grid_size=None): if grid_size is None: grid_size = self.input_size[1:] pos_embed = get_2d_sincos_pos_embed( self.hidden_size, (grid_size[0] // self.patch_size[1], grid_size[1] // self.patch_size[2]), scale=self.space_scale, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def get_temporal_pos_embed(self): pos_embed = get_1d_sincos_pos_embed( self.hidden_size, self.input_size[0] // self.patch_size[0], scale=self.time_scale, ) pos_embed = torch.from_numpy(pos_embed).float().unsqueeze(0).requires_grad_(False) return pos_embed def freeze_not_temporal(self): for n, p in self.named_parameters(): if "attn_temp" not in n: p.requires_grad = False def freeze_text(self): for n, p in self.named_parameters(): if "cross_attn" in n: p.requires_grad = False def initialize_temporal(self): for block in self.blocks: nn.init.constant_(block.attn_temp.proj.weight, 0) nn.init.constant_(block.attn_temp.proj.bias, 0) def initialize_weights(self): # Initialize transformer layers: def _basic_init(module): if isinstance(module, nn.Linear): torch.nn.init.xavier_uniform_(module.weight) if module.bias is not None: nn.init.constant_(module.bias, 0) self.apply(_basic_init) # Initialize patch_embed like nn.Linear (instead of nn.Conv2d): w = self.x_embedder.proj.weight.data nn.init.xavier_uniform_(w.view([w.shape[0], -1])) # Initialize timestep embedding MLP: nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02) nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02) nn.init.normal_(self.t_block[1].weight, std=0.02) # Initialize caption embedding MLP: nn.init.normal_(self.y_embedder.y_proj.fc1.weight, std=0.02) nn.init.normal_(self.y_embedder.y_proj.fc2.weight, std=0.02) # Zero-out adaLN modulation layers in PixArt blocks: for block in self.blocks: nn.init.constant_(block.cross_attn.proj.weight, 0) nn.init.constant_(block.cross_attn.proj.bias, 0) # Zero-out output layers: nn.init.constant_(self.final_layer.linear.weight, 0) nn.init.constant_(self.final_layer.linear.bias, 0)