hunyuandit

f91d2ea3 · mashun1 · f91d2ea3 · f91d2ea3 · f91d2ea3 · f91d2ea3
Commit f91d2ea3 authored May 28, 2024 by mashun1
20 changed files
--- a/hydit/modules/embedders.py
+++ b/hydit/modules/embedders.py
+import math
+import torch
+import torch.nn as nn
+from einops import repeat
+
+from timm.models.layers import to_2tuple
+
+
+class PatchEmbed(nn.Module):
+    """ 2D Image to Patch Embedding
+
+    Image to Patch Embedding using Conv2d
+
+    A convolution based approach to patchifying a 2D image w/ embedding projection.
+
+    Based on the impl in https://github.com/google-research/vision_transformer
+
+    Hacked together by / Copyright 2020 Ross Wightman
+
+    Remove the _assert function in forward function to be compatible with multi-resolution images.
+    """
+    def __init__(
+            self,
+            img_size=224,
+            patch_size=16,
+            in_chans=3,
+            embed_dim=768,
+            norm_layer=None,
+            flatten=True,
+            bias=True,
+    ):
+        super().__init__()
+        if isinstance(img_size, int):
+            img_size = to_2tuple(img_size)
+        elif isinstance(img_size, (tuple, list)) and len(img_size) == 2:
+            img_size = tuple(img_size)
+        else:
+            raise ValueError(f"img_size must be int or tuple/list of length 2. Got {img_size}")
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+
+    def update_image_size(self, img_size):
+        self.img_size = img_size
+        self.grid_size = (img_size[0] // self.patch_size[0], img_size[1] // self.patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+
+    def forward(self, x):
+        # B, C, H, W = x.shape
+        # _assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
+        # _assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
+        x = self.proj(x)
+        if self.flatten:
+            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
+        x = self.norm(x)
+        return x
+
+
+def timestep_embedding(t, dim, max_period=10000, repeat_only=False):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
+    if not repeat_only:
+        half = dim // 2
+        freqs = torch.exp(
+            -math.log(max_period)
+            * torch.arange(start=0, end=half, dtype=torch.float32)
+            / half
+        ).to(device=t.device)   # size: [dim/2], 一个指数衰减的曲线
+        args = t[:, None].float() * freqs[None]
+        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+        if dim % 2:
+            embedding = torch.cat(
+                [embedding, torch.zeros_like(embedding[:, :1])], dim=-1
+            )
+    else:
+        embedding = repeat(t, "b -> b d", d=dim)
+    return embedding
+
+
+class TimestepEmbedder(nn.Module):
+    """
+    Embeds scalar timesteps into vector representations.
+    """
+    def __init__(self, hidden_size, frequency_embedding_size=256, out_size=None):
+        super().__init__()
+        if out_size is None:
+            out_size = hidden_size
+        self.mlp = nn.Sequential(
+            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
+            nn.SiLU(),
+            nn.Linear(hidden_size, out_size, bias=True),
+        )
+        self.frequency_embedding_size = frequency_embedding_size
+
+    def forward(self, t):
+        t_freq = timestep_embedding(t, self.frequency_embedding_size).type(self.mlp[0].weight.dtype)
+        t_emb = self.mlp(t_freq)
+        return t_emb
--- a/hydit/modules/models.py
+++ b/hydit/modules/models.py
--- a/hydit/modules/norm_layers.py
+++ b/hydit/modules/norm_layers.py
+import torch
+import torch.nn as nn
+
+
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, elementwise_affine=True, eps: float = 1e-6):
+        """
+        Initialize the RMSNorm normalization layer.
+
+        Args:
+            dim (int): The dimension of the input tensor.
+            eps (float, optional): A small value added to the denominator for numerical stability. Default is 1e-6.
+
+        Attributes:
+            eps (float): A small value added to the denominator for numerical stability.
+            weight (nn.Parameter): Learnable scaling parameter.
+
+        """
+        super().__init__()
+        self.eps = eps
+        if elementwise_affine:
+            self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        """
+        Apply the RMSNorm normalization to the input tensor.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The normalized tensor.
+
+        """
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        """
+        Forward pass through the RMSNorm layer.
+
+        Args:
+            x (torch.Tensor): The input tensor.
+
+        Returns:
+            torch.Tensor: The output tensor after applying RMSNorm.
+
+        """
+        output = self._norm(x.float()).type_as(x)
+        if hasattr(self, "weight"):
+            output = output * self.weight
+        return output
+
+
+class GroupNorm32(nn.GroupNorm):
+    def __init__(self, num_groups, num_channels, eps=1e-5, dtype=None):
+        super().__init__(num_groups=num_groups, num_channels=num_channels, eps=eps, dtype=dtype)
+
+    def forward(self, x):
+        y = super().forward(x).to(x.dtype)
+        return y
+
+def normalization(channels, dtype=None):
+    """
+    Make a standard normalization layer.
+    :param channels: number of input channels.
+    :return: an nn.Module for normalization.
+    """
+    return GroupNorm32(num_channels=channels, num_groups=32, dtype=dtype)
--- a/hydit/modules/poolers.py
+++ b/hydit/modules/poolers.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class AttentionPool(nn.Module):
+    def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(torch.randn(spacial_dim + 1, embed_dim) / embed_dim ** 0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        x = x.permute(1, 0, 2)  # NLC -> LNC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (L+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (L+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1], key=x, value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False
+        )
+        return x.squeeze(0)
--- a/hydit/modules/posemb_layers.py
+++ b/hydit/modules/posemb_layers.py
--- a/hydit/modules/text_encoder.py
+++ b/hydit/modules/text_encoder.py
--- a/hydit/modules/trt/engine.py
+++ b/hydit/modules/trt/engine.py
--- a/hydit/modules/trt/hcf_model.py
+++ b/hydit/modules/trt/hcf_model.py
--- a/hydit/utils/tools.py
+++ b/hydit/utils/tools.py
+import random
+
+import numpy as np
+import torch
+
+
+def set_seeds(seed_list, device=None):
+    if isinstance(seed_list, (tuple, list)):
+        seed = sum(seed_list)
+    else:
+        seed = seed_list
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed_all(seed)
+
+    return torch.Generator(device).manual_seed(seed)
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=658
+# 模型名称
+modelName=HunyuanDiT
+# 模型描述
+modelDescription=支持中文输入的文生图模型。
+# 应用场景
+appScenario=推理,AIGC,媒体,科研,教育
+# 框架类型
+frameType=pytorch
--- a/readme_imgs/alg.png
+++ b/readme_imgs/alg.png
--- a/readme_imgs/model_structure.png
+++ b/readme_imgs/model_structure.png
--- a/readme_imgs/result_1.png
+++ b/readme_imgs/result_1.png
--- a/readme_imgs/result_2.png
+++ b/readme_imgs/result_2.png
--- a/readme_imgs/result_3.png
+++ b/readme_imgs/result_3.png
--- a/readme_imgs/result_4.png
+++ b/readme_imgs/result_4.png
--- a/requirements.txt
+++ b/requirements.txt
+# --extra-index-url https://pypi.ngc.nvidia.com
+# timm==0.9.5
+diffusers==0.21.2
+peft==0.10.0
+protobuf==3.19.0
+# torchvision==0.14.1
+transformers==4.37.2
+accelerate==0.29.3
+loguru==0.7.2
+einops==0.7.0
+sentencepiece==0.1.99
+# cuda-python==11.7.1
+# onnxruntime==1.12.1
+# onnx==1.12.0
+# nvidia-pyindex==1.0.9
+# onnx-graphsurgeon==0.3.27
+# polygraphy==0.47.1
+pandas==2.0.3
+gradio==3.50.2
+loguru
\ No newline at end of file
--- a/sample_t2i.py
+++ b/sample_t2i.py
--- a/trt/build_engine.sh
+++ b/trt/build_engine.sh
--- a/trt/export_onnx.py
+++ b/trt/export_onnx.py