v1.0

aad7b6c7 · chenzk · aad7b6c7 · aad7b6c7 · aad7b6c7 · aad7b6c7
Commit aad7b6c7 authored Feb 19, 2025 by chenzk
20 changed files
--- a/hy3dgen/texgen/differentiable_renderer/camera_utils.py
+++ b/hy3dgen/texgen/differentiable_renderer/camera_utils.py
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import math
+
+import numpy as np
+import torch
+
+
+def transform_pos(mtx, pos, keepdim=False):
+    t_mtx = torch.from_numpy(mtx).to(
+        pos.device) if isinstance(
+        mtx, np.ndarray) else mtx
+    if pos.shape[-1] == 3:
+        posw = torch.cat(
+            [pos, torch.ones([pos.shape[0], 1]).to(pos.device)], axis=1)
+    else:
+        posw = pos
+
+    if keepdim:
+        return torch.matmul(posw, t_mtx.t())[...]
+    else:
+        return torch.matmul(posw, t_mtx.t())[None, ...]
+
+
+def get_mv_matrix(elev, azim, camera_distance, center=None):
+    elev = -elev
+    azim += 90
+
+    elev_rad = math.radians(elev)
+    azim_rad = math.radians(azim)
+
+    camera_position = np.array([camera_distance * math.cos(elev_rad) * math.cos(azim_rad),
+                                camera_distance *
+                                math.cos(elev_rad) * math.sin(azim_rad),
+                                camera_distance * math.sin(elev_rad)])
+
+    if center is None:
+        center = np.array([0, 0, 0])
+    else:
+        center = np.array(center)
+
+    lookat = center - camera_position
+    lookat = lookat / np.linalg.norm(lookat)
+
+    up = np.array([0, 0, 1.0])
+    right = np.cross(lookat, up)
+    right = right / np.linalg.norm(right)
+    up = np.cross(right, lookat)
+    up = up / np.linalg.norm(up)
+
+    c2w = np.concatenate(
+        [np.stack([right, up, -lookat], axis=-1), camera_position[:, None]], axis=-1)
+
+    w2c = np.zeros((4, 4))
+    w2c[:3, :3] = np.transpose(c2w[:3, :3], (1, 0))
+    w2c[:3, 3:] = -np.matmul(np.transpose(c2w[:3, :3], (1, 0)), c2w[:3, 3:])
+    w2c[3, 3] = 1.0
+
+    return w2c.astype(np.float32)
+
+
+def get_orthographic_projection_matrix(
+    left=-1, right=1, bottom=-1, top=1, near=0, far=2):
+    """
+    计算正交投影矩阵。
+
+    参数:
+        left (float): 投影区域左侧边界。
+        right (float): 投影区域右侧边界。
+        bottom (float): 投影区域底部边界。
+        top (float): 投影区域顶部边界。
+        near (float): 投影区域近裁剪面距离。
+        far (float): 投影区域远裁剪面距离。
+
+    返回:
+        numpy.ndarray: 正交投影矩阵。
+    """
+    ortho_matrix = np.eye(4, dtype=np.float32)
+    ortho_matrix[0, 0] = 2 / (right - left)
+    ortho_matrix[1, 1] = 2 / (top - bottom)
+    ortho_matrix[2, 2] = -2 / (far - near)
+    ortho_matrix[0, 3] = -(right + left) / (right - left)
+    ortho_matrix[1, 3] = -(top + bottom) / (top - bottom)
+    ortho_matrix[2, 3] = -(far + near) / (far - near)
+    return ortho_matrix
+
+
+def get_perspective_projection_matrix(fovy, aspect_wh, near, far):
+    fovy_rad = math.radians(fovy)
+    return np.array([[1.0 / (math.tan(fovy_rad / 2.0) * aspect_wh), 0, 0, 0],
+                     [0, 1.0 / math.tan(fovy_rad / 2.0), 0, 0],
+                     [0, 0, -(far + near) / (far - near), -
+                     2.0 * far * near / (far - near)],
+                     [0, 0, -1, 0]]).astype(np.float32)
--- a/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat
+++ b/hy3dgen/texgen/differentiable_renderer/compile_mesh_painter.bat
+FOR /F "tokens=*" %%i IN ('python -m pybind11 --includes') DO SET PYINCLUDES=%%i
+echo %PYINCLUDES%
+g++ -O3 -Wall -shared -std=c++11 -fPIC %PYINCLUDES% mesh_processor.cpp -o mesh_processor.pyd -lpython3.12
\ No newline at end of file
--- a/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-linux-x86_64.egg
+++ b/hy3dgen/texgen/differentiable_renderer/dist/mesh_processor-0.0.0-py3.10-linux-x86_64.egg
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.cpp
+#include <vector>
+#include <queue>
+#include <cmath>
+#include <algorithm>
+#include <pybind11/pybind11.h>
+#include <pybind11/numpy.h>
+#include <pybind11/stl.h>
+
+namespace py = pybind11;
+using namespace std;
+
+std::pair<py::array_t<float>,
+  py::array_t<uint8_t>>  meshVerticeInpaint_smooth(py::array_t<float> texture,
+py::array_t<uint8_t> mask,
+                 py::array_t<float> vtx_pos, py::array_t<float> vtx_uv, 
+                 py::array_t<int> pos_idx, py::array_t<int> uv_idx) {
+    auto texture_buf = texture.request();
+    auto mask_buf = mask.request();
+    auto vtx_pos_buf = vtx_pos.request();
+    auto vtx_uv_buf = vtx_uv.request();
+    auto pos_idx_buf = pos_idx.request();
+    auto uv_idx_buf = uv_idx.request();
+
+    int texture_height = texture_buf.shape[0];
+    int texture_width = texture_buf.shape[1];
+    int texture_channel = texture_buf.shape[2];
+    float* texture_ptr = static_cast<float*>(texture_buf.ptr);
+    uint8_t* mask_ptr = static_cast<uint8_t*>(mask_buf.ptr);
+
+    int vtx_num = vtx_pos_buf.shape[0];
+    float* vtx_pos_ptr = static_cast<float*>(vtx_pos_buf.ptr);
+    float* vtx_uv_ptr = static_cast<float*>(vtx_uv_buf.ptr);
+    int* pos_idx_ptr = static_cast<int*>(pos_idx_buf.ptr);
+    int* uv_idx_ptr = static_cast<int*>(uv_idx_buf.ptr);
+
+    vector<float> vtx_mask(vtx_num, 0.0f);
+    vector<vector<float>> vtx_color(vtx_num, vector<float>(texture_channel, 0.0f));
+    vector<int> uncolored_vtxs;
+
+    vector<vector<int>> G(vtx_num);
+
+    for (int i = 0; i < uv_idx_buf.shape[0]; ++i) {
+        for (int k = 0; k < 3; ++k) {
+            int vtx_uv_idx = uv_idx_ptr[i * 3 + k];
+            int vtx_idx = pos_idx_ptr[i * 3 + k];
+            int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1));
+            int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1));
+
+            if (mask_ptr[uv_u * texture_width + uv_v] > 0) {
+                vtx_mask[vtx_idx] = 1.0f;
+                for (int c = 0; c < texture_channel; ++c) {
+                    vtx_color[vtx_idx][c] = texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c];
+                }
+            }else{
+                uncolored_vtxs.push_back(vtx_idx);
+            }
+
+            G[pos_idx_ptr[i * 3 + k]].push_back(pos_idx_ptr[i * 3 + (k + 1) % 3]);
+        }
+    }
+
+    int smooth_count = 2;
+    int last_uncolored_vtx_count = 0;
+    while (smooth_count>0) {
+        int uncolored_vtx_count = 0;
+
+        for (int vtx_idx : uncolored_vtxs) {
+
+            vector<float> sum_color(texture_channel, 0.0f);
+            float total_weight = 0.0f;
+
+            array<float, 3> vtx_0 = {vtx_pos_ptr[vtx_idx * 3],
+vtx_pos_ptr[vtx_idx * 3 + 1], vtx_pos_ptr[vtx_idx * 3 + 2]};
+            for (int connected_idx : G[vtx_idx]) {
+                if (vtx_mask[connected_idx] > 0) {
+                    array<float, 3> vtx1 = {vtx_pos_ptr[connected_idx * 3],
+                    vtx_pos_ptr[connected_idx * 3 + 1], vtx_pos_ptr[connected_idx * 3 + 2]};
+                    float dist_weight = 1.0f / max(sqrt(pow(vtx_0[0] - vtx1[0], 2) + pow(vtx_0[1] - vtx1[1], 2) + \
+                     pow(vtx_0[2] - vtx1[2], 2)), 1E-4);
+                    dist_weight = dist_weight * dist_weight;
+                    for (int c = 0; c < texture_channel; ++c) {
+                        sum_color[c] += vtx_color[connected_idx][c] * dist_weight;
+                    }
+                    total_weight += dist_weight;
+                }
+            }
+
+            if (total_weight > 0.0f) {
+                for (int c = 0; c < texture_channel; ++c) {
+                    vtx_color[vtx_idx][c] = sum_color[c] / total_weight;
+                }
+                vtx_mask[vtx_idx] = 1.0f;
+            } else {
+                uncolored_vtx_count++;
+            }
+            
+        }
+
+        if(last_uncolored_vtx_count==uncolored_vtx_count){
+            smooth_count--;
+        }else{
+            smooth_count++;
+        }
+        last_uncolored_vtx_count = uncolored_vtx_count;
+    }
+
+    // Create new arrays for the output
+    py::array_t<float> new_texture(texture_buf.size);
+    py::array_t<uint8_t> new_mask(mask_buf.size);
+
+    auto new_texture_buf = new_texture.request();
+    auto new_mask_buf = new_mask.request();
+
+    float* new_texture_ptr = static_cast<float*>(new_texture_buf.ptr);
+    uint8_t* new_mask_ptr = static_cast<uint8_t*>(new_mask_buf.ptr);
+    // Copy original texture and mask to new arrays
+    std::copy(texture_ptr, texture_ptr + texture_buf.size, new_texture_ptr);
+    std::copy(mask_ptr, mask_ptr + mask_buf.size, new_mask_ptr);
+
+    for (int face_idx = 0; face_idx < uv_idx_buf.shape[0]; ++face_idx) {
+        for (int k = 0; k < 3; ++k) {
+            int vtx_uv_idx = uv_idx_ptr[face_idx * 3 + k];
+            int vtx_idx = pos_idx_ptr[face_idx * 3 + k];
+
+            if (vtx_mask[vtx_idx] == 1.0f) {
+                int uv_v = round(vtx_uv_ptr[vtx_uv_idx * 2] * (texture_width - 1));
+                int uv_u = round((1.0 - vtx_uv_ptr[vtx_uv_idx * 2 + 1]) * (texture_height - 1));
+
+                for (int c = 0; c < texture_channel; ++c) {
+                    new_texture_ptr[(uv_u * texture_width + uv_v) * texture_channel + c] = vtx_color[vtx_idx][c];
+                }
+                new_mask_ptr[uv_u * texture_width + uv_v] = 255;
+            }
+        }
+    }
+
+    // Reshape the new arrays to match the original texture and mask shapes
+    new_texture.resize({texture_height, texture_width, 3});
+    new_mask.resize({texture_height, texture_width});
+  return std::make_pair(new_texture, new_mask);
+}
+
+
+std::pair<py::array_t<float>, py::array_t<uint8_t>> meshVerticeInpaint(py::array_t<float> texture,
+          py::array_t<uint8_t> mask,
+          py::array_t<float> vtx_pos, py::array_t<float> vtx_uv,
+          py::array_t<int> pos_idx, py::array_t<int> uv_idx, const std::string& method = "smooth") {
+    if (method == "smooth") {
+        return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx);
+    } else {
+        throw std::invalid_argument("Invalid method. Use 'smooth' or 'forward'.");
+    }
+}
+
+PYBIND11_MODULE(mesh_processor, m) {
+    m.def("meshVerticeInpaint", &meshVerticeInpaint, "A function to process mesh",
+          py::arg("texture"), py::arg("mask"),
+          py::arg("vtx_pos"), py::arg("vtx_uv"),
+          py::arg("pos_idx"), py::arg("uv_idx"),
+          py::arg("method") = "smooth");
+}
\ No newline at end of file
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/PKG-INFO
+Metadata-Version: 2.1
+Name: mesh-processor
+Version: 0.0.0
+Requires-Python: >=3.6
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/SOURCES.txt
+mesh_processor.cpp
+setup.py
+mesh_processor.egg-info/PKG-INFO
+mesh_processor.egg-info/SOURCES.txt
+mesh_processor.egg-info/dependency_links.txt
+mesh_processor.egg-info/requires.txt
+mesh_processor.egg-info/top_level.txt
\ No newline at end of file
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/dependency_links.txt
+
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/requires.txt
+pybind11>=2.6.0
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.egg-info/top_level.txt
+mesh_processor
--- a/hy3dgen/texgen/differentiable_renderer/mesh_processor.py
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_processor.py
+import numpy as np
+
+def meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx):
+    texture_height, texture_width, texture_channel = texture.shape
+    vtx_num = vtx_pos.shape[0]
+
+    vtx_mask = np.zeros(vtx_num, dtype=np.float32)
+    vtx_color = [np.zeros(texture_channel, dtype=np.float32) for _ in range(vtx_num)]
+    uncolored_vtxs = []
+    G = [[] for _ in range(vtx_num)]
+
+    for i in range(uv_idx.shape[0]):
+        for k in range(3):
+            vtx_uv_idx = uv_idx[i, k]
+            vtx_idx = pos_idx[i, k]
+            uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
+            uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
+            if mask[uv_u, uv_v] > 0:
+                vtx_mask[vtx_idx] = 1.0
+                vtx_color[vtx_idx] = texture[uv_u, uv_v]
+            else:
+                uncolored_vtxs.append(vtx_idx)
+            G[pos_idx[i, k]].append(pos_idx[i, (k + 1) % 3])
+
+    smooth_count = 2
+    last_uncolored_vtx_count = 0
+    while smooth_count > 0:
+        uncolored_vtx_count = 0
+        for vtx_idx in uncolored_vtxs:
+            sum_color = np.zeros(texture_channel, dtype=np.float32)
+            total_weight = 0.0
+            vtx_0 = vtx_pos[vtx_idx]
+            for connected_idx in G[vtx_idx]:
+                if vtx_mask[connected_idx] > 0:
+                    vtx1 = vtx_pos[connected_idx]
+                    dist = np.sqrt(np.sum((vtx_0 - vtx1) ** 2))
+                    dist_weight = 1.0 / max(dist, 1e-4)
+                    dist_weight *= dist_weight
+                    sum_color += vtx_color[connected_idx] * dist_weight
+                    total_weight += dist_weight
+            if total_weight > 0:
+                vtx_color[vtx_idx] = sum_color / total_weight
+                vtx_mask[vtx_idx] = 1.0
+            else:
+                uncolored_vtx_count += 1
+
+        if last_uncolored_vtx_count == uncolored_vtx_count:
+            smooth_count -= 1
+        else:
+            smooth_count += 1
+        last_uncolored_vtx_count = uncolored_vtx_count
+
+    new_texture = texture.copy()
+    new_mask = mask.copy()
+    for face_idx in range(uv_idx.shape[0]):
+        for k in range(3):
+            vtx_uv_idx = uv_idx[face_idx, k]
+            vtx_idx = pos_idx[face_idx, k]
+            if vtx_mask[vtx_idx] == 1.0:
+                uv_v = int(round(vtx_uv[vtx_uv_idx, 0] * (texture_width - 1)))
+                uv_u = int(round((1.0 - vtx_uv[vtx_uv_idx, 1]) * (texture_height - 1)))
+                new_texture[uv_u, uv_v] = vtx_color[vtx_idx]
+                new_mask[uv_u, uv_v] = 255
+    return new_texture, new_mask
+
+def meshVerticeInpaint(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx, method="smooth"):
+    if method == "smooth":
+        return meshVerticeInpaint_smooth(texture, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
+    else:
+        raise ValueError("Invalid method. Use 'smooth' or 'forward'.")
\ No newline at end of file
--- a/hy3dgen/texgen/differentiable_renderer/mesh_render.py
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_render.py
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import cv2
+import numpy as np
+import torch
+import torch.nn.functional as F
+import trimesh
+from PIL import Image
+
+from .camera_utils import (
+    transform_pos,
+    get_mv_matrix,
+    get_orthographic_projection_matrix,
+    get_perspective_projection_matrix,
+)
+from .mesh_processor import meshVerticeInpaint
+from .mesh_utils import load_mesh, save_mesh
+
+
+def stride_from_shape(shape):
+    stride = [1]
+    for x in reversed(shape[1:]):
+        stride.append(stride[-1] * x)
+    return list(reversed(stride))
+
+
+def scatter_add_nd_with_count(input, count, indices, values, weights=None):
+    # input: [..., C], D dimension + C channel
+    # count: [..., 1], D dimension
+    # indices: [N, D], long
+    # values: [N, C]
+
+    D = indices.shape[-1]
+    C = input.shape[-1]
+    size = input.shape[:-1]
+    stride = stride_from_shape(size)
+
+    assert len(size) == D
+
+    input = input.view(-1, C)  # [HW, C]
+    count = count.view(-1, 1)
+
+    flatten_indices = (indices * torch.tensor(stride,
+                                              dtype=torch.long, device=indices.device)).sum(-1)  # [N]
+
+    if weights is None:
+        weights = torch.ones_like(values[..., :1])
+
+    input.scatter_add_(0, flatten_indices.unsqueeze(1).repeat(1, C), values)
+    count.scatter_add_(0, flatten_indices.unsqueeze(1), weights)
+
+    return input.view(*size, C), count.view(*size, 1)
+
+
+def linear_grid_put_2d(H, W, coords, values, return_count=False):
+    # coords: [N, 2], float in [0, 1]
+    # values: [N, C]
+
+    C = values.shape[-1]
+
+    indices = coords * torch.tensor(
+        [H - 1, W - 1], dtype=torch.float32, device=coords.device
+    )
+    indices_00 = indices.floor().long()  # [N, 2]
+    indices_00[:, 0].clamp_(0, H - 2)
+    indices_00[:, 1].clamp_(0, W - 2)
+    indices_01 = indices_00 + torch.tensor(
+        [0, 1], dtype=torch.long, device=indices.device
+    )
+    indices_10 = indices_00 + torch.tensor(
+        [1, 0], dtype=torch.long, device=indices.device
+    )
+    indices_11 = indices_00 + torch.tensor(
+        [1, 1], dtype=torch.long, device=indices.device
+    )
+
+    h = indices[..., 0] - indices_00[..., 0].float()
+    w = indices[..., 1] - indices_00[..., 1].float()
+    w_00 = (1 - h) * (1 - w)
+    w_01 = (1 - h) * w
+    w_10 = h * (1 - w)
+    w_11 = h * w
+
+    result = torch.zeros(H, W, C, device=values.device,
+                         dtype=values.dtype)  # [H, W, C]
+    count = torch.zeros(H, W, 1, device=values.device,
+                        dtype=values.dtype)  # [H, W, 1]
+    weights = torch.ones_like(values[..., :1])  # [N, 1]
+
+    result, count = scatter_add_nd_with_count(
+        result, count, indices_00, values * w_00.unsqueeze(1), weights * w_00.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(
+        result, count, indices_01, values * w_01.unsqueeze(1), weights * w_01.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(
+        result, count, indices_10, values * w_10.unsqueeze(1), weights * w_10.unsqueeze(1))
+    result, count = scatter_add_nd_with_count(
+        result, count, indices_11, values * w_11.unsqueeze(1), weights * w_11.unsqueeze(1))
+
+    if return_count:
+        return result, count
+
+    mask = (count.squeeze(-1) > 0)
+    result[mask] = result[mask] / count[mask].repeat(1, C)
+
+    return result
+
+
+class MeshRender():
+    def __init__(
+        self,
+        camera_distance=1.45, camera_type='orth',
+        default_resolution=1024, texture_size=1024,
+        use_antialias=True, max_mip_level=None, filter_mode='linear',
+        bake_mode='linear', raster_mode='cr', device='cuda'):
+
+        self.device = device
+
+        self.set_default_render_resolution(default_resolution)
+        self.set_default_texture_resolution(texture_size)
+
+        self.camera_distance = camera_distance
+        self.use_antialias = use_antialias
+        self.max_mip_level = max_mip_level
+        self.filter_mode = filter_mode
+
+        self.bake_angle_thres = 75
+        self.bake_unreliable_kernel_size = int(
+            (2 / 512) * max(self.default_resolution[0], self.default_resolution[1]))
+        self.bake_mode = bake_mode
+
+        self.raster_mode = raster_mode
+        if self.raster_mode == 'cr':
+            import custom_rasterizer as cr
+            self.raster = cr
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        if camera_type == 'orth':
+            self.ortho_scale = 1.2
+            self.camera_proj_mat = get_orthographic_projection_matrix(
+                left=-self.ortho_scale * 0.5, right=self.ortho_scale * 0.5,
+                bottom=-self.ortho_scale * 0.5, top=self.ortho_scale * 0.5,
+                near=0.1, far=100
+            )
+        elif camera_type == 'perspective':
+            self.camera_proj_mat = get_perspective_projection_matrix(
+                49.13, self.default_resolution[1] / self.default_resolution[0],
+                0.01, 100.0
+            )
+        else:
+            raise f'No camera type {camera_type}'
+
+    def raster_rasterize(self, pos, tri, resolution, ranges=None, grad_db=True):
+
+        if self.raster_mode == 'cr':
+            rast_out_db = None
+            if pos.dim() == 2:
+                pos = pos.unsqueeze(0)
+            findices, barycentric = self.raster.rasterize(pos, tri, resolution)
+            rast_out = torch.cat((barycentric, findices.unsqueeze(-1)), dim=-1)
+            rast_out = rast_out.unsqueeze(0)
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        return rast_out, rast_out_db
+
+    def raster_interpolate(self, uv, rast_out, uv_idx, rast_db=None, diff_attrs=None):
+
+        if self.raster_mode == 'cr':
+            textd = None
+            barycentric = rast_out[0, ..., :-1]
+            findices = rast_out[0, ..., -1]
+            if uv.dim() == 2:
+                uv = uv.unsqueeze(0)
+            textc = self.raster.interpolate(uv, findices, barycentric, uv_idx)
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        return textc, textd
+
+    def raster_texture(self, tex, uv, uv_da=None, mip_level_bias=None, mip=None, filter_mode='auto',
+                       boundary_mode='wrap', max_mip_level=None):
+
+        if self.raster_mode == 'cr':
+            raise f'Texture is not implemented in cr'
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        return color
+
+    def raster_antialias(self, color, rast, pos, tri, topology_hash=None, pos_gradient_boost=1.0):
+
+        if self.raster_mode == 'cr':
+            # Antialias has not been supported yet
+            color = color
+        else:
+            raise f'No raster named {self.raster_mode}'
+
+        return color
+
+    def load_mesh(
+        self,
+        mesh,
+        scale_factor=1.15,
+        auto_center=True,
+    ):
+        vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data = load_mesh(mesh)
+        self.mesh_copy = mesh
+        self.set_mesh(vtx_pos, pos_idx,
+                      vtx_uv=vtx_uv, uv_idx=uv_idx,
+                      scale_factor=scale_factor, auto_center=auto_center
+                      )
+        if texture_data is not None:
+            self.set_texture(texture_data)
+
+    def save_mesh(self):
+        texture_data = self.get_texture()
+        texture_data = Image.fromarray((texture_data * 255).astype(np.uint8))
+        return save_mesh(self.mesh_copy, texture_data)
+
+    def set_mesh(
+        self,
+        vtx_pos, pos_idx,
+        vtx_uv=None, uv_idx=None,
+        scale_factor=1.15, auto_center=True
+    ):
+
+        self.vtx_pos = torch.from_numpy(vtx_pos).to(self.device).float()
+        self.pos_idx = torch.from_numpy(pos_idx).to(self.device).to(torch.int)
+        if (vtx_uv is not None) and (uv_idx is not None):
+            self.vtx_uv = torch.from_numpy(vtx_uv).to(self.device).float()
+            self.uv_idx = torch.from_numpy(uv_idx).to(self.device).to(torch.int)
+        else:
+            self.vtx_uv = None
+            self.uv_idx = None
+
+        self.vtx_pos[:, [0, 1]] = -self.vtx_pos[:, [0, 1]]
+        self.vtx_pos[:, [1, 2]] = self.vtx_pos[:, [2, 1]]
+        if (vtx_uv is not None) and (uv_idx is not None):
+            self.vtx_uv[:, 1] = 1.0 - self.vtx_uv[:, 1]
+
+        if auto_center:
+            max_bb = (self.vtx_pos - 0).max(0)[0]
+            min_bb = (self.vtx_pos - 0).min(0)[0]
+            center = (max_bb + min_bb) / 2
+            scale = torch.norm(self.vtx_pos - center, dim=1).max() * 2.0
+            self.vtx_pos = (self.vtx_pos - center) * \
+                           (scale_factor / float(scale))
+            self.scale_factor = scale_factor
+
+    def set_texture(self, tex):
+        if isinstance(tex, np.ndarray):
+            tex = Image.fromarray((tex * 255).astype(np.uint8))
+        elif isinstance(tex, torch.Tensor):
+            tex = tex.cpu().numpy()
+            tex = Image.fromarray((tex * 255).astype(np.uint8))
+
+        tex = tex.resize(self.texture_size).convert('RGB')
+        tex = np.array(tex) / 255.0
+        self.tex = torch.from_numpy(tex).to(self.device)
+        self.tex = self.tex.float()
+
+    def set_default_render_resolution(self, default_resolution):
+        if isinstance(default_resolution, int):
+            default_resolution = (default_resolution, default_resolution)
+        self.default_resolution = default_resolution
+
+    def set_default_texture_resolution(self, texture_size):
+        if isinstance(texture_size, int):
+            texture_size = (texture_size, texture_size)
+        self.texture_size = texture_size
+
+    def get_mesh(self):
+        vtx_pos = self.vtx_pos.cpu().numpy()
+        pos_idx = self.pos_idx.cpu().numpy()
+        vtx_uv = self.vtx_uv.cpu().numpy()
+        uv_idx = self.uv_idx.cpu().numpy()
+
+        # 坐标变换的逆变换
+        vtx_pos[:, [1, 2]] = vtx_pos[:, [2, 1]]
+        vtx_pos[:, [0, 1]] = -vtx_pos[:, [0, 1]]
+
+        vtx_uv[:, 1] = 1.0 - vtx_uv[:, 1]
+        return vtx_pos, pos_idx, vtx_uv, uv_idx
+
+    def get_texture(self):
+        return self.tex.cpu().numpy()
+
+    def to(self, device):
+        self.device = device
+
+        for attr_name in dir(self):
+            attr_value = getattr(self, attr_name)
+            if isinstance(attr_value, torch.Tensor):
+                setattr(self, attr_name, attr_value.to(self.device))
+
+    def color_rgb_to_srgb(self, image):
+        if isinstance(image, Image.Image):
+            image_rgb = torch.tesnor(
+                np.array(image) /
+                255.0).float().to(
+                self.device)
+        elif isinstance(image, np.ndarray):
+            image_rgb = torch.tensor(image).float()
+        else:
+            image_rgb = image.to(self.device)
+
+        image_srgb = torch.where(
+            image_rgb <= 0.0031308,
+            12.92 * image_rgb,
+            1.055 * torch.pow(image_rgb, 1 / 2.4) - 0.055
+        )
+
+        if isinstance(image, Image.Image):
+            image_srgb = Image.fromarray(
+                (image_srgb.cpu().numpy() *
+                 255).astype(
+                    np.uint8))
+        elif isinstance(image, np.ndarray):
+            image_srgb = image_srgb.cpu().numpy()
+        else:
+            image_srgb = image_srgb.to(image.device)
+
+        return image_srgb
+
+    def _render(
+        self,
+        glctx,
+        mvp,
+        pos,
+        pos_idx,
+        uv,
+        uv_idx,
+        tex,
+        resolution,
+        max_mip_level,
+        keep_alpha,
+        filter_mode
+    ):
+        pos_clip = transform_pos(mvp, pos)
+        if isinstance(resolution, (int, float)):
+            resolution = [resolution, resolution]
+        rast_out, rast_out_db = self.raster_rasterize(
+            glctx, pos_clip, pos_idx, resolution=resolution)
+
+        tex = tex.contiguous()
+        if filter_mode == 'linear-mipmap-linear':
+            texc, texd = self.raster_interpolate(
+                uv[None, ...], rast_out, uv_idx, rast_db=rast_out_db, diff_attrs='all')
+            color = self.raster_texture(
+                tex[None, ...], texc, texd, filter_mode='linear-mipmap-linear', max_mip_level=max_mip_level)
+        else:
+            texc, _ = self.raster_interpolate(uv[None, ...], rast_out, uv_idx)
+            color = self.raster_texture(tex[None, ...], texc, filter_mode=filter_mode)
+
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
+        color = color * visible_mask  # Mask out background.
+        if self.use_antialias:
+            color = self.raster_antialias(color, rast_out, pos_clip, pos_idx)
+
+        if keep_alpha:
+            color = torch.cat([color, visible_mask], dim=-1)
+        return color[0, ...]
+
+    def render(
+        self,
+        elev,
+        azim,
+        camera_distance=None,
+        center=None,
+        resolution=None,
+        tex=None,
+        keep_alpha=True,
+        bgcolor=None,
+        filter_mode=None,
+        return_type='th'
+    ):
+
+        proj = self.camera_proj_mat
+        r_mv = get_mv_matrix(
+            elev=elev,
+            azim=azim,
+            camera_distance=self.camera_distance if camera_distance is None else camera_distance,
+            center=center)
+        r_mvp = np.matmul(proj, r_mv).astype(np.float32)
+        if tex is not None:
+            if isinstance(tex, Image.Image):
+                tex = torch.tensor(np.array(tex) / 255.0)
+            elif isinstance(tex, np.ndarray):
+                tex = torch.tensor(tex)
+            if tex.dim() == 2:
+                tex = tex.unsqueeze(-1)
+            tex = tex.float().to(self.device)
+        image = self._render(r_mvp, self.vtx_pos, self.pos_idx, self.vtx_uv, self.uv_idx,
+                             self.tex if tex is None else tex,
+                             self.default_resolution if resolution is None else resolution,
+                             self.max_mip_level, True, filter_mode if filter_mode else self.filter_mode)
+        mask = (image[..., [-1]] == 1).float()
+        if bgcolor is None:
+            bgcolor = [0 for _ in range(image.shape[-1] - 1)]
+        image = image * mask + (1 - mask) * \
+                torch.tensor(bgcolor + [0]).to(self.device)
+        if keep_alpha == False:
+            image = image[..., :-1]
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.squeeze(-1).cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+        return image
+
+    def render_normal(
+        self,
+        elev,
+        azim,
+        camera_distance=None,
+        center=None,
+        resolution=None,
+        bg_color=[1, 1, 1],
+        use_abs_coor=False,
+        normalize_rgb=True,
+        return_type='th'
+    ):
+
+        pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
+        if resolution is None:
+            resolution = self.default_resolution
+        if isinstance(resolution, (int, float)):
+            resolution = [resolution, resolution]
+        rast_out, rast_out_db = self.raster_rasterize(
+            pos_clip, self.pos_idx, resolution=resolution)
+
+        if use_abs_coor:
+            mesh_triangles = self.vtx_pos[self.pos_idx[:, :3], :]
+        else:
+            pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
+            mesh_triangles = pos_camera[self.pos_idx[:, :3], :]
+        face_normals = F.normalize(
+            torch.cross(mesh_triangles[:,
+                        1,
+                        :] - mesh_triangles[:,
+                             0,
+                             :],
+                        mesh_triangles[:,
+                        2,
+                        :] - mesh_triangles[:,
+                             0,
+                             :],
+                        dim=-1),
+            dim=-1)
+
+        vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
+                                                              faces=self.pos_idx.cpu(),
+                                                              face_normals=face_normals.cpu(), )
+        vertex_normals = torch.from_numpy(
+            vertex_normals).float().to(self.device).contiguous()
+
+        # Interpolate normal values across the rasterized pixels
+        normal, _ = self.raster_interpolate(
+            vertex_normals[None, ...], rast_out, self.pos_idx)
+
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
+        normal = normal * visible_mask + \
+                 torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
+                                                                                    visible_mask)  # Mask out background.
+
+        if normalize_rgb:
+            normal = (normal + 1) * 0.5
+        if self.use_antialias:
+            normal = self.raster_antialias(normal, rast_out, pos_clip, self.pos_idx)
+
+        image = normal[0, ...]
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+
+        return image
+
+    def convert_normal_map(self, image):
+        # blue is front, red is left, green is top
+        if isinstance(image, Image.Image):
+            image = np.array(image)
+        mask = (image == [255, 255, 255]).all(axis=-1)
+
+        image = (image / 255.0) * 2.0 - 1.0
+
+        image[..., [1]] = -image[..., [1]]
+        image[..., [1, 2]] = image[..., [2, 1]]
+        image[..., [0]] = -image[..., [0]]
+
+        image = (image + 1.0) * 0.5
+
+        image = (image * 255).astype(np.uint8)
+        image[mask] = [127, 127, 255]
+
+        return Image.fromarray(image)
+
+    def get_pos_from_mvp(self, elev, azim, camera_distance, center):
+        proj = self.camera_proj_mat
+        r_mv = get_mv_matrix(
+            elev=elev,
+            azim=azim,
+            camera_distance=self.camera_distance if camera_distance is None else camera_distance,
+            center=center)
+
+        pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
+        pos_clip = transform_pos(proj, pos_camera)
+
+        return pos_camera, pos_clip
+
+    def render_depth(
+        self,
+        elev,
+        azim,
+        camera_distance=None,
+        center=None,
+        resolution=None,
+        return_type='th'
+    ):
+        pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
+
+        if resolution is None:
+            resolution = self.default_resolution
+        if isinstance(resolution, (int, float)):
+            resolution = [resolution, resolution]
+        rast_out, rast_out_db = self.raster_rasterize(
+            pos_clip, self.pos_idx, resolution=resolution)
+
+        pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
+        tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
+
+        # Interpolate depth values across the rasterized pixels
+        depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
+
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
+        depth_max, depth_min = depth[visible_mask >
+                                     0].max(), depth[visible_mask > 0].min()
+        depth = (depth - depth_min) / (depth_max - depth_min)
+
+        depth = depth * visible_mask  # Mask out background.
+        if self.use_antialias:
+            depth = self.raster_antialias(depth, rast_out, pos_clip, self.pos_idx)
+
+        image = depth[0, ...]
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.squeeze(-1).cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+        return image
+
+    def render_position(self, elev, azim, camera_distance=None, center=None,
+                        resolution=None, bg_color=[1, 1, 1], return_type='th'):
+        pos_camera, pos_clip = self.get_pos_from_mvp(elev, azim, camera_distance, center)
+        if resolution is None:
+            resolution = self.default_resolution
+        if isinstance(resolution, (int, float)):
+            resolution = [resolution, resolution]
+        rast_out, rast_out_db = self.raster_rasterize(
+            pos_clip, self.pos_idx, resolution=resolution)
+
+        tex_position = 0.5 - self.vtx_pos[:, :3] / self.scale_factor
+        tex_position = tex_position.contiguous()
+
+        # Interpolate depth values across the rasterized pixels
+        position, _ = self.raster_interpolate(
+            tex_position[None, ...], rast_out, self.pos_idx)
+
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)
+
+        position = position * visible_mask + \
+                   torch.tensor(bg_color, dtype=torch.float32, device=self.device) * (1 -
+                                                                                      visible_mask)  # Mask out background.
+        if self.use_antialias:
+            position = self.raster_antialias(position, rast_out, pos_clip, self.pos_idx)
+
+        image = position[0, ...]
+
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.squeeze(-1).cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+        return image
+
+    def render_uvpos(self, return_type='th'):
+        image = self.uv_feature_map(self.vtx_pos * 0.5 + 0.5)
+        if return_type == 'np':
+            image = image.cpu().numpy()
+        elif return_type == 'pl':
+            image = image.cpu().numpy() * 255
+            image = Image.fromarray(image.astype(np.uint8))
+        return image
+
+    def uv_feature_map(self, vert_feat, bg=None):
+        vtx_uv = self.vtx_uv * 2 - 1.0
+        vtx_uv = torch.cat(
+            [vtx_uv, torch.zeros_like(self.vtx_uv)], dim=1).unsqueeze(0)
+        vtx_uv[..., -1] = 1
+        uv_idx = self.uv_idx
+        rast_out, rast_out_db = self.raster_rasterize(
+            vtx_uv, uv_idx, resolution=self.texture_size)
+        feat_map, _ = self.raster_interpolate(vert_feat[None, ...], rast_out, uv_idx)
+        feat_map = feat_map[0, ...]
+        if bg is not None:
+            visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
+            feat_map[visible_mask == 0] = bg
+        return feat_map
+
+    def render_sketch_from_geometry(self, normal_image, depth_image):
+        normal_image_np = normal_image.cpu().numpy()
+        depth_image_np = depth_image.cpu().numpy()
+
+        normal_image_np = (normal_image_np * 255).astype(np.uint8)
+        depth_image_np = (depth_image_np * 255).astype(np.uint8)
+        normal_image_np = cv2.cvtColor(normal_image_np, cv2.COLOR_RGB2GRAY)
+
+        normal_edges = cv2.Canny(normal_image_np, 80, 150)
+        depth_edges = cv2.Canny(depth_image_np, 30, 80)
+
+        combined_edges = np.maximum(normal_edges, depth_edges)
+
+        sketch_image = torch.from_numpy(combined_edges).to(
+            normal_image.device).float() / 255.0
+        sketch_image = sketch_image.unsqueeze(-1)
+
+        return sketch_image
+
+    def render_sketch_from_depth(self, depth_image):
+        depth_image_np = depth_image.cpu().numpy()
+        depth_image_np = (depth_image_np * 255).astype(np.uint8)
+        depth_edges = cv2.Canny(depth_image_np, 30, 80)
+        combined_edges = depth_edges
+        sketch_image = torch.from_numpy(combined_edges).to(
+            depth_image.device).float() / 255.0
+        sketch_image = sketch_image.unsqueeze(-1)
+        return sketch_image
+
+    def back_project(self, image, elev, azim,
+                     camera_distance=None, center=None, method=None):
+        if isinstance(image, Image.Image):
+            image = torch.tensor(np.array(image) / 255.0)
+        elif isinstance(image, np.ndarray):
+            image = torch.tensor(image)
+        if image.dim() == 2:
+            image = image.unsqueeze(-1)
+        image = image.float().to(self.device)
+        resolution = image.shape[:2]
+        channel = image.shape[-1]
+        texture = torch.zeros(self.texture_size + (channel,)).to(self.device)
+        cos_map = torch.zeros(self.texture_size + (1,)).to(self.device)
+
+        proj = self.camera_proj_mat
+        r_mv = get_mv_matrix(
+            elev=elev,
+            azim=azim,
+            camera_distance=self.camera_distance if camera_distance is None else camera_distance,
+            center=center)
+        pos_camera = transform_pos(r_mv, self.vtx_pos, keepdim=True)
+        pos_clip = transform_pos(proj, pos_camera)
+        pos_camera = pos_camera[:, :3] / pos_camera[:, 3:4]
+        v0 = pos_camera[self.pos_idx[:, 0], :]
+        v1 = pos_camera[self.pos_idx[:, 1], :]
+        v2 = pos_camera[self.pos_idx[:, 2], :]
+        face_normals = F.normalize(
+            torch.cross(
+                v1 - v0,
+                v2 - v0,
+                dim=-1),
+            dim=-1)
+        vertex_normals = trimesh.geometry.mean_vertex_normals(vertex_count=self.vtx_pos.shape[0],
+                                                              faces=self.pos_idx.cpu(),
+                                                              face_normals=face_normals.cpu(), )
+        vertex_normals = torch.from_numpy(
+            vertex_normals).float().to(self.device).contiguous()
+        tex_depth = pos_camera[:, 2].reshape(1, -1, 1).contiguous()
+        rast_out, rast_out_db = self.raster_rasterize(
+            pos_clip, self.pos_idx, resolution=resolution)
+        visible_mask = torch.clamp(rast_out[..., -1:], 0, 1)[0, ...]
+
+        normal, _ = self.raster_interpolate(
+            vertex_normals[None, ...], rast_out, self.pos_idx)
+        normal = normal[0, ...]
+        uv, _ = self.raster_interpolate(self.vtx_uv[None, ...], rast_out, self.uv_idx)
+        depth, _ = self.raster_interpolate(tex_depth, rast_out, self.pos_idx)
+        depth = depth[0, ...]
+
+        depth_max, depth_min = depth[visible_mask >
+                                     0].max(), depth[visible_mask > 0].min()
+        depth_normalized = (depth - depth_min) / (depth_max - depth_min)
+        depth_image = depth_normalized * visible_mask  # Mask out background.
+
+        sketch_image = self.render_sketch_from_depth(depth_image)
+
+        lookat = torch.tensor([[0, 0, -1]], device=self.device)
+        cos_image = torch.nn.functional.cosine_similarity(
+            lookat, normal.view(-1, 3))
+        cos_image = cos_image.view(normal.shape[0], normal.shape[1], 1)
+
+        cos_thres = np.cos(self.bake_angle_thres / 180 * np.pi)
+        cos_image[cos_image < cos_thres] = 0
+
+        # shrink
+        kernel_size = self.bake_unreliable_kernel_size * 2 + 1
+        kernel = torch.ones(
+            (1, 1, kernel_size, kernel_size), dtype=torch.float32).to(
+            sketch_image.device)
+
+        visible_mask = visible_mask.permute(2, 0, 1).unsqueeze(0).float()
+        visible_mask = F.conv2d(
+            1.0 - visible_mask,
+            kernel,
+            padding=kernel_size // 2)
+        visible_mask = 1.0 - (visible_mask > 0).float()  # 二值化
+        visible_mask = visible_mask.squeeze(0).permute(1, 2, 0)
+
+        sketch_image = sketch_image.permute(2, 0, 1).unsqueeze(0)
+        sketch_image = F.conv2d(sketch_image, kernel, padding=kernel_size // 2)
+        sketch_image = (sketch_image > 0).float()  # 二值化
+        sketch_image = sketch_image.squeeze(0).permute(1, 2, 0)
+        visible_mask = visible_mask * (sketch_image < 0.5)
+
+        cos_image[visible_mask == 0] = 0
+
+        method = self.bake_mode if method is None else method
+
+        if method == 'linear':
+            proj_mask = (visible_mask != 0).view(-1)
+            uv = uv.squeeze(0).contiguous().view(-1, 2)[proj_mask]
+            image = image.squeeze(0).contiguous().view(-1, channel)[proj_mask]
+            cos_image = cos_image.contiguous().view(-1, 1)[proj_mask]
+            sketch_image = sketch_image.contiguous().view(-1, 1)[proj_mask]
+
+            texture = linear_grid_put_2d(
+                self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], image)
+            cos_map = linear_grid_put_2d(
+                self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], cos_image)
+            boundary_map = linear_grid_put_2d(
+                self.texture_size[1], self.texture_size[0], uv[..., [1, 0]], sketch_image)
+        else:
+            raise f'No bake mode {method}'
+
+        return texture, cos_map, boundary_map
+
+    def bake_texture(self, colors, elevs, azims,
+                     camera_distance=None, center=None, exp=6, weights=None):
+        for i in range(len(colors)):
+            if isinstance(colors[i], Image.Image):
+                colors[i] = torch.tensor(
+                    np.array(
+                        colors[i]) / 255.0,
+                    device=self.device).float()
+        if weights is None:
+            weights = [1.0 for _ in range(colors)]
+        textures = []
+        cos_maps = []
+        for color, elev, azim, weight in zip(colors, elevs, azims, weights):
+            texture, cos_map, _ = self.back_project(
+                color, elev, azim, camera_distance, center)
+            cos_map = weight * (cos_map ** exp)
+            textures.append(texture)
+            cos_maps.append(cos_map)
+
+        texture_merge, trust_map_merge = self.fast_bake_texture(
+            textures, cos_maps)
+        return texture_merge, trust_map_merge
+
+    @torch.no_grad()
+    def fast_bake_texture(self, textures, cos_maps):
+
+        channel = textures[0].shape[-1]
+        texture_merge = torch.zeros(
+            self.texture_size + (channel,)).to(self.device)
+        trust_map_merge = torch.zeros(self.texture_size + (1,)).to(self.device)
+        for texture, cos_map in zip(textures, cos_maps):
+            view_sum = (cos_map > 0).sum()
+            painted_sum = ((cos_map > 0) * (trust_map_merge > 0)).sum()
+            if painted_sum / view_sum > 0.99:
+                continue
+            texture_merge += texture * cos_map
+            trust_map_merge += cos_map
+        texture_merge = texture_merge / torch.clamp(trust_map_merge, min=1E-8)
+
+        return texture_merge, trust_map_merge > 1E-8
+
+    def uv_inpaint(self, texture, mask):
+
+        if isinstance(texture, torch.Tensor):
+            texture_np = texture.cpu().numpy()
+        elif isinstance(texture, np.ndarray):
+            texture_np = texture
+        elif isinstance(texture, Image.Image):
+            texture_np = np.array(texture) / 255.0
+
+        vtx_pos, pos_idx, vtx_uv, uv_idx = self.get_mesh()
+
+        texture_np, mask = meshVerticeInpaint(
+            texture_np, mask, vtx_pos, vtx_uv, pos_idx, uv_idx)
+
+        texture_np = cv2.inpaint(
+            (texture_np *
+             255).astype(
+                np.uint8),
+            255 -
+            mask,
+            3,
+            cv2.INPAINT_NS)
+
+        return texture_np
--- a/hy3dgen/texgen/differentiable_renderer/mesh_utils.py
+++ b/hy3dgen/texgen/differentiable_renderer/mesh_utils.py
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+import trimesh
+
+
+def load_mesh(mesh):
+    vtx_pos = mesh.vertices if hasattr(mesh, 'vertices') else None
+    pos_idx = mesh.faces if hasattr(mesh, 'faces') else None
+
+    vtx_uv = mesh.visual.uv if hasattr(mesh.visual, 'uv') else None
+    uv_idx = mesh.faces if hasattr(mesh, 'faces') else None
+
+    texture_data = None
+
+    return vtx_pos, pos_idx, vtx_uv, uv_idx, texture_data
+
+
+def save_mesh(mesh, texture_data):
+    material = trimesh.visual.texture.SimpleMaterial(image=texture_data, diffuse=(255, 255, 255))
+    texture_visuals = trimesh.visual.TextureVisuals(uv=mesh.visual.uv, image=texture_data, material=material)
+    mesh.visual = texture_visuals
+    return mesh
--- a/hy3dgen/texgen/differentiable_renderer/setup.py
+++ b/hy3dgen/texgen/differentiable_renderer/setup.py
+from setuptools import setup, Extension
+import pybind11
+import sys
+import platform
+
+def get_platform_specific_args():
+    system = platform.system().lower()
+    cpp_std = 'c++14'  # Make configurable if needed
+    
+    if sys.platform == 'win32':
+        compile_args = ['/O2', f'/std:{cpp_std}', '/EHsc', '/MP', '/DWIN32_LEAN_AND_MEAN', '/bigobj']
+        link_args = []
+        extra_includes = []
+    elif system == 'linux':
+        compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra', '-pthread']
+        link_args = ['-fPIC', '-pthread']
+        extra_includes = []
+    elif sys.platform == 'darwin':
+        compile_args = ['-O3', f'-std={cpp_std}', '-fPIC', '-Wall', '-Wextra',
+                       '-stdlib=libc++', '-mmacosx-version-min=10.14']
+        link_args = ['-fPIC', '-stdlib=libc++', '-mmacosx-version-min=10.14', '-dynamiclib']
+        extra_includes = []
+    else:
+        raise RuntimeError(f"Unsupported platform: {system}")
+    
+    return compile_args, link_args, extra_includes
+
+extra_compile_args, extra_link_args, platform_includes = get_platform_specific_args()
+include_dirs = [pybind11.get_include(), pybind11.get_include(user=True)]
+include_dirs.extend(platform_includes)
+
+ext_modules = [
+    Extension(
+        "mesh_processor",
+        ["mesh_processor.cpp"],
+        include_dirs=include_dirs,
+        language='c++',
+        extra_compile_args=extra_compile_args,
+        extra_link_args=extra_link_args,
+    ),
+]
+
+setup(
+    name="mesh_processor",
+    ext_modules=ext_modules,
+    install_requires=['pybind11>=2.6.0'],
+    python_requires='>=3.6',
+)
\ No newline at end of file
--- a/hy3dgen/texgen/hunyuanpaint/__init__.py
+++ b/hy3dgen/texgen/hunyuanpaint/__init__.py
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
--- a/hy3dgen/texgen/hunyuanpaint/pipeline.py
+++ b/hy3dgen/texgen/hunyuanpaint/pipeline.py
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy
+import numpy as np
+import torch
+import torch.distributed
+import torch.utils.checkpoint
+from PIL import Image
+from diffusers import (
+    AutoencoderKL,
+    DiffusionPipeline,
+    ImagePipelineOutput
+)
+from diffusers.callbacks import MultiPipelineCallbacks, PipelineCallback
+from diffusers.image_processor import PipelineImageInput
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion import StableDiffusionPipeline, retrieve_timesteps, \
+    rescale_noise_cfg
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate
+from einops import rearrange
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
+
+from .unet.modules import UNet2p5DConditionModel
+
+
+def to_rgb_image(maybe_rgba: Image.Image):
+    if maybe_rgba.mode == 'RGB':
+        return maybe_rgba
+    elif maybe_rgba.mode == 'RGBA':
+        rgba = maybe_rgba
+        img = numpy.random.randint(127, 128, size=[rgba.size[1], rgba.size[0], 3], dtype=numpy.uint8)
+        img = Image.fromarray(img, 'RGB')
+        img.paste(rgba, mask=rgba.getchannel('A'))
+        return img
+    else:
+        raise ValueError("Unsupported image type.", maybe_rgba.mode)
+
+
+class HunyuanPaintPipeline(StableDiffusionPipeline):
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2p5DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        feature_extractor: CLIPImageProcessor,
+        safety_checker=None,
+        use_torch_compile=False,
+    ):
+        DiffusionPipeline.__init__(self)
+
+        safety_checker = None
+        self.register_modules(
+            vae=torch.compile(vae) if use_torch_compile else vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=torch.compile(feature_extractor) if use_torch_compile else feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+
+    @torch.no_grad()
+    def encode_images(self, images):
+        B = images.shape[0]
+        images = rearrange(images, 'b n c h w -> (b n) c h w')
+
+        dtype = next(self.vae.parameters()).dtype
+        images = (images - 0.5) * 2.0
+        posterior = self.vae.encode(images.to(dtype)).latent_dist
+        latents = posterior.sample() * self.vae.config.scaling_factor
+
+        latents = rearrange(latents, '(b n) c h w -> b n c h w', b=B)
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        image: Image.Image = None,
+        prompt=None,
+        negative_prompt='watermark, ugly, deformed, noisy, blurry, low contrast',
+        *args,
+        num_images_per_prompt: Optional[int] = 1,
+        guidance_scale=2.0,
+        output_type: Optional[str] = "pil",
+        width=512,
+        height=512,
+        num_inference_steps=28,
+        return_dict=True,
+        **cached_condition,
+    ):
+        if image is None:
+            raise ValueError("Inputting embeddings not supported for this pipeline. Please pass an image.")
+        assert not isinstance(image, torch.Tensor)
+
+        image = to_rgb_image(image)
+
+        image_vae = torch.tensor(np.array(image) / 255.0)
+        image_vae = image_vae.unsqueeze(0).permute(0, 3, 1, 2).unsqueeze(0)
+        image_vae = image_vae.to(device=self.vae.device, dtype=self.vae.dtype)
+
+        batch_size = image_vae.shape[0]
+        assert batch_size == 1
+        assert num_images_per_prompt == 1
+
+        ref_latents = self.encode_images(image_vae)
+
+        def convert_pil_list_to_tensor(images):
+            bg_c = [1., 1., 1.]
+            images_tensor = []
+            for batch_imgs in images:
+                view_imgs = []
+                for pil_img in batch_imgs:
+                    img = numpy.asarray(pil_img, dtype=numpy.float32) / 255.
+                    if img.shape[2] > 3:
+                        alpha = img[:, :, 3:]
+                        img = img[:, :, :3] * alpha + bg_c * (1 - alpha)
+                    img = torch.from_numpy(img).permute(2, 0, 1).unsqueeze(0).contiguous().half().to("cuda")
+                    view_imgs.append(img)
+                view_imgs = torch.cat(view_imgs, dim=0)
+                images_tensor.append(view_imgs.unsqueeze(0))
+
+            images_tensor = torch.cat(images_tensor, dim=0)
+            return images_tensor
+
+        if "normal_imgs" in cached_condition:
+
+            if isinstance(cached_condition["normal_imgs"], List):
+                cached_condition["normal_imgs"] = convert_pil_list_to_tensor(cached_condition["normal_imgs"])
+
+            cached_condition['normal_imgs'] = self.encode_images(cached_condition["normal_imgs"])
+
+        if "position_imgs" in cached_condition:
+
+            if isinstance(cached_condition["position_imgs"], List):
+                cached_condition["position_imgs"] = convert_pil_list_to_tensor(cached_condition["position_imgs"])
+
+            cached_condition["position_imgs"] = self.encode_images(cached_condition["position_imgs"])
+
+        if 'camera_info_gen' in cached_condition:
+            camera_info = cached_condition['camera_info_gen']  # B,N
+            if isinstance(camera_info, List):
+                camera_info = torch.tensor(camera_info)
+            camera_info = camera_info.to(image_vae.device).to(torch.int64)
+            cached_condition['camera_info_gen'] = camera_info
+        if 'camera_info_ref' in cached_condition:
+            camera_info = cached_condition['camera_info_ref']  # B,N
+            if isinstance(camera_info, List):
+                camera_info = torch.tensor(camera_info)
+            camera_info = camera_info.to(image_vae.device).to(torch.int64)
+            cached_condition['camera_info_ref'] = camera_info
+
+        cached_condition['ref_latents'] = ref_latents
+
+        if guidance_scale > 1:
+            negative_ref_latents = torch.zeros_like(cached_condition['ref_latents'])
+            cached_condition['ref_latents'] = torch.cat([negative_ref_latents, cached_condition['ref_latents']])
+            cached_condition['ref_scale'] = torch.as_tensor([0.0, 1.0]).to(cached_condition['ref_latents'])
+            if "normal_imgs" in cached_condition:
+                cached_condition['normal_imgs'] = torch.cat(
+                    (cached_condition['normal_imgs'], cached_condition['normal_imgs']))
+
+            if "position_imgs" in cached_condition:
+                cached_condition['position_imgs'] = torch.cat(
+                    (cached_condition['position_imgs'], cached_condition['position_imgs']))
+
+            if 'position_maps' in cached_condition:
+                cached_condition['position_maps'] = torch.cat(
+                    (cached_condition['position_maps'], cached_condition['position_maps']))
+
+            if 'camera_info_gen' in cached_condition:
+                cached_condition['camera_info_gen'] = torch.cat(
+                    (cached_condition['camera_info_gen'], cached_condition['camera_info_gen']))
+            if 'camera_info_ref' in cached_condition:
+                cached_condition['camera_info_ref'] = torch.cat(
+                    (cached_condition['camera_info_ref'], cached_condition['camera_info_ref']))
+
+        prompt_embeds = self.unet.learned_text_clip_gen.repeat(num_images_per_prompt, 1, 1)
+        negative_prompt_embeds = torch.zeros_like(prompt_embeds)
+
+        latents: torch.Tensor = self.denoise(
+            None,
+            *args,
+            cross_attention_kwargs=None,
+            guidance_scale=guidance_scale,
+            num_images_per_prompt=num_images_per_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            num_inference_steps=num_inference_steps,
+            output_type='latent',
+            width=width,
+            height=height,
+            **cached_condition
+        ).images
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+        else:
+            image = latents
+
+        image = self.image_processor.postprocess(image, output_type=output_type)
+        if not return_dict:
+            return (image,)
+
+        return ImagePipelineOutput(images=image)
+
+    def denoise(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        timesteps: List[int] = None,
+        sigmas: List[float] = None,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        guidance_rescale: float = 0.0,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
+                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
+                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
+                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
+                provided, embeddings are computed from the `ip_adapter_image` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            guidance_rescale (`float`, *optional*, defaults to 0.0):
+                Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
+                Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
+                using zero terminal SNR.
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
+            )
+
+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # to deal with lora scaling and other possible forward hooks
+
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            callback_steps,
+            negative_prompt,
+            prompt_embeds,
+            negative_prompt_embeds,
+            ip_adapter_image,
+            ip_adapter_image_embeds,
+            callback_on_step_end_tensor_inputs,
+        )
+
+        self._guidance_scale = guidance_scale
+        self._guidance_rescale = guidance_rescale
+        self._clip_skip = clip_skip
+        self._cross_attention_kwargs = cross_attention_kwargs
+        self._interrupt = False
+
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+
+        device = self._execution_device
+
+        # 3. Encode input prompt
+        lora_scale = (
+            self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+        )
+
+        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            self.do_classifier_free_guidance,
+            negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            lora_scale=lora_scale,
+            clip_skip=self.clip_skip,
+        )
+
+        # For classifier free guidance, we need to do two forward passes.
+        # Here we concatenate the unconditional and text embeddings into a single batch
+        # to avoid doing two forward passes
+        if self.do_classifier_free_guidance:
+            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
+            image_embeds = self.prepare_ip_adapter_image_embeds(
+                ip_adapter_image,
+                ip_adapter_image_embeds,
+                device,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+            )
+
+        # 4. Prepare timesteps
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, timesteps, sigmas
+        )
+        assert num_images_per_prompt == 1
+        # 5. Prepare latent variables
+        num_channels_latents = self.unet.config.in_channels
+        latents = self.prepare_latents(
+            batch_size * kwargs['num_in_batch'],  # num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+
+        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+        # 6.1 Add image embeds for IP-Adapter
+        added_cond_kwargs = (
+            {"image_embeds": image_embeds}
+            if (ip_adapter_image is not None or ip_adapter_image_embeds is not None)
+            else None
+        )
+
+        # 6.2 Optionally get Guidance Scale Embedding
+        timestep_cond = None
+        if self.unet.config.time_cond_proj_dim is not None:
+            guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt)
+            timestep_cond = self.get_guidance_scale_embedding(
+                guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+            ).to(device=device, dtype=latents.dtype)
+
+        # 7. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        self._num_timesteps = len(timesteps)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+
+                # expand the latents if we are doing classifier free guidance
+                latents = rearrange(latents, '(b n) c h w -> b n c h w', n=kwargs['num_in_batch'])
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = rearrange(latent_model_input, 'b n c h w -> (b n) c h w')
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = rearrange(latent_model_input, '(b n) c h w ->b n c h w', n=kwargs['num_in_batch'])
+
+                # predict the noise residual
+
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    timestep_cond=timestep_cond,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                    return_dict=False, **kwargs
+                )[0]
+                latents = rearrange(latents, 'b n c h w -> (b n) c h w')
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
+                    # Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
+                    noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = \
+                    self.scheduler.step(noise_pred, t, latents[:, :num_channels_latents, :, :], **extra_step_kwargs,
+                                        return_dict=False)[0]
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        step_idx = i // getattr(self.scheduler, "order", 1)
+                        callback(step_idx, t, latents)
+
+        if not output_type == "latent":
+            image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[
+                0
+            ]
+            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+        else:
+            image = latents
+            has_nsfw_concept = None
+
+        if has_nsfw_concept is None:
+            do_denormalize = [True] * image.shape[0]
+        else:
+            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+        image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+        # Offload all models
+        self.maybe_free_model_hooks()
+
+        if not return_dict:
+            return (image, has_nsfw_concept)
+
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
--- a/hy3dgen/texgen/hunyuanpaint/unet/__init__.py
+++ b/hy3dgen/texgen/hunyuanpaint/unet/__init__.py
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
--- a/hy3dgen/texgen/hunyuanpaint/unet/modules.py
+++ b/hy3dgen/texgen/hunyuanpaint/unet/modules.py
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+
+import copy
+import json
+import os
+from typing import Any, Dict, Optional
+
+import torch
+import torch.nn as nn
+from diffusers.models import UNet2DConditionModel
+from diffusers.models.attention_processor import Attention
+from diffusers.models.transformers.transformer_2d import BasicTransformerBlock
+from einops import rearrange
+
+
+def _chunked_feed_forward(ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int):
+    # "feed_forward_chunk_size" can be used to save memory
+    if hidden_states.shape[chunk_dim] % chunk_size != 0:
+        raise ValueError(
+            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+        )
+
+    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
+    ff_output = torch.cat(
+        [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
+        dim=chunk_dim,
+    )
+    return ff_output
+
+
+class Basic2p5DTransformerBlock(torch.nn.Module):
+    def __init__(self, transformer: BasicTransformerBlock, layer_name, use_ma=True, use_ra=True) -> None:
+        super().__init__()
+        self.transformer = transformer
+        self.layer_name = layer_name
+        self.use_ma = use_ma
+        self.use_ra = use_ra
+
+        # multiview attn
+        if self.use_ma:
+            self.attn_multiview = Attention(
+                query_dim=self.dim,
+                heads=self.num_attention_heads,
+                dim_head=self.attention_head_dim,
+                dropout=self.dropout,
+                bias=self.attention_bias,
+                cross_attention_dim=None,
+                upcast_attention=self.attn1.upcast_attention,
+                out_bias=True,
+            )
+
+        # ref attn
+        if self.use_ra:
+            self.attn_refview = Attention(
+                query_dim=self.dim,
+                heads=self.num_attention_heads,
+                dim_head=self.attention_head_dim,
+                dropout=self.dropout,
+                bias=self.attention_bias,
+                cross_attention_dim=None,
+                upcast_attention=self.attn1.upcast_attention,
+                out_bias=True,
+            )
+
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.transformer, name)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None,
+    ) -> torch.Tensor:
+
+        # Notice that normalization is always applied before the real computation in the following blocks.
+        # 0. Self-Attention
+        batch_size = hidden_states.shape[0]
+
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        num_in_batch = cross_attention_kwargs.pop('num_in_batch', 1)
+        mode = cross_attention_kwargs.pop('mode', None)
+        mva_scale = cross_attention_kwargs.pop('mva_scale', 1.0)
+        ref_scale = cross_attention_kwargs.pop('ref_scale', 1.0)
+        condition_embed_dict = cross_attention_kwargs.pop("condition_embed_dict", None)
+
+        if self.norm_type == "ada_norm":
+            norm_hidden_states = self.norm1(hidden_states, timestep)
+        elif self.norm_type == "ada_norm_zero":
+            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(
+                hidden_states, timestep, class_labels, hidden_dtype=hidden_states.dtype
+            )
+        elif self.norm_type in ["layer_norm", "layer_norm_i2vgen"]:
+            norm_hidden_states = self.norm1(hidden_states)
+        elif self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm1(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif self.norm_type == "ada_norm_single":
+            shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = (
+                self.scale_shift_table[None] + timestep.reshape(batch_size, 6, -1)
+            ).chunk(6, dim=1)
+            norm_hidden_states = self.norm1(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_msa) + shift_msa
+        else:
+            raise ValueError("Incorrect norm used")
+
+        if self.pos_embed is not None:
+            norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+        # 1. Prepare GLIGEN inputs
+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+        gligen_kwargs = cross_attention_kwargs.pop("gligen", None)
+
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=encoder_hidden_states if self.only_cross_attention else None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
+
+        if self.norm_type == "ada_norm_zero":
+            attn_output = gate_msa.unsqueeze(1) * attn_output
+        elif self.norm_type == "ada_norm_single":
+            attn_output = gate_msa * attn_output
+
+        hidden_states = attn_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        # 1.2 Reference Attention
+        if 'w' in mode:
+            condition_embed_dict[self.layer_name] = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c',
+                                                              n=num_in_batch)  # B, (N L), C
+
+        if 'r' in mode and self.use_ra:
+            condition_embed = condition_embed_dict[self.layer_name].unsqueeze(1).repeat(1, num_in_batch, 1,
+                                                                                        1)  # B N L C
+            condition_embed = rearrange(condition_embed, 'b n l c -> (b n) l c')
+
+            attn_output = self.attn_refview(
+                norm_hidden_states,
+                encoder_hidden_states=condition_embed,
+                attention_mask=None,
+                **cross_attention_kwargs
+            )
+            ref_scale_timing = ref_scale
+            if isinstance(ref_scale, torch.Tensor):
+                ref_scale_timing = ref_scale.unsqueeze(1).repeat(1, num_in_batch).view(-1)
+                for _ in range(attn_output.ndim - 1):
+                    ref_scale_timing = ref_scale_timing.unsqueeze(-1)
+            hidden_states = ref_scale_timing * attn_output + hidden_states
+            if hidden_states.ndim == 4:
+                hidden_states = hidden_states.squeeze(1)
+
+        # 1.3 Multiview Attention
+        if num_in_batch > 1 and self.use_ma:
+            multivew_hidden_states = rearrange(norm_hidden_states, '(b n) l c -> b (n l) c', n=num_in_batch)
+
+            attn_output = self.attn_multiview(
+                multivew_hidden_states,
+                encoder_hidden_states=multivew_hidden_states,
+                **cross_attention_kwargs
+            )
+
+            attn_output = rearrange(attn_output, 'b (n l) c -> (b n) l c', n=num_in_batch)
+
+            hidden_states = mva_scale * attn_output + hidden_states
+            if hidden_states.ndim == 4:
+                hidden_states = hidden_states.squeeze(1)
+
+        # 1.2 GLIGEN Control
+        if gligen_kwargs is not None:
+            hidden_states = self.fuser(hidden_states, gligen_kwargs["objs"])
+
+        # 3. Cross-Attention
+        if self.attn2 is not None:
+            if self.norm_type == "ada_norm":
+                norm_hidden_states = self.norm2(hidden_states, timestep)
+            elif self.norm_type in ["ada_norm_zero", "layer_norm", "layer_norm_i2vgen"]:
+                norm_hidden_states = self.norm2(hidden_states)
+            elif self.norm_type == "ada_norm_single":
+                # For PixArt norm2 isn't applied here:
+                # https://github.com/PixArt-alpha/PixArt-alpha/blob/0f55e922376d8b797edd44d25d0e7464b260dcab/diffusion/model/nets/PixArtMS.py#L70C1-L76C103
+                norm_hidden_states = hidden_states
+            elif self.norm_type == "ada_norm_continuous":
+                norm_hidden_states = self.norm2(hidden_states, added_cond_kwargs["pooled_text_emb"])
+            else:
+                raise ValueError("Incorrect norm")
+
+            if self.pos_embed is not None and self.norm_type != "ada_norm_single":
+                norm_hidden_states = self.pos_embed(norm_hidden_states)
+
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
+
+            hidden_states = attn_output + hidden_states
+
+        # 4. Feed-forward
+        # i2vgen doesn't have this norm 🤷‍♂️
+        if self.norm_type == "ada_norm_continuous":
+            norm_hidden_states = self.norm3(hidden_states, added_cond_kwargs["pooled_text_emb"])
+        elif not self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm3(hidden_states)
+
+        if self.norm_type == "ada_norm_zero":
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+
+        if self.norm_type == "ada_norm_single":
+            norm_hidden_states = self.norm2(hidden_states)
+            norm_hidden_states = norm_hidden_states * (1 + scale_mlp) + shift_mlp
+
+        if self._chunk_size is not None:
+            # "feed_forward_chunk_size" can be used to save memory
+            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+        else:
+            ff_output = self.ff(norm_hidden_states)
+
+        if self.norm_type == "ada_norm_zero":
+            ff_output = gate_mlp.unsqueeze(1) * ff_output
+        elif self.norm_type == "ada_norm_single":
+            ff_output = gate_mlp * ff_output
+
+        hidden_states = ff_output + hidden_states
+        if hidden_states.ndim == 4:
+            hidden_states = hidden_states.squeeze(1)
+
+        return hidden_states
+
+
+class UNet2p5DConditionModel(torch.nn.Module):
+    def __init__(self, unet: UNet2DConditionModel) -> None:
+        super().__init__()
+        self.unet = unet
+
+        self.use_ma = True
+        self.use_ra = True
+        self.use_camera_embedding = True
+        self.use_dual_stream = True
+
+        if self.use_dual_stream:
+            self.unet_dual = copy.deepcopy(unet)
+            self.init_attention(self.unet_dual)
+        self.init_attention(self.unet, use_ma=self.use_ma, use_ra=self.use_ra)
+        self.init_condition()
+        self.init_camera_embedding()
+
+    @staticmethod
+    def from_pretrained(pretrained_model_name_or_path, **kwargs):
+        torch_dtype = kwargs.pop('torch_dtype', torch.float32)
+        config_path = os.path.join(pretrained_model_name_or_path, 'config.json')
+        unet_ckpt_path = os.path.join(pretrained_model_name_or_path, 'diffusion_pytorch_model.bin')
+        with open(config_path, 'r', encoding='utf-8') as file:
+            config = json.load(file)
+        unet = UNet2DConditionModel(**config)
+        unet = UNet2p5DConditionModel(unet)
+        unet_ckpt = torch.load(unet_ckpt_path, map_location='cpu', weights_only=True)
+        unet.load_state_dict(unet_ckpt, strict=True)
+        unet = unet.to(torch_dtype)
+        return unet
+
+    def init_condition(self):
+        self.unet.conv_in = torch.nn.Conv2d(
+            12,
+            self.unet.conv_in.out_channels,
+            kernel_size=self.unet.conv_in.kernel_size,
+            stride=self.unet.conv_in.stride,
+            padding=self.unet.conv_in.padding,
+            dilation=self.unet.conv_in.dilation,
+            groups=self.unet.conv_in.groups,
+            bias=self.unet.conv_in.bias is not None)
+
+        self.unet.learned_text_clip_gen = nn.Parameter(torch.randn(1, 77, 1024))
+        self.unet.learned_text_clip_ref = nn.Parameter(torch.randn(1, 77, 1024))
+
+    def init_camera_embedding(self):
+
+        if self.use_camera_embedding:
+            time_embed_dim = 1280
+            self.max_num_ref_image = 5
+            self.max_num_gen_image = 12 * 3 + 4 * 2
+            self.unet.class_embedding = nn.Embedding(self.max_num_ref_image + self.max_num_gen_image, time_embed_dim)
+
+    def init_attention(self, unet, use_ma=False, use_ra=False):
+
+        for down_block_i, down_block in enumerate(unet.down_blocks):
+            if hasattr(down_block, "has_cross_attention") and down_block.has_cross_attention:
+                for attn_i, attn in enumerate(down_block.attentions):
+                    for transformer_i, transformer in enumerate(attn.transformer_blocks):
+                        if isinstance(transformer, BasicTransformerBlock):
+                            attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
+                                                                                               f'down_{down_block_i}_{attn_i}_{transformer_i}',
+                                                                                               use_ma, use_ra)
+
+        if hasattr(unet.mid_block, "has_cross_attention") and unet.mid_block.has_cross_attention:
+            for attn_i, attn in enumerate(unet.mid_block.attentions):
+                for transformer_i, transformer in enumerate(attn.transformer_blocks):
+                    if isinstance(transformer, BasicTransformerBlock):
+                        attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
+                                                                                           f'mid_{attn_i}_{transformer_i}',
+                                                                                           use_ma, use_ra)
+
+        for up_block_i, up_block in enumerate(unet.up_blocks):
+            if hasattr(up_block, "has_cross_attention") and up_block.has_cross_attention:
+                for attn_i, attn in enumerate(up_block.attentions):
+                    for transformer_i, transformer in enumerate(attn.transformer_blocks):
+                        if isinstance(transformer, BasicTransformerBlock):
+                            attn.transformer_blocks[transformer_i] = Basic2p5DTransformerBlock(transformer,
+                                                                                               f'up_{up_block_i}_{attn_i}_{transformer_i}',
+                                                                                               use_ma, use_ra)
+
+    def __getattr__(self, name: str):
+        try:
+            return super().__getattr__(name)
+        except AttributeError:
+            return getattr(self.unet, name)
+
+    def forward(
+        self, sample, timestep, encoder_hidden_states,
+        *args, down_intrablock_additional_residuals=None,
+        down_block_res_samples=None, mid_block_res_sample=None,
+        **cached_condition,
+    ):
+        B, N_gen, _, H, W = sample.shape
+        assert H == W
+
+        if self.use_camera_embedding:
+            camera_info_gen = cached_condition['camera_info_gen'] + self.max_num_ref_image
+            camera_info_gen = rearrange(camera_info_gen, 'b n -> (b n)')
+        else:
+            camera_info_gen = None
+
+        sample = [sample]
+        if 'normal_imgs' in cached_condition:
+            sample.append(cached_condition["normal_imgs"])
+        if 'position_imgs' in cached_condition:
+            sample.append(cached_condition["position_imgs"])
+        sample = torch.cat(sample, dim=2)
+
+        sample = rearrange(sample, 'b n c h w -> (b n) c h w')
+
+        encoder_hidden_states_gen = encoder_hidden_states.unsqueeze(1).repeat(1, N_gen, 1, 1)
+        encoder_hidden_states_gen = rearrange(encoder_hidden_states_gen, 'b n l c -> (b n) l c')
+
+        if self.use_ra:
+            if 'condition_embed_dict' in cached_condition:
+                condition_embed_dict = cached_condition['condition_embed_dict']
+            else:
+                condition_embed_dict = {}
+                ref_latents = cached_condition['ref_latents']
+                N_ref = ref_latents.shape[1]
+                if self.use_camera_embedding:
+                    camera_info_ref = cached_condition['camera_info_ref']
+                    camera_info_ref = rearrange(camera_info_ref, 'b n -> (b n)')
+                else:
+                    camera_info_ref = None
+
+                ref_latents = rearrange(ref_latents, 'b n c h w -> (b n) c h w')
+
+                encoder_hidden_states_ref = self.unet.learned_text_clip_ref.unsqueeze(1).repeat(B, N_ref, 1, 1)
+                encoder_hidden_states_ref = rearrange(encoder_hidden_states_ref, 'b n l c -> (b n) l c')
+
+                noisy_ref_latents = ref_latents
+                timestep_ref = 0
+
+                if self.use_dual_stream:
+                    unet_ref = self.unet_dual
+                else:
+                    unet_ref = self.unet
+                unet_ref(
+                    noisy_ref_latents, timestep_ref,
+                    encoder_hidden_states=encoder_hidden_states_ref,
+                    class_labels=camera_info_ref,
+                    # **kwargs
+                    return_dict=False,
+                    cross_attention_kwargs={
+                        'mode': 'w', 'num_in_batch': N_ref,
+                        'condition_embed_dict': condition_embed_dict},
+                )
+                cached_condition['condition_embed_dict'] = condition_embed_dict
+        else:
+            condition_embed_dict = None
+
+        mva_scale = cached_condition.get('mva_scale', 1.0)
+        ref_scale = cached_condition.get('ref_scale', 1.0)
+
+        return self.unet(
+            sample, timestep,
+            encoder_hidden_states_gen, *args,
+            class_labels=camera_info_gen,
+            down_intrablock_additional_residuals=[
+                sample.to(dtype=self.unet.dtype) for sample in down_intrablock_additional_residuals
+            ] if down_intrablock_additional_residuals is not None else None,
+            down_block_additional_residuals=[
+                sample.to(dtype=self.unet.dtype) for sample in down_block_res_samples
+            ] if down_block_res_samples is not None else None,
+            mid_block_additional_residual=(
+                mid_block_res_sample.to(dtype=self.unet.dtype)
+                if mid_block_res_sample is not None else None
+            ),
+            return_dict=False,
+            cross_attention_kwargs={
+                'mode': 'r', 'num_in_batch': N_gen,
+                'condition_embed_dict': condition_embed_dict,
+                'mva_scale': mva_scale,
+                'ref_scale': ref_scale,
+            },
+        )
--- a/hy3dgen/texgen/pipelines.py
+++ b/hy3dgen/texgen/pipelines.py
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
+
+
+import logging
+import numpy as np
+import os
+import torch
+from PIL import Image
+
+from .differentiable_renderer.mesh_render import MeshRender
+from .utils.dehighlight_utils import Light_Shadow_Remover
+from .utils.multiview_utils import Multiview_Diffusion_Net
+from .utils.imagesuper_utils import Image_Super_Net
+from .utils.uv_warp_utils import mesh_uv_wrap
+
+logger = logging.getLogger(__name__)
+
+
+class Hunyuan3DTexGenConfig:
+
+    def __init__(self, light_remover_ckpt_path, multiview_ckpt_path):
+        self.device = 'cuda'
+        self.light_remover_ckpt_path = light_remover_ckpt_path
+        self.multiview_ckpt_path = multiview_ckpt_path
+
+        self.candidate_camera_azims = [0, 90, 180, 270, 0, 180]
+        self.candidate_camera_elevs = [0, 0, 0, 0, 90, -90]
+        self.candidate_view_weights = [1, 0.1, 0.5, 0.1, 0.05, 0.05]
+
+        self.render_size = 2048
+        self.texture_size = 2048
+        self.bake_exp = 4
+        self.merge_method = 'fast'
+
+
+class Hunyuan3DPaintPipeline:
+    @classmethod
+    def from_pretrained(cls, model_path):
+        original_model_path = model_path
+        if os.path.exists(model_path):
+        # if not os.path.exists(model_path):
+            # try local path
+            base_dir = os.environ.get('HY3DGEN_MODELS', '~/.cache/hy3dgen')
+            # model_path = os.path.expanduser(os.path.join(base_dir, model_path))
+
+            delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
+            multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0')
+            print(multiview_model_path)
+
+            if not os.path.exists(delight_model_path) or not os.path.exists(multiview_model_path):
+                try:
+                    import huggingface_hub
+                    # download from huggingface
+                    model_path = huggingface_hub.snapshot_download(repo_id=original_model_path)
+                    delight_model_path = os.path.join(model_path, 'hunyuan3d-delight-v2-0')
+                    multiview_model_path = os.path.join(model_path, 'hunyuan3d-paint-v2-0')
+                    return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path))
+                except ImportError:
+                    logger.warning(
+                        "You need to install HuggingFace Hub to load models from the hub."
+                    )
+                    raise RuntimeError(f"Model path {model_path} not found")
+            else:
+                return cls(Hunyuan3DTexGenConfig(delight_model_path, multiview_model_path))
+
+        raise FileNotFoundError(f"Model path {original_model_path} not found and we could not find it at huggingface")
+
+    def __init__(self, config):
+        self.config = config
+        self.models = {}
+        self.render = MeshRender(
+            default_resolution=self.config.render_size,
+            texture_size=self.config.texture_size)
+
+        self.load_models()
+
+    def load_models(self):
+        # empty cude cache
+        torch.cuda.empty_cache()
+        # Load model
+        self.models['delight_model'] = Light_Shadow_Remover(self.config)
+        self.models['multiview_model'] = Multiview_Diffusion_Net(self.config)
+        self.models['super_model'] = Image_Super_Net(self.config)
+
+    def render_normal_multiview(self, camera_elevs, camera_azims, use_abs_coor=True):
+        normal_maps = []
+        for elev, azim in zip(camera_elevs, camera_azims):
+            normal_map = self.render.render_normal(
+                elev, azim, use_abs_coor=use_abs_coor, return_type='pl')
+            normal_maps.append(normal_map)
+
+        return normal_maps
+
+    def render_position_multiview(self, camera_elevs, camera_azims):
+        position_maps = []
+        for elev, azim in zip(camera_elevs, camera_azims):
+            position_map = self.render.render_position(
+                elev, azim, return_type='pl')
+            position_maps.append(position_map)
+
+        return position_maps
+
+    def bake_from_multiview(self, views, camera_elevs,
+                            camera_azims, view_weights, method='graphcut'):
+        project_textures, project_weighted_cos_maps = [], []
+        project_boundary_maps = []
+        for view, camera_elev, camera_azim, weight in zip(
+            views, camera_elevs, camera_azims, view_weights):
+            project_texture, project_cos_map, project_boundary_map = self.render.back_project(
+                view, camera_elev, camera_azim)
+            project_cos_map = weight * (project_cos_map ** self.config.bake_exp)
+            project_textures.append(project_texture)
+            project_weighted_cos_maps.append(project_cos_map)
+            project_boundary_maps.append(project_boundary_map)
+
+        if method == 'fast':
+            texture, ori_trust_map = self.render.fast_bake_texture(
+                project_textures, project_weighted_cos_maps)
+        else:
+            raise f'no method {method}'
+        return texture, ori_trust_map > 1E-8
+
+    def texture_inpaint(self, texture, mask):
+
+        texture_np = self.render.uv_inpaint(texture, mask)
+        texture = torch.tensor(texture_np / 255).float().to(texture.device)
+
+        return texture
+    
+    def recenter_image(self, image, border_ratio=0.2):
+        if image.mode == 'RGB':
+            return image
+        elif image.mode == 'L':
+            image = image.convert('RGB')
+            return image
+        
+        alpha_channel = np.array(image)[:, :, 3]
+        non_zero_indices = np.argwhere(alpha_channel > 0)
+        if non_zero_indices.size == 0:
+            raise ValueError("Image is fully transparent")
+
+        min_row, min_col = non_zero_indices.min(axis=0)
+        max_row, max_col = non_zero_indices.max(axis=0)
+
+        cropped_image = image.crop((min_col, min_row, max_col + 1, max_row + 1))
+
+        width, height = cropped_image.size
+        border_width = int(width * border_ratio)
+        border_height = int(height * border_ratio)
+
+        new_width = width + 2 * border_width
+        new_height = height + 2 * border_height
+
+        square_size = max(new_width, new_height)
+
+        new_image = Image.new('RGBA', (square_size, square_size), (255, 255, 255, 0))
+
+        paste_x = (square_size - new_width) // 2 + border_width
+        paste_y = (square_size - new_height) // 2 + border_height
+
+        new_image.paste(cropped_image, (paste_x, paste_y))
+        return new_image
+
+    @torch.no_grad()
+    def __call__(self, mesh, image):
+
+        if isinstance(image, str):
+            image_prompt = Image.open(image)
+        else:
+            image_prompt = image
+        
+        image_prompt = self.recenter_image(image_prompt)
+
+        image_prompt = self.models['delight_model'](image_prompt)
+
+        mesh = mesh_uv_wrap(mesh)
+
+        self.render.load_mesh(mesh)
+
+        selected_camera_elevs, selected_camera_azims, selected_view_weights = \
+            self.config.candidate_camera_elevs, self.config.candidate_camera_azims, self.config.candidate_view_weights
+
+        normal_maps = self.render_normal_multiview(
+            selected_camera_elevs, selected_camera_azims, use_abs_coor=True)
+        position_maps = self.render_position_multiview(
+            selected_camera_elevs, selected_camera_azims)
+
+        camera_info = [(((azim // 30) + 9) % 12) // {-20: 1, 0: 1, 20: 1, -90: 3, 90: 3}[
+            elev] + {-20: 0, 0: 12, 20: 24, -90: 36, 90: 40}[elev] for azim, elev in
+                       zip(selected_camera_azims, selected_camera_elevs)]
+        multiviews = self.models['multiview_model'](image_prompt, normal_maps + position_maps, camera_info)
+
+        for i in range(len(multiviews)):
+            multiviews[i] = self.models['super_model'](multiviews[i])
+            multiviews[i] = multiviews[i].resize(
+                (self.config.render_size, self.config.render_size))
+
+        texture, mask = self.bake_from_multiview(multiviews,
+                                                 selected_camera_elevs, selected_camera_azims, selected_view_weights,
+                                                 method=self.config.merge_method)
+
+        mask_np = (mask.squeeze(-1).cpu().numpy() * 255).astype(np.uint8)
+
+        texture = self.texture_inpaint(texture, mask_np)
+
+        self.render.set_texture(texture)
+        textured_mesh = self.render.save_mesh()
+
+        return textured_mesh
--- a/hy3dgen/texgen/utils/__init__.py
+++ b/hy3dgen/texgen/utils/__init__.py
+# Open Source Model Licensed under the Apache License Version 2.0
+# and Other Licenses of the Third-Party Components therein:
+# The below Model in this distribution may have been modified by THL A29 Limited
+# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
+
+# Copyright (C) 2024 THL A29 Limited, a Tencent company.  All rights reserved.
+# The below software and/or models in this distribution may have been
+# modified by THL A29 Limited ("Tencent Modifications").
+# All Tencent Modifications are Copyright (C) THL A29 Limited.
+
+# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
+# except for the third-party components listed below.
+# Hunyuan 3D does not impose any additional limitations beyond what is outlined
+# in the repsective licenses of these third-party components.
+# Users must comply with all terms and conditions of original licenses of these third-party
+# components and must ensure that the usage of the third party components adheres to
+# all relevant laws and regulations.
+
+# For avoidance of doubts, Hunyuan 3D means the large language models and
+# their software and algorithms, including trained model weights, parameters (including
+# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
+# fine-tuning enabling code and other elements of the foregoing made publicly available
+# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
--- a/hy3dgen/texgen/utils/__pycache__/__init__.cpython-310.pyc
+++ b/hy3dgen/texgen/utils/__pycache__/__init__.cpython-310.pyc