"templates/vscode:/vscode.git/clone" did not exist on "901507335f6ed59cad6bbbc2b5d8d9eba8a1b4e1"
Commit aad7b6c7 authored by chenzk's avatar chenzk
Browse files

v1.0

parents
Pipeline #2416 canceled with stages
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
import torch
from diffusers import EulerAncestralDiscreteScheduler
from diffusers import StableDiffusionControlNetPipeline, StableDiffusionXLControlNetImg2ImgPipeline, ControlNetModel, \
AutoencoderKL
class Img2img_Control_Ip_adapter:
def __init__(self, device):
controlnet = ControlNetModel.from_pretrained('lllyasviel/control_v11f1p_sd15_depth', torch_dtype=torch.float16,
variant="fp16", use_safetensors=True)
pipe = StableDiffusionControlNetPipeline.from_pretrained(
'runwayml/stable-diffusion-v1-5', controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
)
pipe.load_ip_adapter('h94/IP-Adapter', subfolder="models", weight_name="ip-adapter-plus_sd15.safetensors")
pipe.set_ip_adapter_scale(0.7)
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
# pipe.enable_model_cpu_offload()
self.pipe = pipe.to(device)
def __call__(
self,
prompt,
control_image,
ip_adapter_image,
negative_prompt,
height=512,
width=512,
num_inference_steps=20,
guidance_scale=8.0,
controlnet_conditioning_scale=1.0,
output_type="pil",
**kwargs,
):
results = self.pipe(
prompt=prompt,
negative_prompt=negative_prompt,
image=control_image,
ip_adapter_image=ip_adapter_image,
generator=torch.manual_seed(42),
seed=42,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
controlnet_conditioning_scale=controlnet_conditioning_scale,
strength=1,
# clip_skip=2,
height=height,
width=width,
output_type=output_type,
**kwargs,
).images[0]
return results
################################################################
class HesModel:
def __init__(self, ):
controlnet_depth = ControlNetModel.from_pretrained(
'diffusers/controlnet-depth-sdxl-1.0',
torch_dtype=torch.float16,
variant="fp16",
use_safetensors=True
)
self.pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
'stabilityai/stable-diffusion-xl-base-1.0',
torch_dtype=torch.float16,
variant="fp16",
controlnet=controlnet_depth,
use_safetensors=True,
)
self.pipe.vae = AutoencoderKL.from_pretrained(
'madebyollin/sdxl-vae-fp16-fix',
torch_dtype=torch.float16
)
self.pipe.load_ip_adapter('h94/IP-Adapter', subfolder="sdxl_models", weight_name="ip-adapter_sdxl.safetensors")
self.pipe.set_ip_adapter_scale(0.7)
self.pipe.to("cuda")
def __call__(self,
init_image,
control_image,
ip_adapter_image=None,
prompt='3D image',
negative_prompt='2D image',
seed=42,
strength=0.8,
num_inference_steps=40,
guidance_scale=7.5,
controlnet_conditioning_scale=0.5,
**kwargs
):
image = self.pipe(
prompt=prompt,
image=init_image,
control_image=control_image,
ip_adapter_image=ip_adapter_image,
negative_prompt=negative_prompt,
num_inference_steps=num_inference_steps,
guidance_scale=guidance_scale,
strength=strength,
controlnet_conditioning_scale=controlnet_conditioning_scale,
seed=seed,
**kwargs
).images[0]
return image
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
class RunningStats():
def __init__(self) -> None:
self.count = 0
self.sum = 0
self.mean = 0
self.min = None
self.max = None
def add_value(self, value):
self.count += 1
self.sum += value
self.mean = self.sum / self.count
if self.min is None or value < self.min:
self.min = value
if self.max is None or value > self.max:
self.max = value
def get_count(self):
return self.count
def get_sum(self):
return self.sum
def get_mean(self):
return self.mean
def get_min(self):
return self.min
def get_max(self):
return self.max
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
import cv2
import numpy as np
import torch
from PIL import Image
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
class Light_Shadow_Remover():
def __init__(self, config):
self.device = config.device
self.cfg_image = 1.5
self.cfg_text = 1.0
pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained(
config.light_remover_ckpt_path,
torch_dtype=torch.float16,
safety_checker=None,
)
pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
pipeline.set_progress_bar_config(disable=True)
self.pipeline = pipeline.to(self.device, torch.float16)
def recorrect_rgb(self, src_image, target_image, alpha_channel, scale=0.95):
def flat_and_mask(bgr, a):
mask = torch.where(a > 0.5, True, False)
bgr_flat = bgr.reshape(-1, bgr.shape[-1])
mask_flat = mask.reshape(-1)
bgr_flat_masked = bgr_flat[mask_flat, :]
return bgr_flat_masked
src_flat = flat_and_mask(src_image, alpha_channel)
target_flat = flat_and_mask(target_image, alpha_channel)
corrected_bgr = torch.zeros_like(src_image)
for i in range(3):
src_mean, src_stddev = torch.mean(src_flat[:, i]), torch.std(src_flat[:, i])
target_mean, target_stddev = torch.mean(target_flat[:, i]), torch.std(target_flat[:, i])
corrected_bgr[:, :, i] = torch.clamp((src_image[:, :, i] - scale * src_mean) * (target_stddev / src_stddev) + scale * target_mean, 0, 1)
src_mse = torch.mean((src_image - target_image) ** 2)
modify_mse = torch.mean((corrected_bgr - target_image) ** 2)
if src_mse < modify_mse:
corrected_bgr = torch.cat([src_image, alpha_channel], dim=-1)
else:
corrected_bgr = torch.cat([corrected_bgr, alpha_channel], dim=-1)
return corrected_bgr
@torch.no_grad()
def __call__(self, image):
image = image.resize((512, 512))
if image.mode == 'RGBA':
image_array = np.array(image)
alpha_channel = image_array[:, :, 3]
erosion_size = 3
kernel = np.ones((erosion_size, erosion_size), np.uint8)
alpha_channel = cv2.erode(alpha_channel, kernel, iterations=1)
image_array[alpha_channel == 0, :3] = 255
image_array[:, :, 3] = alpha_channel
image = Image.fromarray(image_array)
image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
alpha = image_tensor[:, :, 3:]
rgb_target = image_tensor[:, :, :3]
else:
image_tensor = torch.tensor(np.array(image) / 255.0).to(self.device)
alpha = torch.ones_like(image_tensor)[:, :, :1]
rgb_target = image_tensor[:, :, :3]
image = image.convert('RGB')
image = self.pipeline(
prompt="",
image=image,
generator=torch.manual_seed(42),
height=512,
width=512,
num_inference_steps=50,
image_guidance_scale=self.cfg_image,
guidance_scale=self.cfg_text,
).images[0]
image_tensor = torch.tensor(np.array(image)/255.0).to(self.device)
rgb_src = image_tensor[:,:,:3]
image = self.recorrect_rgb(rgb_src, rgb_target, alpha)
image = image[:,:,:3]*image[:,:,3:] + torch.ones_like(image[:,:,:3])*(1.0-image[:,:,3:])
image = Image.fromarray((image.cpu().numpy()*255).astype(np.uint8))
return image
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
import torch
from diffusers import StableDiffusionUpscalePipeline
class Image_Super_Net():
def __init__(self, config):
self.up_pipeline_x4 = StableDiffusionUpscalePipeline.from_pretrained(
'stabilityai/stable-diffusion-x4-upscaler',
torch_dtype=torch.float16,
).to(config.device)
self.up_pipeline_x4.set_progress_bar_config(disable=True)
def __call__(self, image, prompt=''):
with torch.no_grad():
upscaled_image = self.up_pipeline_x4(
prompt=[prompt],
image=image,
num_inference_steps=5,
).images[0]
return upscaled_image
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
import os
import random
import numpy as np
import torch
from diffusers import DiffusionPipeline
from diffusers import EulerAncestralDiscreteScheduler
class Multiview_Diffusion_Net():
def __init__(self, config) -> None:
self.device = config.device
self.view_size = 512
multiview_ckpt_path = config.multiview_ckpt_path
current_file_path = os.path.abspath(__file__)
custom_pipeline_path = os.path.join(os.path.dirname(current_file_path), '..', 'hunyuanpaint')
pipeline = DiffusionPipeline.from_pretrained(
multiview_ckpt_path,
custom_pipeline=custom_pipeline_path, torch_dtype=torch.float16)
pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config,
timestep_spacing='trailing')
pipeline.set_progress_bar_config(disable=True)
self.pipeline = pipeline.to(self.device)
def seed_everything(self, seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PL_GLOBAL_SEED"] = str(seed)
def __call__(self, input_image, control_images, camera_info):
self.seed_everything(0)
input_image = input_image.resize((self.view_size, self.view_size))
for i in range(len(control_images)):
control_images[i] = control_images[i].resize((self.view_size, self.view_size))
if control_images[i].mode == 'L':
control_images[i] = control_images[i].point(lambda x: 255 if x > 1 else 0, mode='1')
kwargs = dict(generator=torch.Generator(device=self.pipeline.device).manual_seed(0))
num_view = len(control_images) // 2
normal_image = [[control_images[i] for i in range(num_view)]]
position_image = [[control_images[i + num_view] for i in range(num_view)]]
camera_info_gen = [camera_info]
camera_info_ref = [[0]]
kwargs['width'] = self.view_size
kwargs['height'] = self.view_size
kwargs['num_in_batch'] = num_view
kwargs['camera_info_gen'] = camera_info_gen
kwargs['camera_info_ref'] = camera_info_ref
kwargs["normal_imgs"] = normal_image
kwargs["position_imgs"] = position_image
mvd_image = self.pipeline(input_image, num_inference_steps=30, **kwargs).images
return mvd_image
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
import trimesh
def remesh_mesh(mesh_path, remesh_path, method='trimesh'):
if method == 'trimesh':
mesh_simplify_trimesh(mesh_path, remesh_path)
else:
raise f'Method {method} has not been implemented.'
def mesh_simplify_trimesh(inputpath, outputpath):
import pymeshlab
ms = pymeshlab.MeshSet()
ms.load_new_mesh(inputpath, load_in_a_single_layer=True)
ms.save_current_mesh(outputpath.replace('.glb', '.obj'), save_textures=False)
courent = trimesh.load(outputpath.replace('.glb', '.obj'), force='mesh')
face_num = courent.faces.shape[0]
if face_num > 100000:
courent = courent.simplify_quadric_decimation(40000)
courent.export(outputpath)
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
import trimesh
import xatlas
def mesh_uv_wrap(mesh):
if isinstance(mesh, trimesh.Scene):
mesh = mesh.dump(concatenate=True)
if len(mesh.faces) > 50000:
raise ValueError("The mesh has more than 50,000 faces, which is not supported.")
vmapping, indices, uvs = xatlas.parametrize(mesh.vertices, mesh.faces)
mesh.vertices = mesh.vertices[vmapping]
mesh.faces = indices
mesh.visual.uv = uvs
return mesh
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
import os
import random
import numpy as np
import torch
from diffusers import AutoPipelineForText2Image
def seed_everything(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PL_GLOBAL_SEED"] = str(seed)
class HunyuanDiTPipeline:
def __init__(
self,
model_path="Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled",
device='cuda'
):
self.device = device
self.pipe = AutoPipelineForText2Image.from_pretrained(
model_path,
torch_dtype=torch.float16,
enable_pag=True,
pag_applied_layers=["blocks.(16|17|18|19)"]
).to(device)
self.pos_txt = ",白色背景,3D风格,最佳质量"
self.neg_txt = "文本,特写,裁剪,出框,最差质量,低质量,JPEG伪影,PGLY,重复,病态," \
"残缺,多余的手指,变异的手,画得不好的手,画得不好的脸,变异,畸形,模糊,脱水,糟糕的解剖学," \
"糟糕的比例,多余的肢体,克隆的脸,毁容,恶心的比例,畸形的肢体,缺失的手臂,缺失的腿," \
"额外的手臂,额外的腿,融合的手指,手指太多,长脖子"
def compile(self):
# accelarate hunyuan-dit transformer,first inference will cost long time
torch.set_float32_matmul_precision('high')
self.pipe.transformer = torch.compile(self.pipe.transformer, fullgraph=True)
# self.pipe.vae.decode = torch.compile(self.pipe.vae.decode, fullgraph=True)
generator = torch.Generator(device=self.pipe.device) # infer once for hot-start
out_img = self.pipe(
prompt='美少女战士',
negative_prompt='模糊',
num_inference_steps=25,
pag_scale=1.3,
width=1024,
height=1024,
generator=generator,
return_dict=False
)[0][0]
@torch.no_grad()
def __call__(self, prompt, seed=0):
seed_everything(seed)
generator = torch.Generator(device=self.pipe.device)
generator = generator.manual_seed(int(seed))
out_img = self.pipe(
prompt=prompt[:60] + self.pos_txt,
negative_prompt=self.neg_txt,
num_inference_steps=25,
pag_scale=1.3,
width=1024,
height=1024,
generator=generator,
return_dict=False
)[0][0]
return out_img
File added
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
import torch
from PIL import Image
from hy3dgen.rembg import BackgroundRemover
from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline, FaceReducer, FloaterRemover, DegenerateFaceRemover
from hy3dgen.text2image import HunyuanDiTPipeline
def image_to_3d(image_path='assets/demo.png'):
rembg = BackgroundRemover()
model_path = 'tencent/Hunyuan3D-2'
image = Image.open(image_path)
if image.mode == 'RGB':
image = rembg(image)
pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
mesh = pipeline(image=image, num_inference_steps=30, mc_algo='mc',
generator=torch.manual_seed(2025))[0]
mesh = FloaterRemover()(mesh)
mesh = DegenerateFaceRemover()(mesh)
mesh = FaceReducer()(mesh)
mesh.export('mesh.glb')
try:
from hy3dgen.texgen import Hunyuan3DPaintPipeline
pipeline = Hunyuan3DPaintPipeline.from_pretrained(model_path)
mesh = pipeline(mesh, image=image)
mesh.export('texture.glb')
except Exception as e:
print(e)
print('Please try to install requirements by following README.md')
def text_to_3d(prompt='a car'):
rembg = BackgroundRemover()
t2i = HunyuanDiTPipeline('Tencent-Hunyuan/HunyuanDiT-v1.1-Diffusers-Distilled')
model_path = 'tencent/Hunyuan3D-2'
i23d = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(model_path)
image = t2i(prompt)
image = rembg(image)
mesh = i23d(image, num_inference_steps=30, mc_algo='mc')[0]
mesh = FloaterRemover()(mesh)
mesh = DegenerateFaceRemover()(mesh)
mesh = FaceReducer()(mesh)
mesh.export('t2i_demo.glb')
def image_to_3d_fast(image_path='assets/demo.png'):
rembg = BackgroundRemover()
model_path = 'tencent/Hunyuan3D-2'
image = Image.open(image_path)
if image.mode == 'RGB':
image = rembg(image)
pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained(
model_path,
subfolder='hunyuan3d-dit-v2-0-fast',
variant='fp16'
)
mesh = pipeline(image=image, num_inference_steps=30, mc_algo='mc',
generator=torch.manual_seed(2025))[0]
mesh = FloaterRemover()(mesh)
mesh = DegenerateFaceRemover()(mesh)
mesh = FaceReducer()(mesh)
mesh.export('mesh.glb')
if __name__ == '__main__':
image_to_3d_fast()
# image_to_3d()
# text_to_3d()
# 模型编码
modelCode=1421
# 模型名称
modelName=Hunyuan3D-2_pytorch
# 模型描述
modelDescription=腾讯提出Hunyuan3D-2,用于生成带有高分辨率纹理贴图的高保真度3D模型。
# 应用场景
appScenario=推理,3D生成,动漫,广媒,影视,制造,医疗,家居,教育
# 框架类型
frameType=pytorch
ninja
pybind11
diffusers
einops
opencv-python
numpy
torch
transformers
torchvision
#taming-transformers-rom1504
#ConfigArgParse
#ipdb
omegaconf
scikit-image
rembg
onnxruntime
numba==0.58.0
numpy==1.25.0
transformers==4.49.0
#sentencepiece
tqdm
# Mesh Processing
trimesh
pymeshlab
pygltflib
xatlas
#kornia
#facexlib
# Training
accelerate
#pytorch_lightning
#scikit-learn
#scikit-image
# Demo only
gradio
fastapi
uvicorn
rembg
onnxruntime
#gevent
#geventhttpclient
# Open Source Model Licensed under the Apache License Version 2.0
# and Other Licenses of the Third-Party Components therein:
# The below Model in this distribution may have been modified by THL A29 Limited
# ("Tencent Modifications"). All Tencent Modifications are Copyright (C) 2024 THL A29 Limited.
# Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
# The below software and/or models in this distribution may have been
# modified by THL A29 Limited ("Tencent Modifications").
# All Tencent Modifications are Copyright (C) THL A29 Limited.
# Hunyuan 3D is licensed under the TENCENT HUNYUAN NON-COMMERCIAL LICENSE AGREEMENT
# except for the third-party components listed below.
# Hunyuan 3D does not impose any additional limitations beyond what is outlined
# in the repsective licenses of these third-party components.
# Users must comply with all terms and conditions of original licenses of these third-party
# components and must ensure that the usage of the third party components adheres to
# all relevant laws and regulations.
# For avoidance of doubts, Hunyuan 3D means the large language models and
# their software and algorithms, including trained model weights, parameters (including
# optimizer states), machine-learning model code, inference-enabling code, training-enabling code,
# fine-tuning enabling code and other elements of the foregoing made publicly available
# by Tencent in accordance with TENCENT HUNYUAN COMMUNITY LICENSE AGREEMENT.
from setuptools import setup, find_packages
setup(
name="hy3dgen",
version="2.0.0",
packages=find_packages(),
)
---
license: openrail++
tags:
- stable-diffusion
inference: false
---
# Stable Diffusion x4 upscaler model card
This model card focuses on the model associated with the Stable Diffusion Upscaler, available [here](https://github.com/Stability-AI/stablediffusion).
This model is trained for 1.25M steps on a 10M subset of LAION containing images `>2048x2048`. The model was trained on crops of size `512x512` and is a text-guided [latent upscaling diffusion model](https://arxiv.org/abs/2112.10752).
In addition to the textual input, it receives a `noise_level` as an input parameter, which can be used to add noise to the low-resolution input according to a [predefined diffusion schedule](configs/stable-diffusion/x4-upscaling.yaml).
![Image](https://github.com/Stability-AI/stablediffusion/raw/main/assets/stable-samples/upscaling/merged-dog.png)
- Use it with the [`stablediffusion`](https://github.com/Stability-AI/stablediffusion) repository: download the `x4-upscaler-ema.ckpt` [here](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler/resolve/main/x4-upscaler-ema.ckpt).
- Use it with 🧨 [`diffusers`](https://huggingface.co/stabilityai/stable-diffusion-x4-upscaler#examples)
## Model Details
- **Developed by:** Robin Rombach, Patrick Esser
- **Model type:** Diffusion-based text-to-image generation model
- **Language(s):** English
- **License:** [CreativeML Open RAIL++-M License](https://huggingface.co/stabilityai/stable-diffusion-2/blob/main/LICENSE-MODEL)
- **Model Description:** This is a model that can be used to generate and modify images based on text prompts. It is a [Latent Diffusion Model](https://arxiv.org/abs/2112.10752) that uses a fixed, pretrained text encoder ([OpenCLIP-ViT/H](https://github.com/mlfoundations/open_clip)).
- **Resources for more information:** [GitHub Repository](https://github.com/Stability-AI/).
- **Cite as:**
@InProceedings{Rombach_2022_CVPR,
author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
title = {High-Resolution Image Synthesis With Latent Diffusion Models},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2022},
pages = {10684-10695}
}
## Examples
Using the [🤗's Diffusers library](https://github.com/huggingface/diffusers) to run Stable Diffusion 2 in a simple and efficient manner.
```bash
pip install diffusers transformers accelerate scipy safetensors
```
```python
import requests
from PIL import Image
from io import BytesIO
from diffusers import StableDiffusionUpscalePipeline
import torch
# load model and scheduler
model_id = "stabilityai/stable-diffusion-x4-upscaler"
pipeline = StableDiffusionUpscalePipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipeline = pipeline.to("cuda")
# let's download an image
url = "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd2-upscale/low_res_cat.png"
response = requests.get(url)
low_res_img = Image.open(BytesIO(response.content)).convert("RGB")
low_res_img = low_res_img.resize((128, 128))
prompt = "a white cat"
upscaled_image = pipeline(prompt=prompt, image=low_res_img).images[0]
upscaled_image.save("upsampled_cat.png")
```
**Notes**:
- Despite not being a dependency, we highly recommend you to install [xformers](https://github.com/facebookresearch/xformers) for memory efficient attention (better performance)
- If you have low GPU RAM available, make sure to add a `pipe.enable_attention_slicing()` after sending it to `cuda` for less VRAM usage (to the cost of speed)
# Uses
## Direct Use
The model is intended for research purposes only. Possible research areas and tasks include
- Safe deployment of models which have the potential to generate harmful content.
- Probing and understanding the limitations and biases of generative models.
- Generation of artworks and use in design and other artistic processes.
- Applications in educational or creative tools.
- Research on generative models.
Excluded uses are described below.
### Misuse, Malicious Use, and Out-of-Scope Use
_Note: This section is originally taken from the [DALLE-MINI model card](https://huggingface.co/dalle-mini/dalle-mini), was used for Stable Diffusion v1, but applies in the same way to Stable Diffusion v2_.
The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
#### Out-of-Scope Use
The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
#### Misuse and Malicious Use
Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
- Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
- Intentionally promoting or propagating discriminatory content or harmful stereotypes.
- Impersonating individuals without their consent.
- Sexual content without consent of the people who might see it.
- Mis- and disinformation
- Representations of egregious violence and gore
- Sharing of copyrighted or licensed material in violation of its terms of use.
- Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
## Limitations and Bias
### Limitations
- The model does not achieve perfect photorealism
- The model cannot render legible text
- The model does not perform well on more difficult tasks which involve compositionality, such as rendering an image corresponding to “A red cube on top of a blue sphere”
- Faces and people in general may not be generated properly.
- The model was trained mainly with English captions and will not work as well in other languages.
- The autoencoding part of the model is lossy
- The model was trained on a subset of the large-scale dataset
[LAION-5B](https://laion.ai/blog/laion-5b/), which contains adult, violent and sexual content. To partially mitigate this, we have filtered the dataset using LAION's NFSW detector (see Training section).
### Bias
While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
Stable Diffusion vw was primarily trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/),
which consists of images that are limited to English descriptions.
Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for.
This affects the overall output of the model, as white and western cultures are often set as the default. Further, the
ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts.
Stable Diffusion v2 mirrors and exacerbates biases to such a degree that viewer discretion must be advised irrespective of the input or its intent.
## Training
**Training Data**
The model developers used the following dataset for training the model:
- LAION-5B and subsets (details below). The training data is further filtered using LAION's NSFW detector, with a "p_unsafe" score of 0.1 (conservative). For more details, please refer to LAION-5B's [NeurIPS 2022](https://openreview.net/forum?id=M3Y74vmsMcY) paper and reviewer discussions on the topic.
**Training Procedure**
Stable Diffusion v2 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training,
- Images are encoded through an encoder, which turns images into latent representations. The autoencoder uses a relative downsampling factor of 8 and maps images of shape H x W x 3 to latents of shape H/f x W/f x 4
- Text prompts are encoded through the OpenCLIP-ViT/H text-encoder.
- The output of the text encoder is fed into the UNet backbone of the latent diffusion model via cross-attention.
- The loss is a reconstruction objective between the noise that was added to the latent and the prediction made by the UNet. We also use the so-called _v-objective_, see https://arxiv.org/abs/2202.00512.
We currently provide the following checkpoints:
- `512-base-ema.ckpt`: 550k steps at resolution `256x256` on a subset of [LAION-5B](https://laion.ai/blog/laion-5b/) filtered for explicit pornographic material, using the [LAION-NSFW classifier](https://github.com/LAION-AI/CLIP-based-NSFW-Detector) with `punsafe=0.1` and an [aesthetic score](https://github.com/christophschuhmann/improved-aesthetic-predictor) >= `4.5`.
850k steps at resolution `512x512` on the same dataset with resolution `>= 512x512`.
- `768-v-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for 150k steps using a [v-objective](https://arxiv.org/abs/2202.00512) on the same dataset. Resumed for another 140k steps on a `768x768` subset of our dataset.
- `512-depth-ema.ckpt`: Resumed from `512-base-ema.ckpt` and finetuned for 200k steps. Added an extra input channel to process the (relative) depth prediction produced by [MiDaS](https://github.com/isl-org/MiDaS) (`dpt_hybrid`) which is used as an additional conditioning.
The additional input channels of the U-Net which process this extra information were zero-initialized.
- `512-inpainting-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for another 200k steps. Follows the mask-generation strategy presented in [LAMA](https://github.com/saic-mdal/lama) which, in combination with the latent VAE representations of the masked image, are used as an additional conditioning.
The additional input channels of the U-Net which process this extra information were zero-initialized. The same strategy was used to train the [1.5-inpainting checkpoint](https://github.com/saic-mdal/lama).
- `x4-upscaling-ema.ckpt`: Trained for 1.25M steps on a 10M subset of LAION containing images `>2048x2048`. The model was trained on crops of size `512x512` and is a text-guided [latent upscaling diffusion model](https://arxiv.org/abs/2112.10752).
In addition to the textual input, it receives a `noise_level` as an input parameter, which can be used to add noise to the low-resolution input according to a [predefined diffusion schedule](configs/stable-diffusion/x4-upscaling.yaml).
- **Hardware:** 32 x 8 x A100 GPUs
- **Optimizer:** AdamW
- **Gradient Accumulations**: 1
- **Batch:** 32 x 8 x 2 x 4 = 2048
- **Learning rate:** warmup to 0.0001 for 10,000 steps and then kept constant
## Evaluation Results
Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
5.0, 6.0, 7.0, 8.0) and 50 steps DDIM sampling steps show the relative improvements of the checkpoints:
![pareto](model-variants.jpg)
Evaluated using 50 DDIM steps and 10000 random prompts from the COCO2017 validation set, evaluated at 512x512 resolution. Not optimized for FID scores.
## Environmental Impact
**Stable Diffusion v1** **Estimated Emissions**
Based on that information, we estimate the following CO2 emissions using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). The hardware, runtime, cloud provider, and compute region were utilized to estimate the carbon impact.
- **Hardware Type:** A100 PCIe 40GB
- **Hours used:** 200000
- **Cloud Provider:** AWS
- **Compute Region:** US-east
- **Carbon Emitted (Power consumption x Time x Carbon produced based on location of power grid):** 15000 kg CO2 eq.
## Citation
@InProceedings{Rombach_2022_CVPR,
author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
title = {High-Resolution Image Synthesis With Latent Diffusion Models},
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
month = {June},
year = {2022},
pages = {10684-10695}
}
*This model card was written by: Robin Rombach, Patrick Esser and David Ha and is based on the [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion/blob/main/Stable_Diffusion_v1_Model_Card.md) and [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
---
library_name: hunyuan3d-2.0
license: other
license_name: tencent-hunyuan-community
license_link: https://huggingface.co/tencent/Hunyuan3D-2/blob/main/LICENSE.txt
language:
- en
- zh
tags:
- image-to-3d
- text-to-3d
pipeline_tag: image-to-3d
---
<p align="center">
<img src="./assets/images/teaser.jpg">
</p>
<div align="center">
<a href=https://3d.hunyuan.tencent.com target="_blank"><img src=https://img.shields.io/badge/Hunyuan3D-black.svg?logo=homepage height=22px></a>
<a href=https://huggingface.co/spaces/tencent/Hunyuan3D-2 target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Demo-276cb4.svg height=22px></a>
<a href=https://huggingface.co/tencent/Hunyuan3D-2 target="_blank"><img src=https://img.shields.io/badge/%F0%9F%A4%97%20Models-d96902.svg height=22px></a>
<a href=https://3d-models.hunyuan.tencent.com/ target="_blank"><img src= https://img.shields.io/badge/Page-bb8a2e.svg?logo=github height=22px></a>
<a href=https://discord.gg/GuaWYwzKbX target="_blank"><img src= https://img.shields.io/badge/Discord-white.svg?logo=discord height=22px></a>
<a href=https://github.com/Tencent/Hunyuan3D-2/blob/main/assets/report/Tencent_Hunyuan3D_2_0.pdf target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>
</div>
[//]: # ( <a href=# target="_blank"><img src=https://img.shields.io/badge/Report-b5212f.svg?logo=arxiv height=22px></a>)
[//]: # ( <a href=# target="_blank"><img src= https://img.shields.io/badge/Colab-8f2628.svg?logo=googlecolab height=22px></a>)
[//]: # ( <a href="#"><img alt="PyPI - Downloads" src="https://img.shields.io/pypi/v/mulankit?logo=pypi" height=22px></a>)
<br>
<p align="center">
“ Living out everyone’s imagination on creating and manipulating 3D assets.”
</p>
This repository contains the models of the paper [Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D Assets Generation](https://huggingface.co/papers/2501.12202).
For code and more details on how to use it, refer to the [Github repository](https://github.com/Tencent/Hunyuan3D-2).
## 🔥 News
- Jan 21, 2025: 💬 Release [Hunyuan3D 2.0](https://huggingface.co/spaces/tencent/Hunyuan3D-2). Please give it a try!
## **Abstract**
We present Hunyuan3D 2.0, an advanced large-scale 3D synthesis system for generating high-resolution textured 3D assets.
This system includes two foundation components: a large-scale shape generation model - Hunyuan3D-DiT, and a large-scale
texture synthesis model - Hunyuan3D-Paint.
The shape generative model, built on a scalable flow-based diffusion transformer, aims to create geometry that properly
aligns with a given condition image, laying a solid foundation for downstream applications.
The texture synthesis model, benefiting from strong geometric and diffusion priors, produces high-resolution and vibrant
texture maps for either generated or hand-crafted meshes.
Furthermore, we build Hunyuan3D-Studio - a versatile, user-friendly production platform that simplifies the re-creation
process of 3D assets. It allows both professional and amateur users to manipulate or even animate their meshes
efficiently.
We systematically evaluate our models, showing that Hunyuan3D 2.0 outperforms previous state-of-the-art models,
including the open-source models and closed-source models in geometry details, condition alignment, texture quality, and
e.t.c.
<p align="center">
<img src="assets/images/system.jpg">
</p>
## ☯️ **Hunyuan3D 2.0**
### Architecture
Hunyuan3D 2.0 features a two-stage generation pipeline, starting with the creation of a bare mesh, followed by the
synthesis of a texture map for that mesh. This strategy is effective for decoupling the difficulties of shape and
texture generation and also provides flexibility for texturing either generated or handcrafted meshes.
<p align="left">
<img src="assets/images/arch.jpg">
</p>
### Performance
We have evaluated Hunyuan3D 2.0 with other open-source as well as close-source 3d-generation methods.
The numerical results indicate that Hunyuan3D 2.0 surpasses all baselines in the quality of generated textured 3D assets
and the condition following ability.
| Model | CMMD(⬇) | FID_CLIP(⬇) | FID(⬇) | CLIP-score(⬆) |
|-------------------------|-----------|-------------|-------------|---------------|
| Top Open-source Model1 | 3.591 | 54.639 | 289.287 | 0.787 |
| Top Close-source Model1 | 3.600 | 55.866 | 305.922 | 0.779 |
| Top Close-source Model2 | 3.368 | 49.744 | 294.628 | 0.806 |
| Top Close-source Model3 | 3.218 | 51.574 | 295.691 | 0.799 |
| Hunyuan3D 2.0 | **3.193** | **49.165** | **282.429** | **0.809** |
Generation results of Hunyuan3D 2.0:
<p align="left">
<img src="assets/images/e2e-1.gif" height=300>
<img src="assets/images/e2e-2.gif" height=300>
</p>
### Pretrained Models
| Model | Date | Huggingface |
|----------------------|------------|--------------------------------------------------------|
| Hunyuan3D-DiT-v2-0 | 2025-01-21 | [Download](https://huggingface.co/tencent/Hunyuan3D-2) |
| Hunyuan3D-Paint-v2-0 | 2025-01-21 | [Download](https://huggingface.co/tencent/Hunyuan3D-2) |
| Hunyuan3D-Delight-v2-0 | 2025-01-21 | [Download](https://huggingface.co/tencent/Hunyuan3D-2/tree/main/hunyuan3d-delight-v2-0) |
## 🤗 Get Started with Hunyuan3D 2.0
You may follow the next steps to use Hunyuan3D 2.0 via code or the Gradio App.
### Install Requirements
Please install Pytorch via the [official](https://pytorch.org/) site. Then install the other requirements via
```bash
pip install -r requirements.txt
# for texture
cd hy3dgen/texgen/custom_rasterizer
python3 setup.py install
cd ../../..
cd hy3dgen/texgen/differentiable_renderer
bash compile_mesh_painter.sh OR python3 setup.py install (on Windows)
```
### API Usage
We designed a diffusers-like API to use our shape generation model - Hunyuan3D-DiT and texture synthesis model -
Hunyuan3D-Paint.
You could assess **Hunyuan3D-DiT** via:
```python
from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained('tencent/Hunyuan3D-2')
mesh = pipeline(image='assets/demo.png')[0]
```
The output mesh is a [trimesh object](https://trimesh.org/trimesh.html), which you could save to glb/obj (or other
format) file.
For **Hunyuan3D-Paint**, do the following:
```python
from hy3dgen.texgen import Hunyuan3DPaintPipeline
from hy3dgen.shapegen import Hunyuan3DDiTFlowMatchingPipeline
# let's generate a mesh first
pipeline = Hunyuan3DDiTFlowMatchingPipeline.from_pretrained('tencent/Hunyuan3D-2')
mesh = pipeline(image='assets/demo.png')[0]
pipeline = Hunyuan3DPaintPipeline.from_pretrained('tencent/Hunyuan3D-2')
mesh = pipeline(mesh, image='assets/demo.png')
```
Please visit [minimal_demo.py](minimal_demo.py) for more advanced usage, such as **text to 3D** and **texture generation
for handcrafted mesh**.
### Gradio App
You could also host a [Gradio](https://www.gradio.app/) App in your own computer via:
```bash
pip3 install gradio==3.39.0
python3 gradio_app.py
```
Don't forget to visit [Hunyuan3D](https://3d.hunyuan.tencent.com) for quick use, if you don't want to host yourself.
## 📑 Open-Source Plan
- [x] Inference Code
- [x] Model Checkpoints
- [x] Technical Report
- [ ] ComfyUI
- [ ] TensorRT Version
## 🔗 BibTeX
If you found this repository helpful, please cite our report:
```bibtex
@misc{hunyuan3d22025tencent,
title={Hunyuan3D 2.0: Scaling Diffusion Models for High Resolution Textured 3D Assets Generation},
author={Tencent Hunyuan3D Team},
year={2025},
eprint={2501.12202},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{yang2024tencent,
title={Tencent Hunyuan3D-1.0: A Unified Framework for Text-to-3D and Image-to-3D Generation},
author={Tencent Hunyuan3D Team},
year={2024},
eprint={2411.02293},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
```
## Community Resources
Thanks for the contributions of community members, here we have these great extensions of Hunyuan3D 2.0:
- [ComfyUI-Hunyuan3DWrapper](https://github.com/kijai/ComfyUI-Hunyuan3DWrapper)
- [Hunyuan3D-2-for-windows](https://github.com/sdbds/Hunyuan3D-2-for-windows)
- [📦 A bundle for running on Windows | 整合包](https://github.com/YanWenKun/Comfy3D-WinPortable/releases/tag/r8-hunyuan3d2)
## Acknowledgements
We would like to thank the contributors to
the [DINOv2](https://github.com/facebookresearch/dinov2), [Stable Diffusion](https://github.com/Stability-AI/stablediffusion), [FLUX](https://github.com/black-forest-labs/flux), [diffusers](https://github.com/huggingface/diffusers)
and [HuggingFace](https://huggingface.co) repositories, for their open research and exploration.
## Star History
<a href="https://star-history.com/#Tencent/Hunyuan3D-2&Date">
<picture>
<source media="(prefers-color-scheme: dark)" srcset="https://api.star-history.com/svg?repos=Tencent/Hunyuan3D-2&type=Date&theme=dark" />
<source media="(prefers-color-scheme: light)" srcset="https://api.star-history.com/svg?repos=Tencent/Hunyuan3D-2&type=Date" />
<img alt="Star History Chart" src="https://api.star-history.com/svg?repos=Tencent/Hunyuan3D-2&type=Date" />
</picture>
</a>
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment