Commit f05e915f authored by weishb's avatar weishb
Browse files

首次提交

parent 297bf637
{
"models": {
"denoiser": {
"name": "ElasticSLatFlowModel",
"args": {
"resolution": 32,
"in_channels": 32,
"out_channels": 32,
"model_channels": 1536,
"cond_channels": 1024,
"num_blocks": 30,
"num_heads": 12,
"mlp_ratio": 5.3334,
"pe_mode": "rope",
"share_mod": true,
"initialization": "scaled",
"qk_rms_norm": true,
"qk_rms_norm_cross": true
}
}
},
"dataset": {
"name": "ImageConditionedSLatShape",
"args": {
"resolution": 512,
"image_size": 512,
"min_aesthetic_score": 4.5,
"max_tokens": 8192,
"normalization": {
"mean": [
0.781296, 0.018091, -0.495192, -0.558457, 1.060530, 0.093252, 1.518149, -0.933218,
-0.732996, 2.604095, -0.118341, -2.143904, 0.495076, -2.179512, -2.130751, -0.996944,
0.261421, -2.217463, 1.260067, -0.150213, 3.790713, 1.481266, -1.046058, -1.523667,
-0.059621, 2.220780, 1.621212, 0.877230, 0.567247, -3.175944, -3.186688, 1.578665
],
"std": [
5.972266, 4.706852, 5.445010, 5.209927, 5.320220, 4.547237, 5.020802, 5.444004,
5.226681, 5.683095, 4.831436, 5.286469, 5.652043, 5.367606, 5.525084, 4.730578,
4.805265, 5.124013, 5.530808, 5.619001, 5.103930, 5.417670, 5.269677, 5.547194,
5.634698, 5.235274, 6.110351, 5.511298, 6.237273, 4.879207, 5.347008, 5.405691
]
},
"pretrained_slat_dec": "microsoft/TRELLIS.2-4B/ckpts/shape_dec_next_dc_f16c32_fp16"
}
},
"trainer": {
"name": "ImageConditionedSparseFlowMatchingCFGTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 8,
"batch_split": 2,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 1e-4,
"weight_decay": 0.01,
"betas": [0.9, 0.95],
"eps": 1e-8
}
},
"ema_rate": [
0.9999
],
"mix_precision_mode": "amp",
"mix_precision_dtype": "bfloat16",
"elastic": {
"name": "LinearMemoryController",
"args": {
"target_ratio": 0.75,
"max_mem_ratio_start": 0.5
}
},
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"p_uncond": 0.1,
"t_schedule": {
"name": "uniform",
"args": {}
},
"sigma_min": 1e-5,
"image_cond_model": {
"name": "DinoV3FeatureExtractor",
"args": {
"model_name": "facebook/dinov3-vitl16-pretrain-lvd1689m",
"image_size": 512
}
}
}
}
}
\ No newline at end of file
{
"models": {
"denoiser": {
"name": "ElasticSLatFlowModel",
"args": {
"resolution": 64,
"in_channels": 32,
"out_channels": 32,
"model_channels": 1536,
"cond_channels": 1024,
"num_blocks": 30,
"num_heads": 12,
"mlp_ratio": 5.3334,
"pe_mode": "rope",
"share_mod": true,
"initialization": "scaled",
"qk_rms_norm": true,
"qk_rms_norm_cross": true
}
}
},
"dataset": {
"name": "ImageConditionedSLatShape",
"args": {
"resolution": 1024,
"image_size": 1024,
"min_aesthetic_score": 4.5,
"max_tokens": 32768,
"normalization": {
"mean": [
0.781296, 0.018091, -0.495192, -0.558457, 1.060530, 0.093252, 1.518149, -0.933218,
-0.732996, 2.604095, -0.118341, -2.143904, 0.495076, -2.179512, -2.130751, -0.996944,
0.261421, -2.217463, 1.260067, -0.150213, 3.790713, 1.481266, -1.046058, -1.523667,
-0.059621, 2.220780, 1.621212, 0.877230, 0.567247, -3.175944, -3.186688, 1.578665
],
"std": [
5.972266, 4.706852, 5.445010, 5.209927, 5.320220, 4.547237, 5.020802, 5.444004,
5.226681, 5.683095, 4.831436, 5.286469, 5.652043, 5.367606, 5.525084, 4.730578,
4.805265, 5.124013, 5.530808, 5.619001, 5.103930, 5.417670, 5.269677, 5.547194,
5.634698, 5.235274, 6.110351, 5.511298, 6.237273, 4.879207, 5.347008, 5.405691
]
},
"pretrained_slat_dec": "microsoft/TRELLIS.2-4B/ckpts/shape_dec_next_dc_f16c32_fp16"
}
},
"trainer": {
"name": "ImageConditionedSparseFlowMatchingCFGTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 2,
"batch_split": 1,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 2e-5,
"weight_decay": 0.01,
"betas": [0.9, 0.95],
"eps": 1e-8
}
},
"ema_rate": [
0.9999
],
"mix_precision_mode": "amp",
"mix_precision_dtype": "bfloat16",
"elastic": {
"name": "LinearMemoryController",
"args": {
"target_ratio": 0.75,
"max_mem_ratio_start": 0.25
}
},
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"finetune_ckpt": {
"denoiser": "PATH_TO_512_CKPT"
},
"i_log": 500,
"i_sample": 1000,
"i_save": 1000,
"p_uncond": 0.1,
"t_schedule": {
"name": "uniform",
"args": {}
},
"sigma_min": 1e-5,
"image_cond_model": {
"name": "DinoV3FeatureExtractor",
"args": {
"model_name": "facebook/dinov3-vitl16-pretrain-lvd1689m",
"image_size": 1024
}
}
}
}
}
\ No newline at end of file
{
"models": {
"denoiser": {
"name": "ElasticSLatFlowModel",
"args": {
"resolution": 32,
"in_channels": 64,
"out_channels": 32,
"model_channels": 1536,
"cond_channels": 1024,
"num_blocks": 30,
"num_heads": 12,
"mlp_ratio": 5.3334,
"pe_mode": "rope",
"share_mod": true,
"initialization": "scaled",
"qk_rms_norm": true,
"qk_rms_norm_cross": true
}
}
},
"dataset": {
"name": "ImageConditionedSLatPbr",
"args": {
"resolution": 512,
"image_size": 512,
"min_aesthetic_score": 4.5,
"max_tokens": 8192,
"pbr_slat_normalization": {
"mean": [
3.501659, 2.212398, 2.226094, 0.251093, -0.026248, -0.687364, 0.439898, -0.928075,
0.029398, -0.339596, -0.869527, 1.038479, -0.972385, 0.126042, -1.129303, 0.455149,
-1.209521, 2.069067, 0.544735, 2.569128, -0.323407, 2.293000, -1.925608, -1.217717,
1.213905, 0.971588, -0.023631, 0.106750, 2.021786, 0.250524, -0.662387, -0.768862
],
"std": [
2.665652, 2.743913, 2.765121, 2.595319, 3.037293, 2.291316, 2.144656, 2.911822,
2.969419, 2.501689, 2.154811, 3.163343, 2.621215, 2.381943, 3.186697, 3.021588,
2.295916, 3.234985, 3.233086, 2.260140, 2.874801, 2.810596, 3.292720, 2.674999,
2.680878, 2.372054, 2.451546, 2.353556, 2.995195, 2.379849, 2.786195, 2.775190
]
},
"shape_slat_normalization": {
"mean": [
0.781296, 0.018091, -0.495192, -0.558457, 1.060530, 0.093252, 1.518149, -0.933218,
-0.732996, 2.604095, -0.118341, -2.143904, 0.495076, -2.179512, -2.130751, -0.996944,
0.261421, -2.217463, 1.260067, -0.150213, 3.790713, 1.481266, -1.046058, -1.523667,
-0.059621, 2.220780, 1.621212, 0.877230, 0.567247, -3.175944, -3.186688, 1.578665
],
"std": [
5.972266, 4.706852, 5.445010, 5.209927, 5.320220, 4.547237, 5.020802, 5.444004,
5.226681, 5.683095, 4.831436, 5.286469, 5.652043, 5.367606, 5.525084, 4.730578,
4.805265, 5.124013, 5.530808, 5.619001, 5.103930, 5.417670, 5.269677, 5.547194,
5.634698, 5.235274, 6.110351, 5.511298, 6.237273, 4.879207, 5.347008, 5.405691
]
},
"attrs": [
"base_color",
"metallic",
"roughness",
"alpha"
],
"pretrained_pbr_slat_dec": "microsoft/TRELLIS.2-4B/ckpts/tex_dec_next_dc_f16c32_fp16",
"pretrained_shape_slat_dec": "microsoft/TRELLIS.2-4B/ckpts/shape_dec_next_dc_f16c32_fp16"
}
},
"trainer": {
"name": "ImageConditionedSparseFlowMatchingCFGTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 8,
"batch_split": 2,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 1e-4,
"weight_decay": 0.01,
"betas": [0.9, 0.95],
"eps": 1e-8
}
},
"ema_rate": [
0.9999
],
"mix_precision_mode": "amp",
"mix_precision_dtype": "bfloat16",
"elastic": {
"name": "LinearMemoryController",
"args": {
"target_ratio": 0.75,
"max_mem_ratio_start": 0.5
}
},
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"p_uncond": 0.1,
"t_schedule": {
"name": "uniform",
"args": {}
},
"sigma_min": 1e-5,
"image_cond_model": {
"name": "DinoV3FeatureExtractor",
"args": {
"model_name": "facebook/dinov3-vitl16-pretrain-lvd1689m",
"image_size": 512
}
}
}
}
}
\ No newline at end of file
{
"models": {
"denoiser": {
"name": "ElasticSLatFlowModel",
"args": {
"resolution": 32,
"in_channels": 64,
"out_channels": 32,
"model_channels": 1536,
"cond_channels": 1024,
"num_blocks": 30,
"num_heads": 12,
"mlp_ratio": 5.3334,
"pe_mode": "rope",
"share_mod": true,
"initialization": "scaled",
"qk_rms_norm": true,
"qk_rms_norm_cross": true
}
}
},
"dataset": {
"name": "ImageConditionedSLatPbr",
"args": {
"resolution": 1024,
"image_size": 1024,
"min_aesthetic_score": 4.5,
"max_tokens": 32768,
"full_pbr": true,
"pbr_slat_normalization": {
"mean": [
3.501659, 2.212398, 2.226094, 0.251093, -0.026248, -0.687364, 0.439898, -0.928075,
0.029398, -0.339596, -0.869527, 1.038479, -0.972385, 0.126042, -1.129303, 0.455149,
-1.209521, 2.069067, 0.544735, 2.569128, -0.323407, 2.293000, -1.925608, -1.217717,
1.213905, 0.971588, -0.023631, 0.106750, 2.021786, 0.250524, -0.662387, -0.768862
],
"std": [
2.665652, 2.743913, 2.765121, 2.595319, 3.037293, 2.291316, 2.144656, 2.911822,
2.969419, 2.501689, 2.154811, 3.163343, 2.621215, 2.381943, 3.186697, 3.021588,
2.295916, 3.234985, 3.233086, 2.260140, 2.874801, 2.810596, 3.292720, 2.674999,
2.680878, 2.372054, 2.451546, 2.353556, 2.995195, 2.379849, 2.786195, 2.775190
]
},
"shape_slat_normalization": {
"mean": [
0.781296, 0.018091, -0.495192, -0.558457, 1.060530, 0.093252, 1.518149, -0.933218,
-0.732996, 2.604095, -0.118341, -2.143904, 0.495076, -2.179512, -2.130751, -0.996944,
0.261421, -2.217463, 1.260067, -0.150213, 3.790713, 1.481266, -1.046058, -1.523667,
-0.059621, 2.220780, 1.621212, 0.877230, 0.567247, -3.175944, -3.186688, 1.578665
],
"std": [
5.972266, 4.706852, 5.445010, 5.209927, 5.320220, 4.547237, 5.020802, 5.444004,
5.226681, 5.683095, 4.831436, 5.286469, 5.652043, 5.367606, 5.525084, 4.730578,
4.805265, 5.124013, 5.530808, 5.619001, 5.103930, 5.417670, 5.269677, 5.547194,
5.634698, 5.235274, 6.110351, 5.511298, 6.237273, 4.879207, 5.347008, 5.405691
]
},
"attrs": [
"base_color",
"metallic",
"roughness",
"alpha"
],
"pretrained_pbr_slat_dec": "microsoft/TRELLIS.2-4B/ckpts/tex_dec_next_dc_f16c32_fp16",
"pretrained_shape_slat_dec": "microsoft/TRELLIS.2-4B/ckpts/shape_dec_next_dc_f16c32_fp16"
}
},
"trainer": {
"name": "ImageConditionedSparseFlowMatchingCFGTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 2,
"batch_split": 1,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 2e-5,
"weight_decay": 0.01,
"betas": [0.9, 0.95],
"eps": 1e-8
}
},
"ema_rate": [
0.9999
],
"mix_precision_mode": "amp",
"mix_precision_dtype": "bfloat16",
"elastic": {
"name": "LinearMemoryController",
"args": {
"target_ratio": 0.75,
"max_mem_ratio_start": 0.25
}
},
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 1000,
"i_save": 1000,
"p_uncond": 0.1,
"t_schedule": {
"name": "uniform",
"args": {}
},
"sigma_min": 1e-5,
"image_cond_model": {
"name": "DinoV3FeatureExtractor",
"args": {
"model_name": "facebook/dinov3-vitl16-pretrain-lvd1689m",
"image_size": 1024
}
}
}
}
}
\ No newline at end of file
{
"models": {
"denoiser": {
"name": "SparseStructureFlowModel",
"args": {
"resolution": 16,
"in_channels": 8,
"out_channels": 8,
"model_channels": 1536,
"cond_channels": 1024,
"num_blocks": 30,
"num_heads": 12,
"mlp_ratio": 5.3334,
"pe_mode": "rope",
"share_mod": true,
"initialization": "scaled",
"qk_rms_norm": true,
"qk_rms_norm_cross": true
}
}
},
"dataset": {
"name": "ImageConditionedSparseStructureLatent",
"args": {
"min_aesthetic_score": 4.5,
"image_size": 512,
"pretrained_ss_dec": "microsoft/TRELLIS-image-large/ckpts/ss_dec_conv3d_16l8_fp16"
}
},
"trainer": {
"name": "ImageConditionedFlowMatchingCFGTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 8,
"batch_split": 4,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 1e-4,
"weight_decay": 0.01,
"betas": [0.9, 0.95],
"eps": 1e-8
}
},
"ema_rate": [
0.9999
],
"mix_precision_mode": "amp",
"mix_precision_dtype": "bfloat16",
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"p_uncond": 0.1,
"t_schedule": {
"name": "logitNormal",
"args": {
"mean": 1.0,
"std": 1.0
}
},
"sigma_min": 1e-5,
"image_cond_model": {
"name": "DinoV3FeatureExtractor",
"args": {
"model_name": "facebook/dinov3-vitl16-pretrain-lvd1689m",
"image_size": 512
}
}
}
}
}
\ No newline at end of file
{
"models": {
"encoder": {
"name": "FlexiDualGridVaeEncoder",
"args": {
"model_channels": [64, 128, 256, 512, 1024],
"latent_channels": 32,
"num_blocks": [0, 4, 8, 16, 4],
"block_type": [
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d"
],
"down_block_type": [
"SparseResBlockS2C3d",
"SparseResBlockS2C3d",
"SparseResBlockS2C3d",
"SparseResBlockS2C3d"
],
"block_args": [
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": false
},
{
"use_checkpoint": false
},
{
"use_checkpoint": false
}
],
"use_fp16": true
}
},
"decoder": {
"name": "FlexiDualGridVaeDecoder",
"args": {
"resolution": 256,
"model_channels": [1024, 512, 256, 128, 64],
"latent_channels": 32,
"num_blocks": [4, 16, 8, 4, 0],
"block_type": [
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d"
],
"up_block_type": [
"SparseResBlockC2S3d",
"SparseResBlockC2S3d",
"SparseResBlockC2S3d",
"SparseResBlockC2S3d"
],
"block_args": [
{
"use_checkpoint": false
},
{
"use_checkpoint": false
},
{
"use_checkpoint": false
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
}
],
"use_fp16": true
}
}
},
"dataset": {
"name": "FlexiDualGridDataset",
"args": {
"resolution": 256,
"max_active_voxels": 1000000,
"max_num_faces": 1000000,
"min_aesthetic_score": 4.5
}
},
"trainer": {
"name": "ShapeVaeTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 8,
"batch_split": 2,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 1e-4,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"lambda_subdiv": 0.1,
"lambda_intersected": 0.1,
"lambda_vertice": 1e-2,
"lambda_mask": 1,
"lambda_depth": 10,
"lambda_normal": 1,
"lambda_kl": 1e-6,
"lambda_ssim": 0.2,
"lambda_lpips": 0.2,
"camera_randomization_config": {
"radius_range": [2, 100]
}
}
}
}
\ No newline at end of file
{
"models": {
"encoder": {
"name": "FlexiDualGridVaeEncoder",
"args": {
"model_channels": [64, 128, 256, 512, 1024],
"latent_channels": 32,
"num_blocks": [0, 4, 8, 16, 4],
"block_type": [
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d"
],
"down_block_type": [
"SparseResBlockS2C3d",
"SparseResBlockS2C3d",
"SparseResBlockS2C3d",
"SparseResBlockS2C3d"
],
"block_args": [
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
}
],
"use_fp16": true
}
},
"decoder": {
"name": "FlexiDualGridVaeDecoder",
"args": {
"resolution": 512,
"model_channels": [1024, 512, 256, 128, 64],
"latent_channels": 32,
"num_blocks": [4, 16, 8, 4, 0],
"block_type": [
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d"
],
"up_block_type": [
"SparseResBlockC2S3d",
"SparseResBlockC2S3d",
"SparseResBlockC2S3d",
"SparseResBlockC2S3d"
],
"block_args": [
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
}
],
"use_fp16": true
}
}
},
"dataset": {
"name": "FlexiDualGridDataset",
"args": {
"resolution": 512,
"max_active_voxels": 1000000,
"max_num_faces": 1000000,
"min_aesthetic_score": 4.5
}
},
"trainer": {
"name": "ShapeVaeTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 4,
"batch_split": 2,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 1e-5,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"finetune_ckpt": {
"encoder": "PATH_TO_ENCODER_CKPT",
"decoder": "PATH_TO_DECODER_CKPT"
},
"snapshot_batch_size": 1,
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"lambda_subdiv": 0.1,
"lambda_intersected": 0.1,
"lambda_vertice": 1e-2,
"lambda_mask": 1,
"lambda_depth": 10,
"lambda_normal": 1,
"lambda_kl": 1e-6,
"lambda_ssim": 0.2,
"lambda_lpips": 0.2,
"render_resolution": 1024,
"camera_randomization_config": {
"radius_range": [2, 100]
}
}
}
}
\ No newline at end of file
{
"models": {
"encoder": {
"name": "SparseUnetVaeEncoder",
"args": {
"in_channels": 6,
"model_channels": [64, 128, 256, 512, 1024],
"latent_channels": 32,
"num_blocks": [0, 4, 8, 16, 4],
"block_type": [
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d"
],
"down_block_type": [
"SparseResBlockS2C3d",
"SparseResBlockS2C3d",
"SparseResBlockS2C3d",
"SparseResBlockS2C3d"
],
"block_args": [
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": false
},
{
"use_checkpoint": false
},
{
"use_checkpoint": false
}
],
"use_fp16": true
}
},
"decoder": {
"name": "SparseUnetVaeDecoder",
"args": {
"out_channels": 6,
"model_channels": [1024, 512, 256, 128, 64],
"latent_channels": 32,
"num_blocks": [4, 16, 8, 4, 0],
"block_type": [
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d"
],
"up_block_type": [
"SparseResBlockC2S3d",
"SparseResBlockC2S3d",
"SparseResBlockC2S3d",
"SparseResBlockC2S3d"
],
"block_args": [
{
"use_checkpoint": false
},
{
"use_checkpoint": false
},
{
"use_checkpoint": false
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
}
],
"use_fp16": true,
"pred_subdiv": false
}
}
},
"dataset": {
"name": "SparseVoxelPbrDataset",
"args": {
"resolution": 256,
"min_aesthetic_score": 4.5,
"max_active_voxels": 1000000,
"max_num_faces": 1000000,
"with_mesh": false,
"attrs": [
"base_color",
"metallic",
"roughness",
"alpha"
]
}
},
"trainer": {
"name": "PbrVaeTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 8,
"batch_split": 1,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 1e-4,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"lambda_kl": 1e-6,
"loss_type": "l1",
"lambda_render": 0.0
}
}
}
\ No newline at end of file
{
"models": {
"encoder": {
"name": "SparseUnetVaeEncoder",
"args": {
"in_channels": 6,
"model_channels": [64, 128, 256, 512, 1024],
"latent_channels": 32,
"num_blocks": [0, 4, 8, 16, 4],
"block_type": [
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d"
],
"down_block_type": [
"SparseResBlockS2C3d",
"SparseResBlockS2C3d",
"SparseResBlockS2C3d",
"SparseResBlockS2C3d"
],
"block_args": [
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
}
],
"use_fp16": true
}
},
"decoder": {
"name": "SparseUnetVaeDecoder",
"args": {
"out_channels": 6,
"model_channels": [1024, 512, 256, 128, 64],
"latent_channels": 32,
"num_blocks": [4, 16, 8, 4, 0],
"block_type": [
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d",
"SparseConvNeXtBlock3d"
],
"up_block_type": [
"SparseResBlockC2S3d",
"SparseResBlockC2S3d",
"SparseResBlockC2S3d",
"SparseResBlockC2S3d"
],
"block_args": [
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
},
{
"use_checkpoint": true
}
],
"use_fp16": true,
"pred_subdiv": false
}
}
},
"dataset": {
"name": "SparseVoxelPbrDataset",
"args": {
"resolution": 512,
"min_aesthetic_score": 4.5,
"max_active_voxels": 1000000,
"max_num_faces": 1000000,
"attrs": [
"base_color",
"metallic",
"roughness",
"alpha"
]
}
},
"trainer": {
"name": "PbrVaeTrainer",
"args": {
"max_steps": 1000000,
"batch_size_per_gpu": 4,
"batch_split": 2,
"optimizer": {
"name": "AdamW",
"args": {
"lr": 1e-5,
"weight_decay": 0.0
}
},
"ema_rate": [
0.9999
],
"fp16_mode": "inflat_all",
"fp16_scale_growth": 0.001,
"grad_clip": {
"name": "AdaptiveGradClipper",
"args": {
"max_norm": 1.0,
"clip_percentile": 95
}
},
"finetune_ckpt": {
"encoder": "PATH_TO_ENCODER_CKPT",
"decoder": "PATH_TO_DECODER_CKPT"
},
"snapshot_batch_size": 1,
"render_resolution": 512,
"i_log": 500,
"i_sample": 10000,
"i_save": 10000,
"lambda_kl": 1e-6,
"lambda_render": 1.0,
"loss_type": "l1",
"lambda_ssim": 0.2,
"lambda_lpips": 0.2,
"camera_randomization_config": {
"radius_range": [2, 100]
}
}
}
}
\ No newline at end of file
# Dataset Preparation Toolkit
This toolkit provides a comprehensive pipeline for preparing 3D datasets, including downloading, processing, voxelizing, and latent encoding for SC-VAE and Flow Model training.
### Step 1: Install Dependencies
Initialize the environment and install necessary dependencies:
```bash
. ./data_toolkit/setup.sh
```
### Step 2: Initialize Metadata
Before processing, load the dataset metadata.
```bash
python data_toolkit/build_metadata.py <SUBSET> --root <ROOT> [--source <SOURCE>]
```
**Arguments:**
- `SUBSET`: Target dataset subset. Options: `ObjaverseXL`, `ABO`, `HSSD`, `TexVerse` (Training sets); `SketchfabPicked`, `Toys4k` (Test sets).
- `ROOT`: Root directory to save the data.
- `SOURCE`: Data source (Required if `SUBSET` is `ObjaverseXL`). Options: `sketchfab`, `github`.
**Example:**
Load metadata for `ObjaverseXL` (sketchfab) and save to `datasets/ObjaverseXL_sketchfab`:
```bash
python data_toolkit/build_metadata.py ObjaverseXL --source sketchfab --root datasets/ObjaverseXL_sketchfab
```
### Step 3: Download Data
Download the 3D assets to the local storage.
```bash
python data_toolkit/download.py <SUBSET> --root <ROOT> [--rank <RANK> --world_size <WORLD_SIZE>]
```
**Arguments:**
- `RANK` / `WORLD_SIZE`: Parameters for multi-node distributed downloading.
**Example:**
To download the `ObjaverseXL` subset:
> **Note:** The example below sets a large `WORLD_SIZE` (160,000) for demonstration purposes, meaning only a tiny fraction of the dataset will be downloaded by this single process.
```bash
python data_toolkit/download.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab --world_size 160000
```
*Attention: Some datasets may require an interactive Hugging Face login or manual steps. Please follow any on-screen instructions.*
**Update Metadata:**
After downloading, update the metadata registry:
```bash
python data_toolkit/build_metadata.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab
```
### Step 4: Process Mesh and PBR Textures
Standardize 3D assets by dumping mesh and PBR textures.
*Note: This process utilizes the CPU.*
```bash
# Dump Meshes
python data_toolkit/dump_mesh.py <SUBSET> --root <ROOT> [--rank <RANK> --world_size <WORLD_SIZE>]
# Dump PBR Textures
python data_toolkit/dump_pbr.py <SUBSET> --root <ROOT> [--rank <RANK> --world_size <WORLD_SIZE>]
# Get statisitics of the asset
python asset_stats.py --root <ROOT> [--rank <RANK> --world_size <WORLD_SIZE>]
```
**Example:**
```bash
python data_toolkit/dump_mesh.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab
python data_toolkit/dump_pbr.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab
python asset_stats.py --root datasets/ObjaverseXL_sketchfab
```
**Update Metadata:**
```bash
python data_toolkit/build_metadata.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab
```
### Step 5: Convert to O-Voxels
Convert the processed meshes and textures into O-Voxels format.
*Note: This process utilizes the CPU.*
```bash
python data_toolkit/dual_grid.py <SUBSET> --root <ROOT> [--rank <RANK> --world_size <WORLD_SIZE>] [--resolution <RESOLUTION>]
python data_toolkit/voxelize_pbr.py <SUBSET> --root <ROOT> [--rank <RANK> --world_size <WORLD_SIZE>] [--resolution <RESOLUTION>]
```
**Arguments:**
- `RESOLUTION`: Target resolutions for O-Voxels, comma-separated (e.g., `256,512,1024`). Default is `256`.
**Example:**
Convert `ObjaverseXL` to resolutions 256, 512, and 1024:
```bash
python data_toolkit/dual_grid.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab --resolution 256,512,1024
python data_toolkit/voxelize_pbr.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab --resolution 256,512,1024
```
### At this point, the dataset is ready for SC-VAE Training
### Step 6: Encode Latents
Encode sparse structures into latents to train the first-stage generator.
```bash
# 1. Encode Shape Latents
python data_toolkit/encode_shape_latent.py --root <ROOT> [--rank <RANK> --world_size <WORLD_SIZE>] [--resolution <RESOLUTION>]
# 2. Encode PBR Latents
python data_toolkit/encode_pbr_latent.py --root <ROOT> [--rank <RANK> --world_size <WORLD_SIZE>] [--resolution <RESOLUTION>]
# 3. Update Metadata (Required before next step)
python data_toolkit/build_metadata.py <SUBSET> --root <ROOT>
# 4. Encode Sparse Structure (SS) Latents
python data_toolkit/encode_ss_latent.py --root <ROOT> --shape_latent_name <SHAPE_LATENT_NAME> [--rank <RANK> --world_size <WORLD_SIZE>] [--resolution <SS_RESOLUTION>]
```
**Arguments:**
- `RESOLUTION`: Input O-Voxel resolution. Default is `1024`.
- `SS_RESOLUTION`: Resolution for sparse structures. Default is `64`.
- `SHAPE_LATENT_NAME`: The specific version name of the shape latent.
**Example:**
```bash
python data_toolkit/encode_shape_latent.py --root datasets/ObjaverseXL_sketchfab --resolution 512
python data_toolkit/encode_pbr_latent.py --root datasets/ObjaverseXL_sketchfab --resolution 512
python data_toolkit/encode_shape_latent.py --root datasets/ObjaverseXL_sketchfab --resolution 1024
python data_toolkit/encode_pbr_latent.py --root datasets/ObjaverseXL_sketchfab --resolution 1024
# Update metadata
python data_toolkit/build_metadata.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab
# Encode SS Latents
python data_toolkit/encode_ss_latent.py --root datasets/ObjaverseXL_sketchfab --shape_latent_name shape_enc_next_dc_f16c32_fp16_1024 --resolution 64
# Final Metadata Update
python data_toolkit/build_metadata.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab
```
### Step 7: Render Image Conditions
Render multi-view images to train the image-conditioned generator.
*Note: This process may utilize the CPU.*
```bash
python data_toolkit/render_cond.py <SUBSET> --root <ROOT> [--num_views <NUM_VIEWS>] [--rank <RANK> --world_size <WORLD_SIZE>]
```
**Arguments:**
- `NUM_VIEWS`: Number of views to render per asset. Default is `16`.
**Example:**
```bash
python data_toolkit/render_cond.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab
```
**Final Metadata Update:**
```bash
python data_toolkit/build_metadata.py ObjaverseXL --root datasets/ObjaverseXL_sketchfab
```
\ No newline at end of file
import os
import argparse
import pickle
from tqdm import tqdm
import pandas as pd
from easydict import EasyDict as edict
from concurrent.futures import ThreadPoolExecutor
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--root', type=str, required=True,
help='Directory to save the metadata')
parser.add_argument('--mesh_dump_root', type=str, default=None,
help='Directory to save the mesh dumps')
parser.add_argument('--pbr_dump_root', type=str, default=None,
help='Directory to save the pbr dumps')
parser.add_argument('--instances', type=str, default=None,
help='Instances to process')
parser.add_argument('--rank', type=int, default=0)
parser.add_argument('--world_size', type=int, default=1)
parser.add_argument('--max_workers', type=int, default=0)
opt = parser.parse_args()
opt = edict(vars(opt))
opt.mesh_dump_root = opt.mesh_dump_root or opt.root
opt.pbr_dump_root = opt.pbr_dump_root or opt.root
os.makedirs(os.path.join(opt.root, 'asset_stats', 'new_records'), exist_ok=True)
# get file list
if not os.path.exists(os.path.join(opt.root, 'metadata.csv')):
raise ValueError('metadata.csv not found')
metadata = pd.read_csv(os.path.join(opt.root, 'metadata.csv')).set_index('sha256')
if os.path.exists(os.path.join(opt.root, 'asset_stats', 'metadata.csv')):
metadata = metadata.combine_first(pd.read_csv(os.path.join(opt.root, 'asset_stats','metadata.csv')).set_index('sha256'))
if os.path.exists(os.path.join(opt.mesh_dump_root, 'mesh_dumps', 'metadata.csv')):
metadata = metadata.combine_first(pd.read_csv(os.path.join(opt.mesh_dump_root, 'mesh_dumps','metadata.csv')).set_index('sha256'))
if os.path.exists(os.path.join(opt.pbr_dump_root, 'pbr_dumps', 'metadata.csv')):
metadata = metadata.combine_first(pd.read_csv(os.path.join(opt.pbr_dump_root, 'pbr_dumps', 'metadata.csv')).set_index('sha256'))
metadata = metadata.reset_index()
if opt.instances is None:
if 'num_faces' in metadata.columns:
metadata = metadata[metadata['num_faces'].isnull()]
metadata = metadata[(metadata['mesh_dumped'] == True) | (metadata['pbr_dumped'] == True)]
else:
if os.path.exists(opt.instances):
with open(opt.instances, 'r') as f:
instances = f.read().splitlines()
else:
instances = opt.instances.split(',')
metadata = metadata[metadata['sha256'].isin(instances)]
start = len(metadata) * opt.rank // opt.world_size
end = len(metadata) * (opt.rank + 1) // opt.world_size
metadata = metadata[start:end]
print(f'Processing {len(metadata)} objects...')
# process objects
records = []
with ThreadPoolExecutor(max_workers=opt.max_workers or os.cpu_count()) as executor, \
tqdm(total=len(metadata), desc='Processing objects') as pbar:
def worker(metadatum):
try:
sha256 = metadatum['sha256']
if metadatum['pbr_dumped'] == True:
with open(os.path.join(opt.pbr_dump_root, 'pbr_dumps', f'{sha256}.pickle'), 'rb') as f:
dump = pickle.load(f)
num_faces = 0
num_vertices = 0
for obj in dump['objects']:
if obj['vertices'].size == 0 or obj['faces'].size == 0:
continue
num_faces += obj['faces'].shape[0]
num_vertices += obj['vertices'].shape[0]
num_basecolor_tex = 0
num_metallic_tex = 0
num_roughness_tex = 0
num_alpha_tex = 0
for mat in dump['materials']:
if mat['baseColorTexture'] is not None:
num_basecolor_tex += 1
if mat['metallicTexture'] is not None:
num_metallic_tex += 1
if mat['roughnessTexture'] is not None:
num_roughness_tex += 1
if mat['alphaTexture'] is not None:
num_alpha_tex += 1
record = {
'sha256': sha256,
'num_faces': num_faces,
'num_vertices': num_vertices,
'num_basecolor_tex': num_basecolor_tex,
'num_metallic_tex': num_metallic_tex,
'num_roughness_tex': num_roughness_tex,
'num_alpha_tex': num_alpha_tex,
}
records.append(record)
else:
with open(os.path.join(opt.mesh_dump_root,'mesh_dumps', f'{sha256}.pickle'), 'rb') as f:
dump = pickle.load(f)
num_faces = 0
num_vertices = 0
for obj in dump['objects']:
if obj['vertices'].size == 0 or obj['faces'].size == 0:
continue
num_faces += obj['faces'].shape[0]
num_vertices += obj['vertices'].shape[0]
record = {
'sha256': sha256,
'num_faces': num_faces,
'num_vertices': num_vertices,
}
records.append(record)
pbar.update()
except Exception as e:
print(f'Error processing {sha256}: {e}')
pbar.update()
for metadatum in metadata.to_dict('records'):
executor.submit(worker, metadatum)
executor.shutdown(wait=True)
# save records
records = pd.DataFrame.from_records(records)
records.to_csv(os.path.join(opt.root, 'asset_stats', 'new_records', f'part_{opt.rank}.csv'), index=False)
import argparse, sys, os, math, io
from typing import *
import bpy
import bmesh
from mathutils import Vector, Matrix
import numpy as np
import pickle
"""=============== BLENDER ==============="""
IMPORT_FUNCTIONS: Dict[str, Callable] = {
"obj": bpy.ops.import_scene.obj if bpy.app.version[0] < 4 else bpy.ops.wm.obj_import,
"glb": bpy.ops.import_scene.gltf,
"gltf": bpy.ops.import_scene.gltf,
"usd": bpy.ops.import_scene.usd,
"fbx": bpy.ops.import_scene.fbx,
"stl": bpy.ops.import_mesh.stl if bpy.app.version[0] < 4 else bpy.ops.wm.stl_import,
"usda": bpy.ops.import_scene.usda,
"dae": bpy.ops.wm.collada_import,
"ply": bpy.ops.import_mesh.ply if bpy.app.version[0] < 4 else bpy.ops.wm.ply_import,
"abc": bpy.ops.wm.alembic_import,
"blend": bpy.ops.wm.append,
}
def init_scene() -> None:
"""Resets the scene to a clean state.
Returns:
None
"""
# delete everything
for obj in bpy.data.objects:
bpy.data.objects.remove(obj, do_unlink=True)
# delete all the materials
for material in bpy.data.materials:
bpy.data.materials.remove(material, do_unlink=True)
# delete all the textures
for texture in bpy.data.textures:
bpy.data.textures.remove(texture, do_unlink=True)
# delete all the images
for image in bpy.data.images:
bpy.data.images.remove(image, do_unlink=True)
def load_object(object_path: str) -> None:
"""Loads a model with a supported file extension into the scene.
Args:
object_path (str): Path to the model file.
Raises:
ValueError: If the file extension is not supported.
Returns:
None
"""
file_extension = object_path.split(".")[-1].lower()
if file_extension is None:
raise ValueError(f"Unsupported file type: {object_path}")
if file_extension == "usdz":
# install usdz io package
dirname = os.path.dirname(os.path.realpath(__file__))
usdz_package = os.path.join(dirname, "io_scene_usdz.zip")
bpy.ops.preferences.addon_install(filepath=usdz_package)
# enable it
addon_name = "io_scene_usdz"
bpy.ops.preferences.addon_enable(module=addon_name)
# import the usdz
from io_scene_usdz.import_usdz import import_usdz
import_usdz(context, filepath=object_path, materials=True, animations=True)
return None
# load from existing import functions
import_function = IMPORT_FUNCTIONS[file_extension]
print(f"Loading object from {object_path}")
if file_extension == "blend":
import_function(directory=object_path, link=False)
elif file_extension in {"glb", "gltf"}:
import_function(filepath=object_path, merge_vertices=True, import_shading='NORMALS', bone_heuristic='TEMPERANCE')
else:
import_function(filepath=object_path)
def delete_invisible_objects() -> None:
"""Deletes all invisible objects in the scene.
Returns:
None
"""
to_remove = []
for obj in bpy.context.scene.objects:
if obj.hide_viewport or obj.hide_render:
obj.hide_viewport = False
obj.hide_render = False
obj.hide_select = False
to_remove.append(obj)
for obj in to_remove:
bpy.data.objects.remove(obj, do_unlink=True)
# Delete invisible collections
invisible_collections = [col for col in bpy.data.collections if col.hide_viewport]
for col in invisible_collections:
bpy.data.collections.remove(col)
def scene_bbox() -> Tuple[Vector, Vector]:
"""Returns the bounding box of the scene.
Taken from Shap-E rendering script
(https://github.com/openai/shap-e/blob/main/shap_e/rendering/blender/blender_script.py#L68-L82)
Returns:
Tuple[Vector, Vector]: The minimum and maximum coordinates of the bounding box.
"""
bbox_min = (math.inf,) * 3
bbox_max = (-math.inf,) * 3
found = False
scene_meshes = [obj for obj in bpy.context.scene.objects.values() if isinstance(obj.data, bpy.types.Mesh)]
for obj in scene_meshes:
found = True
for coord in obj.bound_box:
coord = Vector(coord)
coord = obj.matrix_world @ coord
bbox_min = tuple(min(x, y) for x, y in zip(bbox_min, coord))
bbox_max = tuple(max(x, y) for x, y in zip(bbox_max, coord))
if not found:
raise RuntimeError("no objects in scene to compute bounding box for")
return Vector(bbox_min), Vector(bbox_max)
def normalize_scene() -> Tuple[float, Vector]:
"""Normalizes the scene by scaling and translating it to fit in a unit cube centered
at the origin.
Mostly taken from the Point-E / Shap-E rendering script
(https://github.com/openai/point-e/blob/main/point_e/evals/scripts/blender_script.py#L97-L112),
but fix for multiple root objects: (see bug report here:
https://github.com/openai/shap-e/pull/60).
Returns:
Tuple[float, Vector]: The scale factor and the offset applied to the scene.
"""
scene_root_objects = [obj for obj in bpy.context.scene.objects.values() if not obj.parent]
if len(scene_root_objects) > 1:
# create an empty object to be used as a parent for all root objects
scene = bpy.data.objects.new("ParentEmpty", None)
bpy.context.scene.collection.objects.link(scene)
# parent all root objects to the empty object
for obj in scene_root_objects:
obj.parent = scene
else:
scene = scene_root_objects[0]
bbox_min, bbox_max = scene_bbox()
scale = 1 / max(bbox_max - bbox_min)
scene.scale = scene.scale * scale
# Apply scale to matrix_world.
bpy.context.view_layer.update()
bbox_min, bbox_max = scene_bbox()
offset = -(bbox_min + bbox_max) / 2
scene.matrix_world.translation += offset
return scale, offset
def main(arg):
# Initialize context
if arg.object.endswith(".blend"):
delete_invisible_objects()
else:
init_scene()
load_object(arg.object)
print('[INFO] Scene initialized.')
# Normalize scene
scale, offset = normalize_scene()
print('[INFO] Scene normalized.')
# Start dumping
depsgraph = bpy.context.evaluated_depsgraph_get()
scene = bpy.context.scene
output = {
'objects': [],
}
# Dumping meshes
for obj in scene.objects:
if obj.type != 'MESH':
continue
pack = {
"vertices": None,
"faces": None,
}
eval_obj = obj.evaluated_get(depsgraph)
eval_mesh = eval_obj.to_mesh()
bm = bmesh.new()
bm.from_mesh(eval_mesh)
bm.transform(obj.matrix_world)
bmesh.ops.triangulate(bm, faces=bm.faces)
bm.to_mesh(eval_mesh)
bm.free()
pack["vertices"] = np.array([
v.co[:] for v in eval_mesh.vertices
], dtype=np.float32) # (N, 3)
pack["faces"] = np.array([
[eval_mesh.loops[i].vertex_index for i in poly.loop_indices]
for poly in eval_mesh.polygons
], dtype=np.int32) # (F, 3)
output['objects'].append(pack)
# Save output
os.makedirs(os.path.dirname(arg.output_path), exist_ok=True)
with open(arg.output_path, 'wb') as f:
pickle.dump(output, f)
print('[INFO] Output saved to {}.'.format(arg.output_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Renders given obj file by rotation a camera around it.')
parser.add_argument('--object', type=str, help='Path to the 3D model file to be rendered.')
parser.add_argument('--output_path', type=str, default='/tmp', help='The path the output will be dumped to.')
argv = sys.argv[sys.argv.index("--") + 1:]
args = parser.parse_args(argv)
main(args)
\ No newline at end of file
import argparse, sys, os, math, io
from typing import *
import bpy
import bmesh
from mathutils import Vector, Matrix
import numpy as np
from PIL import Image
import pickle
"""=============== BLENDER ==============="""
IMPORT_FUNCTIONS: Dict[str, Callable] = {
"obj": bpy.ops.import_scene.obj if bpy.app.version[0] < 4 else bpy.ops.wm.obj_import,
"glb": bpy.ops.import_scene.gltf,
"gltf": bpy.ops.import_scene.gltf,
"usd": bpy.ops.import_scene.usd,
"fbx": bpy.ops.import_scene.fbx,
"stl": bpy.ops.import_mesh.stl if bpy.app.version[0] < 4 else bpy.ops.wm.stl_import,
"usda": bpy.ops.import_scene.usda,
"dae": bpy.ops.wm.collada_import,
"ply": bpy.ops.import_mesh.ply if bpy.app.version[0] < 4 else bpy.ops.wm.ply_import,
"abc": bpy.ops.wm.alembic_import,
"blend": bpy.ops.wm.append,
}
def init_scene() -> None:
"""Resets the scene to a clean state.
Returns:
None
"""
# delete everything
for obj in bpy.data.objects:
bpy.data.objects.remove(obj, do_unlink=True)
# delete all the materials
for material in bpy.data.materials:
bpy.data.materials.remove(material, do_unlink=True)
# delete all the textures
for texture in bpy.data.textures:
bpy.data.textures.remove(texture, do_unlink=True)
# delete all the images
for image in bpy.data.images:
bpy.data.images.remove(image, do_unlink=True)
def load_object(object_path: str) -> None:
"""Loads a model with a supported file extension into the scene.
Args:
object_path (str): Path to the model file.
Raises:
ValueError: If the file extension is not supported.
Returns:
None
"""
file_extension = object_path.split(".")[-1].lower()
if file_extension is None:
raise ValueError(f"Unsupported file type: {object_path}")
if file_extension == "usdz":
# install usdz io package
dirname = os.path.dirname(os.path.realpath(__file__))
usdz_package = os.path.join(dirname, "io_scene_usdz.zip")
bpy.ops.preferences.addon_install(filepath=usdz_package)
# enable it
addon_name = "io_scene_usdz"
bpy.ops.preferences.addon_enable(module=addon_name)
# import the usdz
from io_scene_usdz.import_usdz import import_usdz
import_usdz(context, filepath=object_path, materials=True, animations=True)
return None
# load from existing import functions
import_function = IMPORT_FUNCTIONS[file_extension]
print(f"Loading object from {object_path}")
if file_extension == "blend":
import_function(directory=object_path, link=False)
elif file_extension in {"glb", "gltf"}:
import_function(filepath=object_path, merge_vertices=True, import_shading='NORMALS', bone_heuristic='TEMPERANCE')
else:
import_function(filepath=object_path)
def delete_invisible_objects() -> None:
"""Deletes all invisible objects in the scene.
Returns:
None
"""
to_remove = []
for obj in bpy.context.scene.objects:
if obj.hide_viewport or obj.hide_render:
obj.hide_viewport = False
obj.hide_render = False
obj.hide_select = False
to_remove.append(obj)
for obj in to_remove:
bpy.data.objects.remove(obj, do_unlink=True)
# Delete invisible collections
invisible_collections = [col for col in bpy.data.collections if col.hide_viewport]
for col in invisible_collections:
bpy.data.collections.remove(col)
def scene_bbox() -> Tuple[Vector, Vector]:
"""Returns the bounding box of the scene.
Taken from Shap-E rendering script
(https://github.com/openai/shap-e/blob/main/shap_e/rendering/blender/blender_script.py#L68-L82)
Returns:
Tuple[Vector, Vector]: The minimum and maximum coordinates of the bounding box.
"""
bbox_min = (math.inf,) * 3
bbox_max = (-math.inf,) * 3
found = False
scene_meshes = [obj for obj in bpy.context.scene.objects.values() if isinstance(obj.data, bpy.types.Mesh)]
for obj in scene_meshes:
found = True
for coord in obj.bound_box:
coord = Vector(coord)
coord = obj.matrix_world @ coord
bbox_min = tuple(min(x, y) for x, y in zip(bbox_min, coord))
bbox_max = tuple(max(x, y) for x, y in zip(bbox_max, coord))
if not found:
raise RuntimeError("no objects in scene to compute bounding box for")
return Vector(bbox_min), Vector(bbox_max)
def normalize_scene() -> Tuple[float, Vector]:
"""Normalizes the scene by scaling and translating it to fit in a unit cube centered
at the origin.
Mostly taken from the Point-E / Shap-E rendering script
(https://github.com/openai/point-e/blob/main/point_e/evals/scripts/blender_script.py#L97-L112),
but fix for multiple root objects: (see bug report here:
https://github.com/openai/shap-e/pull/60).
Returns:
Tuple[float, Vector]: The scale factor and the offset applied to the scene.
"""
scene_root_objects = [obj for obj in bpy.context.scene.objects.values() if not obj.parent]
if len(scene_root_objects) > 1:
# create an empty object to be used as a parent for all root objects
scene = bpy.data.objects.new("ParentEmpty", None)
bpy.context.scene.collection.objects.link(scene)
# parent all root objects to the empty object
for obj in scene_root_objects:
obj.parent = scene
else:
scene = scene_root_objects[0]
bbox_min, bbox_max = scene_bbox()
scale = 1 / max(bbox_max - bbox_min)
scene.scale = scene.scale * scale
# Apply scale to matrix_world.
bpy.context.view_layer.update()
bbox_min, bbox_max = scene_bbox()
offset = -(bbox_min + bbox_max) / 2
scene.matrix_world.translation += offset
return scale, offset
# =============== NODE TREE PARSING ===============
def extract_image(tex_node, channels):
image = tex_node.image
pixels = np.array(image.pixels[:])
data = pixels.reshape(image.size[1], image.size[0], -1)
data = data[..., channels]
if data.dtype != np.uint8:
data = np.clip(data, 0.0, 1.0)
data = (data * 255).astype(np.uint8)
if len(data.shape) == 2: # Single channel
pil_image = Image.fromarray(data, mode='L')
elif data.shape[2] == 3:
pil_image = Image.fromarray(data, mode='RGB')
elif data.shape[2] == 4:
pil_image = Image.fromarray(data, mode='RGBA')
else:
raise ValueError("Unsupported channel shape for image")
buffer = io.BytesIO()
pil_image.save(buffer, format='PNG')
png_bytes = buffer.getvalue()
return {
'image': png_bytes,
'interpolation': tex_node.interpolation,
'extension': tex_node.extension,
}
def try_extract_image(link, expected_channel='RGB'):
"""
Tries to extract an image from a texture node link.
Supported sub tree modes:
- RGB:
TEX_IMAGE ->
- R, G, B:
TEX_IMAGE -> SEPARATE_COLOR ->
- A:
TEX_IMAGE ->
"""
assert expected_channel in ['RGB', 'R', 'G', 'B', 'A'], "Unsupported channel"
if expected_channel == 'RGB':
assert link.from_node.type == 'TEX_IMAGE', "Material is not supported"
assert link.from_socket.name == 'Color', "Material is not supported"
tex_node = link.from_node
return extract_image(tex_node, [0, 1, 2])
if expected_channel in ['R', 'G', 'B']:
socket_name = {
'R': 'Red',
'G': 'Green',
'B': 'Blue',
}[expected_channel]
assert link.from_node.type == 'SEPARATE_COLOR' and link.from_node.mode == 'RGB', \
f"Material is not supported, {link.from_node.type}, {link.from_node.mode}"
assert link.from_socket.name == socket_name, "Material is not supported"
sep_node = link.from_node
assert sep_node.inputs[0].is_linked and sep_node.inputs[0].links[0].from_node.type == 'TEX_IMAGE', \
"Material is not supported"
assert sep_node.inputs[0].links[0].from_socket.name == 'Color', "Material is not supported"
tex_node = sep_node.inputs[0].links[0].from_node
channel_index = {
'R': 0,
'G': 1,
'B': 2,
}[expected_channel]
return extract_image(tex_node, channel_index)
if expected_channel == 'A':
assert link.from_node.type == 'TEX_IMAGE', "Material is not supported"
assert link.from_socket.name == 'Alpha', "Material is not supported"
tex_node = link.from_node
return extract_image(tex_node, 3)
def try_extract_factor(link, mode='color'):
"""
Tries to extract a factor from a math node link.
Supported sub tree modes:
- color:
ANY -> MIX(MULTIPLY) ->
- scalar:
ANY -> MATH(MULTIPLY) ->
"""
assert mode in ['color','scalar'], "Unsupported mode"
if mode == 'color':
if link.from_node.type == 'MIX':
mix_node = link.from_node
assert mix_node.data_type == 'RGBA' and mix_node.blend_type == 'MULTIPLY', f"Material is not supported, {mix_node.data_type}, {mix_node.blend_type}"
assert not mix_node.inputs['Factor'].is_linked and mix_node.inputs['Factor'].default_value == 1.0, \
"Material is not supported"
if mix_node.inputs['A'].is_linked:
assert not mix_node.inputs['B'].is_linked, "Material is not supported"
return (list(mix_node.inputs['B'].default_value)[:3], mix_node.inputs['A'].links[0])
else:
assert not mix_node.inputs['A'].is_linked, "Material is not supported"
assert mix_node.inputs['B'].is_linked, "Material is not supported"
return (list(mix_node.inputs['A'].default_value)[:3], mix_node.inputs['B'].links[0])
return ([1.0, 1.0, 1.0], link)
if mode =='scalar':
if link.from_node.type == 'MATH':
math_node = link.from_node
assert math_node.operation == 'MULTIPLY', "Material is not supported"
assert math_node.inputs[0].is_linked, "Material is not supported"
assert not math_node.inputs[1].is_linked, "Material is not supported"
return (math_node.inputs[1].default_value, math_node.inputs[0].links[0])
return (1.0, link)
def try_extract_image_with_factor(link, expected_channel='RGB'):
"""
Tries to extract an image and a factor from a texture node link.
"""
factor, link = try_extract_factor(link, 'color' if expected_channel in ['RGB'] else 'scalar')
image = try_extract_image(link, expected_channel)
return (factor, image)
def main(arg):
# Initialize context
if arg.object.endswith(".blend"):
delete_invisible_objects()
else:
init_scene()
load_object(arg.object)
print('[INFO] Scene initialized.')
# Normalize scene
scale, offset = normalize_scene()
print('[INFO] Scene normalized.')
# Start dumping
depsgraph = bpy.context.evaluated_depsgraph_get()
scene = bpy.context.scene
output = {
'materials': [],
'objects': [],
}
# Dumping materials
for mat in bpy.data.materials:
assert mat.use_nodes == True, "Material is not supported"
pack = {
"baseColorFactor": [1.0, 1.0, 1.0],
"alphaFactor": 1.0,
"metallicFactor": 1.0,
"roughnessFactor": 1.0,
"alphaMode": "OPAQUE",
"alphaCutoff": 0.5,
"baseColorTexture": None,
"alphaTexture": None,
"metallicTexture": None,
"roughnessTexture": None,
}
try:
principled_node = mat.node_tree.nodes.get('Principled BSDF')
assert principled_node is not None, "Material is not supported"
# Handle base color
if not principled_node.inputs['Base Color'].is_linked:
pack["baseColorFactor"] = list(principled_node.inputs['Base Color'].default_value)
else:
link = principled_node.inputs['Base Color'].links[0]
if link.from_node.type == 'RGB':
pack["baseColorFactor"] = list(link.from_node.outputs[0].default_value)
else:
factor, image = try_extract_image_with_factor(link, 'RGB')
pack["baseColorFactor"] = factor
pack["baseColorTexture"] = image
# Handle alpha
if not principled_node.inputs['Alpha'].is_linked:
pack["alphaFactor"] = principled_node.inputs['Alpha'].default_value
if pack["alphaFactor"] < 1.0:
pack["alphaMode"] = "BLEND"
else:
link = principled_node.inputs['Alpha'].links[0]
node = link.from_node
if node.type == 'VALUE':
pack["alphaFactor"] = node.outputs[0].default_value
if pack["alphaFactor"] < 1.0:
pack["alphaMode"] = "BLEND"
else:
pack["alphaMode"] = "BLEND"
if node.type == 'MATH':
if node.operation == 'ROUND':
assert node.inputs[0].is_linked, "Material is not supported"
pack["alphaMode"] = "MASK"
link = node.inputs[0].links[0]
elif node.operation == 'SUBTRACT':
assert node.inputs[0].default_value == 1.0 and \
node.inputs[1].is_linked and \
node.inputs[1].links[0].from_node.type == 'MATH' and \
node.inputs[1].links[0].from_node.operation == 'LESS_THAN', \
"Material is not supported"
assert node.inputs[1].links[0].from_node.inputs[0].is_linked, "Material is not supported"
pack["alphaMode"] = "MASK"
pack["alphaCutoff"] = node.inputs[1].links[0].from_node.inputs[1].default_value
link = node.inputs[1].links[0].from_node.inputs[0].links[0]
factor, image = try_extract_image_with_factor(link, 'A')
pack["alphaFactor"] = factor
pack["alphaTexture"] = image
# Handle metallic
if not principled_node.inputs['Metallic'].is_linked:
pack["metallicFactor"] = principled_node.inputs['Metallic'].default_value
else:
link = principled_node.inputs['Metallic'].links[0]
node = link.from_node
if node.type == 'VALUE':
pack["metallicFactor"] = node.outputs[0].default_value
else:
factor, image = try_extract_image_with_factor(link, 'B')
pack["metallicFactor"] = factor
pack["metallicTexture"] = image
# Handle roughness
if not principled_node.inputs['Roughness'].is_linked:
pack["roughnessFactor"] = principled_node.inputs['Roughness'].default_value
else:
link = principled_node.inputs['Roughness'].links[0]
node = link.from_node
if node.type == 'VALUE':
pack["roughnessFactor"] = node.outputs[0].default_value
else:
factor, image = try_extract_image_with_factor(link, 'G')
pack["roughnessFactor"] = factor
pack["roughnessTexture"] = image
output['materials'].append(pack)
except:
with open(arg.output_path + '_error.txt', 'w') as f:
f.write(str([[n.name] for n in mat.node_tree.nodes]))
raise RuntimeError("Material is not supported")
# Dumping meshes
for obj in scene.objects:
if obj.type != 'MESH':
continue
pack = {
"vertices": None,
"faces": None,
"uvs": None,
"matIDs": None,
}
eval_obj = obj.evaluated_get(depsgraph)
eval_mesh = eval_obj.to_mesh()
bm = bmesh.new()
bm.from_mesh(eval_mesh)
bm.transform(obj.matrix_world)
bmesh.ops.triangulate(bm, faces=bm.faces)
bm.to_mesh(eval_mesh)
bm.free()
pack["vertices"] = np.array([
v.co[:] for v in eval_mesh.vertices
], dtype=np.float32) # (N, 3)
pack["faces"] = np.array([
[eval_mesh.loops[i].vertex_index for i in poly.loop_indices]
for poly in eval_mesh.polygons
], dtype=np.int32) # (F, 3)
pack["normals"] = np.array([
[eval_mesh.loops[i].normal for i in poly.loop_indices]
for poly in eval_mesh.polygons
], dtype=np.float32) # (F, 3, 3)
if eval_mesh.uv_layers.active is not None:
pack["uvs"] = np.array([
[eval_mesh.uv_layers.active.data[i].uv for i in poly.loop_indices]
for poly in eval_mesh.polygons
], dtype=np.float32) # (F, 3, 2)
pack["mat_ids"] = np.array([
bpy.data.materials.find(obj.material_slots[poly.material_index].name)
if len(obj.material_slots) > 0 and obj.material_slots[poly.material_index].material is not None else -1
for poly in eval_mesh.polygons
], dtype=np.int32)
output['objects'].append(pack)
# Save output
os.makedirs(os.path.dirname(arg.output_path), exist_ok=True)
with open(arg.output_path, 'wb') as f:
pickle.dump(output, f)
print('[INFO] Output saved to {}.'.format(arg.output_path))
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Renders given obj file by rotation a camera around it.')
parser.add_argument('--object', type=str, help='Path to the 3D model file to be rendered.')
parser.add_argument('--output_path', type=str, default='/tmp', help='The path the output will be dumped to.')
argv = sys.argv[sys.argv.index("--") + 1:]
args = parser.parse_args(argv)
main(args)
\ No newline at end of file
import subprocess
import sys
import ensurepip
ensurepip.bootstrap()
subprocess.check_call([sys.executable, "-m", "pip", "install", "Pillow"])
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment