Commit 106580f9 authored by chenych's avatar chenych
Browse files

First commit

parents
Pipeline #689 failed with stages
in 0 seconds
_base_ = [
'./_base_/default_runtime.py',
'./_base_/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = None
use_gt_bbox = True
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[192, 256], # [48, 64]
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=use_gt_bbox,
det_bbox_thr=0.0,
bbox_file='datasets/coco_pose/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
sigma = [1.5, 3]
aug_idx = 0
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
# dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0),
# dict(
# type='TopDownHalfBodyTransform',
# num_joints_half_body=8,
# prob_half_body=0.3),
# dict(
# type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
# dict(type='ToTensor'),
# dict(
# type='NormalizeTensor',
# mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTargetCustom',
sigma=sigma,
# the following are custom args
use_gt_bbox=use_gt_bbox,
dir_name='train_256x192_aug{}'.format(aug_idx),
target_path='datasets/coco_pose/data_pair',
),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
# dict(type='TopDownRandomFlip', flip_prob=1), # for flip test
dict(type='TopDownAffine'),
# dict(type='ToTensor'),
# dict(
# type='NormalizeTensor',
# mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTargetCustom',
sigma=sigma,
# the following are custom args
use_gt_bbox=use_gt_bbox,
dir_name='val_256x192',
target_path='datasets/coco_pose/data_pair',
),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'datasets/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
load_data_only=True, # custom arg
train=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
# import newly registered module
custom_imports = dict(
imports=[
'model.top_down',
'data.topdown_coco_dataset',
'data.pipelines.top_down_transform',
],
allow_failed_imports=False)
_base_ = [
'./_base_/default_runtime.py',
'./_base_/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = None
use_gt_bbox = False
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[192, 256], # [48, 64]
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=use_gt_bbox,
det_bbox_thr=0.0,
bbox_file='datasets/coco_pose/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
sigma = [1.5, 3] # 2
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
# dict(type='ToTensor'),
# dict(
# type='NormalizeTensor',
# mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTargetCustom',
sigma=sigma,
# the following are custom args
use_gt_bbox=use_gt_bbox,
dir_name='train_256x192_aug0',
target_path='datasets/coco_pose/data_pair',
),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
# dict(type='TopDownRandomFlip', flip_prob=1), # for flip test
dict(type='TopDownAffine'),
# dict(type='ToTensor'),
# dict(
# type='NormalizeTensor',
# mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTargetCustom',
sigma=sigma,
# the following are custom args
use_gt_bbox=use_gt_bbox,
dir_name='test_256x192',
target_path='datasets/coco_pose/data_pair',
),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'datasets/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
load_data_only=True, # custom arg
train=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
# import newly registered module
custom_imports = dict(
imports=[
'model.top_down',
'data.topdown_coco_dataset',
'data.pipelines.top_down_transform',
],
allow_failed_imports=False)
_base_ = [
'./_base_/default_runtime.py',
'./_base_/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# model settings
model = None
use_gt_bbox = False
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[192, 256], # [48, 64]
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=use_gt_bbox,
det_bbox_thr=0.0,
bbox_file='datasets/coco_pose/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
sigma = [1.5, 3] # 2
train_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomShiftBboxCenter', shift_factor=0.16, prob=0.3),
dict(type='TopDownRandomFlip', flip_prob=0.5),
dict(
type='TopDownHalfBodyTransform',
num_joints_half_body=8,
prob_half_body=0.3),
dict(
type='TopDownGetRandomScaleRotation', rot_factor=40, scale_factor=0.5),
dict(type='TopDownAffine'),
# dict(type='ToTensor'),
# dict(
# type='NormalizeTensor',
# mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTargetCustom',
sigma=sigma,
# the following are custom args
use_gt_bbox=use_gt_bbox,
dir_name='train_256x192_aug0',
target_path='datasets/coco_pose/data_pair',
),
dict(
type='Collect',
keys=['img', 'target', 'target_weight'],
meta_keys=[
'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale',
'rotation', 'bbox_score', 'flip_pairs'
]),
]
val_pipeline = [
dict(type='LoadImageFromFile'),
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(type='TopDownRandomFlip', flip_prob=1), # for flip test
dict(type='TopDownAffine'),
# dict(type='ToTensor'),
# dict(
# type='NormalizeTensor',
# mean=[0.485, 0.456, 0.406],
# std=[0.229, 0.224, 0.225]),
dict(
type='TopDownGenerateTargetCustom',
sigma=sigma,
# the following are custom args
use_gt_bbox=use_gt_bbox,
dir_name='test_256x192_flip',
target_path='datasets/coco_pose/data_pair',
),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'datasets/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
load_data_only=True, # custom arg
train=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_train2017.json',
img_prefix=f'{data_root}/train2017/',
data_cfg=data_cfg,
pipeline=train_pipeline,
dataset_info={{_base_.dataset_info}}),
val=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
img_prefix=f'{data_root}/val2017/',
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
# import newly registered module
custom_imports = dict(
imports=[
'model.top_down',
'data.topdown_coco_dataset',
'data.pipelines.top_down_transform',
],
allow_failed_imports=False)
import os
job_name = "painter_vit_large"
ckpt_file = "painter_vit_large.pth"
prompt = "000000000165_box0"
image_dir = 'models_inference/{}/coco_pose_inference_{}_{}/'.format(job_name, ckpt_file, prompt)
if not image_dir[-1] == "/":
image_dir = image_dir + '/'
print(image_dir)
_base_ = [
'./_base_/default_runtime.py',
'./_base_/coco.py'
]
evaluation = dict(interval=10, metric='mAP', save_best='AP')
optimizer = dict(
type='Adam',
lr=5e-4,
)
optimizer_config = dict(grad_clip=None)
# learning policy
lr_config = dict(
policy='step',
warmup='linear',
warmup_iters=500,
warmup_ratio=0.001,
step=[170, 200])
total_epochs = 210
channel_cfg = dict(
num_output_channels=17,
dataset_joints=17,
dataset_channel=[
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
],
inference_channel=[
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
])
# fake model settings
model = dict(
type='TopDownCustom',
pretrained=None,
backbone=dict(
type='HRNet',
in_channels=3,
extra=dict(
stage1=dict(
num_modules=1,
num_branches=1,
block='BOTTLENECK',
num_blocks=(4, ),
num_channels=(64, )),
stage2=dict(
num_modules=1,
num_branches=2,
block='BASIC',
num_blocks=(4, 4),
num_channels=(32, 64)),
stage3=dict(
num_modules=4,
num_branches=3,
block='BASIC',
num_blocks=(4, 4, 4),
num_channels=(32, 64, 128)),
stage4=dict(
num_modules=3,
num_branches=4,
block='BASIC',
num_blocks=(4, 4, 4, 4),
num_channels=(32, 64, 128, 256))),
),
keypoint_head=dict(
type='TopdownHeatmapSimpleHead',
in_channels=32,
out_channels=channel_cfg['num_output_channels'],
num_deconv_layers=0,
extra=dict(final_conv_kernel=1, ),
loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)),
train_cfg=dict(),
test_cfg=dict(
flip_test=True,
post_process='default',
shift_heatmap=True,
modulate_kernel=17))
data_cfg = dict(
image_size=[192, 256],
heatmap_size=[192, 256],
# heatmap_size=[48, 64],
# image_size=[640, 320], # w, h
# heatmap_size=[640, 320],
num_output_channels=channel_cfg['num_output_channels'],
num_joints=channel_cfg['dataset_joints'],
dataset_channel=channel_cfg['dataset_channel'],
inference_channel=channel_cfg['inference_channel'],
soft_nms=False,
nms_thr=1.0,
oks_thr=0.9,
vis_thr=0.2,
use_gt_bbox=False,
imagename_with_boxid=True, # custom
det_bbox_thr=0.0,
bbox_file='datasets/coco_pose/person_detection_results/'
'COCO_val2017_detections_AP_H_56_person.json',
)
# sigma = [1.5, 3] # 2
sigma = 3 # use the hyper params of R, which is heatmap
val_pipeline = [
dict(type='LoadImageFromFile'), # load custom images according to filename and box_id, using topdown_coco_dataset
dict(type='TopDownGetBboxCenterScale', padding=1.25),
dict(
type='Collect',
keys=['img'],
meta_keys=[
'image_file', 'center', 'scale', 'rotation', 'bbox_score',
'flip_pairs'
]),
]
test_pipeline = val_pipeline
data_root = 'datasets/coco'
data = dict(
samples_per_gpu=32,
workers_per_gpu=8,
val_dataloader=dict(samples_per_gpu=32),
test_dataloader=dict(samples_per_gpu=32),
pseudo_test=True, # custom arg
val=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
# img_prefix=f'{data_root}/val2017/',
img_prefix=image_dir,
data_cfg=data_cfg,
pipeline=val_pipeline,
dataset_info={{_base_.dataset_info}}),
test=dict(
type='TopDownCocoDatasetCustom',
ann_file=f'{data_root}/annotations/person_keypoints_val2017.json',
# img_prefix=f'{data_root}/val2017/',
img_prefix=image_dir,
data_cfg=data_cfg,
pipeline=test_pipeline,
dataset_info={{_base_.dataset_info}}),
)
# import newly registered module
custom_imports = dict(
imports=[
'model.top_down',
'data.topdown_coco_dataset',
'data.pipelines.top_down_transform',
],
allow_failed_imports=False)
import os
import random
import warnings
import cv2
import numpy as np
from PIL import Image
def define_colors_gb_mean_sep(num_locations=17):
num_sep_per_channel = int(num_locations ** (1 / 2)) + 1 # 5
separation_per_channel = 256 // num_sep_per_channel # 51
color_dict = {}
# R = G = B = 0
# B += separation_per_channel # offset for the first loop
for location in range(num_locations):
num_seq_g = location // num_sep_per_channel
num_seq_b = location % num_sep_per_channel
assert (num_seq_g <= num_sep_per_channel) and (num_seq_b <= num_sep_per_channel)
G = 255 - num_seq_g * separation_per_channel
B = 255 - num_seq_b * separation_per_channel
assert (G < 256) and (B < 256)
assert (G >= 0) and (B >= 0)
assert (G, B) not in color_dict.values()
color_dict[location] = (G, B)
# print(location, (num_seq_g, num_seq_b), (G, B))
# colors = [v for k, v in color_dict.items()]
# min values in gb: [51, 51]
return color_dict
color_dict = define_colors_gb_mean_sep()
def encode_target_to_image(target, target_weight, target_dir, metas):
if len(target.shape) == 3:
return encode_rgb_target_to_image(
target_kernel=target, target_class=target,
target_weight_kernel=target_weight, target_weight_class=target_weight,
target_dir=target_dir, metas=metas,
)
assert len(target.shape) == 4
return encode_rgb_target_to_image(
target_kernel=target[1], target_class=target[0],
target_weight_kernel=target_weight[1], target_weight_class=target_weight[0],
target_dir=target_dir, metas=metas,
)
def check_input(target_weight, target, metas):
if not ((target_weight.reshape(17, 1, 1) * target) == target).all():
print("useful target_weight!")
target = target_weight.reshape(17, 1, 1) * target
# make sure the invisible part is weighted zero, and thus not shown in target
if not (target_weight[np.sum(metas['joints_3d_visible'], axis=1) == 0] == 0).all():
print(metas['image_file'], "may have joints_3d_visible problems!")
def encode_rgb_target_to_image(target_kernel, target_class, target_weight_kernel, target_weight_class, target_dir, metas):
"""
Args:
target: ndarray (17, 256, 192)
target_weight: ndarray (17, 1)
metas: dict
Returns:
an RGB image, R encodes heatmap, GB encodes class
"""
check_input(target_weight_kernel, target_kernel, metas)
check_input(target_weight_class, target_class, metas)
# 1. handle kernel in R channel
# get max value for collision area
sum_kernel = target_kernel.max(0) # (256, 192)
max_kernel_indices = target_kernel.argmax(0) # (256, 192)
R = sum_kernel[:, :, None] * 255. # (256, 192, 1)
# 2. handle class in BG channels
K, H, W = target_class.shape
keypoint_areas_class = []
for keypoint_idx in range(K):
mask = target_class[keypoint_idx] != 0
keypoint_areas_class.append(mask)
keypoint_areas_class = np.stack(keypoint_areas_class) # (17, 256, 192)
num_pos_per_location_class = keypoint_areas_class.sum(0) # (256, 192)
collision_area_class = num_pos_per_location_class > 1 # (256, 192)
GB_MultiChannel = np.zeros((17, 256, 192, 2))
for keypoint_idx in range(K):
color = color_dict[keypoint_idx]
class_mask = keypoint_areas_class[keypoint_idx]
GB_MultiChannel[keypoint_idx][class_mask] = color
GB = GB_MultiChannel.sum(0) # (256, 192, 2)
if np.sum(collision_area_class) != 0:
for keypoint_idx in range(K):
color = color_dict[keypoint_idx]
# mach more max_area_this_keypoint for 0, but removed by collision_area_class latter
max_area_this_keypoint = max_kernel_indices == keypoint_idx
area_of_interest = max_area_this_keypoint * collision_area_class
if not (area_of_interest == 0).all():
GB[area_of_interest] = color
# 3. get images / labels and save
image_label = np.concatenate([R, GB], axis=-1).astype(np.uint8) # (256, 192, 3)
image_label = Image.fromarray(image_label)
image = metas['img']
image = Image.fromarray(image)
box_idx = metas['bbox_id']
_, filename = os.path.dirname(metas['image_file']), os.path.basename(metas['image_file'])
image_path = os.path.join(target_dir, filename.replace(".jpg", "_box{}_image.png".format(box_idx)))
label_path = os.path.join(target_dir, filename.replace(".jpg", "_box{}_label.png".format(box_idx)))
# if os.path.exists(image_path):
# print(image_path, "exist! return!")
# return
image.save(image_path)
image_label.save(label_path)
# Copyright (c) OpenMMLab. All rights reserved.
import warnings
import os
from PIL import Image
import cv2
import numpy as np
from mmpose.core.bbox import bbox_xywh2cs
from mmpose.core.post_processing import (affine_transform, fliplr_joints,
get_affine_transform, get_warp_matrix,
warp_affine_joints)
from mmpose.datasets.builder import PIPELINES
from mmpose.datasets.pipelines import TopDownGenerateTarget
from .custom_transform import encode_target_to_image
@PIPELINES.register_module()
class TopDownGenerateTargetCustom(TopDownGenerateTarget):
"""Generate the target heatmap.
Required key: 'joints_3d', 'joints_3d_visible', 'ann_info'.
Modified key: 'target', and 'target_weight'.
Args:
sigma: Sigma of heatmap gaussian for 'MSRA' approach.
kernel: Kernel of heatmap gaussian for 'Megvii' approach.
encoding (str): Approach to generate target heatmaps.
Currently supported approaches: 'MSRA', 'Megvii', 'UDP'.
Default:'MSRA'
unbiased_encoding (bool): Option to use unbiased
encoding methods.
Paper ref: Zhang et al. Distribution-Aware Coordinate
Representation for Human Pose Estimation (CVPR 2020).
keypoint_pose_distance: Keypoint pose distance for UDP.
Paper ref: Huang et al. The Devil is in the Details: Delving into
Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
target_type (str): supported targets: 'GaussianHeatmap',
'CombinedTarget'. Default:'GaussianHeatmap'
CombinedTarget: The combination of classification target
(response map) and regression target (offset map).
Paper ref: Huang et al. The Devil is in the Details: Delving into
Unbiased Data Processing for Human Pose Estimation (CVPR 2020).
"""
def __init__(self,
sigma=2,
kernel=(11, 11),
valid_radius_factor=0.0546875,
target_type='GaussianHeatmap',
encoding='MSRA',
unbiased_encoding=False,
# the following are custom args
target_path=None,
dir_name=None,
use_gt_bbox=True):
super().__init__(
sigma=sigma,
kernel=kernel,
valid_radius_factor=valid_radius_factor,
target_type=target_type,
encoding=encoding,
unbiased_encoding=unbiased_encoding)
self.target_path = target_path
self.dir_name = dir_name
self.use_gt_bbox = use_gt_bbox
target_dir = os.path.join(self.target_path, self.dir_name)
if not os.path.exists(target_dir):
os.makedirs(target_dir)
def __call__(self, results):
"""Generate the target heatmap."""
joints_3d = results['joints_3d']
joints_3d_visible = results['joints_3d_visible']
assert self.encoding in ['MSRA', 'Megvii', 'UDP']
if self.encoding == 'MSRA':
if isinstance(self.sigma, list):
num_sigmas = len(self.sigma)
cfg = results['ann_info']
num_joints = cfg['num_joints']
heatmap_size = cfg['heatmap_size']
target = np.empty(
(0, num_joints, heatmap_size[1], heatmap_size[0]),
dtype=np.float32)
target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
for i in range(num_sigmas):
target_i, target_weight_i = self._msra_generate_target(
cfg, joints_3d, joints_3d_visible, self.sigma[i])
target = np.concatenate([target, target_i[None]], axis=0)
target_weight = np.concatenate(
[target_weight, target_weight_i[None]], axis=0)
else:
target, target_weight = self._msra_generate_target(
results['ann_info'], joints_3d, joints_3d_visible,
self.sigma)
elif self.encoding == 'Megvii':
if isinstance(self.kernel, list):
num_kernels = len(self.kernel)
cfg = results['ann_info']
num_joints = cfg['num_joints']
W, H = cfg['heatmap_size']
target = np.empty((0, num_joints, H, W), dtype=np.float32)
target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
for i in range(num_kernels):
target_i, target_weight_i = self._megvii_generate_target(
cfg, joints_3d, joints_3d_visible, self.kernel[i])
target = np.concatenate([target, target_i[None]], axis=0)
target_weight = np.concatenate(
[target_weight, target_weight_i[None]], axis=0)
else:
target, target_weight = self._megvii_generate_target(
results['ann_info'], joints_3d, joints_3d_visible,
self.kernel)
elif self.encoding == 'UDP':
if self.target_type.lower() == 'CombinedTarget'.lower():
factors = self.valid_radius_factor
channel_factor = 3
elif self.target_type.lower() == 'GaussianHeatmap'.lower():
factors = self.sigma
channel_factor = 1
else:
raise ValueError('target_type should be either '
"'GaussianHeatmap' or 'CombinedTarget'")
if isinstance(factors, list):
num_factors = len(factors)
cfg = results['ann_info']
num_joints = cfg['num_joints']
W, H = cfg['heatmap_size']
target = np.empty((0, channel_factor * num_joints, H, W),
dtype=np.float32)
target_weight = np.empty((0, num_joints, 1), dtype=np.float32)
for i in range(num_factors):
target_i, target_weight_i = self._udp_generate_target(
cfg, joints_3d, joints_3d_visible, factors[i],
self.target_type)
target = np.concatenate([target, target_i[None]], axis=0)
target_weight = np.concatenate(
[target_weight, target_weight_i[None]], axis=0)
else:
target, target_weight = self._udp_generate_target(
results['ann_info'], joints_3d, joints_3d_visible, factors,
self.target_type)
else:
raise ValueError(
f'Encoding approach {self.encoding} is not supported!')
results['target'] = target
results['target_weight'] = target_weight
target_dir = os.path.join(self.target_path, self.dir_name)
if not self.use_gt_bbox:
box_idx = results['bbox_id']
image = results['img']
image = Image.fromarray(image)
_, filename = os.path.dirname(results['image_file']), os.path.basename(results['image_file'])
image_path = os.path.join(target_dir,
filename.replace(".jpg", "_box{}_image.png".format(box_idx)))
if os.path.exists(image_path):
print(image_path, "exist! return!")
return results
image.save(image_path)
else:
# filter all black target
if (target.sum((1, 2)) == 0).all():
return results
# encode target to image (save is also done inside)
encode_target_to_image(target, target_weight, target_dir=target_dir, metas=results)
return results
# Copyright (c) OpenMMLab. All rights reserved.
import os
import os.path as osp
import tempfile
import warnings
from collections import OrderedDict, defaultdict
import json_tricks as json
import numpy as np
from mmcv import Config, deprecated_api_warning
from xtcocotools.cocoeval import COCOeval
from mmpose.core.post_processing import oks_nms, soft_oks_nms
from mmpose.datasets.builder import DATASETS
# from mmpose.datasets.datasets.base import Kpt2dSviewRgbImgTopDownDataset
from mmpose.datasets.datasets.top_down import TopDownCocoDataset
@DATASETS.register_module()
class TopDownCocoDatasetCustom(TopDownCocoDataset):
"""CocoDataset dataset for top-down pose estimation.
"Microsoft COCO: Common Objects in Context", ECCV'2014.
More details can be found in the `paper
<https://arxiv.org/abs/1405.0312>`__ .
The dataset loads raw features and apply specified transforms
to return a dict containing the image tensors and other information.
COCO keypoint indexes::
0: 'nose',
1: 'left_eye',
2: 'right_eye',
3: 'left_ear',
4: 'right_ear',
5: 'left_shoulder',
6: 'right_shoulder',
7: 'left_elbow',
8: 'right_elbow',
9: 'left_wrist',
10: 'right_wrist',
11: 'left_hip',
12: 'right_hip',
13: 'left_knee',
14: 'right_knee',
15: 'left_ankle',
16: 'right_ankle'
Args:
ann_file (str): Path to the annotation file.
img_prefix (str): Path to a directory where images are held.
Default: None.
data_cfg (dict): config
pipeline (list[dict | callable]): A sequence of data transforms.
dataset_info (DatasetInfo): A class containing all dataset info.
test_mode (bool): Store True when building test or
validation dataset. Default: False.
"""
def __init__(self,
ann_file,
img_prefix,
data_cfg,
pipeline,
dataset_info=None,
test_mode=False):
super().__init__(
ann_file,
img_prefix,
data_cfg,
pipeline,
dataset_info=dataset_info,
test_mode=test_mode)
self.imagename_with_boxid = data_cfg.get('imagename_with_boxid', False)
def _load_coco_keypoint_annotation_kernel(self, img_id):
"""load annotation from COCOAPI.
Note:
bbox:[x1, y1, w, h]
Args:
img_id: coco image id
Returns:
dict: db entry
"""
img_ann = self.coco.loadImgs(img_id)[0]
width = img_ann['width']
height = img_ann['height']
num_joints = self.ann_info['num_joints']
ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False)
objs = self.coco.loadAnns(ann_ids)
# sanitize bboxes
valid_objs = []
for obj in objs:
if 'bbox' not in obj:
continue
x, y, w, h = obj['bbox']
x1 = max(0, x)
y1 = max(0, y)
x2 = min(width - 1, x1 + max(0, w))
y2 = min(height - 1, y1 + max(0, h))
if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1:
obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1]
valid_objs.append(obj)
objs = valid_objs
bbox_id = 0
rec = []
for obj in objs:
if 'keypoints' not in obj:
continue
if max(obj['keypoints']) == 0:
continue
if 'num_keypoints' in obj and obj['num_keypoints'] == 0:
continue
joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32)
keypoints = np.array(obj['keypoints']).reshape(-1, 3)
joints_3d[:, :2] = keypoints[:, :2]
joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3])
image_file = osp.join(self.img_prefix, self.id2name[img_id])
if self.imagename_with_boxid:
# gt bbox label example: 000000342971_box0_image.png
image_file = image_file.replace(".jpg", "_box{}_image.png".format(bbox_id))
rec.append({
'image_file': image_file,
'bbox': obj['clean_bbox'][:4],
'rotation': 0,
'joints_3d': joints_3d,
'joints_3d_visible': joints_3d_visible,
'dataset': self.dataset_name,
'bbox_score': 1,
'bbox_id': bbox_id
})
bbox_id = bbox_id + 1
return rec
def _load_coco_person_detection_results(self):
"""Load coco person detection results."""
num_joints = self.ann_info['num_joints']
all_boxes = None
with open(self.bbox_file, 'r') as f:
all_boxes = json.load(f)
if not all_boxes:
raise ValueError('=> Load %s fail!' % self.bbox_file)
print(f'=> Total boxes: {len(all_boxes)}')
kpt_db = []
bbox_id = 0
for det_res in all_boxes:
if det_res['category_id'] != 1:
continue
image_file = osp.join(self.img_prefix,
self.id2name[det_res['image_id']])
box = det_res['bbox']
score = det_res['score']
if score < self.det_bbox_thr:
continue
joints_3d = np.zeros((num_joints, 3), dtype=np.float32)
joints_3d_visible = np.ones((num_joints, 3), dtype=np.float32)
if self.imagename_with_boxid:
image_file = image_file.replace(".jpg", "_box{}_image.png".format(bbox_id))
kpt_db.append({
'image_file': image_file,
'rotation': 0,
'bbox': box[:4],
'bbox_score': score,
'dataset': self.dataset_name,
'joints_3d': joints_3d,
'joints_3d_visible': joints_3d_visible,
'bbox_id': bbox_id
})
bbox_id = bbox_id + 1
print(f'=> Total boxes after filter '
f'low score@{self.det_bbox_thr}: {bbox_id}')
return kpt_db
@deprecated_api_warning(name_dict=dict(outputs='results'))
def evaluate(self, results, res_folder=None, metric='mAP', **kwargs):
"""Evaluate coco keypoint results. The pose prediction results will be
saved in ``${res_folder}/result_keypoints.json``.
Note:
- batch_size: N
- num_keypoints: K
- heatmap height: H
- heatmap width: W
Args:
results (list[dict]): Testing results containing the following
items:
- preds (np.ndarray[N,K,3]): The first two dimensions are \
coordinates, score is the third dimension of the array.
- boxes (np.ndarray[N,6]): [center[0], center[1], scale[0], \
scale[1],area, score]
- image_paths (list[str]): For example, ['data/coco/val2017\
/000000393226.jpg']
- heatmap (np.ndarray[N, K, H, W]): model output heatmap
- bbox_id (list(int)).
res_folder (str, optional): The folder to save the testing
results. If not specified, a temp folder will be created.
Default: None.
metric (str | list[str]): Metric to be performed. Defaults: 'mAP'.
Returns:
dict: Evaluation results for evaluation metric.
"""
metrics = metric if isinstance(metric, list) else [metric]
allowed_metrics = ['mAP']
for metric in metrics:
if metric not in allowed_metrics:
raise KeyError(f'metric {metric} is not supported')
if res_folder is not None:
tmp_folder = None
res_file = osp.join(res_folder, 'result_keypoints.json')
else:
tmp_folder = tempfile.TemporaryDirectory()
res_file = osp.join(tmp_folder.name, 'result_keypoints.json')
kpts = defaultdict(list)
for result in results:
preds = result['preds']
boxes = result['boxes']
image_paths = result['image_paths']
if self.imagename_with_boxid:
for idx, img_path in enumerate(image_paths):
image_dir, file_name = os.path.dirname(img_path), os.path.basename(img_path)
file_name = file_name.split("_")[0] + ".jpg"
img_path = os.path.join(image_dir, file_name)
image_paths[idx] = img_path
bbox_ids = result['bbox_ids']
batch_size = len(image_paths)
for i in range(batch_size):
image_id = self.name2id[image_paths[i][len(self.img_prefix):]]
kpts[image_id].append({
'keypoints': preds[i],
'center': boxes[i][0:2],
'scale': boxes[i][2:4],
'area': boxes[i][4],
'score': boxes[i][5],
'image_id': image_id,
'bbox_id': bbox_ids[i]
})
kpts = self._sort_and_unique_bboxes(kpts)
# rescoring and oks nms
num_joints = self.ann_info['num_joints']
vis_thr = self.vis_thr
oks_thr = self.oks_thr
valid_kpts = []
for image_id in kpts.keys():
img_kpts = kpts[image_id]
for n_p in img_kpts:
box_score = n_p['score']
if kwargs.get('rle_score', False):
pose_score = n_p['keypoints'][:, 2]
n_p['score'] = float(box_score + np.mean(pose_score) +
np.max(pose_score))
else:
kpt_score = 0
valid_num = 0
for n_jt in range(0, num_joints):
t_s = n_p['keypoints'][n_jt][2]
if t_s > vis_thr:
kpt_score = kpt_score + t_s
valid_num = valid_num + 1
if valid_num != 0:
kpt_score = kpt_score / valid_num
# rescoring
n_p['score'] = kpt_score * box_score
if self.use_nms:
nms = soft_oks_nms if self.soft_nms else oks_nms
keep = nms(img_kpts, oks_thr, sigmas=self.sigmas)
valid_kpts.append([img_kpts[_keep] for _keep in keep])
else:
valid_kpts.append(img_kpts)
self._write_coco_keypoint_results(valid_kpts, res_file)
# do evaluation only if the ground truth keypoint annotations exist
if 'annotations' in self.coco.dataset:
info_str = self._do_python_keypoint_eval(res_file)
name_value = OrderedDict(info_str)
if tmp_folder is not None:
tmp_folder.cleanup()
else:
warnings.warn(f'Due to the absence of ground truth keypoint'
f'annotations, the quantitative evaluation can not'
f'be conducted. The prediction results have been'
f'saved at: {osp.abspath(res_file)}')
name_value = {}
return name_value
# --------------------------------------------------------
# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
# Github source: https://github.com/baaivision/Painter
# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
# Licensed under The MIT License [see LICENSE for details]
# By Xinlong Wang, Wen Wang
# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
# --------------------------------------------------------'
import os
import glob
import json
import tqdm
import argparse
def get_args_parser():
parser = argparse.ArgumentParser('COCO pose estimation preparation', add_help=False)
parser.add_argument('--split', type=str, help='dataset split',
choices=['train', 'val'], required=True)
parser.add_argument('--output_dir', type=str, help='path to output dir',
default='datasets/coco_pose')
return parser.parse_args()
if __name__ == "__main__":
args = get_args_parser()
split = args.split
if split == "train":
aug_list = [
"_aug0", "_aug1", "_aug2", "_aug3", "_aug4",
"_aug5", "_aug6", "_aug7", "_aug8", "_aug9",
"_aug10", "_aug11", "_aug12", "_aug13", "_aug14",
"_aug15", "_aug16", "_aug17", "_aug18", "_aug19",
]
elif split == "val":
aug_list = ["", "_flip"]
else:
raise NotImplementedError
save_path = os.path.join(args.output_dir, "coco_pose_256x192_{}.json".format(split))
print(save_path)
output_dict = []
for aug_idx in aug_list:
image_dir = "datasets/coco_pose/data_pair/{}_256x192{}".format(split, aug_idx)
print(aug_idx, image_dir)
image_path_list = glob.glob(os.path.join(image_dir, '*image.png'))
for image_path in tqdm.tqdm(image_path_list):
label_path = image_path.replace("image.png", "label.png")
assert label_path != image_path
assert os.path.isfile(image_path)
if not os.path.isfile(label_path):
print("ignoring {}".format(label_path))
continue
pair_dict = {}
pair_dict["image_path"] = image_path.replace('datasets/', '')
pair_dict["target_path"] = label_path.replace('datasets/', '')
pair_dict["type"] = "coco_image2pose"
output_dict.append(pair_dict)
json.dump(output_dict, open(save_path, 'w'))
# Copyright (c) OpenMMLab. All rights reserved.
import os
import warnings
import mmcv
import numpy as np
from PIL import Image
import torch
from mmcv.image import imwrite
from mmcv.utils.misc import deprecated_api_warning
from mmcv.visualization.image import imshow
from mmpose.core import imshow_bboxes, imshow_keypoints
from mmpose.models import builder
from mmpose.models.builder import POSENETS
# from .base import BasePose
from mmpose.models.detectors import TopDown
try:
from mmcv.runner import auto_fp16
except ImportError:
warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
'Please install mmcv>=1.1.4')
from mmpose.core import auto_fp16
from mmpose.core.post_processing import flip_back
from data.pipelines.custom_transform import define_colors_gb_mean_sep
color_dict = define_colors_gb_mean_sep()
color_list = [v for k, v in color_dict.items()]
color_list.append((0, 0))
@POSENETS.register_module()
class TopDownCustom(TopDown):
"""Top-down pose detectors.
Args:
backbone (dict): Backbone modules to extract feature.
keypoint_head (dict): Keypoint head to process feature.
train_cfg (dict): Config for training. Default: None.
test_cfg (dict): Config for testing. Default: None.
pretrained (str): Path to the pretrained models.
loss_pose (None): Deprecated arguments. Please use
`loss_keypoint` for heads instead.
"""
colors = torch.tensor(color_list, dtype=torch.float32, device="cuda")
def __init__(self,
backbone,
neck=None,
keypoint_head=None,
train_cfg=None,
test_cfg=None,
pretrained=None,
loss_pose=None):
super().__init__(
backbone=backbone,
neck=neck,
keypoint_head=keypoint_head,
train_cfg=train_cfg,
test_cfg=test_cfg,
pretrained=pretrained,
loss_pose=loss_pose)
@auto_fp16(apply_to=('img', ))
def forward(self,
img,
target=None,
target_weight=None,
img_metas=None,
return_loss=True,
return_heatmap=False,
pseudo_test=False,
**kwargs):
"""Calls either forward_train or forward_test depending on whether
return_loss=True. Note this setting will change the expected inputs.
When `return_loss=True`, img and img_meta are single-nested (i.e.
Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta
should be double nested (i.e. List[Tensor], List[List[dict]]), with
the outer list indicating test time augmentations.
Note:
- batch_size: N
- num_keypoints: K
- num_img_channel: C (Default: 3)
- img height: imgH
- img width: imgW
- heatmaps height: H
- heatmaps weight: W
Args:
img (torch.Tensor[NxCximgHximgW]): Input images.
target (torch.Tensor[NxKxHxW]): Target heatmaps.
target_weight (torch.Tensor[NxKx1]): Weights across
different joint types.
img_metas (list(dict)): Information about data augmentation
By default this includes:
- "image_file: path to the image file
- "center": center of the bbox
- "scale": scale of the bbox
- "rotation": rotation of the bbox
- "bbox_score": score of bbox
return_loss (bool): Option to `return loss`. `return loss=True`
for training, `return loss=False` for validation & test.
return_heatmap (bool) : Option to return heatmap.
Returns:
dict|tuple: if `return loss` is true, then return losses. \
Otherwise, return predicted poses, boxes, image paths \
and heatmaps.
"""
if pseudo_test:
return self.forward_pseudo_test(
img, img_metas, return_heatmap=return_heatmap, **kwargs)
if return_loss:
return self.forward_train(img, target, target_weight, img_metas,
**kwargs)
return self.forward_test(
img, img_metas, return_heatmap=return_heatmap, **kwargs)
def forward_test(self, img, img_metas, return_heatmap=False, **kwargs):
"""Defines the computation performed at every call when testing."""
assert img.size(0) == len(img_metas)
batch_size, _, img_height, img_width = img.shape
if batch_size > 1:
assert 'bbox_id' in img_metas[0]
result = {}
features = self.backbone(img)
if self.with_neck:
features = self.neck(features)
if self.with_keypoint:
output_heatmap = self.keypoint_head.inference_model(
features, flip_pairs=None)
if self.test_cfg.get('flip_test', True):
img_flipped = img.flip(3) # (b, c, h, w)
features_flipped = self.backbone(img_flipped)
if self.with_neck:
features_flipped = self.neck(features_flipped)
if self.with_keypoint:
output_flipped_heatmap = self.keypoint_head.inference_model(
features_flipped, img_metas[0]['flip_pairs'])
output_heatmap = (output_heatmap + output_flipped_heatmap)
if self.test_cfg.get('regression_flip_shift', False):
output_heatmap[..., 0] -= 1.0 / img_width
output_heatmap = output_heatmap / 2
if self.with_keypoint:
keypoint_result = self.keypoint_head.decode(
img_metas, output_heatmap, img_size=[img_width, img_height])
result.update(keypoint_result)
if not return_heatmap:
output_heatmap = None
result['output_heatmap'] = output_heatmap
return result
def forward_pseudo_test(self, img, img_metas, return_heatmap=False, **kwargs):
"""Defines the computation performed at every call when testing."""
assert img.size(0) == len(img_metas)
batch_size, _, img_height, img_width = img.shape
if batch_size > 1:
assert 'bbox_id' in img_metas[0]
result = {}
output_heatmap = self.decode_images_to_heatmaps_minmax(
images=img, resize=False,
)
# add support for flip test
if self.test_cfg.get('flip_test', True):
image_flip_list = []
for batch_idx in range(img.shape[0]):
flip_image_dir = os.path.dirname(img_metas[batch_idx]['image_file']) + "_flip"
flip_image_name = os.path.basename(img_metas[batch_idx]['image_file'])
flip_image_path = os.path.join(flip_image_dir, flip_image_name)
image = np.array(Image.open(flip_image_path))
image_tensor = torch.from_numpy(image).to(img.device)
image_flip_list.append(image_tensor)
img_flipped = torch.stack(image_flip_list) # (b, h, w, 3)
if self.with_keypoint:
# output_flipped_heatmap = self.keypoint_head.inference_model(
# features_flipped, img_metas[0]['flip_pairs'])
output = self.decode_images_to_heatmaps_minmax(
images=img_flipped, resize=False,
)
flip_pairs = img_metas[0]['flip_pairs']
assert flip_pairs is not None
output_flipped_heatmap = flip_back(
output,
flip_pairs,
target_type=self.keypoint_head.target_type)
# feature is not aligned, shift flipped heatmap for higher accuracy
if self.test_cfg.get('shift_heatmap', False):
output_flipped_heatmap[:, :, :, 1:] = output_flipped_heatmap[:, :, :, :-1]
output_heatmap = (output_heatmap + output_flipped_heatmap)
if self.test_cfg.get('regression_flip_shift', False):
output_heatmap[..., 0] -= 1.0 / img_width
output_heatmap = output_heatmap / 2
if self.with_keypoint:
keypoint_result = self.keypoint_head.decode(
img_metas, output_heatmap, img_size=[img_width, img_height])
result.update(keypoint_result)
if not return_heatmap:
output_heatmap = None
result['output_heatmap'] = output_heatmap
return result
def decode_images_to_heatmaps_minmax(self, images, resize=False):
"""
Args:
images: (bs, 256, 192, 3)
resize: whether to resize to (64, 48)
Returns:
heatmaps: (bs, 17, h, w)
"""
assert images.shape[-1] == 3
batch_size, image_height, image_width, _ = images.shape
images = images.float()
# classify each pixel using GB
GB = images[..., 1:].view(batch_size, 1, image_height, image_width, 2) # (bs, 1, 256, 192, 2)
colors = TopDown.colors
num_classes = colors.shape[0]
colors = colors.view(1, -1, 1, 1, 2)
dist = torch.abs(GB - colors).sum(-1) # (bs, 18, 256, 192)
dist, indices = torch.min(dist, dim=1) # (bs, 256, 192)
keypoint_mask_list = []
for idx in range(num_classes):
mask = indices == idx # (bs, 256, 192)
keypoint_mask_list.append(mask)
R = images[..., 0] # (bs, 256, 192)
heatmap_list = []
for idx in range(num_classes):
if idx == 17:
continue
mask = keypoint_mask_list[idx]
heatmap = mask * R
heatmap_list.append(heatmap.unsqueeze(1))
heatmaps = torch.cat(heatmap_list, dim=1)
if resize:
raise NotImplementedError
return heatmaps.cpu().numpy() / 255.
# --------------------------------------------------------
# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
# Github source: https://github.com/baaivision/Painter
# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
# Licensed under The MIT License [see LICENSE for details]
# By Xinlong Wang, Wen Wang
# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
# --------------------------------------------------------'
import sys
import os
import warnings
import requests
import argparse
import torch
import torch.nn.functional as F
import numpy as np
import glob
import tqdm
import matplotlib.pyplot as plt
from PIL import Image
import torch.distributed as dist
from torch.utils.data import DataLoader, DistributedSampler
sys.path.append('.')
import models_painter
from util.ddp_utils import DatasetTest
from util import ddp_utils
imagenet_mean = np.array([0.485, 0.456, 0.406])
imagenet_std = np.array([0.229, 0.224, 0.225])
def get_args_parser():
parser = argparse.ArgumentParser('COCO Pose Estimation', add_help=False)
parser.add_argument('--ckpt_path', type=str, help='path to ckpt', default='')
parser.add_argument('--model', type=str, help='dir to ckpt',
default='painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1')
parser.add_argument('--prompt', type=str, help='prompt image in train set',
default='000000000165_box0')
parser.add_argument('--input_size', type=int, default=448)
parser.add_argument('--flip_test', action='store_true', help='use offline bbox')
# distributed training parameters
parser.add_argument('--world_size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
return parser.parse_args()
def prepare_model(chkpt_dir, arch, args=None):
# build model
model = getattr(models_painter, arch)()
model.to("cuda")
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
model_without_ddp = model.module
# load model
checkpoint = torch.load(chkpt_dir, map_location='cpu')
msg = model_without_ddp.load_state_dict(checkpoint['model'], strict=False)
print(msg)
return model
def run_one_image(img, tgt, size, model, out_path, device):
x = torch.tensor(img)
x = x.unsqueeze(dim=0)
x = torch.einsum('nhwc->nchw', x)
tgt = torch.tensor(tgt)
tgt = tgt.unsqueeze(dim=0)
tgt = torch.einsum('nhwc->nchw', tgt)
bool_masked_pos = torch.zeros(model.module.patch_embed.num_patches)
bool_masked_pos[model.module.patch_embed.num_patches//2:] = 1
bool_masked_pos = bool_masked_pos.unsqueeze(dim=0)
valid = torch.ones_like(tgt)
loss, y, mask = model(x.float().to(device), tgt.float().to(device), bool_masked_pos.to(device), valid.float().to(device))
y = model.module.unpatchify(y)
y = torch.einsum('nchw->nhwc', y).detach().cpu()
output = y[0, y.shape[1]//2:, :, :]
output = torch.clip((output * imagenet_std + imagenet_mean) * 255, 0, 255)
output = F.interpolate(output[None, ...].permute(0, 3, 1, 2), size=[size[1], size[0]], mode='nearest').permute(0, 2, 3, 1)[0]
output = output.int()
output = Image.fromarray(output.numpy().astype(np.uint8))
output.save(out_path)
if __name__ == '__main__':
dataset_dir = "datasets/"
args = get_args_parser()
args = ddp_utils.init_distributed_mode(args)
device = torch.device("cuda")
ckpt_path = args.ckpt_path
model = args.model
prompt = args.prompt
input_size = args.input_size
path_splits = ckpt_path.split('/')
ckpt_dir, ckpt_file = path_splits[-2], path_splits[-1]
dst_dir = os.path.join('models_inference', ckpt_dir.split('/')[-1],
"coco_pose_inference_{}_{}".format(ckpt_path, os.path.basename(prompt).split(".")[0]))
if args.flip_test:
dst_dir = dst_dir + "_flip"
if ddp_utils.get_rank() == 0:
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
print("output_dir: {}".format(dst_dir))
model_painter = prepare_model(ckpt_path, model, args)
print('Model loaded.')
img_src_dir = dataset_dir + "coco_pose/data_pair/test_256x192"
if args.flip_test:
img_src_dir += "_flip"
dataset_val = DatasetTest(img_src_dir, input_size, ext_list=('*.png',))
sampler_val = DistributedSampler(dataset_val, shuffle=False)
data_loader_val = DataLoader(dataset_val, batch_size=1, sampler=sampler_val,
drop_last=False, collate_fn=ddp_utils.collate_fn, num_workers=2)
img2_path = dataset_dir + "coco_pose/data_pair/train_256x192_aug0/{}_image.png".format(prompt)
tgt2_path = dataset_dir + "coco_pose/data_pair/train_256x192_aug0/{}_label.png".format(prompt)
# load the shared prompt image pair
img2 = Image.open(img2_path).convert("RGB")
img2 = img2.resize((input_size, input_size))
img2 = np.array(img2) / 255.
tgt2 = Image.open(tgt2_path)
tgt2 = tgt2.resize((input_size, input_size))
tgt2 = np.array(tgt2) / 255.
model_painter.eval()
for data in tqdm.tqdm(data_loader_val):
""" Load an image """
assert len(data) == 1
img, img_path, size = data[0]
img_name = os.path.basename(img_path)
out_path = os.path.join(dst_dir, img_name.replace('.jpg', '.png'))
img = np.concatenate((img2, img), axis=0)
assert img.shape == (input_size * 2, input_size, 3)
# normalize by ImageNet mean and std
img = img - imagenet_mean
img = img / imagenet_std
tgt = tgt2 # tgt is not available
tgt = np.concatenate((tgt2, tgt), axis=0)
assert tgt.shape == (input_size * 2, input_size, 3)
# normalize by ImageNet mean and std
tgt = tgt - imagenet_mean
tgt = tgt / imagenet_std
# make random mask reproducible (comment out to make it change)
torch.manual_seed(2)
run_one_image(img, tgt, size, model_painter, out_path, device)
#!/usr/bin/env bash
# Copyright (c) OpenMMLab. All rights reserved.
CONFIG=$1
CHECKPOINT=$2
GPUS=$3
NNODES=${NNODES:-1}
NODE_RANK=${NODE_RANK:-0}
PORT=${PORT:-29500}
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch \
--nnodes=$NNODES \
--node_rank=$NODE_RANK \
--master_addr=$MASTER_ADDR \
--nproc_per_node=$GPUS \
--master_port=$PORT \
$(dirname "$0")/test.py \
$CONFIG \
$CHECKPOINT \
--launcher pytorch \
${@:4}
#!/usr/bin/env bash
# Copyright (c) OpenMMLab. All rights reserved.
CONFIG=$1
GPUS=$2
NNODES=${NNODES:-1}
NODE_RANK=${NODE_RANK:-0}
PORT=${PORT:-29500}
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch \
--nnodes=$NNODES \
--node_rank=$NODE_RANK \
--master_addr=$MASTER_ADDR \
--nproc_per_node=$GPUS \
--master_port=$PORT \
$(dirname "$0")/train.py \
$CONFIG \
--launcher pytorch ${@:3}
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os
import os.path as osp
import sys
sys.path.insert(0, "./")
import tqdm
import warnings
import mmcv
import torch
from mmcv import Config, DictAction
from mmcv.cnn import fuse_conv_bn
from mmcv.parallel import MMDataParallel, MMDistributedDataParallel
from mmcv.runner import get_dist_info, init_dist, load_checkpoint
from mmpose.apis import multi_gpu_test
from apis.test import single_gpu_test
from mmpose.datasets import build_dataloader, build_dataset
from mmpose.models import build_posenet
from mmpose.utils import setup_multi_processes
try:
from mmcv.runner import wrap_fp16_model
except ImportError:
warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0'
'Please install mmcv>=1.1.4')
from mmpose.core import wrap_fp16_model
def parse_args():
parser = argparse.ArgumentParser(description='mmpose test model')
parser.add_argument('config', help='test config file path')
parser.add_argument('checkpoint', help='checkpoint file')
parser.add_argument('--out', help='output result file')
parser.add_argument(
'--work-dir', help='the dir to save evaluation results')
parser.add_argument(
'--fuse-conv-bn',
action='store_true',
help='Whether to fuse conv and bn, this will slightly increase'
'the inference speed')
parser.add_argument(
'--gpu-id',
type=int,
default=0,
help='id of gpu to use '
'(only applicable to non-distributed testing)')
parser.add_argument(
'--eval',
default=None,
nargs='+',
help='evaluation metric, which depends on the dataset,'
' e.g., "mAP" for MSCOCO')
parser.add_argument(
'--gpu-collect',
action='store_true',
help='whether to use gpu to collect results')
parser.add_argument('--tmpdir', help='tmp dir for writing some results')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def merge_configs(cfg1, cfg2):
# Merge cfg2 into cfg1
# Overwrite cfg1 if repeated, ignore if value is None.
cfg1 = {} if cfg1 is None else cfg1.copy()
cfg2 = {} if cfg2 is None else cfg2
for k, v in cfg2.items():
if v:
cfg1[k] = v
return cfg1
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# set multi-process settings
setup_multi_processes(cfg)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
cfg.data.test.test_mode = True
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# build the dataloader
dataset = build_dataset(cfg.data.test, dict(test_mode=True))
# step 1: give default values and override (if exist) from cfg.data
loader_cfg = {
**dict(seed=cfg.get('seed'), drop_last=False, dist=distributed),
**({} if torch.__version__ != 'parrots' else dict(
prefetch_num=2,
pin_memory=False,
)),
**dict((k, cfg.data[k]) for k in [
'seed',
'prefetch_num',
'pin_memory',
'persistent_workers',
] if k in cfg.data)
}
# step2: cfg.data.test_dataloader has higher priority
test_loader_cfg = {
**loader_cfg,
**dict(shuffle=False, drop_last=False),
**dict(workers_per_gpu=cfg.data.get('workers_per_gpu', 1)),
**dict(samples_per_gpu=cfg.data.get('samples_per_gpu', 1)),
**cfg.data.get('test_dataloader', {})
}
data_loader = build_dataloader(dataset, **test_loader_cfg)
load_data_only = cfg.data.get('load_data_only', False)
if load_data_only:
for _ in tqdm.tqdm(data_loader):
pass
print("dataset enumerated, exit!")
sys.exit()
# build the model and load checkpoint
model = build_posenet(cfg.model)
fp16_cfg = cfg.get('fp16', None)
if fp16_cfg is not None:
wrap_fp16_model(model)
# load_checkpoint(model, args.checkpoint, map_location='cpu')
if args.fuse_conv_bn:
model = fuse_conv_bn(model)
pseudo_test = cfg.data.get('pseudo_test', False)
assert pseudo_test
# only support single gpu test
model = MMDataParallel(model, device_ids=[args.gpu_id])
outputs = single_gpu_test(model, data_loader, pseudo_test=True)
rank, _ = get_dist_info()
eval_config = cfg.get('evaluation', {})
eval_config = merge_configs(eval_config, dict(metric=args.eval))
if rank == 0:
if args.out:
print(f'\nwriting results to {args.out}')
mmcv.dump(outputs, args.out)
results = dataset.evaluate(outputs, cfg.work_dir, **eval_config)
for k, v in sorted(results.items()):
print(f'{k}: {v}')
if __name__ == '__main__':
main()
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import copy
import os
import os.path as osp
import time
import warnings
import mmcv
import torch
import torch.distributed as dist
from mmcv import Config, DictAction
from mmcv.runner import get_dist_info, init_dist, set_random_seed
from mmcv.utils import get_git_hash
from mmpose import __version__
from mmpose.apis import init_random_seed
from apis.train import train_model
from mmpose.datasets import build_dataset
from mmpose.models import build_posenet
from mmpose.utils import collect_env, get_root_logger, setup_multi_processes
def parse_args():
parser = argparse.ArgumentParser(description='Train a pose model')
parser.add_argument('config', help='train config file path')
parser.add_argument('--work-dir', help='the dir to save logs and models')
parser.add_argument(
'--resume-from', help='the checkpoint file to resume from')
parser.add_argument(
'--no-validate',
action='store_true',
help='whether not to evaluate the checkpoint during training')
group_gpus = parser.add_mutually_exclusive_group()
group_gpus.add_argument(
'--gpus',
type=int,
help='(Deprecated, please use --gpu-id) number of gpus to use '
'(only applicable to non-distributed training)')
group_gpus.add_argument(
'--gpu-ids',
type=int,
nargs='+',
help='(Deprecated, please use --gpu-id) ids of gpus to use '
'(only applicable to non-distributed training)')
group_gpus.add_argument(
'--gpu-id',
type=int,
default=0,
help='id of gpu to use '
'(only applicable to non-distributed training)')
parser.add_argument('--seed', type=int, default=None, help='random seed')
parser.add_argument(
'--diff_seed',
action='store_true',
help='Whether or not set different seeds for different ranks')
parser.add_argument(
'--deterministic',
action='store_true',
help='whether to set deterministic options for CUDNN backend.')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
default={},
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. For example, '
"'--cfg-options model.backbone.depth=18 model.backbone.with_cp=True'")
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
parser.add_argument(
'--autoscale-lr',
action='store_true',
help='automatically scale lr with the number of gpus')
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def main():
args = parse_args()
cfg = Config.fromfile(args.config)
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# set multi-process settings
setup_multi_processes(cfg)
# set cudnn_benchmark
if cfg.get('cudnn_benchmark', False):
torch.backends.cudnn.benchmark = True
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
cfg.work_dir = osp.join('./work_dirs',
osp.splitext(osp.basename(args.config))[0])
if args.resume_from is not None:
cfg.resume_from = args.resume_from
if args.gpus is not None:
cfg.gpu_ids = range(1)
warnings.warn('`--gpus` is deprecated because we only support '
'single GPU mode in non-distributed training. '
'Use `gpus=1` now.')
if args.gpu_ids is not None:
cfg.gpu_ids = args.gpu_ids[0:1]
warnings.warn('`--gpu-ids` is deprecated, please use `--gpu-id`. '
'Because we only support single GPU mode in '
'non-distributed training. Use the first GPU '
'in `gpu_ids` now.')
if args.gpus is None and args.gpu_ids is None:
cfg.gpu_ids = [args.gpu_id]
if args.autoscale_lr:
# apply the linear scaling rule (https://arxiv.org/abs/1706.02677)
cfg.optimizer['lr'] = cfg.optimizer['lr'] * len(cfg.gpu_ids) / 8
# init distributed env first, since logger depends on the dist info.
if args.launcher == 'none':
distributed = False
if len(cfg.gpu_ids) > 1:
warnings.warn(
f'We treat {cfg.gpu_ids} as gpu-ids, and reset to '
f'{cfg.gpu_ids[0:1]} as gpu-ids to avoid potential error in '
'non-distribute training time.')
cfg.gpu_ids = cfg.gpu_ids[0:1]
else:
distributed = True
init_dist(args.launcher, **cfg.dist_params)
# re-set gpu_ids with distributed training mode
_, world_size = get_dist_info()
cfg.gpu_ids = range(world_size)
# create work_dir
mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))
# init the logger before other steps
timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
log_file = osp.join(cfg.work_dir, f'{timestamp}.log')
logger = get_root_logger(log_file=log_file, log_level=cfg.log_level)
# init the meta dict to record some important information such as
# environment info and seed, which will be logged
meta = dict()
# log env info
env_info_dict = collect_env()
env_info = '\n'.join([(f'{k}: {v}') for k, v in env_info_dict.items()])
dash_line = '-' * 60 + '\n'
logger.info('Environment info:\n' + dash_line + env_info + '\n' +
dash_line)
meta['env_info'] = env_info
# log some basic info
logger.info(f'Distributed training: {distributed}')
logger.info(f'Config:\n{cfg.pretty_text}')
# set random seeds
seed = init_random_seed(args.seed)
seed = seed + dist.get_rank() if args.diff_seed else seed
logger.info(f'Set random seed to {seed}, '
f'deterministic: {args.deterministic}')
set_random_seed(seed, deterministic=args.deterministic)
cfg.seed = seed
meta['seed'] = seed
# model = build_posenet(cfg.model)
model = None
datasets = [build_dataset(cfg.data.train)]
if len(cfg.workflow) == 2:
val_dataset = copy.deepcopy(cfg.data.val)
val_dataset.pipeline = cfg.data.train.pipeline
datasets.append(build_dataset(val_dataset))
if cfg.checkpoint_config is not None:
# save mmpose version, config file content
# checkpoints as meta data
cfg.checkpoint_config.meta = dict(
mmpose_version=__version__ + get_git_hash(digits=7),
config=cfg.pretty_text,
)
train_model(
model,
datasets,
cfg,
distributed=distributed,
validate=(not args.no_validate),
timestamp=timestamp,
meta=meta)
if __name__ == '__main__':
main()
# !/bin/bash
set -x
JOB_NAME="painter_vit_large"
CKPT_FILE="painter_vit_large.pth"
PROMPT="study_room_0005b/rgb_00094"
MODEL="painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1"
CKPT_PATH="models/${JOB_NAME}/${CKPT_FILE}"
DST_DIR="models_inference/${JOB_NAME}/nyuv2_depth_inference_${CKPT_FILE}_${PROMPT}"
# inference
python eval/nyuv2_depth/painter_inference_depth.py \
--ckpt_path ${CKPT_PATH} --model ${MODEL} --prompt ${PROMPT}
python eval/nyuv2_depth/eval_with_pngs.py \
--pred_path ${DST_DIR} \
--gt_path datasets/nyu_depth_v2/official_splits/test/ \
--dataset nyu --min_depth_eval 1e-3 --max_depth_eval 10 --eigen_crop
# Copyright (C) 2019 Jin Han Lee
#
# This file is a part of BTS.
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>
from __future__ import absolute_import, division, print_function
import os
import argparse
import fnmatch
import cv2
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '1'
def convert_arg_line_to_args(arg_line):
for arg in arg_line.split():
if not arg.strip():
continue
yield arg
parser = argparse.ArgumentParser(description='BTS TensorFlow implementation.', fromfile_prefix_chars='@')
parser.convert_arg_line_to_args = convert_arg_line_to_args
parser.add_argument('--pred_path', type=str, help='path to the prediction results in png', required=True)
parser.add_argument('--gt_path', type=str, help='root path to the groundtruth data', required=False)
parser.add_argument('--dataset', type=str, help='dataset to test on, nyu or kitti', default='nyu')
parser.add_argument('--eigen_crop', help='if set, crops according to Eigen NIPS14', action='store_true')
parser.add_argument('--garg_crop', help='if set, crops according to Garg ECCV16', action='store_true')
parser.add_argument('--min_depth_eval', type=float, help='minimum depth for evaluation', default=1e-3)
parser.add_argument('--max_depth_eval', type=float, help='maximum depth for evaluation', default=80)
parser.add_argument('--do_kb_crop', help='if set, crop input images as kitti benchmark images', action='store_true')
args = parser.parse_args()
def compute_errors(gt, pred):
thresh = np.maximum((gt / pred), (pred / gt))
d1 = (thresh < 1.25).mean()
d2 = (thresh < 1.25 ** 2).mean()
d3 = (thresh < 1.25 ** 3).mean()
rmse = (gt - pred) ** 2
rmse = np.sqrt(rmse.mean())
rmse_log = (np.log(gt) - np.log(pred)) ** 2
rmse_log = np.sqrt(rmse_log.mean())
abs_rel = np.mean(np.abs(gt - pred) / gt)
sq_rel = np.mean(((gt - pred)**2) / gt)
err = np.log(pred) - np.log(gt)
silog = np.sqrt(np.mean(err ** 2) - np.mean(err) ** 2) * 100
err = np.abs(np.log10(pred) - np.log10(gt))
log10 = np.mean(err)
return silog, log10, abs_rel, sq_rel, rmse, rmse_log, d1, d2, d3
def test():
global gt_depths, missing_ids, pred_filenames
gt_depths = []
missing_ids = set()
pred_filenames = []
for root, dirnames, filenames in os.walk(args.pred_path):
for pred_filename in fnmatch.filter(filenames, '*.png'):
if 'cmap' in pred_filename or 'gt' in pred_filename:
continue
dirname = root.replace(args.pred_path, '')
pred_filenames.append(os.path.join(dirname, pred_filename))
num_test_samples = len(pred_filenames)
pred_depths = []
for i in range(num_test_samples):
pred_depth_path = os.path.join(args.pred_path, pred_filenames[i])
pred_depth = cv2.imread(pred_depth_path, -1)
if pred_depth is None:
print('Missing: %s ' % pred_depth_path)
missing_ids.add(i)
continue
if args.dataset == 'nyu':
pred_depth = pred_depth.astype(np.float32) / 1000.0
else:
pred_depth = pred_depth.astype(np.float32) / 256.0
pred_depths.append(pred_depth)
print('Raw png files reading done')
print('Evaluating {} files'.format(len(pred_depths)))
if args.dataset == 'kitti':
for t_id in range(num_test_samples):
file_dir = pred_filenames[t_id].split('.')[0]
filename = file_dir.split('_')[-1]
directory = file_dir.replace('_' + filename, '')
gt_depth_path = os.path.join(args.gt_path, directory, 'proj_depth/groundtruth/image_02', filename + '.png')
depth = cv2.imread(gt_depth_path, -1)
if depth is None:
print('Missing: %s ' % gt_depth_path)
missing_ids.add(t_id)
continue
depth = depth.astype(np.float32) / 256.0
gt_depths.append(depth)
elif args.dataset == 'nyu':
for t_id in range(num_test_samples):
file_dir = pred_filenames[t_id].split('.')[0]
filename = file_dir.split('_')[-1]
directory = file_dir.replace('_rgb_'+file_dir.split('_')[-1], '')
gt_depth_path = os.path.join(args.gt_path, directory, 'sync_depth_' + filename + '.png')
depth = cv2.imread(gt_depth_path, -1)
if depth is None:
print('Missing: %s ' % gt_depth_path)
missing_ids.add(t_id)
continue
depth = depth.astype(np.float32) / 1000.0
gt_depths.append(depth)
print('GT files reading done')
print('{} GT files missing'.format(len(missing_ids)))
print('Computing errors')
eval(pred_depths)
print('Done.')
def eval(pred_depths):
num_samples = len(pred_depths)
pred_depths_valid = []
i = 0
for t_id in range(num_samples):
if t_id in missing_ids:
continue
pred_depths_valid.append(pred_depths[t_id])
num_samples = num_samples - len(missing_ids)
silog = np.zeros(num_samples, np.float32)
log10 = np.zeros(num_samples, np.float32)
rms = np.zeros(num_samples, np.float32)
log_rms = np.zeros(num_samples, np.float32)
abs_rel = np.zeros(num_samples, np.float32)
sq_rel = np.zeros(num_samples, np.float32)
d1 = np.zeros(num_samples, np.float32)
d2 = np.zeros(num_samples, np.float32)
d3 = np.zeros(num_samples, np.float32)
for i in range(num_samples):
gt_depth = gt_depths[i]
pred_depth = pred_depths_valid[i]
pred_depth[pred_depth < args.min_depth_eval] = args.min_depth_eval
pred_depth[pred_depth > args.max_depth_eval] = args.max_depth_eval
pred_depth[np.isinf(pred_depth)] = args.max_depth_eval
gt_depth[np.isinf(gt_depth)] = 0
gt_depth[np.isnan(gt_depth)] = 0
valid_mask = np.logical_and(gt_depth > args.min_depth_eval, gt_depth < args.max_depth_eval)
if args.do_kb_crop:
height, width = gt_depth.shape
top_margin = int(height - 352)
left_margin = int((width - 1216) / 2)
pred_depth_uncropped = np.zeros((height, width), dtype=np.float32)
pred_depth_uncropped[top_margin:top_margin + 352, left_margin:left_margin + 1216] = pred_depth
pred_depth = pred_depth_uncropped
if args.garg_crop or args.eigen_crop:
gt_height, gt_width = gt_depth.shape
eval_mask = np.zeros(valid_mask.shape)
if args.garg_crop:
eval_mask[int(0.40810811 * gt_height):int(0.99189189 * gt_height), int(0.03594771 * gt_width):int(0.96405229 * gt_width)] = 1
elif args.eigen_crop:
if args.dataset == 'kitti':
eval_mask[int(0.3324324 * gt_height):int(0.91351351 * gt_height), int(0.0359477 * gt_width):int(0.96405229 * gt_width)] = 1
else:
eval_mask[45:471, 41:601] = 1
valid_mask = np.logical_and(valid_mask, eval_mask)
silog[i], log10[i], abs_rel[i], sq_rel[i], rms[i], log_rms[i], d1[i], d2[i], d3[i] = compute_errors(gt_depth[valid_mask], pred_depth[valid_mask])
print("{:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}, {:>7}".format(
'd1', 'd2', 'd3', 'AbsRel', 'SqRel', 'RMSE', 'RMSElog', 'SILog', 'log10'))
print("{:7.3f}, {:7.3f}, {:7.3f}, {:7.3f}, {:7.3f}, {:7.3f}, {:7.3f}, {:7.3f}, {:7.3f}".format(
d1.mean(), d2.mean(), d3.mean(),
abs_rel.mean(), sq_rel.mean(), rms.mean(), log_rms.mean(), silog.mean(), log10.mean()))
return silog, log10, abs_rel, sq_rel, rms, log_rms, d1, d2, d3
def main():
test()
if __name__ == '__main__':
main()
# --------------------------------------------------------
# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
# Github source: https://github.com/baaivision/Painter
# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
# Licensed under The MIT License [see LICENSE for details]
# By Xinlong Wang, Wen Wang
# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
# --------------------------------------------------------'
import sys
import os
import argparse
import torch
import torch.nn.functional as F
import numpy as np
import glob
import tqdm
import matplotlib.pyplot as plt
from PIL import Image
sys.path.append('.')
import models_painter
imagenet_mean = np.array([0.485, 0.456, 0.406])
imagenet_std = np.array([0.229, 0.224, 0.225])
def show_image(image, title=''):
# image is [H, W, 3]
assert image.shape[2] == 3
plt.imshow(torch.clip((image * imagenet_std + imagenet_mean) * 255, 0, 255).int())
plt.title(title, fontsize=16)
plt.axis('off')
return
def prepare_model(chkpt_dir, arch='painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1'):
# build model
model = getattr(models_painter, arch)()
# load model
checkpoint = torch.load(chkpt_dir, map_location='cuda:0')
msg = model.load_state_dict(checkpoint['model'], strict=False)
print(msg)
model.eval()
return model
def run_one_image(img, tgt, size, model, out_path, device):
x = torch.tensor(img)
x = x.unsqueeze(dim=0)
x = torch.einsum('nhwc->nchw', x)
tgt = torch.tensor(tgt)
tgt = tgt.unsqueeze(dim=0)
tgt = torch.einsum('nhwc->nchw', tgt)
bool_masked_pos = torch.zeros(model.patch_embed.num_patches)
bool_masked_pos[model.patch_embed.num_patches//2:] = 1
bool_masked_pos = bool_masked_pos.unsqueeze(dim=0)
valid = torch.ones_like(tgt)
loss, y, mask = model(x.float().to(device), tgt.float().to(device), bool_masked_pos.to(device), valid.float().to(device))
y = model.unpatchify(y)
y = torch.einsum('nchw->nhwc', y).detach().cpu()
output = y[0, y.shape[1]//2:, :, :]
output = torch.clip((output * imagenet_std + imagenet_mean) * 10000, 0, 10000)
output = F.interpolate(output[None, ...].permute(0, 3, 1, 2), size=[size[1], size[0]], mode='bilinear').permute(0, 2, 3, 1)[0]
output = output.mean(-1).int()
output = Image.fromarray(output.numpy())
output.save(out_path)
def get_args_parser():
parser = argparse.ArgumentParser('NYU Depth V2', add_help=False)
parser.add_argument('--ckpt_path', type=str, help='path to ckpt',
default='')
parser.add_argument('--model', type=str, help='dir to ckpt',
default='painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1')
parser.add_argument('--prompt', type=str, help='prompt image in train set',
default='study_room_0005b/rgb_00094')
parser.add_argument('--input_size', type=int, default=448)
return parser.parse_args()
if __name__ == '__main__':
args = get_args_parser()
ckpt_path = args.ckpt_path
path_splits = ckpt_path.split('/')
ckpt_dir, ckpt_file = path_splits[-2], path_splits[-1]
model_painter = prepare_model(ckpt_path, args.model)
print('Model loaded.')
device = torch.device("cuda")
model_painter.to(device)
dst_dir = os.path.join('models_inference', ckpt_dir,
"nyuv2_depth_inference_{}_{}/".format(ckpt_file, args.prompt))
print(dst_dir)
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
img_src_dir = "datasets/nyu_depth_v2/official_splits/test/"
img_path_list = glob.glob(img_src_dir + "/*/rgb*g")
img2_path = "datasets/nyu_depth_v2/sync/{}.jpg".format(args.prompt)
tgt_path = "datasets/nyu_depth_v2/sync/{}.png".format(args.prompt.replace('rgb', 'sync_depth'))
tgt2_path = tgt_path
res, hres = args.input_size, args.input_size
for img_path in tqdm.tqdm(img_path_list):
room_name = img_path.split("/")[-2]
img_name = img_path.split("/")[-1].split(".")[0]
out_path = dst_dir + "/" + room_name + "_" + img_name + ".png"
img = Image.open(img_path).convert("RGB")
size = img.size
img = img.resize((res, hres))
img = np.array(img) / 255.
img2 = Image.open(img2_path).convert("RGB")
img2 = img2.resize((res, hres))
img2 = np.array(img2) / 255.
img = np.concatenate((img2, img), axis=0)
assert img.shape == (2 * res, res, 3)
# normalize by ImageNet mean and std
img = img - imagenet_mean
img = img / imagenet_std
tgt = Image.open(tgt_path)
tgt = np.array(tgt) / 10000.
tgt = tgt * 255
tgt = Image.fromarray(tgt).convert("RGB")
tgt = tgt.resize((res, hres))
tgt = np.array(tgt) / 255.
tgt2 = Image.open(tgt2_path)
tgt2 = np.array(tgt2) / 10000.
tgt2 = tgt2 * 255
tgt2 = Image.fromarray(tgt2).convert("RGB")
tgt2 = tgt2.resize((res, hres))
tgt2 = np.array(tgt2) / 255.
tgt = np.concatenate((tgt2, tgt), axis=0)
assert tgt.shape == (2 * res, res, 3)
# normalize by ImageNet mean and std
tgt = tgt - imagenet_mean
tgt = tgt / imagenet_std
torch.manual_seed(2)
run_one_image(img, tgt, size, model_painter, out_path, device)
close all;clear all;
denoised = load('/MATLAB Drive/painter/sidd/Idenoised.mat');
gt = load('/MATLAB Drive/painter/sidd/ValidationGtBlocksSrgb.mat');
denoised = denoised.Idenoised;
gt = gt.ValidationGtBlocksSrgb;
gt = im2single(gt);
total_psnr = 0;
total_ssim = 0;
for i = 1:40
for k = 1:32
denoised_patch = squeeze(denoised(i,k,:,:,:));
gt_patch = squeeze(gt(i,k,:,:,:));
ssim_val = ssim(denoised_patch, gt_patch);
psnr_val = psnr(denoised_patch, gt_patch);
total_ssim = total_ssim + ssim_val;
total_psnr = total_psnr + psnr_val;
end
end
qm_psnr = total_psnr / (40*32);
qm_ssim = total_ssim / (40*32);
fprintf('PSNR: %f SSIM: %f\n', qm_psnr, qm_ssim);
\ No newline at end of file
# --------------------------------------------------------
# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
# Github source: https://github.com/baaivision/Painter
# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
# Licensed under The MIT License [see LICENSE for details]
# By Xinlong Wang, Wen Wang
# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
# --------------------------------------------------------'
import sys
import os
import warnings
import requests
import argparse
import torch
import torch.nn.functional as F
import numpy as np
import glob
import tqdm
import matplotlib.pyplot as plt
import cv2
from PIL import Image
import scipy.io as sio
sys.path.append('.')
import models_painter
from skimage.metrics import peak_signal_noise_ratio as psnr_loss
from skimage.metrics import structural_similarity as ssim_loss
imagenet_mean = np.array([0.485, 0.456, 0.406])
imagenet_std = np.array([0.229, 0.224, 0.225])
def get_args_parser():
parser = argparse.ArgumentParser('SIDD denoising', add_help=False)
parser.add_argument('--ckpt_path', type=str, help='path to ckpt',
default='')
parser.add_argument('--model', type=str, help='dir to ckpt',
default='painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1')
parser.add_argument('--prompt', type=str, help='prompt image in train set',
default='9_9')
parser.add_argument('--input_size', type=int, default=448)
parser.add_argument('--save', action='store_true', help='save predictions',
default=False)
return parser.parse_args()
def prepare_model(chkpt_dir, arch='painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1'):
# build model
model = getattr(models_painter, arch)()
# load model
checkpoint = torch.load(chkpt_dir, map_location='cuda:0')
msg = model.load_state_dict(checkpoint['model'], strict=False)
print(msg)
return model
def run_one_image(img, tgt, size, model, out_path, device):
x = torch.tensor(img)
x = x.unsqueeze(dim=0)
x = torch.einsum('nhwc->nchw', x)
tgt = torch.tensor(tgt)
tgt = tgt.unsqueeze(dim=0)
tgt = torch.einsum('nhwc->nchw', tgt)
bool_masked_pos = torch.zeros(model.patch_embed.num_patches)
bool_masked_pos[model.patch_embed.num_patches//2:] = 1
bool_masked_pos = bool_masked_pos.unsqueeze(dim=0)
valid = torch.ones_like(tgt)
loss, y, mask = model(x.float().to(device), tgt.float().to(device), bool_masked_pos.to(device), valid.float().to(device))
y = model.unpatchify(y)
y = torch.einsum('nchw->nhwc', y).detach().cpu()
output = y[0, y.shape[1]//2:, :, :]
output = output * imagenet_std + imagenet_mean
output = F.interpolate(
output[None, ...].permute(0, 3, 1, 2), size=[size[1], size[0]], mode='bicubic').permute(0, 2, 3, 1)[0]
return output.numpy()
if __name__ == '__main__':
args = get_args_parser()
ckpt_path = args.ckpt_path
model = args.model
prompt = args.prompt
input_size = args.input_size
path_splits = ckpt_path.split('/')
ckpt_dir, ckpt_file = path_splits[-2], path_splits[-1]
dst_dir = os.path.join('models_inference', ckpt_dir.split('/')[-1],
"sidd_inference_{}_{}".format(ckpt_file, os.path.basename(prompt).split(".")[0]))
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
print("output_dir: {}".format(dst_dir))
model_painter = prepare_model(ckpt_path, model)
print('Model loaded.')
device = torch.device("cuda")
model_painter.to(device)
img_src_dir = "datasets/denoise/val/"
filepath = os.path.join(img_src_dir, 'ValidationNoisyBlocksSrgb.mat')
img = sio.loadmat(filepath)
Inoisy = np.float32(np.array(img['ValidationNoisyBlocksSrgb'])) # (40, 32, 256, 256, 3)
Inoisy /= 255.
img2_path = "datasets/denoise/train/input/{}.png".format(prompt)
tgt2_path = "datasets/denoise/train/groundtruth/{}.png".format(prompt)
# load the shared prompt image pair
img2 = Image.open(img2_path).convert("RGB")
img2 = img2.resize((input_size, input_size))
img2 = np.array(img2) / 255.
tgt2 = Image.open(tgt2_path)
tgt2 = tgt2.resize((input_size, input_size))
tgt2 = np.array(tgt2) / 255.
model_painter.eval()
restored = np.zeros_like(Inoisy)
for img_idx in tqdm.tqdm(range(40)):
for patch_idx in range(32):
""" Load an image """
img_org = Inoisy[img_idx, patch_idx, :, :, :]
img = cv2.resize(img_org, (input_size, input_size))
# img = img_org.resize((input_size, input_size))
img = np.concatenate((img2, img), axis=0)
assert img.shape == (input_size * 2, input_size, 3)
# normalize by ImageNet mean and std
img = img - imagenet_mean
img = img / imagenet_std
tgt = tgt2 # tgt is not available
tgt = np.concatenate((tgt2, tgt), axis=0)
assert tgt.shape == (input_size * 2, input_size, 3)
# normalize by ImageNet mean and std
tgt = tgt - imagenet_mean
tgt = tgt / imagenet_std
# make random mask reproducible (comment out to make it change)
torch.manual_seed(2)
output = run_one_image(img, tgt, size=(256, 256), model=model_painter, out_path=None, device=device)
rgb_restored = output
rgb_restored = np.clip(rgb_restored, 0, 1)
restored[img_idx, patch_idx, :, :, :] = rgb_restored
# optionally save images
if args.save:
out_path = os.path.join(dst_dir, '%04d_%02d.png' % (img_idx + 1, patch_idx + 1))
output = rgb_restored * 255
output = Image.fromarray(output.astype(np.uint8))
output.save(out_path)
# save denoised data
sio.savemat(os.path.join(dst_dir, 'Idenoised.mat'), {"Idenoised": restored, })
print(os.path.join(dst_dir, 'Idenoised.mat'))
# --------------------------------------------------------
# Images Speak in Images: A Generalist Painter for In-Context Visual Learning (https://arxiv.org/abs/2212.02499)
# Github source: https://github.com/baaivision/Painter
# Copyright (c) 2022 Beijing Academy of Artificial Intelligence (BAAI)
# Licensed under The MIT License [see LICENSE for details]
# By Xinlong Wang, Wen Wang
# Based on MAE, BEiT, detectron2, Mask2Former, bts, mmcv, mmdetetection, mmpose, MIRNet, MPRNet, and Uformer codebases
# --------------------------------------------------------'
import argparse
import datetime
import json
import numpy as np
import os
import time
from pathlib import Path
import torch
import torch.backends.cudnn as cudnn
from torch.utils.tensorboard import SummaryWriter
import timm
assert timm.__version__ == "0.3.2" # version check
import util.lr_decay as lrd
import util.misc as misc
from util.misc import get_parameter_groups
from util.misc import NativeScalerWithGradNormCount as NativeScaler
from util.pos_embed import interpolate_pos_embed
import models_painter
from engine_train import train_one_epoch, evaluate_pt
from data.pairdataset import PairDataset
import data.pair_transforms as pair_transforms
from util.masking_generator import MaskingGenerator
from data.sampler import DistributedSamplerWrapper
try:
import wandb
has_wandb = True
except ImportError:
has_wandb = False
def get_args_parser():
parser = argparse.ArgumentParser('Painter pre-training', add_help=False)
parser.add_argument('--batch_size', default=2, type=int,
help='Batch size per GPU (effective batch size is batch_size * accum_iter * # gpus')
parser.add_argument('--epochs', default=15, type=int)
parser.add_argument('--accum_iter', default=16, type=int,
help='Accumulate gradient iterations (for increasing the effective batch size under memory constraints)')
# Model parameters
parser.add_argument('--model', default='painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1', type=str, metavar='MODEL',
help='Name of model to train')
parser.add_argument('--input_size', default=224, type=int, nargs='+',
help='images input size')
parser.add_argument('--mask_ratio', default=0.5, type=float,
help='Masking ratio (percentage of removed patches).')
parser.add_argument('--norm_pix_loss', action='store_true',
help='Use (per-patch) normalized pixels as targets for computing loss')
parser.set_defaults(norm_pix_loss=False)
parser.add_argument('--num_mask_patches', default=784, type=int,
help='number of the visual tokens/patches need be masked')
parser.add_argument('--max_mask_patches_per_block', type=int, default=None)
parser.add_argument('--min_mask_patches_per_block', type=int, default=16)
parser.add_argument('--stop_grad_patch_embed', action='store_true',
help='stop-grad after first conv, or patch embedding')
parser.set_defaults(stop_grad_patch_embed=False)
parser.add_argument('--finetune', default='',
help='finetune from checkpoint')
parser.add_argument('--drop_path', default=0., type=float,
help='Drop path rate (default: 0.)')
parser.add_argument('--min_random_scale', default=0.3, type=float,
help='Minimal random scale for randomresizecrop (default: 0.3)')
parser.add_argument('--last_norm_instance', action='store_true', default=False,
help='use instance norm to normalize each channel map before the decoder layer')
parser.add_argument('--half_mask_ratio', default=0.1, type=float,
help='ratio of using half mask during training (default: 0.1)')
parser.add_argument('--use_checkpoint', action='store_true', default=False,
help='use checkpoint to save GPU memory')
# Optimizer parameters
parser.add_argument('--weight_decay', type=float, default=0.1,
help='weight decay (default: 0.1)')
parser.add_argument('--lr', type=float, default=None, metavar='LR',
help='learning rate (absolute lr)')
parser.add_argument('--blr', type=float, default=1e-3, metavar='LR',
help='base learning rate: absolute_lr = base_lr * total_batch_size / 256')
parser.add_argument('--min_lr', type=float, default=0., metavar='LR',
help='lower lr bound for cyclic schedulers that hit 0')
parser.add_argument('--warmup_epochs', type=int, default=40, metavar='N',
help='epochs to warmup LR')
parser.add_argument('--save_freq', type=int, default=100,
help='save checkkpoints frequency')
parser.add_argument('--clip_grad', type=float, default=3.0, metavar='NORM',
help='Clip gradient norm (default: None, no clipping)')
parser.add_argument('--opt_eps', default=1e-8, type=float, metavar='EPSILON',
help='Optimizer Epsilon (default: 1e-8)')
parser.add_argument('--opt_betas', default=[0.9, 0.999], type=float, nargs='+', metavar='BETA',
help='Optimizer Betas (default: None, use opt default)')
parser.add_argument('--layer_decay', type=float, default=1.0, metavar='LRD',
help='Learning rate layer decay')
# Dataset parameters
parser.add_argument('--data_path', default='/datasets01/imagenet_full_size/061417/', type=str,
help='dataset path')
#parser.add_argument('--json_path', default='./', type=str,
parser.add_argument('--json_path', default='./', nargs='+', type=str,
help='json path')
parser.add_argument('--val_json_path', default='./', nargs='+',type=str,
help='json path')
parser.add_argument('--output_dir', default='./output_dir',
help='path where to save, empty for no saving')
parser.add_argument('--log_dir', default='./output_dir',
help='path where to tensorboard log')
parser.add_argument('--device', default='cuda',
help='device to use for training / testing')
parser.add_argument('--seed', default=0, type=int)
parser.add_argument('--resume', default='',
help='resume from checkpoint')
parser.add_argument('--auto_resume', action='store_true')
parser.set_defaults(auto_resume=False)
parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
help='start epoch')
parser.add_argument('--num_workers', default=10, type=int)
parser.add_argument('--pin_mem', action='store_true',
help='Pin CPU memory in DataLoader for more efficient (sometimes) transfer to GPU.')
parser.add_argument('--no_pin_mem', action='store_false', dest='pin_mem')
parser.set_defaults(pin_mem=True)
parser.add_argument('--use_two_pairs', action='store_true',
help='concatenate two pairs of images')
parser.set_defaults(use_two_pairs=True)
# distributed training parameters
parser.add_argument('--world_size', default=1, type=int,
help='number of distributed processes')
parser.add_argument('--local_rank', default=-1, type=int)
parser.add_argument('--dist_on_itp', action='store_true')
parser.add_argument('--dist_url', default='env://',
help='url used to set up distributed training')
parser.add_argument('--enable_deepspeed',
action='store_true', default=False)
parser.add_argument('--zero_stage', default=0, type=int,
help='ZeRO optimizer stage (default: 0)')
# misc
parser.add_argument('--log_wandb', action='store_true', default=False,
help='log training and validation metrics to wandb')
known_args, _ = parser.parse_known_args()
if known_args.enable_deepspeed:
try:
import deepspeed
from deepspeed import DeepSpeedConfig
parser = deepspeed.add_config_arguments(parser)
ds_init = deepspeed.initialize
except:
print("Please 'pip install deepspeed==0.4.0'")
exit(0)
else:
ds_init = None
return parser.parse_args(), ds_init
def main(args, ds_init):
misc.init_distributed_mode(args)
if ds_init is not None:
misc.create_ds_config(args)
print('job dir: {}'.format(os.path.dirname(os.path.realpath(__file__))))
print("{}".format(args).replace(', ', ',\n'))
device = torch.device(args.device)
# fix the seed for reproducibility
seed = args.seed + misc.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
cudnn.benchmark = True
# define the model
model = models_painter.__dict__[args.model]()
if args.finetune:
checkpoint = torch.load(args.finetune, map_location='cpu')
print("Load pre-trained checkpoint from: %s" % args.finetune)
checkpoint_model = checkpoint['model']
state_dict = model.state_dict()
rm_key_list = ['decoder_embed.weight', 'decoder_embed.bias', 'mask_token']
if args.last_norm_instance:
rm_key_list.extend(['norm.weight', 'norm.bias'])
for k in rm_key_list:
if k in checkpoint_model and checkpoint_model[k].shape != state_dict[k].shape:
print(f"Removing key {k} from pretrained checkpoint")
del checkpoint_model[k]
# interpolate patch embedding
if "patch32" in args.model:
patch_weight = checkpoint['model']['patch_embed.proj.weight']
new_patch_weight = torch.nn.functional.interpolate(patch_weight, size=(32, 32), mode='bicubic', align_corners=False)
checkpoint['model']['patch_embed.proj.weight'] = new_patch_weight
# interpolate position embedding
if "painter" not in args.model:
interpolate_pos_embed(model, checkpoint_model)
# load pre-trained model
msg = model.load_state_dict(checkpoint_model, strict=False)
print(msg)
patch_size = model.patch_size
print("Patch size = %s" % str(patch_size))
args.window_size = (args.input_size[0] // patch_size, args.input_size[1] // patch_size)
args.patch_size = patch_size
# simple augmentation
transform_train = pair_transforms.Compose([
pair_transforms.RandomResizedCrop(args.input_size[1], scale=(args.min_random_scale, 1.0), interpolation=3), # 3 is bicubic
pair_transforms.RandomApply([
pair_transforms.ColorJitter(0.4, 0.4, 0.2, 0.1)
], p=0.8),
pair_transforms.RandomHorizontalFlip(),
pair_transforms.ToTensor(),
pair_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
transform_train2 = pair_transforms.Compose([
pair_transforms.RandomResizedCrop(args.input_size[1], scale=(0.9999, 1.0), interpolation=3), # 3 is bicubic
pair_transforms.ToTensor(),
pair_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
transform_train3 = pair_transforms.Compose([
pair_transforms.RandomResizedCrop(args.input_size[1], scale=(0.9999, 1.0), interpolation=3), # 3 is bicubic
pair_transforms.ToTensor(),
pair_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
transform_train_seccrop = pair_transforms.Compose([
pair_transforms.RandomResizedCrop(args.input_size, scale=(args.min_random_scale, 1.0), ratio=(0.3, 0.7), interpolation=3), # 3 is bicubic
])
transform_val = pair_transforms.Compose([
pair_transforms.RandomResizedCrop(args.input_size[1], scale=(0.9999, 1.0), interpolation=3), # 3 is bicubic
pair_transforms.ToTensor(),
pair_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
masked_position_generator = MaskingGenerator(
args.window_size, num_masking_patches=args.num_mask_patches,
max_num_patches=args.max_mask_patches_per_block,
min_num_patches=args.min_mask_patches_per_block,
)
dataset_train = PairDataset(args.data_path, args.json_path, transform=transform_train, transform2=transform_train2, transform3=transform_train3, transform_seccrop=transform_train_seccrop, masked_position_generator=masked_position_generator, use_two_pairs=args.use_two_pairs, half_mask_ratio=args.half_mask_ratio)
dataset_val = PairDataset(args.data_path, args.val_json_path, transform=transform_val, transform2=None, transform3=None, masked_position_generator=masked_position_generator, use_two_pairs=args.use_two_pairs, half_mask_ratio=1.0)
print(dataset_train)
print(dataset_val)
if True: # args.distributed:
num_tasks = misc.get_world_size()
global_rank = misc.get_rank()
num_samples_train = len(dataset_train)
weights_train = dataset_train.weights
sampler_train = torch.utils.data.WeightedRandomSampler(weights_train, num_samples_train, replacement=True)
sampler_train = DistributedSamplerWrapper(sampler_train, num_replicas=num_tasks, rank=global_rank, shuffle=True)
print("Sampler_train = %s" % str(sampler_train))
sampler_val = torch.utils.data.DistributedSampler(
dataset_val, num_replicas=num_tasks, rank=global_rank, shuffle=False)
else:
sampler_train = torch.utils.data.RandomSampler(dataset_train)
if global_rank == 0 and args.log_dir is not None:
os.makedirs(args.log_dir, exist_ok=True)
log_writer = SummaryWriter(log_dir=args.log_dir)
else:
log_writer = None
if global_rank == 0 and args.log_wandb:
experiment = args.log_dir.split('/')[-2]
if args.resume == '':
wandb.init(project="Painter", name=experiment, config=args)
else:
wandb.init(project="Painter", name=experiment, config=args, resume=True)
data_loader_train = torch.utils.data.DataLoader(
dataset_train, sampler=sampler_train,
batch_size=args.batch_size,
num_workers=args.num_workers,
pin_memory=args.pin_mem,
drop_last=True,
)
data_loader_val = torch.utils.data.DataLoader(
dataset_val, sampler=sampler_val,
batch_size=int(1.5 * args.batch_size),
num_workers=args.num_workers,
pin_memory=args.pin_mem,
drop_last=False,
)
model.to(device)
model_without_ddp = model
print("Model = %s" % str(model_without_ddp))
eff_batch_size = args.batch_size * args.accum_iter * misc.get_world_size()
if args.lr is None: # only base_lr is specified
args.lr = args.blr * eff_batch_size / 256
print("base lr: %.2e" % (args.lr * 256 / eff_batch_size))
print("actual lr: %.2e" % args.lr)
print("accumulate grad iterations: %d" % args.accum_iter)
print("effective batch size: %d" % eff_batch_size)
if args.enable_deepspeed:
loss_scaler = None
optimizer_params = get_parameter_groups(
model, args.weight_decay, model.no_weight_decay()
)
model, optimizer, _, _ = ds_init(
args=args, model=model, model_parameters=optimizer_params,
dist_init_required=not args.distributed,
)
print("model.gradient_accumulation_steps() = %d" %
model.gradient_accumulation_steps())
assert model.gradient_accumulation_steps() == args.accum_iter
else:
if args.distributed:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
model_without_ddp = model.module
# following timm: set wd as 0 for bias and norm layers
param_groups = lrd.param_groups_lrd(model_without_ddp, args.weight_decay,
no_weight_decay_list=model_without_ddp.no_weight_decay(),
layer_decay=args.layer_decay
)
optimizer = torch.optim.AdamW(param_groups, lr=args.lr, betas=args.opt_betas)
print(optimizer)
loss_scaler = NativeScaler()
misc.auto_load_model(
args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer, loss_scaler=loss_scaler)
print(f"Start training for {args.epochs} epochs")
start_time = time.time()
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
data_loader_train.sampler.set_epoch(epoch)
train_stats = train_one_epoch(
model, data_loader_train,
optimizer, device, epoch, loss_scaler,
log_writer=log_writer,
global_rank=global_rank,
args=args
)
if args.output_dir and (epoch % args.save_freq == 0 or epoch + 1 == args.epochs):
misc.save_model(
args=args, model=model, model_without_ddp=model_without_ddp, optimizer=optimizer,
loss_scaler=loss_scaler, epoch=epoch)
test_stats = evaluate_pt(data_loader_val, model, device, epoch=epoch, global_rank=global_rank, args=args)
print(f"Val loss of the network on the {len(dataset_val)} test images: {test_stats['loss']:.3f}")
log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
**{f'test_{k}': v for k, v in test_stats.items()},
'epoch': epoch,}
if args.output_dir and misc.is_main_process():
if log_writer is not None:
log_writer.flush()
with open(os.path.join(args.output_dir, "log.txt"), mode="a", encoding="utf-8") as f:
f.write(json.dumps(log_stats) + "\n")
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
if global_rank == 0 and args.log_wandb:
wandb.finish()
if __name__ == '__main__':
args, ds_init = get_args_parser()
if args.output_dir:
Path(args.output_dir).mkdir(parents=True, exist_ok=True)
main(args, ds_init)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment