Commit 491d0cec authored by chenych's avatar chenych
Browse files

First commit

parents
Pipeline #705 failed with stages
in 0 seconds
#!/bin/bash
export HIP_VISIBLE_DEVICES=0,1,2,3 # 自行修改为训练的卡号和数量
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
DATA_PATH=/home/datasets
name=painter_vit_large
python -m torch.distributed.launch --nproc_per_node=4 \
--use_env main_train.py \
--batch_size 2 \
--accum_iter 16 \
--model painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1 \
--num_mask_patches 784 \
--max_mask_patches_per_block 392 \
--epochs 15 \
--warmup_epochs 1 \
--lr 1e-3 \
--clip_grad 3 \
--layer_decay 0.8 \
--drop_path 0.1 \
--input_size 896 448 \
--save_freq 1 \
--data_path $DATA_PATH/ \
--json_path \
$DATA_PATH/nyu_depth_v2/nyuv2_sync_image_depth.json \
$DATA_PATH/ade20k/ade20k_training_image_semantic.json \
$DATA_PATH/coco/pano_ca_inst/coco_train_image_panoptic_inst.json \
$DATA_PATH/coco/pano_sem_seg/coco_train2017_image_panoptic_sem_seg.json \
$DATA_PATH/coco_pose/coco_pose_256x192_train.json \
$DATA_PATH/denoise/denoise_ssid_train.json \
$DATA_PATH/derain/derain_train.json \
$DATA_PATH/light_enhance/enhance_lol_train.json \
--val_json_path \
$DATA_PATH/nyu_depth_v2/nyuv2_test_image_depth.json \
$DATA_PATH/ade20k/ade20k_validation_image_semantic.json \
$DATA_PATH/coco/pano_ca_inst/coco_val_image_panoptic_inst.json \
$DATA_PATH/coco/pano_sem_seg/coco_val2017_image_panoptic_sem_seg.json \
$DATA_PATH/coco_pose/coco_pose_256x192_val.json \
$DATA_PATH/denoise/denoise_ssid_val.json \
$DATA_PATH/derain/derain_test_rain100h.json \
$DATA_PATH/light_enhance/enhance_lol_val.json \
--output_dir models/$name \
--log_dir models/$name/logs \
--finetune path/to/mae_pretrain_vit_large.pth \
# --log_wandb \
#!/bin/bash
export HSA_FORCE_FINE_GRAIN_PCIE=1
export USE_MIOPEN_BATCHNORM=1
DATA_PATH=/home/datasets
name=painter_vit_large
python -m torch.distributed.launch --nproc_per_node=8 \
--nnodes=${WORLD_SIZE} --node_rank=$RANK \
--master_addr=$MASTER_ADDR --master_port=12358 \
--use_env main_train.py \
--batch_size 2 \
--accum_iter 16 \
--model painter_vit_large_patch16_input896x448_win_dec64_8glb_sl1 \
--num_mask_patches 784 \
--max_mask_patches_per_block 392 \
--epochs 15 \
--warmup_epochs 1 \
--lr 1e-3 \
--clip_grad 3 \
--layer_decay 0.8 \
--drop_path 0.1 \
--input_size 896 448 \
--save_freq 1 \
--data_path $DATA_PATH/ \
--json_path \
$DATA_PATH/nyu_depth_v2/nyuv2_sync_image_depth.json \
$DATA_PATH/ade20k/ade20k_training_image_semantic.json \
$DATA_PATH/coco/pano_ca_inst/coco_train_image_panoptic_inst.json \
$DATA_PATH/coco/pano_sem_seg/coco_train2017_image_panoptic_sem_seg.json \
$DATA_PATH/coco_pose/coco_pose_256x192_train.json \
$DATA_PATH/denoise/denoise_ssid_train.json \
$DATA_PATH/derain/derain_train.json \
$DATA_PATH/light_enhance/enhance_lol_train.json \
--val_json_path \
$DATA_PATH/nyu_depth_v2/nyuv2_test_image_depth.json \
$DATA_PATH/ade20k/ade20k_validation_image_semantic.json \
$DATA_PATH/coco/pano_ca_inst/coco_val_image_panoptic_inst.json \
$DATA_PATH/coco/pano_sem_seg/coco_val2017_image_panoptic_sem_seg.json \
$DATA_PATH/coco_pose/coco_pose_256x192_val.json \
$DATA_PATH/denoise/denoise_ssid_val.json \
$DATA_PATH/derain/derain_test_rain100h.json \
$DATA_PATH/light_enhance/enhance_lol_val.json \
--output_dir models/$name \
--log_dir models/$name/logs \
--finetune path/to/mae_pretrain_vit_large.pth \
# --log_wandb \
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import math
import torch
from torchvision import transforms
from torchvision.transforms import functional as F
class RandomResizedCrop(transforms.RandomResizedCrop):
"""
RandomResizedCrop for matching TF/TPU implementation: no for-loop is used.
This may lead to results different with torchvision's version.
Following BYOL's TF code:
https://github.com/deepmind/deepmind-research/blob/master/byol/utils/dataset.py#L206
"""
@staticmethod
def get_params(img, scale, ratio):
width, height = F._get_image_size(img)
area = height * width
target_area = area * torch.empty(1).uniform_(scale[0], scale[1]).item()
log_ratio = torch.log(torch.tensor(ratio))
aspect_ratio = torch.exp(
torch.empty(1).uniform_(log_ratio[0], log_ratio[1])
).item()
w = int(round(math.sqrt(target_area * aspect_ratio)))
h = int(round(math.sqrt(target_area / aspect_ratio)))
w = min(w, width)
h = min(h, height)
i = torch.randint(0, height - h + 1, size=(1,)).item()
j = torch.randint(0, width - w + 1, size=(1,)).item()
return i, j, h, w
\ No newline at end of file
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# DeiT: https://github.com/facebookresearch/deit
# --------------------------------------------------------
import os
import PIL
from torchvision import datasets, transforms
from timm.data import create_transform
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD
def build_dataset(is_train, args):
transform = build_transform(is_train, args)
root = os.path.join(args.data_path, 'train' if is_train else 'val')
dataset = datasets.ImageFolder(root, transform=transform)
print(dataset)
return dataset
def build_transform(is_train, args):
mean = IMAGENET_DEFAULT_MEAN
std = IMAGENET_DEFAULT_STD
# train transform
if is_train:
# this should always dispatch to transforms_imagenet_train
transform = create_transform(
input_size=args.input_size,
is_training=True,
color_jitter=args.color_jitter,
auto_augment=args.aa,
interpolation='bicubic',
re_prob=args.reprob,
re_mode=args.remode,
re_count=args.recount,
mean=mean,
std=std,
)
return transform
# eval transform
t = []
if args.input_size <= 224:
crop_pct = 224 / 256
else:
crop_pct = 1.0
size = int(args.input_size / crop_pct)
t.append(
transforms.Resize(size, interpolation=PIL.Image.BICUBIC), # to maintain same ratio w.r.t. 224 images
)
t.append(transforms.CenterCrop(args.input_size))
t.append(transforms.ToTensor())
t.append(transforms.Normalize(mean, std))
return transforms.Compose(t)
import os
import glob
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset
import torch.distributed as dist
class DatasetTest(Dataset):
"""
define dataset for ddp
"""
def __init__(self, img_src_dir, input_size, ext_list=('*.png', '*.jpg'), ):
super(DatasetTest, self).__init__()
self.img_src_dir = img_src_dir
self.input_size = input_size
img_path_list = []
for ext in ext_list:
img_path_tmp = glob.glob(os.path.join(img_src_dir, ext))
img_path_list.extend(img_path_tmp)
self.img_path_list = img_path_list
def __len__(self):
return len(self.img_path_list)
def __getitem__(self, index):
img_path = self.img_path_list[index]
img = Image.open(img_path).convert("RGB")
size_org = img.size
img = img.resize((self.input_size, self.input_size))
img = np.array(img) / 255.
return img, img_path, size_org
def collate_fn(batch):
return batch
# batch = list(zip(*batch))
# return tuple(batch)
def setup_for_distributed(is_master):
"""
This function disables printing when not in master process
"""
import builtins as __builtin__
builtin_print = __builtin__.print
def print(*args, **kwargs):
force = kwargs.pop('force', False)
if is_master or force:
builtin_print(*args, **kwargs)
__builtin__.print = print
def is_dist_avail_and_initialized():
if not dist.is_available():
return False
if not dist.is_initialized():
return False
return True
def get_world_size():
if not is_dist_avail_and_initialized():
return 1
return dist.get_world_size()
def get_rank():
if not is_dist_avail_and_initialized():
return 0
return dist.get_rank()
def is_main_process():
return get_rank() == 0
def init_distributed_mode(args):
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ and 'LOCAL_RANK' in os.environ:
args.rank = int(os.environ["RANK"])
args.world_size = int(os.environ['WORLD_SIZE'])
args.gpu = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.gpu = args.rank % torch.cuda.device_count()
else:
print('Not using distributed mode')
args.distributed = False
return args
args.distributed = True
torch.cuda.set_device(args.gpu)
args.dist_backend = 'nccl'
print('| distributed init (rank {}): {}'.format(
args.rank, args.dist_url), flush=True)
torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
world_size=args.world_size, rank=args.rank)
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
return args
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment