"vscode:/vscode.git/clone" did not exist on "862d70fceededcbe3170abf2a3eaaf4df3c14575"
Commit 3ee88850 authored by Hang Zhang's avatar Hang Zhang Committed by Facebook GitHub Bot
Browse files

exploring deformable attention in transformer

Summary:
Pull Request resolved: https://github.com/facebookresearch/d2go/pull/105

exploring deformable attention in transformer

Reviewed By: bichenwu09

Differential Revision: D29093714

fbshipit-source-id: dd691754d9e439661e2eddecb3a1e7cefc8fe568
parent 666e605e
......@@ -5,7 +5,7 @@ import torch.utils.data
import torchvision
from .coco import build as build_coco
from .ade import build as build_ade
def get_coco_api_from_dataset(dataset):
for _ in range(10):
......@@ -19,9 +19,13 @@ def get_coco_api_from_dataset(dataset):
def build_dataset(image_set, args):
if args.dataset_file == 'coco':
return build_coco(image_set, args)
if args.dataset_file == 'coco_panoptic':
dataset = build_coco(image_set, args)
elif args.dataset_file == 'coco_panoptic':
# to avoid making panopticapi required for coco
from .coco_panoptic import build as build_coco_panoptic
return build_coco_panoptic(image_set, args)
dataset = build_coco_panoptic(image_set, args)
elif args.dataset_file == 'ade':
dataset = build_ade(image_set, args)
else:
raise ValueError(f'dataset {args.dataset_file} not supported')
return dataset
import os
import sys
import numpy as np
import random
import math
from PIL import Image, ImageOps, ImageFilter
import skimage.morphology as morp
import torch
import torch.utils.data as data
import torchvision
import torchvision.transforms as transform
from detectron2.utils.file_io import PathManager
from .coco import make_coco_transforms
class ADE20KParsing(torchvision.datasets.VisionDataset):
def __init__(self, root, split, transforms=None):
super(ADE20KParsing, self).__init__(
root)
# assert exists and prepare dataset automatically
assert PathManager.exists(root), "Please setup the dataset"
self.images, self.masks = _get_ade20k_pairs(root, split)
assert (len(self.images) == len(self.masks))
if len(self.images) == 0:
raise(RuntimeError("Found 0 images in subfolders of: \
" + root + "\n"))
self._transforms = transforms
def _mask_transform(self, mask):
target = np.array(mask).astype('int64') - 1
return target
def __getitem__(self, index):
with PathManager.open(self.images[index], "rb") as f:
img = Image.open(f).convert('RGB')
with PathManager.open(self.masks[index], "rb") as f:
mask = Image.open(f).convert('P')
w, h = img.size
## generating bbox and masks
# get different classes
mask = self._mask_transform(mask)
classes = np.unique(mask)
if -1 in classes:
classes = classes[1:]
segmasks = mask == classes[:,None,None]
# find connected component
detr_masks = []
labels = []
for i in range(len(classes)):
mask = segmasks[i]
mclass = classes[i]
connected, nslice = morp.label(mask, connectivity=2, background=0, return_num=True)
for j in range(1, nslice + 1):
detr_masks.append(connected==j)
labels.append(mclass)
target = {}
target['image_id'] = torch.tensor(int(os.path.basename(self.images[index])[10:-4]))
if len(detr_masks) > 0:
target['masks'] = torch.as_tensor(np.stack(detr_masks, axis=0), dtype=torch.uint8)
target['boxes'] = masks_to_boxes(target['masks'])
else:
target['masks'] = torch.as_tensor(detr_masks, dtype=torch.uint8)
target['boxes'] = target['masks']
target['labels'] = torch.tensor(labels)
target['orig_size'] = torch.as_tensor([int(h), int(w)])
target['size'] = torch.as_tensor([int(h), int(w)])
if self._transforms is not None:
img, target = self._transforms(img, target)
return img, target
def __len__(self):
return len(self.images)
@property
def pred_offset(self):
return 1
def masks_to_boxes(masks):
"""Compute the bounding boxes around the provided masks
The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
Returns a [N, 4] tensors, with the boxes in xyxy format
"""
if masks.numel() == 0:
return torch.zeros((0, 4), device=masks.device)
h, w = masks.shape[-2:]
y = torch.arange(0, h, dtype=torch.float)
x = torch.arange(0, w, dtype=torch.float)
y, x = torch.meshgrid(y, x)
x_mask = (masks * x.unsqueeze(0))
x_max = x_mask.flatten(1).max(-1)[0]
x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
y_mask = (masks * y.unsqueeze(0))
y_max = y_mask.flatten(1).max(-1)[0]
y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
return torch.stack([x_min, y_min, x_max, y_max], 1)
def _get_ade20k_pairs(folder, split='train'):
def get_path_pairs(img_folder, mask_folder):
img_paths = []
mask_paths = []
print("Before listing", img_folder)
filenames = PathManager.ls(img_folder)
for filename in filenames:
print("found: ", filename)
basename, _ = os.path.splitext(filename)
if filename.endswith(".jpg"):
imgpath = os.path.join(img_folder, filename)
maskname = basename + '.png'
maskpath = os.path.join(mask_folder, maskname)
img_paths.append(imgpath)
mask_paths.append(maskpath)
#if PathManager.isfile(maskpath):
#else:
# print('cannot find the mask:', maskpath)
return img_paths, mask_paths
if split == 'train':
img_folder = os.path.join(folder, 'images/training')
mask_folder = os.path.join(folder, 'annotations/training')
img_paths, mask_paths = get_path_pairs(img_folder, mask_folder)
print('len(img_paths):', len(img_paths))
assert len(img_paths) == 20210
elif split == 'val':
img_folder = os.path.join(folder, 'images/validation')
mask_folder = os.path.join(folder, 'annotations/validation')
img_paths, mask_paths = get_path_pairs(img_folder, mask_folder)
assert len(img_paths) == 2000
else:
assert split == 'trainval'
train_img_folder = os.path.join(folder, 'images/training')
train_mask_folder = os.path.join(folder, 'annotations/training')
val_img_folder = os.path.join(folder, 'images/validation')
val_mask_folder = os.path.join(folder, 'annotations/validation')
train_img_paths, train_mask_paths = get_path_pairs(train_img_folder, train_mask_folder)
val_img_paths, val_mask_paths = get_path_pairs(val_img_folder, val_mask_folder)
img_paths = train_img_paths + val_img_paths
mask_paths = train_mask_paths + val_mask_paths
assert len(img_paths) == 22210
return img_paths, mask_paths
def build(image_set, args):
dataset = ADE20KParsing(args.ade_path, image_set, transforms=make_coco_transforms(image_set))
return dataset
......@@ -6,22 +6,32 @@ COCO dataset which returns image_id for evaluation.
Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py
"""
import os
from pathlib import Path
from PIL import Image
import torch
import torch.utils.data
import torchvision
from pycocotools import mask as coco_mask
from detectron2.utils.file_io import PathManager
import detr.datasets.transforms as T
class CocoDetection(torchvision.datasets.CocoDetection):
def __init__(self, img_folder, ann_file, transforms, return_masks):
ann_file = PathManager.get_local_path(ann_file)
super(CocoDetection, self).__init__(img_folder, ann_file)
self._transforms = transforms
self.prepare = ConvertCocoPolysToMask(return_masks)
def _load_image(self, id: int) -> Image.Image:
path = self.coco.loadImgs(id)[0]["file_name"]
with PathManager.open(os.path.join(self.root, path), "rb") as f:
image = Image.open(f).convert("RGB")
return image
def __getitem__(self, idx):
img, target = super(CocoDetection, self).__getitem__(idx)
image_id = self.ids[idx]
......@@ -147,6 +157,13 @@ def make_coco_transforms(image_set):
def build(image_set, args):
if "manifold" in args.coco_path:
root = args.coco_path
PATHS = {
"train": (os.path.join(root, "coco_train2017"), "manifold://fair_vision_data/tree/detectron2/json_dataset_annotations/coco/instances_train2017.json"),
"val": (os.path.join(root, "coco_val2017"), "manifold://fair_vision_data/tree/detectron2/json_dataset_annotations/coco/instances_val2017.json"),
}
else:
root = Path(args.coco_path)
assert root.exists(), f'provided COCO path {root} does not exist'
mode = 'instances'
......
......@@ -18,7 +18,7 @@ from pycocotools.cocoeval import COCOeval
from pycocotools.coco import COCO
import pycocotools.mask as mask_util
from util.misc import all_gather
from detr.util.misc import all_gather
class CocoEvaluator(object):
......
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import os
import json
from pathlib import Path
......@@ -9,14 +10,15 @@ import torch
from PIL import Image
from panopticapi.utils import rgb2id
from util.box_ops import masks_to_boxes
from detr.util.box_ops import masks_to_boxes
from .coco import make_coco_transforms
from detectron2.utils.file_io import PathManager
class CocoPanoptic:
def __init__(self, img_folder, ann_folder, ann_file, transforms=None, return_masks=True):
with open(ann_file, 'r') as f:
with PathManager.open(ann_file, 'r') as f:
self.coco = json.load(f)
# sort 'images' field so that they are aligned with 'annotations'
......@@ -35,13 +37,15 @@ class CocoPanoptic:
def __getitem__(self, idx):
ann_info = self.coco['annotations'][idx] if "annotations" in self.coco else self.coco['images'][idx]
img_path = Path(self.img_folder) / ann_info['file_name'].replace('.png', '.jpg')
ann_path = Path(self.ann_folder) / ann_info['file_name']
img_path = os.path.join(self.img_folder, ann_info['file_name'].replace('.png', '.jpg'))
ann_path = os.path.join(self.ann_folder, ann_info['file_name'])
img = Image.open(img_path).convert('RGB')
with PathManager.open(img_path, "rb") as f:
img = Image.open(f).convert('RGB')
w, h = img.size
if "segments_info" in ann_info:
masks = np.asarray(Image.open(ann_path), dtype=np.uint32)
with PathManager.open(ann_path, "rb") as f:
masks = np.asarray(Image.open(f), dtype=np.uint32)
masks = rgb2id(masks)
ids = np.array([ann['id'] for ann in ann_info['segments_info']])
......@@ -80,6 +84,15 @@ class CocoPanoptic:
def build(image_set, args):
if "manifold" in args.coco_path:
root = args.coco_path
PATHS = {
"train": (os.path.join(root, "coco_train2017"), "manifold://fair_vision_data/tree/detectron2/json_dataset_annotations/coco/panoptic_train2017.json"),
"val": (os.path.join(root, "coco_val2017"), "manifold://fair_vision_data/tree/detectron2/json_dataset_annotations/coco/panoptic_val2017.json"),
}
img_folder_path, ann_file = PATHS[image_set]
ann_folder = os.path.join(root, f"coco_panoptic_{image_set}2017")
else:
img_folder_root = Path(args.coco_path)
ann_folder_root = Path(args.coco_panoptic_path)
assert img_folder_root.exists(), f'provided COCO path {img_folder_root} does not exist'
......@@ -89,9 +102,9 @@ def build(image_set, args):
"train": ("train2017", Path("annotations") / f'{mode}_train2017.json'),
"val": ("val2017", Path("annotations") / f'{mode}_val2017.json'),
}
img_folder, ann_file = PATHS[image_set]
img_folder_path = img_folder_root / img_folder
ann_folder = ann_folder_root / f'{mode}_{img_folder}'
ann_file = ann_folder_root / ann_file
......
......@@ -4,7 +4,8 @@
import json
import os
import util.misc as utils
import detr.util.misc as utils
from detectron2.utils.file_io import PathManager
try:
from panopticapi.evaluation import pq_compute
......@@ -17,14 +18,14 @@ class PanopticEvaluator(object):
self.gt_json = ann_file
self.gt_folder = ann_folder
if utils.is_main_process():
if not os.path.exists(output_dir):
os.mkdir(output_dir)
if not PathManager.exists(output_dir):
PathManager.mkdir(output_dir)
self.output_dir = output_dir
self.predictions = []
def update(self, predictions):
for p in predictions:
with open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
with PathManager.open(os.path.join(self.output_dir, p["file_name"]), "wb") as f:
f.write(p.pop("png_string"))
self.predictions += predictions
......@@ -40,7 +41,7 @@ class PanopticEvaluator(object):
if utils.is_main_process():
json_data = {"annotations": self.predictions}
predictions_json = os.path.join(self.output_dir, "predictions.json")
with open(predictions_json, "w") as f:
with PathManager.open(predictions_json, "w") as f:
f.write(json.dumps(json_data))
return pq_compute(self.gt_json, predictions_json, gt_folder=self.gt_folder, pred_folder=self.output_dir)
return None
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Train and eval functions used in main.py
"""
import math
import os
import sys
from typing import Iterable
import torch
import detr.util.misc as utils
from detr.datasets.coco_eval import CocoEvaluator
from detr.datasets.panoptic_eval import PanopticEvaluator
def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
data_loader: Iterable, optimizer: torch.optim.Optimizer,
device: torch.device, epoch: int, max_norm: float = 0):
model.train()
criterion.train()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
header = 'Epoch: [{}]'.format(epoch)
print_freq = 10
for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
samples = samples.to(device)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
outputs = model(samples)
loss_dict = criterion(outputs, targets)
weight_dict = criterion.weight_dict
losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
loss_dict_reduced_unscaled = {f'{k}_unscaled': v
for k, v in loss_dict_reduced.items()}
loss_dict_reduced_scaled = {k: v * weight_dict[k]
for k, v in loss_dict_reduced.items() if k in weight_dict}
losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
loss_value = losses_reduced_scaled.item()
if not math.isfinite(loss_value):
print("Loss is {}, stopping training".format(loss_value))
print(loss_dict_reduced)
sys.exit(1)
optimizer.zero_grad()
losses.backward()
if max_norm > 0:
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
optimizer.step()
metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)
metric_logger.update(class_error=loss_dict_reduced['class_error'])
metric_logger.update(lr=optimizer.param_groups[0]["lr"])
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
@torch.no_grad()
def evaluate(model, criterion, postprocessors, data_loader, base_ds, device, output_dir):
model.eval()
criterion.eval()
metric_logger = utils.MetricLogger(delimiter=" ")
metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
header = 'Test:'
iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys())
coco_evaluator = CocoEvaluator(base_ds, iou_types)
# coco_evaluator.coco_eval[iou_types[0]].params.iouThrs = [0, 0.1, 0.5, 0.75]
panoptic_evaluator = None
if 'panoptic' in postprocessors.keys():
panoptic_evaluator = PanopticEvaluator(
data_loader.dataset.ann_file,
data_loader.dataset.ann_folder,
output_dir=os.path.join(output_dir, "panoptic_eval"),
)
for samples, targets in metric_logger.log_every(data_loader, 10, header):
samples = samples.to(device)
targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
outputs = model(samples)
loss_dict = criterion(outputs, targets)
weight_dict = criterion.weight_dict
# reduce losses over all GPUs for logging purposes
loss_dict_reduced = utils.reduce_dict(loss_dict)
loss_dict_reduced_scaled = {k: v * weight_dict[k]
for k, v in loss_dict_reduced.items() if k in weight_dict}
loss_dict_reduced_unscaled = {f'{k}_unscaled': v
for k, v in loss_dict_reduced.items()}
metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
**loss_dict_reduced_scaled,
**loss_dict_reduced_unscaled)
metric_logger.update(class_error=loss_dict_reduced['class_error'])
orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
results = postprocessors['bbox'](outputs, orig_target_sizes)
if 'segm' in postprocessors.keys():
target_sizes = torch.stack([t["size"] for t in targets], dim=0)
results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes)
res = {target['image_id'].item(): output for target, output in zip(targets, results)}
if coco_evaluator is not None:
coco_evaluator.update(res)
if panoptic_evaluator is not None:
res_pano = postprocessors["panoptic"](outputs, target_sizes, orig_target_sizes)
for i, target in enumerate(targets):
image_id = target["image_id"].item()
file_name = f"{image_id:012d}.png"
res_pano[i]["image_id"] = image_id
res_pano[i]["file_name"] = file_name
panoptic_evaluator.update(res_pano)
# gather the stats from all processes
metric_logger.synchronize_between_processes()
print("Averaged stats:", metric_logger)
if coco_evaluator is not None:
coco_evaluator.synchronize_between_processes()
if panoptic_evaluator is not None:
panoptic_evaluator.synchronize_between_processes()
# accumulate predictions from all images
if coco_evaluator is not None:
coco_evaluator.accumulate()
coco_evaluator.summarize()
panoptic_res = None
if panoptic_evaluator is not None:
panoptic_res = panoptic_evaluator.summarize()
stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
if coco_evaluator is not None:
if 'bbox' in postprocessors.keys():
stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist()
if 'segm' in postprocessors.keys():
stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist()
if panoptic_res is not None:
stats['PQ_all'] = panoptic_res["All"]
stats['PQ_th'] = panoptic_res["Things"]
stats['PQ_st'] = panoptic_res["Stuff"]
return stats, coco_evaluator
......@@ -135,7 +135,7 @@ class Joiner(nn.Sequential):
def build_backbone(args):
position_embedding = build_position_encoding(args)
train_backbone = args.lr_backbone > 0
return_interm_layers = args.masks or (args.num_feature_levels > 1)
return_interm_layers = args.masks
backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
model = Joiner(backbone, position_embedding)
return model
......@@ -457,7 +457,8 @@ def interpolate(input, size=None, scale_factor=None, mode="nearest", align_corne
This will eventually be supported natively by PyTorch, and this
class can go away.
"""
if float(torchvision.__version__[:3]) < 0.7:
#if float(torchvision.__version__[:3]) < 0.7:
if LooseVersion(torchvision.__version__) < LooseVersion("0.7.0"):
if input.numel() > 0:
return torch.nn.functional.interpolate(
input, size, scale_factor, mode, align_corners
......
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import os
import logging
import argparse
import datetime
import json
import random
import time
from datetime import timedelta
from pathlib import Path
import numpy as np
import torch
from torch.utils.data import DataLoader, DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from detr import datasets
import detr.util.misc as utils
from detr.datasets import build_dataset, get_coco_api_from_dataset
from detr.engine import evaluate, train_one_epoch
from detr.models import build_model
from detectron2.utils.file_io import PathManager
from detectron2.engine.launch import _find_free_port
DEFAULT_TIMEOUT = timedelta(minutes=30)
def get_args_parser():
parser = argparse.ArgumentParser('Set transformer detector', add_help=False)
parser.add_argument('--lr', default=1e-4, type=float)
parser.add_argument('--lr_backbone', default=1e-5, type=float)
parser.add_argument('--batch_size', default=2, type=int)
parser.add_argument('--weight_decay', default=1e-4, type=float)
parser.add_argument('--epochs', default=300, type=int)
parser.add_argument('--lr_drop', default=200, type=int)
parser.add_argument('--clip_max_norm', default=0.1, type=float,
help='gradient clipping max norm')
# Model parameters
parser.add_argument('--frozen_weights', type=str, default=None,
help="Path to the pretrained model. If set, only the mask head will be trained")
# * Backbone
parser.add_argument('--backbone', default='resnet50', type=str,
help="Name of the convolutional backbone to use")
parser.add_argument('--dilation', action='store_true',
help="If true, we replace stride with dilation in the last convolutional block (DC5)")
parser.add_argument('--position_embedding', default='sine', type=str, choices=('sine', 'learned'),
help="Type of positional embedding to use on top of the image features")
# * Transformer
parser.add_argument('--enc_layers', default=6, type=int,
help="Number of encoding layers in the transformer")
parser.add_argument('--dec_layers', default=6, type=int,
help="Number of decoding layers in the transformer")
parser.add_argument('--dim_feedforward', default=2048, type=int,
help="Intermediate size of the feedforward layers in the transformer blocks")
parser.add_argument('--hidden_dim', default=256, type=int,
help="Size of the embeddings (dimension of the transformer)")
parser.add_argument('--dropout', default=0.1, type=float,
help="Dropout applied in the transformer")
parser.add_argument('--nheads', default=8, type=int,
help="Number of attention heads inside the transformer's attentions")
parser.add_argument('--num_queries', default=100, type=int,
help="Number of query slots")
parser.add_argument('--pre_norm', action='store_true')
# * Segmentation
parser.add_argument('--masks', action='store_true',
help="Train segmentation head if the flag is provided")
# Loss
parser.add_argument('--no_aux_loss', dest='aux_loss', action='store_false',
help="Disables auxiliary decoding losses (loss at each layer)")
# * Matcher
parser.add_argument('--set_cost_class', default=1, type=float,
help="Class coefficient in the matching cost")
parser.add_argument('--set_cost_bbox', default=5, type=float,
help="L1 box coefficient in the matching cost")
parser.add_argument('--set_cost_giou', default=2, type=float,
help="giou box coefficient in the matching cost")
# * Loss coefficients
parser.add_argument('--mask_loss_coef', default=1, type=float)
parser.add_argument('--dice_loss_coef', default=1, type=float)
parser.add_argument('--bbox_loss_coef', default=5, type=float)
parser.add_argument('--giou_loss_coef', default=2, type=float)
parser.add_argument('--eos_coef', default=0.1, type=float,
help="Relative classification weight of the no-object class")
# dataset parameters
parser.add_argument('--dataset_file', default='coco')
parser.add_argument('--ade_path', type=str, default='manifold://winvision/tree/detectron2/ADEChallengeData2016/')
parser.add_argument('--coco_path', type=str, default='manifold://fair_vision_data/tree/')
parser.add_argument('--coco_panoptic_path', type=str, default='manifold://fair_vision_data/tree/')
parser.add_argument('--remove_difficult', action='store_true')
parser.add_argument('--output-dir', default='',
help='path where to save, empty for no saving')
parser.add_argument('--device', default='cuda',
help='device to use for training / testing')
parser.add_argument('--seed', default=42, type=int)
parser.add_argument('--resume', default='', help='resume from checkpoint')
parser.add_argument('--start_epoch', default=0, type=int, metavar='N',
help='start epoch')
parser.add_argument('--eval', action='store_true')
parser.add_argument('--num_workers', default=2, type=int)
# distributed training parameters
parser.add_argument("--num-gpus", type=int, default=8, help="number of gpus *per machine*")
parser.add_argument("--num-machines", type=int, default=1, help="total number of machines")
parser.add_argument(
"--machine-rank", type=int, default=0, help="the rank of this machine (unique per machine)")
parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
return parser
def main(args):
#utils.init_distributed_mode(args)
if args.frozen_weights is not None:
assert args.masks, "Frozen training is meant for segmentation only"
print(args)
device = torch.device(args.device)
# fix the seed for reproducibility
seed = args.seed + utils.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
model, criterion, postprocessors = build_model(args)
model.to(device)
model_without_ddp = model
if args.distributed:
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
model_without_ddp = model.module
n_parameters = sum(p.numel() for p in model.parameters() if p.requires_grad)
print('number of params:', n_parameters)
param_dicts = [
{"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" not in n and p.requires_grad]},
{
"params": [p for n, p in model_without_ddp.named_parameters() if "backbone" in n and p.requires_grad],
"lr": args.lr_backbone,
},
]
optimizer = torch.optim.AdamW(param_dicts, lr=args.lr,
weight_decay=args.weight_decay)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, args.lr_drop)
dataset_train = build_dataset(image_set='train', args=args)
dataset_val = build_dataset(image_set='val', args=args)
if args.distributed:
sampler_train = DistributedSampler(dataset_train)
sampler_val = DistributedSampler(dataset_val, shuffle=False)
else:
sampler_train = torch.utils.data.RandomSampler(dataset_train)
sampler_val = torch.utils.data.SequentialSampler(dataset_val)
batch_sampler_train = torch.utils.data.BatchSampler(
sampler_train, args.batch_size, drop_last=True)
data_loader_train = DataLoader(dataset_train, batch_sampler=batch_sampler_train,
collate_fn=utils.collate_fn, num_workers=args.num_workers)
data_loader_val = DataLoader(dataset_val, args.batch_size, sampler=sampler_val,
drop_last=False, collate_fn=utils.collate_fn, num_workers=args.num_workers)
if args.dataset_file == "coco_panoptic":
# We also evaluate AP during panoptic training, on original coco DS
coco_val = datasets.coco.build("val", args)
base_ds = get_coco_api_from_dataset(coco_val)
else:
base_ds = get_coco_api_from_dataset(dataset_val)
if args.frozen_weights is not None:
checkpoint = torch.load(args.frozen_weights, map_location='cpu')
model_without_ddp.detr.load_state_dict(checkpoint['model'])
if args.resume:
if args.resume.startswith('https'):
checkpoint = torch.hub.load_state_dict_from_url(
args.resume, map_location='cpu', check_hash=True)
else:
checkpoint = torch.load(args.resume, map_location='cpu')
model_without_ddp.load_state_dict(checkpoint['model'])
if not args.eval and 'optimizer' in checkpoint and 'lr_scheduler' in checkpoint and 'epoch' in checkpoint:
optimizer.load_state_dict(checkpoint['optimizer'])
lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
args.start_epoch = checkpoint['epoch'] + 1
if args.eval:
test_stats, coco_evaluator = evaluate(model, criterion, postprocessors,
data_loader_val, base_ds, device, args.output_dir)
if args.output_dir:
with PathManager.open(os.path.join(args.output_dir, "eval.pth"), "wb") as f:
utils.save_on_master(coco_evaluator.coco_eval["bbox"].eval, f)
return
print("Start training")
start_time = time.time()
for epoch in range(args.start_epoch, args.epochs):
if args.distributed:
sampler_train.set_epoch(epoch)
train_stats = train_one_epoch(
model, criterion, data_loader_train, optimizer, device, epoch,
args.clip_max_norm)
lr_scheduler.step()
if args.output_dir:
checkpoint_paths = [] #os.path.join(args.output_dir, 'checkpoint.pth')]
# extra checkpoint before LR drop and every 10 epochs
if (epoch + 1) % args.lr_drop == 0 or (epoch + 1) % 10 == 0:
checkpoint_paths.append(os.path.join(args.output_dir, f'checkpoint{epoch:04}.pth'))
for checkpoint_path in checkpoint_paths:
with PathManager.open(checkpoint_path, "wb") as f:
if args.gpu == 0 and args.machine_rank == 0:
utils.save_on_master({
'model': model_without_ddp.state_dict(),
'optimizer': optimizer.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
'epoch': epoch,
'args': args,
}, f)
test_stats, coco_evaluator = evaluate(
model, criterion, postprocessors, data_loader_val, base_ds, device, args.output_dir
)
log_stats = {**{f'train_{k}': v for k, v in train_stats.items()},
**{f'test_{k}': v for k, v in test_stats.items()},
'epoch': epoch,
'n_parameters': n_parameters}
if args.output_dir and utils.is_main_process():
with PathManager.open(os.path.join(args.output_dir, "log.txt"), "w") as f:
f.write(json.dumps(log_stats) + "\n")
# for evaluation logs
if coco_evaluator is not None:
PathManager.mkdirs(os.path.join(args.output_dir, 'eval'))
if "bbox" in coco_evaluator.coco_eval:
filenames = ['latest.pth']
if epoch % 50 == 0:
filenames.append(f'{epoch:03}.pth')
for name in filenames:
with PathManager.open(os.path.join(args.output_dir, "eval", name), "wb") as f:
torch.save(coco_evaluator.coco_eval["bbox"].eval,
f)
total_time = time.time() - start_time
total_time_str = str(datetime.timedelta(seconds=int(total_time)))
print('Training time {}'.format(total_time_str))
def launch(
main_func,
num_gpus_per_machine,
num_machines=1,
machine_rank=0,
dist_url=None,
args=(),
timeout=DEFAULT_TIMEOUT,
):
"""
Launch multi-gpu or distributed training.
This function must be called on all machines involved in the training.
It will spawn child processes (defined by ``num_gpus_per_machine``) on each machine.
Args:
main_func: a function that will be called by `main_func(*args)`
num_gpus_per_machine (int): number of GPUs per machine
num_machines (int): the total number of machines
machine_rank (int): the rank of this machine
dist_url (str): url to connect to for distributed jobs, including protocol
e.g. "tcp://127.0.0.1:8686".
Can be set to "auto" to automatically select a free port on localhost
timeout (timedelta): timeout of the distributed workers
args (tuple): arguments passed to main_func
"""
world_size = num_machines * num_gpus_per_machine
args[0].distributed = world_size > 1
if args[0].distributed:
# https://github.com/pytorch/pytorch/pull/14391
# TODO prctl in spawned processes
if dist_url == "auto":
assert num_machines == 1, "dist_url=auto not supported in multi-machine jobs."
port = _find_free_port()
dist_url = f"tcp://127.0.0.1:{port}"
if num_machines > 1 and dist_url.startswith("file://"):
logger = logging.getLogger(__name__)
logger.warning(
"file:// is not a reliable init_method in multi-machine jobs. Prefer tcp://"
)
mp.spawn(
_distributed_worker,
nprocs=num_gpus_per_machine,
args=(
main_func,
world_size,
num_gpus_per_machine,
machine_rank,
dist_url,
args,
timeout,
),
daemon=False,
)
else:
main_func(*args)
def synchronize():
"""
Helper function to synchronize (barrier) among all processes when
using distributed training
"""
if not dist.is_available():
return
if not dist.is_initialized():
return
world_size = dist.get_world_size()
if world_size == 1:
return
dist.barrier()
def _distributed_worker(
local_rank,
main_func,
world_size,
num_gpus_per_machine,
machine_rank,
dist_url,
args,
timeout=DEFAULT_TIMEOUT,
):
assert torch.cuda.is_available(), "cuda is not available. Please check your installation."
global_rank = machine_rank * num_gpus_per_machine + local_rank
try:
dist.init_process_group(
backend="NCCL",
init_method=dist_url,
world_size=world_size,
rank=global_rank,
timeout=timeout,
)
except Exception as e:
logger = logging.getLogger(__name__)
logger.error("Process group URL: {}".format(dist_url))
raise e
# synchronize is needed here to prevent a possible timeout after calling init_process_group
# See: https://github.com/facebookresearch/maskrcnn-benchmark/issues/172
synchronize()
assert num_gpus_per_machine <= torch.cuda.device_count()
torch.cuda.set_device(local_rank)
args[0].gpu = local_rank
# Setup the local process group (which contains ranks within the same machine)
#assert comm._LOCAL_PROCESS_GROUP is None
#num_machines = world_size // num_gpus_per_machine
#for i in range(num_machines):
# ranks_on_i = list(range(i * num_gpus_per_machine, (i + 1) * num_gpus_per_machine))
# pg = dist.new_group(ranks_on_i)
# if i == machine_rank:
# comm._LOCAL_PROCESS_GROUP = pg
main_func(*args)
if __name__ == '__main__':
parser = argparse.ArgumentParser('DETR training and evaluation script', parents=[get_args_parser()])
args = parser.parse_args()
if args.output_dir:
PathManager.mkdirs(args.output_dir)
print("Command Line Args:", args)
launch(
main,
args.num_gpus,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
args=(args,),
)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment