Commit f23248c0 authored by facebook-github-bot's avatar facebook-github-bot
Browse files

Initial commit

fbshipit-source-id: f4a8ba78691d8cf46e003ef0bd2e95f170932778
parents
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import torch
import unittest
from d2go.optimizer import build_optimizer_mapper
import d2go.runner.default_runner as default_runner
class TestArch(torch.nn.Module):
def __init__(self):
super().__init__()
self.conv = torch.nn.Conv2d(3, 4, kernel_size=3, stride=1, padding=1)
self.bn = torch.nn.BatchNorm2d(4)
self.relu = torch.nn.ReLU(inplace=True)
self.avgpool = torch.nn.AdaptiveAvgPool2d((1, 1))
def forward(self, x):
ret = self.conv(x)
ret = self.bn(ret)
ret = self.relu(ret)
ret = self.avgpool(ret)
return ret
def _test_each_optimizer(cfg):
model = TestArch()
optimizer = build_optimizer_mapper(cfg, model)
optimizer.zero_grad()
for _ in range(10):
x = torch.rand(1, 3, 24, 24)
y = model(x)
loss = y.mean()
loss.backward()
optimizer.step()
class TestOptimizer(unittest.TestCase):
def test_all_optimiers(self):
runner = default_runner.Detectron2GoRunner()
cfg = runner.get_default_cfg()
multipliers = [None, [{'conv': 0.1}]]
for optimizer_name in ["SGD", "AdamW"]:
for mult in multipliers:
cfg.SOLVER.OPTIMIZER = optimizer_name
cfg.SOLVER.MULTIPLIERS = mult
_test_each_optimizer(cfg)
def test_full_model_grad_clipping(self):
runner = default_runner.Detectron2GoRunner()
cfg = runner.get_default_cfg()
for optimizer_name in ["SGD", "AdamW"]:
cfg.SOLVER.CLIP_GRADIENTS.CLIP_VALUE = 0.2
cfg.SOLVER.CLIP_GRADIENTS.ENABLED = True
cfg.SOLVER.CLIP_GRADIENTS.CLIP_TYPE = "full_model"
cfg.SOLVER.OPTIMIZER = optimizer_name
_test_each_optimizer(cfg)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import unittest
import numpy as np
import torch
from detectron2.structures import Boxes
from d2go.tests import rcnn_helper as rh
class TestRCNNHelper(unittest.TestCase):
def test_get_instances_from_image(self):
boxes = Boxes(torch.Tensor([[50, 40, 100, 80], [150, 60, 200, 120]]))
gt_kpts = torch.Tensor([75, 60, 1.0] * 21 + [175, 90, 1.0] * 21).reshape(
2, 21, 3
)
batched_inputs = rh.get_batched_inputs(2, boxes=boxes)
instances = rh.get_detected_instances_from_image(batched_inputs)
self.assertEqual(len(instances), 2)
self.assertArrayEqual(instances[0].pred_boxes.tensor, boxes.tensor)
self.assertArrayEqual(instances[0].pred_keypoints, gt_kpts)
def test_get_instances_from_image_scale_image(self):
H, W = 398, 224
all_boxes = Boxes(torch.Tensor([[50, 40, 100, 80], [150, 60, 200, 120]]))
image = rh.get_batched_inputs(1, (H, W), (H, W), all_boxes)[0]["image"]
boxes = rh.get_detected_instances_from_image([{"image": image}])[0].pred_boxes
self.assertArrayEqual(boxes.tensor, all_boxes.tensor)
# scale image by 0.5
scale_image = torch.nn.functional.interpolate(
torch.unsqueeze(image, 0),
scale_factor=(0.5, 0.5),
mode="bilinear",
align_corners=False,
recompute_scale_factor=False,
)[0]
sub_boxes = rh.get_detected_instances_from_image([{"image": scale_image}])[
0
].pred_boxes
self.assertArrayEqual(sub_boxes.tensor, [[25, 20, 50, 40], [75, 30, 100, 60]])
# scale image by 0.75
scale_image = torch.nn.functional.interpolate(
torch.unsqueeze(image, 0),
scale_factor=(0.75, 0.75),
mode="bilinear",
align_corners=False,
recompute_scale_factor=False,
)[0]
sub_boxes = rh.get_detected_instances_from_image([{"image": scale_image}])[
0
].pred_boxes
# [[37.5, 30, 75, 60], [112.5, 45, 150, 90]])
self.assertArrayEqual(sub_boxes.tensor, [[37, 30, 75, 60], [112, 45, 150, 90]])
def test_mock_rcnn_inference(self):
image_size = (1920, 1080)
resize_size = (398, 224)
scale_xy = (1080.0 / 224, 1920.0 / 398)
gt_boxes = Boxes(torch.Tensor([[50, 40, 100, 80], [150, 60, 200, 120]]))
gt_kpts = torch.Tensor([75, 60, 1.0] * 21 + [175, 90, 1.0] * 21).reshape(
2, 21, 3
)
# create inputs
batched_inputs = rh.get_batched_inputs(2, image_size, resize_size, gt_boxes)
# create model
model = rh.MockRCNNInference(image_size, resize_size)
# run without post processing
det_instances = model(batched_inputs, None, do_postprocess=False)
self.assertArrayAllClose(
det_instances[0].pred_boxes.tensor,
gt_boxes.tensor,
atol=1e-4,
)
self.assertArrayAllClose(
det_instances[0].pred_keypoints,
gt_kpts,
atol=1e-4,
)
# run with post processing
det_instances = model(batched_inputs, None, do_postprocess=True)
gt_boxes_scaled = gt_boxes.clone()
gt_boxes_scaled.scale(*scale_xy)
gt_kpts_scaled = torch.Tensor(
[75 * scale_xy[0], 60 * scale_xy[1], 1.0] * 21
+ [175 * scale_xy[0], 90 * scale_xy[1], 1.0] * 21
).reshape(2, 21, 3)
self.assertArrayAllClose(
det_instances[0]["instances"].pred_boxes.tensor,
gt_boxes_scaled.tensor,
atol=1e-4,
)
self.assertArrayAllClose(
det_instances[0]["instances"].pred_keypoints,
gt_kpts_scaled,
atol=1e-4,
)
def assertArrayEqual(self, a1, a2):
self.assertTrue(np.array_equal(a1, a2))
def assertArrayAllClose(self, a1, a2, rtol=1.0e-5, atol=1.0e-8):
self.assertTrue(np.allclose(a1, a2, rtol=rtol, atol=atol))
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import unittest
import torch
from detectron2.modeling import build_anchor_generator, build_backbone
from detectron2.modeling.proposal_generator import rpn
from d2go.runner import GeneralizedRCNNRunner
logger = logging.getLogger(__name__)
# overwrite configs if specified, otherwise default config is used
RPN_CFGS = {}
class TestRPNHeads(unittest.TestCase):
def test_build_rpn_heads(self):
""" Make sure rpn heads run """
self.assertGreater(len(rpn.RPN_HEAD_REGISTRY._obj_map), 0)
for name, builder in rpn.RPN_HEAD_REGISTRY._obj_map.items():
logger.info("Testing {}...".format(name))
cfg = GeneralizedRCNNRunner().get_default_cfg()
if name in RPN_CFGS:
cfg.merge_from_file(RPN_CFGS[name])
backbone = build_backbone(cfg)
backbone_shape = backbone.output_shape()
rpn_input_shape = [backbone_shape[x] for x in cfg.MODEL.RPN.IN_FEATURES]
rpn_head = builder(cfg, rpn_input_shape)
in_channels = list(backbone_shape.values())[0].channels
num_anchors = build_anchor_generator(cfg, rpn_input_shape).num_cell_anchors[
0
]
N, C_in, H, W = 2, in_channels, 24, 32
input = torch.rand([N, C_in, H, W], dtype=torch.float32)
LAYERS = len(cfg.MODEL.RPN.IN_FEATURES)
out = rpn_head([input] * LAYERS)
self.assertEqual(len(out), 2)
logits, bbox_reg = out
for idx in range(LAYERS):
self.assertEqual(
logits[idx].shape,
torch.Size(
[input.shape[0], num_anchors, input.shape[2], input.shape[3]]
),
)
self.assertEqual(
bbox_reg[idx].shape,
torch.Size(
[
logits[idx].shape[0],
num_anchors * 4,
logits[idx].shape[2],
logits[idx].shape[3],
]
),
)
def test_build_rpn_heads_with_rotated_anchor_generator(self):
""" Make sure rpn heads work with rotated anchor generator"""
self.assertGreater(len(rpn.RPN_HEAD_REGISTRY._obj_map), 0)
for name, builder in rpn.RPN_HEAD_REGISTRY._obj_map.items():
logger.info("Testing {}...".format(name))
cfg = GeneralizedRCNNRunner().get_default_cfg()
if name in RPN_CFGS:
cfg.merge_from_file(RPN_CFGS[name])
cfg.MODEL.ANCHOR_GENERATOR.NAME = "RotatedAnchorGenerator"
backbone = build_backbone(cfg)
backbone_shape = backbone.output_shape()
rpn_input_shape = [backbone_shape[x] for x in cfg.MODEL.RPN.IN_FEATURES]
rpn_head = builder(cfg, rpn_input_shape)
in_channels = list(backbone_shape.values())[0].channels
anchor_generator = build_anchor_generator(cfg, rpn_input_shape)
num_anchors = anchor_generator.num_cell_anchors[0]
box_dim = anchor_generator.box_dim
N, C_in, H, W = 2, in_channels, 24, 32
input = torch.rand([N, C_in, H, W], dtype=torch.float32)
LAYERS = len(cfg.MODEL.RPN.IN_FEATURES)
out = rpn_head([input] * LAYERS)
self.assertEqual(len(out), 2)
logits, bbox_reg = out
for idx in range(LAYERS):
self.assertEqual(
logits[idx].shape,
torch.Size(
[input.shape[0], num_anchors, input.shape[2], input.shape[3]]
),
)
self.assertEqual(
bbox_reg[idx].shape,
torch.Size(
[
logits[idx].shape[0],
num_anchors * box_dim,
logits[idx].shape[2],
logits[idx].shape[3],
]
),
)
if __name__ == "__main__":
unittest.main()
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import copy
import os
import tempfile
import unittest
import d2go.runner.default_runner as default_runner
import torch
from detectron2.evaluation import COCOEvaluator, RotatedCOCOEvaluator
from detectron2.modeling import META_ARCH_REGISTRY
from detectron2.structures import Boxes, ImageList, Instances
from mobile_cv.arch.quantization.qconfig import (
updateable_symmetric_moving_avg_minmax_config,
)
from torch.nn.parallel import DistributedDataParallel
from d2go.tests import helper
from d2go.tests.data_loader_helper import create_local_dataset
@META_ARCH_REGISTRY.register()
class MetaArchForTest(torch.nn.Module):
def __init__(self, cfg):
super().__init__()
self.conv = torch.nn.Conv2d(3, 4, kernel_size=3, stride=1, padding=1)
self.bn = torch.nn.BatchNorm2d(4)
self.relu = torch.nn.ReLU(inplace=True)
self.avgpool = torch.nn.AdaptiveAvgPool2d((1, 1))
@property
def device(self):
return self.conv.weight.device
def forward(self, inputs):
if not self.training:
return self.inference(inputs)
images = [x["image"] for x in inputs]
images = ImageList.from_tensors(images, 1)
ret = self.conv(images.tensor)
ret = self.bn(ret)
ret = self.relu(ret)
ret = self.avgpool(ret)
return {"loss": ret.norm()}
def inference(self, inputs):
instance = Instances((10, 10))
instance.pred_boxes = Boxes(torch.tensor([[2.5, 2.5, 7.5, 7.5]]))
instance.scores = torch.tensor([0.9])
instance.pred_classes = torch.tensor([1], dtype=torch.int32)
ret = [{"instances": instance}]
return ret
@META_ARCH_REGISTRY.register()
class MetaArchForTestSingleValue(torch.nn.Module):
def __init__(self, cfg):
super().__init__()
self.scale_weight = torch.nn.Parameter(torch.Tensor([1.0]))
@property
def device(self):
return self.scale_weight.device
def forward(self, inputs):
if not self.training:
return self.inference(inputs)
ret = {"loss": self.scale_weight.norm() * 10.0}
print(self.scale_weight)
print(ret)
return ret
def inference(self, inputs):
instance = Instances((10, 10))
instance.pred_boxes = Boxes(
torch.tensor([[2.5, 2.5, 7.5, 7.5]], device=self.device) * self.scale_weight
)
instance.scores = torch.tensor([0.9])
instance.pred_classes = torch.tensor([1], dtype=torch.int32)
ret = [{"instances": instance}]
return ret
def _get_cfg(runner, output_dir, dataset_name):
cfg = runner.get_default_cfg()
cfg.MODEL.DEVICE = "cpu"
cfg.MODEL.META_ARCHITECTURE = "MetaArchForTest"
cfg.DATASETS.TRAIN = (dataset_name,)
cfg.DATASETS.TEST = (dataset_name,)
cfg.INPUT.MIN_SIZE_TRAIN = (10,)
cfg.INPUT.MIN_SIZE_TEST = (10,)
cfg.SOLVER.MAX_ITER = 5
cfg.SOLVER.STEPS = []
cfg.SOLVER.WARMUP_ITERS = 1
cfg.SOLVER.CHECKPOINT_PERIOD = 1
cfg.SOLVER.IMS_PER_BATCH = 2
cfg.OUTPUT_DIR = output_dir
return cfg
class TestDefaultRunner(unittest.TestCase):
def test_d2go_runner_build_model(self):
with tempfile.TemporaryDirectory() as tmp_dir:
ds_name = create_local_dataset(tmp_dir, 5, 10, 10)
runner = default_runner.Detectron2GoRunner()
cfg = _get_cfg(runner, tmp_dir, ds_name)
model = runner.build_model(cfg)
dl = runner.build_detection_train_loader(cfg)
batch = next(iter(dl))
output = model(batch)
self.assertIsInstance(output, dict)
model.eval()
output = model(batch)
self.assertIsInstance(output, list)
default_runner._close_all_tbx_writers()
def test_d2go_runner_train(self):
with tempfile.TemporaryDirectory() as tmp_dir:
ds_name = create_local_dataset(tmp_dir, 5, 10, 10)
runner = default_runner.Detectron2GoRunner()
cfg = _get_cfg(runner, tmp_dir, ds_name)
model = runner.build_model(cfg)
runner.do_train(cfg, model, resume=True)
final_model_path = os.path.join(tmp_dir, "model_final.pth")
self.assertTrue(os.path.isfile(final_model_path))
default_runner._close_all_tbx_writers()
def test_d2go_runner_test(self):
with tempfile.TemporaryDirectory() as tmp_dir:
ds_name = create_local_dataset(tmp_dir, 5, 10, 10)
runner = default_runner.Detectron2GoRunner()
cfg = _get_cfg(runner, tmp_dir, ds_name)
model = runner.build_model(cfg)
results = runner.do_test(cfg, model)
self.assertEqual(results["default"][ds_name]["bbox"]["AP"], 10.0)
default_runner._close_all_tbx_writers()
def test_d2go_build_evaluator(self):
for rotated, evaluator in [
(True, RotatedCOCOEvaluator),
(False, COCOEvaluator),
]:
with tempfile.TemporaryDirectory() as tmp_dir:
ds_name = create_local_dataset(tmp_dir, 5, 10, 10, is_rotated=rotated)
runner = default_runner.Detectron2GoRunner()
cfg = _get_cfg(runner, tmp_dir, ds_name)
ds_evaluators = runner.get_evaluator(cfg, ds_name, tmp_dir)
self.assertTrue(isinstance(ds_evaluators._evaluators[0], evaluator))
@helper.enable_ddp_env
def test_d2go_runner_ema(self):
with tempfile.TemporaryDirectory() as tmp_dir:
ds_name = create_local_dataset(tmp_dir, 5, 10, 10)
runner = default_runner.Detectron2GoRunner()
cfg = _get_cfg(runner, tmp_dir, ds_name)
cfg.MODEL.META_ARCHITECTURE = "MetaArchForTestSingleValue"
cfg.MODEL_EMA.ENABLED = True
cfg.MODEL_EMA.DECAY = 0.9
def _run_train(cfg):
cfg = copy.deepcopy(cfg)
model = runner.build_model(cfg)
model = DistributedDataParallel(model, broadcast_buffers=False)
runner.do_train(cfg, model, True)
final_model_path = os.path.join(tmp_dir, "model_final.pth")
trained_weights = torch.load(final_model_path)
self.assertIn("ema_state", trained_weights)
default_runner._close_all_tbx_writers()
return final_model_path, model.module.ema_state
def _run_test(cfg, final_path, gt_ema):
cfg = copy.deepcopy(cfg)
cfg.MODEL.WEIGHTS = final_path
model = runner.build_model(cfg, eval_only=True)
self.assertGreater(len(model.ema_state.state), 0)
self.assertEqual(len(model.ema_state.state), len(gt_ema.state))
self.assertTrue(
_compare_state_dict(
model.ema_state.state_dict(), gt_ema.state_dict()
)
)
results = runner.do_test(cfg, model)
self.assertEqual(results["default"][ds_name]["bbox"]["AP"], 3.0)
self.assertEqual(results["ema"][ds_name]["bbox"]["AP"], 9.0)
default_runner._close_all_tbx_writers()
def _run_build_model_with_ema_weight(cfg, final_path, gt_ema):
cfg = copy.deepcopy(cfg)
cfg.MODEL.WEIGHTS = final_path
cfg.MODEL_EMA.USE_EMA_WEIGHTS_FOR_EVAL_ONLY = True
model = runner.build_model(cfg, eval_only=True)
self.assertTrue(
_compare_state_dict(model.state_dict(), gt_ema.state_dict())
)
final_model_path, gt_ema = _run_train(cfg)
_run_test(cfg, final_model_path, gt_ema)
_run_build_model_with_ema_weight(cfg, final_model_path, gt_ema)
def test_d2go_runner_train_qat_hook_update_stat(self):
"""Check that the qat hook is used and updates stats"""
@META_ARCH_REGISTRY.register()
class MetaArchForTestQAT(MetaArchForTest):
def prepare_for_quant(self, cfg):
"""Set the qconfig to updateable observers"""
self.qconfig = updateable_symmetric_moving_avg_minmax_config
return self
def setup(tmp_dir):
ds_name = create_local_dataset(tmp_dir, 5, 10, 10)
runner = default_runner.Detectron2GoRunner()
cfg = _get_cfg(runner, tmp_dir, ds_name)
cfg.merge_from_list(
(
["MODEL.META_ARCHITECTURE", "MetaArchForTestQAT"]
+ ["QUANTIZATION.QAT.ENABLED", "True"]
+ ["QUANTIZATION.QAT.START_ITER", "0"]
+ ["QUANTIZATION.QAT.ENABLE_OBSERVER_ITER", "0"]
)
)
return runner, cfg
# check observers have not changed their minmax vals (stats changed)
with tempfile.TemporaryDirectory() as tmp_dir:
runner, cfg = setup(tmp_dir)
model = runner.build_model(cfg)
runner.do_train(cfg, model, resume=True)
observer = model.conv.activation_post_process.activation_post_process
self.assertEqual(observer.min_val, torch.tensor(float("inf")))
self.assertEqual(observer.max_val, torch.tensor(float("-inf")))
self.assertNotEqual(observer.max_stat, torch.tensor(float("inf")))
# check observer does not change if period is > max_iter
with tempfile.TemporaryDirectory() as tmp_dir:
runner, cfg = setup(tmp_dir)
cfg.merge_from_list(
(
["QUANTIZATION.QAT.UPDATE_OBSERVER_STATS_PERIODICALLY", "True"]
+ ["QUANTIZATION.QAT.UPDATE_OBSERVER_STATS_PERIOD", "10"]
)
)
model = runner.build_model(cfg)
runner.do_train(cfg, model, resume=True)
observer = model.conv.activation_post_process.activation_post_process
self.assertEqual(observer.min_val, torch.tensor(float("inf")))
self.assertEqual(observer.max_val, torch.tensor(float("-inf")))
self.assertNotEqual(observer.max_stat, torch.tensor(float("inf")))
# check observer changes if period < max_iter
with tempfile.TemporaryDirectory() as tmp_dir:
runner, cfg = setup(tmp_dir)
cfg.merge_from_list(
(
["QUANTIZATION.QAT.UPDATE_OBSERVER_STATS_PERIODICALLY", "True"]
+ ["QUANTIZATION.QAT.UPDATE_OBSERVER_STATS_PERIOD", "1"]
)
)
model = runner.build_model(cfg)
runner.do_train(cfg, model, resume=True)
observer = model.conv.activation_post_process.activation_post_process
self.assertNotEqual(observer.min_val, torch.tensor(float("inf")))
self.assertNotEqual(observer.max_val, torch.tensor(float("-inf")))
self.assertNotEqual(observer.max_stat, torch.tensor(float("inf")))
default_runner._close_all_tbx_writers()
def test_d2go_runner_train_qat(self):
"""Make sure QAT runs"""
@META_ARCH_REGISTRY.register()
class MetaArchForTestQAT1(torch.nn.Module):
def __init__(self, cfg):
super().__init__()
self.conv = torch.nn.Conv2d(3, 4, kernel_size=3, stride=1, padding=1)
@property
def device(self):
return self.conv.weight.device
def forward(self, inputs):
images = [x["image"] for x in inputs]
images = ImageList.from_tensors(images, 1)
ret = self.conv(images.tensor)
losses = {"loss": ret.norm()}
# run the same conv again
ret1 = self.conv(images.tensor)
losses["ret1"] = ret1.norm()
return losses
def setup(tmp_dir, backend):
ds_name = create_local_dataset(tmp_dir, 5, 10, 10)
runner = default_runner.Detectron2GoRunner()
cfg = _get_cfg(runner, tmp_dir, ds_name)
cfg.merge_from_list(
(
["MODEL.META_ARCHITECTURE", "MetaArchForTestQAT1"]
+ ["QUANTIZATION.QAT.ENABLED", "True"]
+ ["QUANTIZATION.QAT.START_ITER", "0"]
+ ["QUANTIZATION.QAT.ENABLE_OBSERVER_ITER", "0"]
+ ["QUANTIZATION.BACKEND", backend]
)
)
return runner, cfg
for backend in ["fbgemm", "qnnpack"]:
with tempfile.TemporaryDirectory() as tmp_dir:
runner, cfg = setup(tmp_dir, backend=backend)
model = runner.build_model(cfg)
runner.do_train(cfg, model, resume=True)
default_runner._close_all_tbx_writers()
def _compare_state_dict(sd1, sd2, abs_error=1e-3):
if len(sd1) != len(sd2):
return False
if set(sd1.keys()) != set(sd2.keys()):
return False
for name in sd1:
if sd1[name].dtype == torch.float32:
if torch.abs((sd1[name] - sd2[name])).max() > abs_error:
return False
elif (sd1[name] != sd2[name]).any():
return False
return True
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import os
import tempfile
import unittest
from copy import deepcopy
from typing import Dict
import d2go.runner.default_runner as default_runner
import pytorch_lightning as pl # type: ignore
import torch
from d2go.config import CfgNode
from d2go.runner.lightning_task import GeneralizedRCNNTask
from detectron2.utils.events import EventStorage
from torch import Tensor
from d2go.tests import meta_arch_helper as mah
OSSRUN = os.getenv('OSSRUN') == '1'
class TestLightningTask(unittest.TestCase):
def _get_cfg(self, tmp_dir: str) -> CfgNode:
runner = default_runner.Detectron2GoRunner()
cfg = mah.create_detection_cfg(runner, tmp_dir)
cfg.TEST.EVAL_PERIOD = cfg.SOLVER.MAX_ITER
return cfg
def _compare_state_dict(
self, state1: Dict[str, Tensor], state2: Dict[str, Tensor]
) -> bool:
if state1.keys() != state2.keys():
return False
for k in state1:
if not torch.allclose(state1[k], state2[k]):
return False
return True
@unittest.skipIf(OSSRUN, "not supported yet")
def test_load_from_checkpoint(self) -> None:
with tempfile.TemporaryDirectory() as tmp_dir:
task = GeneralizedRCNNTask(self._get_cfg(tmp_dir))
from stl.lightning.callbacks.model_checkpoint import ModelCheckpoint
checkpoint_callback = ModelCheckpoint(
directory=task.cfg.OUTPUT_DIR, has_user_data=False
)
params = {
"max_steps": 1,
"limit_train_batches": 1,
"num_sanity_val_steps": 0,
"checkpoint_callback": checkpoint_callback,
}
trainer = pl.Trainer(**params)
with EventStorage() as storage:
task.storage = storage
trainer.fit(task)
ckpt_path = os.path.join(tmp_dir, "test.ckpt")
trainer.save_checkpoint(ckpt_path)
self.assertTrue(os.path.exists(ckpt_path))
# load model weights from checkpoint
task2 = GeneralizedRCNNTask.load_from_checkpoint(ckpt_path)
self.assertTrue(
self._compare_state_dict(
task.model.state_dict(), task2.model.state_dict()
)
)
def test_train_ema(self):
with tempfile.TemporaryDirectory() as tmp_dir:
cfg = self._get_cfg(tmp_dir)
cfg.MODEL_EMA.ENABLED = True
cfg.MODEL_EMA.DECAY = 0.7
task = GeneralizedRCNNTask(cfg)
init_state = deepcopy(task.model.state_dict())
trainer = pl.Trainer(
max_steps=1,
limit_train_batches=1,
num_sanity_val_steps=0,
)
with EventStorage() as storage:
task.storage = storage
trainer.fit(task)
for k, v in task.model.state_dict().items():
init_state[k].copy_(init_state[k] * 0.7 + 0.3 * v)
self.assertTrue(
self._compare_state_dict(init_state, task.ema_state.state_dict())
)
@unittest.skipIf(OSSRUN, "not supported yet")
def test_load_ema_weights(self):
with tempfile.TemporaryDirectory() as tmp_dir:
cfg = self._get_cfg(tmp_dir)
cfg.MODEL_EMA.ENABLED = True
task = GeneralizedRCNNTask(cfg)
from stl.lightning.callbacks.model_checkpoint import ModelCheckpoint
checkpoint_callback = ModelCheckpoint(
directory=task.cfg.OUTPUT_DIR, save_last=True
)
trainer = pl.Trainer(
max_steps=1,
limit_train_batches=1,
num_sanity_val_steps=0,
callbacks=[checkpoint_callback],
)
with EventStorage() as storage:
task.storage = storage
trainer.fit(task)
# load EMA weights from checkpoint
task2 = GeneralizedRCNNTask.load_from_checkpoint(os.path.join(tmp_dir, "last.ckpt"))
self.assertTrue(self._compare_state_dict(task.ema_state.state_dict(), task2.ema_state.state_dict()))
# apply EMA weights to model
task2.ema_state.apply_to(task2.model)
self.assertTrue(self._compare_state_dict(task.ema_state.state_dict(), task2.model.state_dict()))
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import unittest
from d2go.initializer import (
REGISTER_D2_DATASETS_TIME,
REGISTER_TIME,
SETUP_ENV_TIME,
)
class TestStartupTime(unittest.TestCase):
@unittest.skipIf(True, "Will exceed threshold")
def test_setup_env_time(self):
self.assertLess(sum(SETUP_ENV_TIME), 5.0)
def test_register_d2_datasets_time(self):
self.assertLess(sum(REGISTER_D2_DATASETS_TIME), 3.0)
@unittest.skipIf(True, "Will exceed threshold")
def test_register_time(self):
# NOTE: _register is should be done quickly, currently about 0.2s
self.assertLess(sum(REGISTER_TIME), 1.0)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import tempfile
import unittest
import glob
import d2go.utils.abnormal_checker as ac
import torch
class Model(torch.nn.Module):
def forward(self, x):
return {"loss": x}
class TestUtilsAbnormalChecker(unittest.TestCase):
def test_utils_abnormal_checker(self):
counter = 0
def _writer(all_data):
nonlocal counter
counter += 1
checker = ac.AbnormalLossChecker(-1, writers=[_writer])
losses = [5, 4, 3, 10, 9, 2, 5, 4]
for loss in losses:
checker.check_step({"loss": loss})
self.assertEqual(counter, 2)
def test_utils_abnormal_checker_wrapper(self):
model = Model()
with tempfile.TemporaryDirectory() as tmp_dir:
checker = ac.AbnormalLossChecker(
-1, writers=[ac.FileWriter(tmp_dir)]
)
cmodel = ac.AbnormalLossCheckerWrapper(model, checker)
losses = [5, 4, 3, 10, 9, 2, 5, 4]
for loss in losses:
cur = cmodel(loss)
cur_gt = model(loss)
self.assertEqual(cur, cur_gt)
log_files = glob.glob(f"{tmp_dir}/*.pth")
self.assertEqual(len(log_files), 2)
GT_INVALID_INDICES = [3, 6]
logged_indices = []
for cur_log_file in log_files:
cur_log = torch.load(cur_log_file, map_location="cpu")
self.assertIsInstance(cur_log, dict)
self.assertIn("data", cur_log)
logged_indices.append(cur_log["step"])
self.assertSetEqual(set(logged_indices), set(GT_INVALID_INDICES))
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import unittest
from pathlib import Path
from d2go.utils.validation_monitor import fetch_checkpoints_till_final
from fvcore.common.file_io import PathManager
from mobile_cv.common.misc.file_utils import make_temp_directory
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
def create_file(filename):
with PathManager.open(filename, "w") as _:
pass
class TestValidationMonitor(unittest.TestCase):
def test_fetch_checkpoints_local(self):
with make_temp_directory("test") as output_dir:
output_dir = Path(output_dir)
for i in range(5):
create_file(output_dir / f"model_{i}.pth")
create_file(output_dir / "model_final.pth")
checkpoints = list(fetch_checkpoints_till_final(output_dir))
assert len(checkpoints) == 6
def test_fetch_lightning_checkpoints_local(self):
with make_temp_directory("test") as output_dir:
output_dir = Path(output_dir)
ext = ModelCheckpoint.FILE_EXTENSION
for i in range(5):
create_file(output_dir / f"step={i}{ext}")
create_file(output_dir / f"model_final{ext}")
create_file(output_dir / f"{ModelCheckpoint.CHECKPOINT_NAME_LAST}{ext}")
checkpoints = list(fetch_checkpoints_till_final(output_dir))
self.assertEqual(len(checkpoints), 6)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Binary to evaluate predictor-based model (consist of models in deployable format such
torchscript, caffe2, etc.) using Detectron2Go system (dataloading, evaluation, etc).
"""
import torch
import logging
from d2go.distributed import launch
from d2go.setup import (
basic_argument_parser,
caffe2_global_init,
post_mortem_if_fail_for_main,
prepare_for_launch,
setup_after_launch,
)
from d2go.utils.misc import print_metrics_table
from mobile_cv.common.misc.py import post_mortem_if_fail
from mobile_cv.predictor.api import create_predictor
logger = logging.getLogger("d2go.tools.caffe2_evaluator")
def main(
cfg,
output_dir,
runner,
# binary specific optional arguments
predictor_path,
num_threads=None,
caffe2_engine=None,
caffe2_logging_print_net_summary=0,
):
torch.backends.quantized.engine = cfg.QUANTIZATION.BACKEND
print("run with quantized engine: ", torch.backends.quantized.engine)
setup_after_launch(cfg, output_dir, runner)
caffe2_global_init(caffe2_logging_print_net_summary, num_threads)
predictor = create_predictor(predictor_path)
metrics = runner.do_test(cfg, predictor)
print_metrics_table(metrics)
return {
"accuracy": metrics,
"metrics": metrics,
}
@post_mortem_if_fail()
def run_with_cmdline_args(args):
cfg, output_dir, runner = prepare_for_launch(args)
launch(
post_mortem_if_fail_for_main(main),
args.num_processes,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
backend="GLOO",
always_spawn=False,
args=(
cfg,
output_dir,
runner,
# binary specific optional arguments
args.predictor_path,
args.num_threads,
args.caffe2_engine,
args.caffe2_logging_print_net_summary,
),
)
if __name__ == "__main__":
parser = basic_argument_parser()
parser.add_argument(
"--predictor-path",
type=str,
help="Path (a directory) to the exported model that will be evaluated",
)
# === performance config ===========================================================
parser.add_argument(
"--num-threads",
type=int,
default=None,
help="Number of omp/mkl threads (per process) to use in Caffe2's GlobalInit",
)
parser.add_argument(
"--caffe2-engine",
type=str,
default=None,
help="If set, engine of all ops will be set by this value",
)
parser.add_argument(
"--caffe2_logging_print_net_summary",
type=int,
default=0,
help="Control the --caffe2_logging_print_net_summary in GlobalInit",
)
run_with_cmdline_args(parser.parse_args())
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Binary to convert pytorch detectron2go model to a predictor, which contains model(s) in
deployable format (such as torchscript, caffe2, ...)
"""
import copy
import logging
import typing
import mobile_cv.lut.lib.pt.flops_utils as flops_utils
from d2go.config import temp_defrost
from d2go.export.api import convert_and_export_predictor
from d2go.setup import (
basic_argument_parser,
prepare_for_launch,
setup_after_launch,
)
from mobile_cv.common.misc.py import post_mortem_if_fail
logger = logging.getLogger("d2go.tools.export")
def main(
cfg,
output_dir,
runner,
# binary specific optional arguments
predictor_types: typing.List[str],
compare_accuracy: bool = False,
skip_if_fail: bool = False,
):
cfg = copy.deepcopy(cfg)
setup_after_launch(cfg, output_dir, runner)
with temp_defrost(cfg):
cfg.merge_from_list(["MODEL.DEVICE", "cpu"])
model = runner.build_model(cfg, eval_only=True)
# NOTE: train dataset is used to avoid leakage since the data might be used for
# running calibration for quantization. test_loader is used to make sure it follows
# the inference behaviour (augmentation will not be applied).
datasest = cfg.DATASETS.TRAIN[0]
data_loader = runner.build_detection_test_loader(cfg, datasest)
logger.info("Running the pytorch model and print FLOPS ...")
first_batch = next(iter(data_loader))
input_args = (first_batch,)
flops_utils.print_model_flops(model, input_args)
predictor_paths: typing.Dict[str, str] = {}
for typ in predictor_types:
# convert_and_export_predictor might alter the model, copy before calling it
pytorch_model = copy.deepcopy(model)
try:
predictor_path = convert_and_export_predictor(
cfg, pytorch_model, typ, output_dir, data_loader
)
logger.info(f"Predictor type {typ} has been exported to {predictor_path}")
predictor_paths[typ] = predictor_path
except Exception as e:
logger.warning(f"Export {typ} predictor failed: {e}")
if not skip_if_fail:
raise e
ret = {"predictor_paths": predictor_paths, "accuracy_comparison": {}}
if compare_accuracy:
raise NotImplementedError()
# NOTE: dict for metrics of all exported models (and original pytorch model)
# ret["accuracy_comparison"] = accuracy_comparison
return ret
@post_mortem_if_fail()
def run_with_cmdline_args(args):
cfg, output_dir, runner = prepare_for_launch(args)
return main(
cfg,
output_dir,
runner,
# binary specific optional arguments
predictor_types=args.predictor_types,
compare_accuracy=args.compare_accuracy,
skip_if_fail=args.skip_if_fail,
)
def get_parser():
parser = basic_argument_parser(distributed=False)
parser.add_argument(
"--predictor-types",
type=str,
nargs="+",
help="List of strings specify the types of predictors to export",
)
parser.add_argument(
"--compare-accuracy",
action="store_true",
help="If true, all exported models and the original pytorch model will be"
" evaluted on cfg.DATASETS.TEST",
)
parser.add_argument(
"--skip-if-fail",
action="store_true",
default=False,
help="If set, suppress the exception for failed exporting and continue to"
" export the next type of model",
)
return parser
def cli():
run_with_cmdline_args(get_parser().parse_args())
if __name__ == "__main__":
cli()
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
import logging
import os
from dataclasses import dataclass
from typing import Any, Dict, List, Optional, Type
import pytorch_lightning as pl # type: ignore
from d2go.config import CfgNode, temp_defrost
from d2go.runner import get_class
from d2go.runner.lightning_task import GeneralizedRCNNTask
from d2go.setup import basic_argument_parser
from d2go.utils.misc import dump_trained_model_configs
from detectron2.utils.events import EventStorage
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.callbacks import LearningRateMonitor
from pytorch_lightning.callbacks.model_checkpoint import ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from stl.lightning.callbacks.model_checkpoint import ModelCheckpoint
from stl.lightning.callbacks.quantization import QuantizationAwareTraining
from stl.lightning.io.filesystem import get_filesystem
from stl.lightning.loggers import ManifoldTensorBoardLogger
from stl.lightning.utilities.manifold import manifold_uri_to_bucket_and_path
from torch.distributed import get_rank
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("detectron2go.lightning.train_net")
FINAL_MODEL_CKPT = f"model_final{ModelCheckpoint.FILE_EXTENSION}"
@dataclass
class TrainOutput:
output_dir: str
accuracy: Optional[Dict[str, Any]] = None
tensorboard_log_dir: Optional[str] = None
model_configs: Optional[Dict[str, str]] = None
def get_tb_logger(output_dir: str) -> TensorBoardLogger:
"""Stores tensorboard outputs in output_dir."""
if output_dir.startswith("manifold://"):
bucket, path = manifold_uri_to_bucket_and_path(output_dir)
return ManifoldTensorBoardLogger(manifold_bucket=bucket, manifold_path=path)
return TensorBoardLogger(save_dir=output_dir)
def maybe_override_output_dir(cfg: CfgNode, output_dir: Optional[str]) -> None:
if output_dir is not None and output_dir != cfg.OUTPUT_DIR:
cfg.OUTPUT_DIR = output_dir
logger.warning(
f"Override cfg.OUTPUT_DIR ({cfg.OUTPUT_DIR}) to be the same as "
f"output_dir {output_dir}"
)
def _get_trainer_callbacks(cfg: CfgNode) -> List[Callback]:
"""Gets the trainer callbacks based on the given D2Go Config.
Args:
cfg: The normalized ConfigNode for this D2Go Task.
Returns:
A list of configured Callbacks to be used by the Lightning Traininer.
"""
callbacks: List[Callback] = [
LearningRateMonitor(logging_interval="step"),
ModelCheckpoint(
directory=cfg.OUTPUT_DIR,
has_user_data=False,
save_top_k=-1,
every_n_epochs=-1,
every_n_steps=cfg.SOLVER.CHECKPOINT_PERIOD,
file_name_template="{step}",
save_last=True,
),
]
if cfg.QUANTIZATION.QAT.ENABLED:
qat = cfg.QUANTIZATION.QAT
callbacks.append(
QuantizationAwareTraining(
qconfig_dicts={
submodule: None for submodule in cfg.QUANTIZATION.MODULES
}
if cfg.QUANTIZATION.MODULES
else None,
start_step=qat.START_ITER,
enable_observer=(qat.ENABLE_OBSERVER_ITER, qat.DISABLE_OBSERVER_ITER),
freeze_bn_step=qat.FREEZE_BN_ITER,
)
)
return callbacks
def main(
cfg: CfgNode,
output_dir: Optional[str] = None,
task_cls: Type[GeneralizedRCNNTask] = GeneralizedRCNNTask,
eval_only: bool = False,
num_machines: int = 1,
num_gpus: int = 0,
num_processes: int = 1,
accelerator: Optional[str] = "ddp",
) -> TrainOutput:
"""Main function for launching a training with lightning trainer
Args:
cfg: D2go config node
num_machines: Number of nodes used for distributed training
num_gpus: Number of GPUs to train on each node
num_processes: Number of processes on each node.
NOTE: Automatically set to the number of GPUs when using DDP.
Set a value greater than 1 to mimic distributed training on CPUs.
accelerator: Backend for distributed training. Only DDP
and DPP_CPU are supported.
eval_only: True if run evaluation only.
"""
assert (
num_processes == 1 or num_gpus == 0
), "Only set num_processes > 1 when training on CPUs"
maybe_override_output_dir(cfg, output_dir)
if cfg.MODEL.WEIGHTS:
# only load model weights from checkpoint
task = task_cls.load_from_checkpoint(cfg.MODEL.WEIGHTS, cfg=cfg)
logger.info(f"Load model weights from checkpoint: {cfg.MODEL.WEIGHTS}.")
else:
task = task_cls(cfg)
tb_logger = get_tb_logger(cfg.OUTPUT_DIR)
trainer_params = {
# training loop is bounded by max steps, use a large max_epochs to make
# sure max_steps is met first
"max_epochs": 10 ** 8,
"max_steps": cfg.SOLVER.MAX_ITER,
"val_check_interval": cfg.TEST.EVAL_PERIOD
if cfg.TEST.EVAL_PERIOD > 0
else cfg.SOLVER.MAX_ITER,
"num_nodes": num_machines,
"gpus": num_gpus,
"num_processes": num_processes,
"accelerator": accelerator,
"callbacks": _get_trainer_callbacks(cfg),
"logger": tb_logger,
"num_sanity_val_steps": 0,
"progress_bar_refresh_rate": 10,
}
last_checkpoint = os.path.join(cfg.OUTPUT_DIR, "last.ckpt")
if get_filesystem(cfg.OUTPUT_DIR).exists(last_checkpoint):
# resume training from checkpoint
trainer_params["resume_from_checkpoint"] = last_checkpoint
logger.info(f"Resuming training from checkpoint: {last_checkpoint}.")
# pyre-fixme[16]: Module `pl` has no attribute `Trainer`.
trainer = pl.Trainer(**trainer_params)
# TODO: find a better place for event storage
with EventStorage() as storage:
task.storage = storage
model_configs = None
if eval_only:
logger.info(
f"start to evaluate with {num_machines} nodes and {num_gpus} GPUs"
)
trainer.test(task)
else:
logger.info(f"start to train with {num_machines} nodes and {num_gpus} GPUs")
trainer.fit(task)
final_ckpt = os.path.join(cfg.OUTPUT_DIR, FINAL_MODEL_CKPT)
trainer.save_checkpoint(final_ckpt) # for validation monitor
trained_cfg = cfg.clone()
with temp_defrost(trained_cfg):
trained_cfg.MODEL.WEIGHTS = final_ckpt
model_configs = dump_trained_model_configs(cfg.OUTPUT_DIR, {"model_final": trained_cfg})
tb_log_dir = (
tb_logger.output_dir
if isinstance(tb_logger, ManifoldTensorBoardLogger)
else tb_logger.log_dir
)
return TrainOutput(
output_dir=cfg.OUTPUT_DIR,
tensorboard_log_dir=tb_log_dir,
accuracy=task.eval_res,
model_configs=model_configs
)
def build_config(
config_file: str,
task_cls: Type[GeneralizedRCNNTask],
opts: Optional[List[str]] = None,
) -> CfgNode:
"""Build config node from config file
Args:
config_file: Path to a D2go config file
output_dir: When given, this will override the OUTPUT_DIR in the config
opts: A list of config overrides. e.g. ["SOLVER.IMS_PER_BATCH", "2"]
"""
cfg = task_cls.get_default_cfg()
cfg.merge_from_file(config_file)
if opts:
cfg.merge_from_list(opts)
return cfg
def argument_parser():
parser = basic_argument_parser(distributed=True, requires_output_dir=False)
parser.add_argument(
"--num-gpus", type=int, default=0, help="number of GPUs per machine"
)
return parser
if __name__ == "__main__":
args = argument_parser().parse_args()
task_cls = get_class(args.runner) if args.runner else GeneralizedRCNNTask
cfg = build_config(args.config_file, task_cls, args.opts)
ret = main(
cfg,
args.output_dir,
task_cls,
eval_only=False, # eval_only
num_machines=args.num_machines,
num_gpus=args.num_gpus,
num_processes=args.num_processes,
accelerator="ddp" if args.num_gpus > 0 else "ddp_cpu",
)
if get_rank() == 0:
print(ret)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
"""
Detection Training Script.
"""
import logging
import detectron2.utils.comm as comm
from d2go.distributed import launch
from d2go.setup import (
basic_argument_parser,
post_mortem_if_fail_for_main,
prepare_for_launch,
setup_after_launch,
)
from d2go.utils.misc import print_metrics_table, dump_trained_model_configs
from torch.nn.parallel import DistributedDataParallel
logger = logging.getLogger("d2go.tools.train_net")
def main(
cfg,
output_dir,
runner=None,
eval_only=False,
# NOTE: always enable resume when running on cluster
resume=True,
):
setup_after_launch(cfg, output_dir, runner)
model = runner.build_model(cfg)
logger.info("Model:\n{}".format(model))
if eval_only:
checkpointer = runner.build_checkpointer(cfg, model, save_dir=output_dir)
# checkpointer.resume_or_load() will skip all additional checkpointable
# which may not be desired like ema states
if resume and checkpointer.has_checkpoint():
checkpoint = checkpointer.resume_or_load(cfg.MODEL.WEIGHTS, resume=resume)
else:
checkpoint = checkpointer.load(cfg.MODEL.WEIGHTS)
train_iter = checkpoint.get("iteration", None)
model.eval()
metrics = runner.do_test(cfg, model, train_iter=train_iter)
print_metrics_table(metrics)
return {
"accuracy": metrics,
"model_configs": {},
"metrics": metrics,
}
if comm.get_world_size() > 1:
model = DistributedDataParallel(
model,
device_ids=None if cfg.MODEL.DEVICE == "cpu" else [comm.get_local_rank()],
broadcast_buffers=False,
find_unused_parameters=cfg.MODEL.DDP_FIND_UNUSED_PARAMETERS,
)
trained_cfgs = runner.do_train(cfg, model, resume=resume)
metrics = runner.do_test(cfg, model)
print_metrics_table(metrics)
# dump config files for trained models
trained_model_configs = dump_trained_model_configs(cfg.OUTPUT_DIR, trained_cfgs)
return {
# for e2e_workflow
"accuracy": metrics,
# for unit_workflow
"model_configs": trained_model_configs,
"metrics": metrics,
}
def run_with_cmdline_args(args):
cfg, output_dir, runner = prepare_for_launch(args)
launch(
post_mortem_if_fail_for_main(main),
num_processes_per_machine=args.num_processes,
num_machines=args.num_machines,
machine_rank=args.machine_rank,
dist_url=args.dist_url,
backend=args.dist_backend,
args=(cfg, output_dir, runner, args.eval_only, args.resume),
)
def cli():
parser = basic_argument_parser(requires_output_dir=False)
parser.add_argument(
"--eval-only", action="store_true", help="perform evaluation only"
)
parser.add_argument(
"--resume",
action="store_true",
help="whether to attempt to resume from the checkpoint directory",
)
run_with_cmdline_args(parser.parse_args())
if __name__ == "__main__":
cli()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment