Unverified Commit 6a834e98 authored by Francisco Massa's avatar Francisco Massa Committed by GitHub
Browse files

Move resnet video models to single location (#1190)

* [WIP] Minor cleanups on R3d

* Move all models to video/resnet.py

* Remove old files

* Make tests less memory intensive

* Lint

* Fix typo and add pretraing arg to training script
parent 4ec38d49
...@@ -201,8 +201,7 @@ def main(args): ...@@ -201,8 +201,7 @@ def main(args):
pin_memory=True, collate_fn=collate_fn) pin_memory=True, collate_fn=collate_fn)
print("Creating model") print("Creating model")
# model = torchvision.models.video.__dict__[args.model](pretrained=args.pretrained) model = torchvision.models.video.__dict__[args.model](pretrained=args.pretrained)
model = torchvision.models.video.__dict__[args.model]()
model.to(device) model.to(device)
if args.distributed and args.sync_bn: if args.distributed and args.sync_bn:
model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
......
...@@ -61,7 +61,7 @@ class Tester(unittest.TestCase): ...@@ -61,7 +61,7 @@ class Tester(unittest.TestCase):
def _test_video_model(self, name): def _test_video_model(self, name):
# the default input shape is # the default input shape is
# bs * num_channels * clip_len * h *w # bs * num_channels * clip_len * h *w
input_shape = (1, 3, 8, 112, 112) input_shape = (1, 3, 4, 112, 112)
# test both basicblock and Bottleneck # test both basicblock and Bottleneck
model = models.video.__dict__[name](num_classes=50) model = models.video.__dict__[name](num_classes=50)
x = torch.rand(input_shape) x = torch.rand(input_shape)
...@@ -145,6 +145,7 @@ for model_name in get_available_detection_models(): ...@@ -145,6 +145,7 @@ for model_name in get_available_detection_models():
setattr(Tester, "test_" + model_name, do_test) setattr(Tester, "test_" + model_name, do_test)
for model_name in get_available_video_models(): for model_name in get_available_video_models():
def do_test(self, model_name=model_name): def do_test(self, model_name=model_name):
......
from .r3d import * from .resnet import *
from .r2plus1d import *
from .mixed_conv import *
import torch.nn as nn
__all__ = ["Conv3DSimple", "Conv2Plus1D", "Conv3DNoTemporal"]
class Conv3DSimple(nn.Conv3d):
def __init__(self,
in_planes,
out_planes,
midplanes=None,
stride=1,
padding=1):
super(Conv3DSimple, self).__init__(
in_channels=in_planes,
out_channels=out_planes,
kernel_size=(3, 3, 3),
stride=stride,
padding=padding,
bias=False)
@staticmethod
def get_downsample_stride(stride):
return (stride, stride, stride)
class Conv2Plus1D(nn.Sequential):
def __init__(self,
in_planes,
out_planes,
midplanes,
stride=1,
padding=1):
conv1 = [
nn.Conv3d(in_planes, midplanes, kernel_size=(1, 3, 3),
stride=(1, stride, stride), padding=(0, padding, padding),
bias=False),
nn.BatchNorm3d(midplanes),
nn.ReLU(inplace=True),
nn.Conv3d(midplanes, out_planes, kernel_size=(3, 1, 1),
stride=(stride, 1, 1), padding=(padding, 0, 0),
bias=False)
]
super(Conv2Plus1D, self).__init__(*conv1)
@staticmethod
def get_downsample_stride(stride):
return (stride, stride, stride)
class Conv3DNoTemporal(nn.Conv3d):
def __init__(self,
in_planes,
out_planes,
midplanes=None,
stride=1,
padding=1):
super(Conv3DNoTemporal, self).__init__(
in_channels=in_planes,
out_channels=out_planes,
kernel_size=(1, 3, 3),
stride=(1, stride, stride),
padding=(0, padding, padding),
bias=False)
@staticmethod
def get_downsample_stride(stride):
return (1, stride, stride)
import torch.nn as nn
from ._utils import Conv3DSimple, Conv3DNoTemporal
from .video_stems import get_default_stem
from .video_trunk import VideoTrunkBuilder, BasicBlock, Bottleneck
__all__ = ["mc3_18"]
def _mcX(model_depth, X=3, use_pool1=False, **kwargs):
"""Generate mixed convolution network as in
https://arxiv.org/abs/1711.11248
Args:
model_depth (int): trunk depth - supports most resnet depths
X (int): Up to which layers are convolutions 3D
use_pool1 (bool, optional): Add pooling layer to the stem. Defaults to False.
Returns:
nn.Module: mcX video trunk
"""
assert X > 1 and X <= 5
conv_makers = [Conv3DSimple] * (X - 2)
while len(conv_makers) < 5:
conv_makers.append(Conv3DNoTemporal)
if model_depth < 50:
block = BasicBlock
else:
block = Bottleneck
model = VideoTrunkBuilder(block=block, conv_makers=conv_makers, model_depth=model_depth,
stem=get_default_stem(use_pool1=use_pool1), **kwargs)
return model
def mc3_18(use_pool1=False, **kwargs):
"""Constructor for 18 layer Mixed Convolution network as in
https://arxiv.org/abs/1711.11248
Args:
use_pool1 (bool, optional): Include pooling in the resnet stem. Defaults to False.
Returns:
nn.Module: MC3 Network definitino
"""
return _mcX(18, 3, use_pool1, **kwargs)
import torch.nn as nn
from ._utils import Conv2Plus1D
from .video_stems import get_r2plus1d_stem
from .video_trunk import VideoTrunkBuilder, BasicBlock, Bottleneck
__all__ = ["r2plus1d_18"]
def _r2plus1d(model_depth, use_pool1=False, **kwargs):
"""Constructor for R(2+1)D network as described in
https://arxiv.org/abs/1711.11248
Args:
model_depth (int): Depth of the model - standard resnet depths apply
use_pool1 (bool, optional): Should we use the pooling layer? Defaults to False
Returns:
nn.Module: An R(2+1)D video backbone
"""
convs = [Conv2Plus1D] * 4
if model_depth < 50:
block = BasicBlock
else:
block = Bottleneck
model = VideoTrunkBuilder(
block=block, conv_makers=convs, model_depth=model_depth,
stem=get_r2plus1d_stem(use_pool1), **kwargs)
return model
def r2plus1d_18(use_pool1=False, **kwargs):
"""Constructor for the 18 layer deep R(2+1)D network as in
https://arxiv.org/abs/1711.11248
Args:
use_pool1 (bool, optional): Include pooling in the resnet stem. Defaults to False.
Returns:
nn.Module: R(2+1)D-18 network
"""
return _r2plus1d(18, use_pool1, **kwargs)
import torch.nn as nn
from ._utils import Conv3DSimple
from .video_stems import get_default_stem
from .video_trunk import VideoTrunkBuilder, BasicBlock, Bottleneck
__all__ = ["r3d_18"]
def _r3d(model_depth, use_pool1=False, **kwargs):
"""Constructor of a r3d network as in
https://arxiv.org/abs/1711.11248
Args:
model_depth (int): resnet trunk depth
use_pool1 (bool, optional): Add pooling layer to the stem. Defaults to False
Returns:
nn.Module: R3D network trunk
"""
conv_makers = [Conv3DSimple] * 4
if model_depth < 50:
block = BasicBlock
else:
block = Bottleneck
model = VideoTrunkBuilder(block=block, conv_makers=conv_makers, model_depth=model_depth,
stem=get_default_stem(use_pool1=use_pool1), **kwargs)
return model
def r3d_18(use_pool1=False, **kwargs):
"""Construct 18 layer Resnet3D model as in
https://arxiv.org/abs/1711.11248
Args:
use_pool1 (bool, optional): Include pooling in resnet stem. Defaults to False.
Returns:
nn.Module: R3D-18 network
"""
return _r3d(18, use_pool1, **kwargs)
import inspect
import torch import torch
import torch.nn as nn import torch.nn as nn
from .video_stems import get_default_stem from ..utils import load_state_dict_from_url
from ._utils import Conv3DNoTemporal
BLOCK_CONFIG = { __all__ = ['r3d_18', 'mc3_18', 'r2plus1d_18']
10: (1, 1, 1, 1),
16: (2, 2, 2, 1), model_urls = {
18: (2, 2, 2, 2), 'resnet18': 'https://download.pytorch.org/models/',
26: (2, 3, 4, 3), 'resnet34': 'https://download.pytorch.org/models/',
34: (3, 4, 6, 3),
50: (3, 4, 6, 3),
101: (3, 4, 23, 3),
152: (3, 8, 36, 3)
} }
class Conv3DSimple(nn.Conv3d):
def __init__(self,
in_planes,
out_planes,
midplanes=None,
stride=1,
padding=1):
super(Conv3DSimple, self).__init__(
in_channels=in_planes,
out_channels=out_planes,
kernel_size=(3, 3, 3),
stride=stride,
padding=padding,
bias=False)
@staticmethod
def get_downsample_stride(stride):
return (stride, stride, stride)
class Conv2Plus1D(nn.Sequential):
def __init__(self,
in_planes,
out_planes,
midplanes,
stride=1,
padding=1):
super(Conv2Plus1D, self).__init__(
nn.Conv3d(in_planes, midplanes, kernel_size=(1, 3, 3),
stride=(1, stride, stride), padding=(0, padding, padding),
bias=False),
nn.BatchNorm3d(midplanes),
nn.ReLU(inplace=True),
nn.Conv3d(midplanes, out_planes, kernel_size=(3, 1, 1),
stride=(stride, 1, 1), padding=(padding, 0, 0),
bias=False))
@staticmethod
def get_downsample_stride(stride):
return (stride, stride, stride)
class Conv3DNoTemporal(nn.Conv3d):
def __init__(self,
in_planes,
out_planes,
midplanes=None,
stride=1,
padding=1):
super(Conv3DNoTemporal, self).__init__(
in_channels=in_planes,
out_channels=out_planes,
kernel_size=(1, 3, 3),
stride=(1, stride, stride),
padding=(0, padding, padding),
bias=False)
@staticmethod
def get_downsample_stride(stride):
return (1, stride, stride)
class BasicBlock(nn.Module): class BasicBlock(nn.Module):
expansion = 1 expansion = 1
...@@ -99,30 +159,53 @@ class Bottleneck(nn.Module): ...@@ -99,30 +159,53 @@ class Bottleneck(nn.Module):
return out return out
class VideoTrunkBuilder(nn.Module): class BasicStem(nn.Sequential):
"""The default conv-batchnorm-relu stem
def __init__(self, block, conv_makers, model_depth, """
stem=None, def __init__(self):
num_classes=400, super(BasicStem, self).__init__(
nn.Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2),
padding=(1, 3, 3), bias=False),
nn.BatchNorm3d(64),
nn.ReLU(inplace=True))
class R2Plus1dStem(nn.Sequential):
"""R(2+1)D stem is different than the default one as it uses separated 3D convolution
"""
def __init__(self):
super(R2Plus1dStem, self).__init__(
nn.Conv3d(3, 45, kernel_size=(1, 7, 7),
stride=(1, 2, 2), padding=(0, 3, 3),
bias=False),
nn.BatchNorm3d(45),
nn.ReLU(inplace=True),
nn.Conv3d(45, 64, kernel_size=(3, 1, 1),
stride=(1, 1, 1), padding=(1, 0, 0),
bias=False),
nn.BatchNorm3d(64),
nn.ReLU(inplace=True))
class VideoResNet(nn.Module):
def __init__(self, block, conv_makers, layers,
stem, num_classes=400,
zero_init_residual=False): zero_init_residual=False):
"""Generic resnet video generator. """Generic resnet video generator.
Args: Args:
block (nn.Module): resnet building block block (nn.Module): resnet building block
conv_makers (list(functions)): generator function for each layer conv_makers (list(functions)): generator function for each layer
model_depth (int): depth of the model; supports traditional resnet depths . layers (List[int]): number of blocks per layer
stem (nn.Sequential, optional): Resnet stem, if None, defaults to conv-bn-relu. Defaults to None. stem (nn.Module, optional): Resnet stem, if None, defaults to conv-bn-relu. Defaults to None.
num_classes (int, optional): Dimension of the final FC layer. Defaults to 400. num_classes (int, optional): Dimension of the final FC layer. Defaults to 400.
zero_init_residual (bool, optional): Zero init bottleneck residual BN. Defaults to False. zero_init_residual (bool, optional): Zero init bottleneck residual BN. Defaults to False.
""" """
super(VideoTrunkBuilder, self).__init__() super(VideoResNet, self).__init__()
layers = BLOCK_CONFIG[model_depth]
self.inplanes = 64 self.inplanes = 64
if stem is None: self.stem = stem()
self.conv1 = get_default_stem()
else:
self.conv1 = stem
self.layer1 = self._make_layer(block, conv_makers[0], 64, layers[0], stride=1) self.layer1 = self._make_layer(block, conv_makers[0], 64, layers[0], stride=1)
self.layer2 = self._make_layer(block, conv_makers[1], 128, layers[1], stride=2) self.layer2 = self._make_layer(block, conv_makers[1], 128, layers[1], stride=2)
...@@ -141,7 +224,7 @@ class VideoTrunkBuilder(nn.Module): ...@@ -141,7 +224,7 @@ class VideoTrunkBuilder(nn.Module):
nn.init.constant_(m.bn3.weight, 0) nn.init.constant_(m.bn3.weight, 0)
def forward(self, x): def forward(self, x):
x = self.conv1(x) x = self.stem(x)
x = self.layer1(x) x = self.layer1(x)
x = self.layer2(x) x = self.layer2(x)
...@@ -187,3 +270,71 @@ class VideoTrunkBuilder(nn.Module): ...@@ -187,3 +270,71 @@ class VideoTrunkBuilder(nn.Module):
elif isinstance(m, nn.Linear): elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01) nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0) nn.init.constant_(m.bias, 0)
def _video_resnet(arch, pretrained=False, progress=True, **kwargs):
model = VideoResNet(**kwargs)
if pretrained:
state_dict = load_state_dict_from_url(model_urls[arch],
progress=progress)
model.load_state_dict(state_dict)
return model
def r3d_18(pretrained=False, progress=True, **kwargs):
"""Construct 18 layer Resnet3D model as in
https://arxiv.org/abs/1711.11248
Args:
pretrained (bool): If True, returns a model pre-trained on Kinetics-400
progress (bool): If True, displays a progress bar of the download to stderr
Returns:
nn.Module: R3D-18 network
"""
return _video_resnet('r3d_18',
pretrained, progress,
block=BasicBlock,
conv_makers=[Conv3DSimple] * 4,
layers=[2, 2, 2, 2],
stem=BasicStem, **kwargs)
def mc3_18(pretrained=False, progress=True, **kwargs):
"""Constructor for 18 layer Mixed Convolution network as in
https://arxiv.org/abs/1711.11248
Args:
pretrained (bool): If True, returns a model pre-trained on Kinetics-400
progress (bool): If True, displays a progress bar of the download to stderr
Returns:
nn.Module: MC3 Network definition
"""
return _video_resnet('mc3_18',
pretrained, progress,
block=BasicBlock,
conv_makers=[Conv3DSimple] + [Conv3DNoTemporal] * 3,
layers=[2, 2, 2, 2],
stem=BasicStem, **kwargs)
def r2plus1d_18(pretrained=False, progress=True, **kwargs):
"""Constructor for the 18 layer deep R(2+1)D network as in
https://arxiv.org/abs/1711.11248
Args:
pretrained (bool): If True, returns a model pre-trained on Kinetics-400
progress (bool): If True, displays a progress bar of the download to stderr
Returns:
nn.Module: R(2+1)D-18 network
"""
return _video_resnet('r2plus1d_18',
pretrained, progress,
block=BasicBlock,
conv_makers=[Conv2Plus1D] * 4,
layers=[2, 2, 2, 2],
stem=R2Plus1dStem, **kwargs)
import torch.nn as nn
def get_default_stem(use_pool1=False):
"""The default conv-batchnorm-relu(-maxpool) stem
Args:
use_pool1 (bool, optional): Should the stem include the default maxpool? Defaults to False.
Returns:
nn.Sequential: Conv1 stem of resnet based models.
"""
m = [
nn.Conv3d(3, 64, kernel_size=(3, 7, 7), stride=(1, 2, 2),
padding=(1, 3, 3), bias=False),
nn.BatchNorm3d(64),
nn.ReLU(inplace=True)]
if use_pool1:
m.append(nn. MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1))
return nn.Sequential(*m)
def get_r2plus1d_stem(use_pool1=False):
"""R(2+1)D stem is different than the default one as it uses separated 3D convolution
Args:
use_pool1 (bool, optional): Should the stem contain pool1 layer. Defaults to False.
Returns:
nn.Sequential: the stem of the conv-separated network.
"""
m = [
nn.Conv3d(3, 45, kernel_size=(1, 7, 7),
stride=(1, 2, 2), padding=(0, 3, 3),
bias=False),
nn.BatchNorm3d(45),
nn.ReLU(inplace=True),
nn.Conv3d(45, 64, kernel_size=(3, 1, 1),
stride=(1, 1, 1), padding=(1, 0, 0),
bias=False),
nn.BatchNorm3d(64),
nn.ReLU(inplace=True)]
if use_pool1:
m.append(nn. MaxPool3d(kernel_size=(3, 3, 3), stride=2, padding=1))
return nn.Sequential(*m)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment