Commit fb54db0f authored by limm's avatar limm
Browse files

add projects code

parent 1ac2e802
Pipeline #2804 canceled with stages
# Copyright (c) OpenMMLab. All rights reserved.
from typing import Dict, List
import torch
import torch.nn.functional as F
from mmpretrain.models import BaseSelfSupervisor
from mmpretrain.registry import MODELS
from mmpretrain.structures import DataSample
@MODELS.register_module()
class VideoMaskFeat(BaseSelfSupervisor):
"""MaskFeat.
Implementation of `Masked Feature Prediction for Self-Supervised Visual
Pre-Training <https://arxiv.org/abs/2112.09133>`_.
"""
def loss(self, inputs: List[torch.Tensor], data_samples: List[DataSample],
**kwargs) -> Dict[str, torch.Tensor]:
"""The forward function in training.
Args:
inputs (List[torch.Tensor]): The input images.
data_samples (List[DataSample]): All elements required
during the forward function.
Returns:
Dict[str, torch.Tensor]: A dictionary of loss components.
"""
mask = torch.stack(
[data_sample.mask.value for data_sample in data_samples])
mask = mask.to(torch.bool)
video = inputs[0]
video = video.view((-1, ) + video.shape[2:]) # B, C, T, H, W
latent = self.backbone(video, mask)
B, L, C = latent[0].shape
pred = self.neck([latent[0].view(B * L, C)])
pred = pred[0].view(B, L, -1)
# generate hog target
video = video[:, :, ::self.backbone.patch_stride[0], :, :]
video = video.transpose(1, 2) # B, T, C, H, W
self.target_generator.B = video.size(0)
self.target_generator.T = video.size(1)
video = video.flatten(0, 1) # B*T, C, H, W
hog = self.target_generator(video)
mask = self._get_output_mask(mask)
loss = self.head(pred, hog, mask)
losses = dict(loss=loss)
return losses
def _get_output_mask(self, mask: torch.Tensor) -> torch.Tensor:
size = self.backbone.out_patch_resolution[-1][-1]
output_mask = F.interpolate(mask.float(), size=size)
return output_mask
# Copyright (c) OpenMMLab. All rights reserved.
from typing import List, Optional, Sequence, Tuple, Union
import torch
import torch.nn as nn
import torch.nn.functional as F
from mmaction.models import MViT
from mmaction.models.backbones.mvit import resize_pos_embed
from mmpretrain.registry import MODELS
@MODELS.register_module()
class MaskFeatMViT(MViT):
arch_zoo = {
'maskfeat-small': {
'embed_dims': 96,
'num_layers': 16,
'num_heads': 1,
'downscale_indices': [1, 3],
'dim_mul_indices': [1, 3, 14]
},
'maskfeat-large': {
'embed_dims': 144,
'num_layers': 48,
'num_heads': 2,
'downscale_indices': [2, 8],
'dim_mul_indices': [2, 8, 44]
},
}
def __init__(
self,
arch: str = 'base',
spatial_size: int = 224,
temporal_size: int = 16,
in_channels: int = 3,
out_scales: Union[int, Sequence[int]] = -1,
drop_path_rate: float = 0,
use_abs_pos_embed: bool = False,
interpolate_mode: str = 'trilinear',
pool_kernel: tuple = (3, 3, 3),
dim_mul: int = 2,
head_mul: int = 2,
adaptive_kv_stride: tuple = (1, 8, 8),
rel_pos_embed: bool = True,
residual_pooling: bool = True,
dim_mul_in_attention: bool = True,
with_cls_token: bool = True,
output_cls_token: bool = True,
rel_pos_zero_init: bool = False,
mlp_ratio: float = 4,
qkv_bias: bool = True,
norm_cfg: dict = dict(type='LN', eps=1e-6),
patch_cfg: dict = dict(
kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3)),
init_cfg: Optional[Union[dict, List[dict]]] = [
dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02),
dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
dict(type='Constant', layer='LayerNorm', val=1., bias=0.02),
]
) -> None:
super().__init__(
arch=arch,
spatial_size=spatial_size,
temporal_size=temporal_size,
in_channels=in_channels,
out_scales=out_scales,
drop_path_rate=drop_path_rate,
use_abs_pos_embed=use_abs_pos_embed,
interpolate_mode=interpolate_mode,
pool_kernel=pool_kernel,
dim_mul=dim_mul,
head_mul=head_mul,
adaptive_kv_stride=adaptive_kv_stride,
rel_pos_embed=rel_pos_embed,
residual_pooling=residual_pooling,
dim_mul_in_attention=dim_mul_in_attention,
with_cls_token=with_cls_token,
output_cls_token=output_cls_token,
rel_pos_zero_init=rel_pos_zero_init,
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
norm_cfg=norm_cfg,
patch_cfg=patch_cfg,
init_cfg=init_cfg)
self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims))
self.patch_stride = patch_cfg['stride']
def init_weights(self) -> None:
"""Initialize mask token and cls token."""
super().init_weights()
if (isinstance(self.init_cfg, dict)
and self.init_cfg['type'] == 'Pretrained'):
# Suppress default init if use pretrained model.
return
nn.init.trunc_normal_(self.cls_token, std=.02)
nn.init.trunc_normal_(self.mask_token, std=.02)
def forward(self, x: torch.Tensor,
mask: torch.Tensor) -> Tuple[torch.Tensor]:
x, patch_resolution = self.patch_embed(x)
B, L, C = x.shape
T, H, W = patch_resolution
mask_tokens = self.mask_token.expand(B, L, -1)
mask = F.interpolate(mask.float(), size=(H, W))
mask = mask.flatten(1).unsqueeze(-1)
x = x * (1 - mask) + mask_tokens * mask
cls_tokens = self.cls_token.expand(B, -1, -1)
x = torch.cat((cls_tokens, x), dim=1)
if self.use_abs_pos_embed:
x = x + resize_pos_embed(
self.pos_embed,
self.patch_resolution,
patch_resolution,
mode=self.interpolate_mode,
num_extra_tokens=self.num_extra_tokens)
# if not self.with_cls_token:
# # Remove class token for transformer encoder input
# x = x[:, 1:]
outs = []
self.out_patch_resolution = []
for i, block in enumerate(self.blocks):
x, patch_resolution = block(x, patch_resolution)
if i in self.stage_indices:
stage_index = self.stage_indices[i]
if stage_index in self.out_scales:
self.out_patch_resolution.append(patch_resolution)
x = getattr(self, f'norm{stage_index}')(x)
if not self.output_cls_token:
out = x[:, 1:]
else:
out = x
outs.append(out)
return tuple(outs)
# Copyright (c) OpenMMLab. All rights reserved.
import math
import random
from typing import Optional, Tuple
import numpy as np
from mmcv.transforms.base import BaseTransform
from mmpretrain.registry import TRANSFORMS
@TRANSFORMS.register_module()
class MaskFeatMaskGenerator3D(BaseTransform):
"""Generate mask for video.
Added Keys:
- mask
This module is borrowed from
https://github.com/facebookresearch/SlowFast/blob/main/slowfast/datasets/transform.py
Args:
input_size (int): The size of input video.
num_masking_patches (int): The number of patches to be masked.
min_num_patches (int): The minimum number of patches to be masked
in the process of generating mask. Defaults to 4.
max_num_patches (int, optional): The maximum number of patches to be
masked in the process of generating mask. Defaults to None.
min_aspect (float): The minimum aspect ratio of mask blocks. Defaults
to 0.3.
min_aspect (float, optional): The minimum aspect ratio of mask blocks.
Defaults to None.
"""
def __init__(self,
input_size: int,
num_masking_patches: int,
min_num_patches: int = 4,
max_num_patches: Optional[int] = None,
min_aspect: float = 0.3,
max_aspect: Optional[float] = None) -> None:
self.temporal, self.height, self.width = input_size
self.num_masking_patches = num_masking_patches
self.min_num_patches = min_num_patches
self.max_num_patches = (
num_masking_patches
if max_num_patches is None else max_num_patches)
max_aspect = max_aspect or 1 / min_aspect
self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
def get_shape(self) -> Tuple[int, int, int]:
"""Get the shape of mask.
Returns:
Tuple[int, int, int]: The shape of mask.
"""
return self.temporal, self.height, self.width
def _mask(self, mask: np.ndarray, max_mask_patches: int) -> int:
"""Generate mask recursively.
Args:
mask (np.ndarray): The mask to be generated.
max_mask_patches (int): The maximum number of patches to be masked.
Returns:
int: The number of patches masked.
"""
delta = 0
for _ in range(100):
target_area = random.uniform(self.min_num_patches,
self.max_num_patches)
aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
h = int(round(math.sqrt(target_area * aspect_ratio)))
w = int(round(math.sqrt(target_area / aspect_ratio)))
t = random.randint(1, self.temporal) # !
if w < self.width and h < self.height:
top = random.randint(0, self.height - h)
left = random.randint(0, self.width - w)
front = random.randint(0, self.temporal - t)
num_masked = mask[front:front + t, top:top + h,
left:left + w].sum()
# Overlap
if 0 < h * w * t - num_masked <= max_mask_patches:
for i in range(front, front + t):
for j in range(top, top + h):
for k in range(left, left + w):
if mask[i, j, k] == 0:
mask[i, j, k] = 1
delta += 1
if delta > 0:
break
return delta
def transform(self, results: dict) -> dict:
"""Method to generate random block mask.
Args:
results (dict): Result dict from previous pipeline.
Returns:
dict: Result dict with added key ``mask``.
"""
mask = np.zeros(shape=self.get_shape(), dtype=np.int)
mask_count = 0
while mask_count < self.num_masking_patches:
max_mask_patches = self.num_masking_patches - mask_count
delta = self._mask(mask, max_mask_patches)
if delta == 0:
break
else:
mask_count += delta
results.update({'mask': mask})
return results
def __repr__(self) -> str:
repr_str = self.__class__.__name__
repr_str += f'(temporal={self.temporal}, '
repr_str += f'height={self.height}, '
repr_str += f'width={self.width}, '
repr_str += f'num_masking_patches={self.num_masking_patches}, '
repr_str += f'min_num_patches={self.min_num_patches}, '
repr_str += f'max_num_patches={self.max_num_patches}, '
repr_str += f'log_aspect_ratio={self.log_aspect_ratio})'
return repr_str
#!/usr/bin/env bash
CONFIG=$1
GPUS=$2
NNODES=${NNODES:-1}
NODE_RANK=${NODE_RANK:-0}
PORT=${PORT:-29500}
MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
python -m torch.distributed.launch \
--nnodes=$NNODES \
--node_rank=$NODE_RANK \
--master_addr=$MASTER_ADDR \
--nproc_per_node=$GPUS \
--master_port=$PORT \
$(dirname "$0")/train.py \
$CONFIG \
--launcher pytorch ${@:3}
#!/usr/bin/env bash
set -x
PARTITION=$1
JOB_NAME=$2
CONFIG=$3
GPUS=${GPUS:-8}
GPUS_PER_NODE=${GPUS_PER_NODE:-8}
CPUS_PER_TASK=${CPUS_PER_TASK:-5}
SRUN_ARGS=${SRUN_ARGS:-""}
PY_ARGS=${@:4}
PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
srun -p ${PARTITION} \
--job-name=${JOB_NAME} \
--gres=gpu:${GPUS_PER_NODE} \
--ntasks=${GPUS} \
--ntasks-per-node=${GPUS_PER_NODE} \
--cpus-per-task=${CPUS_PER_TASK} \
--kill-on-bad-exit=1 \
${SRUN_ARGS} \
python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
# Copyright (c) OpenMMLab. All rights reserved.
import argparse
import os
import os.path as osp
from mmengine.config import Config, DictAction
from mmengine.runner import Runner
def parse_args():
parser = argparse.ArgumentParser(description='Train a model')
parser.add_argument('config', help='train config file path')
parser.add_argument('--work-dir', help='the dir to save logs and models')
parser.add_argument(
'--resume',
nargs='?',
type=str,
const='auto',
help='If specify checkpint path, resume from it, while if not '
'specify, try to auto resume from the latest checkpoint '
'in the work directory.')
parser.add_argument(
'--amp',
action='store_true',
help='enable automatic-mixed-precision training')
parser.add_argument(
'--cfg-options',
nargs='+',
action=DictAction,
help='override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.')
parser.add_argument(
'--launcher',
choices=['none', 'pytorch', 'slurm', 'mpi'],
default='none',
help='job launcher')
parser.add_argument('--local_rank', type=int, default=0)
args = parser.parse_args()
if 'LOCAL_RANK' not in os.environ:
os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def main():
args = parse_args()
# load config
cfg = Config.fromfile(args.config)
cfg.launcher = args.launcher
if args.cfg_options is not None:
cfg.merge_from_dict(args.cfg_options)
# work_dir is determined in this priority: CLI > segment in file > filename
if args.work_dir is not None:
# update configs according to CLI args if args.work_dir is not None
cfg.work_dir = args.work_dir
elif cfg.get('work_dir', None) is None:
# use config filename as default work_dir if cfg.work_dir is None
work_type = args.config.split('/')[1]
cfg.work_dir = osp.join('./work_dirs', work_type,
osp.splitext(osp.basename(args.config))[0])
# enable automatic-mixed-precision training
if args.amp is True:
optim_wrapper = cfg.optim_wrapper.get('type', 'OptimWrapper')
assert optim_wrapper in ['OptimWrapper', 'AmpOptimWrapper'], \
'`--amp` is not supported custom optimizer wrapper type ' \
f'`{optim_wrapper}.'
cfg.optim_wrapper.type = 'AmpOptimWrapper'
cfg.optim_wrapper.setdefault('loss_scale', 'dynamic')
# resume training
if args.resume == 'auto':
cfg.resume = True
cfg.load_from = None
elif args.resume is not None:
cfg.resume = True
cfg.load_from = args.resume
# build the runner from config
runner = Runner.from_cfg(cfg)
# start training
runner.train()
if __name__ == '__main__':
main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment