Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
mmpretrain
Commits
fb54db0f
Commit
fb54db0f
authored
Jun 24, 2025
by
limm
Browse files
add projects code
parent
1ac2e802
Pipeline
#2804
canceled with stages
Changes
66
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
470 additions
and
0 deletions
+470
-0
projects/maskfeat_video/models/maskfeat.py
projects/maskfeat_video/models/maskfeat.py
+59
-0
projects/maskfeat_video/models/maskfeat_mvit.py
projects/maskfeat_video/models/maskfeat_mvit.py
+146
-0
projects/maskfeat_video/models/transforms.py
projects/maskfeat_video/models/transforms.py
+130
-0
projects/maskfeat_video/tools/dist_train.sh
projects/maskfeat_video/tools/dist_train.sh
+19
-0
projects/maskfeat_video/tools/slurm_train.sh
projects/maskfeat_video/tools/slurm_train.sh
+23
-0
projects/maskfeat_video/tools/train.py
projects/maskfeat_video/tools/train.py
+93
-0
No files found.
projects/maskfeat_video/models/maskfeat.py
0 → 100644
View file @
fb54db0f
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
Dict
,
List
import
torch
import
torch.nn.functional
as
F
from
mmpretrain.models
import
BaseSelfSupervisor
from
mmpretrain.registry
import
MODELS
from
mmpretrain.structures
import
DataSample
@
MODELS
.
register_module
()
class
VideoMaskFeat
(
BaseSelfSupervisor
):
"""MaskFeat.
Implementation of `Masked Feature Prediction for Self-Supervised Visual
Pre-Training <https://arxiv.org/abs/2112.09133>`_.
"""
def
loss
(
self
,
inputs
:
List
[
torch
.
Tensor
],
data_samples
:
List
[
DataSample
],
**
kwargs
)
->
Dict
[
str
,
torch
.
Tensor
]:
"""The forward function in training.
Args:
inputs (List[torch.Tensor]): The input images.
data_samples (List[DataSample]): All elements required
during the forward function.
Returns:
Dict[str, torch.Tensor]: A dictionary of loss components.
"""
mask
=
torch
.
stack
(
[
data_sample
.
mask
.
value
for
data_sample
in
data_samples
])
mask
=
mask
.
to
(
torch
.
bool
)
video
=
inputs
[
0
]
video
=
video
.
view
((
-
1
,
)
+
video
.
shape
[
2
:])
# B, C, T, H, W
latent
=
self
.
backbone
(
video
,
mask
)
B
,
L
,
C
=
latent
[
0
].
shape
pred
=
self
.
neck
([
latent
[
0
].
view
(
B
*
L
,
C
)])
pred
=
pred
[
0
].
view
(
B
,
L
,
-
1
)
# generate hog target
video
=
video
[:,
:,
::
self
.
backbone
.
patch_stride
[
0
],
:,
:]
video
=
video
.
transpose
(
1
,
2
)
# B, T, C, H, W
self
.
target_generator
.
B
=
video
.
size
(
0
)
self
.
target_generator
.
T
=
video
.
size
(
1
)
video
=
video
.
flatten
(
0
,
1
)
# B*T, C, H, W
hog
=
self
.
target_generator
(
video
)
mask
=
self
.
_get_output_mask
(
mask
)
loss
=
self
.
head
(
pred
,
hog
,
mask
)
losses
=
dict
(
loss
=
loss
)
return
losses
def
_get_output_mask
(
self
,
mask
:
torch
.
Tensor
)
->
torch
.
Tensor
:
size
=
self
.
backbone
.
out_patch_resolution
[
-
1
][
-
1
]
output_mask
=
F
.
interpolate
(
mask
.
float
(),
size
=
size
)
return
output_mask
projects/maskfeat_video/models/maskfeat_mvit.py
0 → 100644
View file @
fb54db0f
# Copyright (c) OpenMMLab. All rights reserved.
from
typing
import
List
,
Optional
,
Sequence
,
Tuple
,
Union
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
mmaction.models
import
MViT
from
mmaction.models.backbones.mvit
import
resize_pos_embed
from
mmpretrain.registry
import
MODELS
@
MODELS
.
register_module
()
class
MaskFeatMViT
(
MViT
):
arch_zoo
=
{
'maskfeat-small'
:
{
'embed_dims'
:
96
,
'num_layers'
:
16
,
'num_heads'
:
1
,
'downscale_indices'
:
[
1
,
3
],
'dim_mul_indices'
:
[
1
,
3
,
14
]
},
'maskfeat-large'
:
{
'embed_dims'
:
144
,
'num_layers'
:
48
,
'num_heads'
:
2
,
'downscale_indices'
:
[
2
,
8
],
'dim_mul_indices'
:
[
2
,
8
,
44
]
},
}
def
__init__
(
self
,
arch
:
str
=
'base'
,
spatial_size
:
int
=
224
,
temporal_size
:
int
=
16
,
in_channels
:
int
=
3
,
out_scales
:
Union
[
int
,
Sequence
[
int
]]
=
-
1
,
drop_path_rate
:
float
=
0
,
use_abs_pos_embed
:
bool
=
False
,
interpolate_mode
:
str
=
'trilinear'
,
pool_kernel
:
tuple
=
(
3
,
3
,
3
),
dim_mul
:
int
=
2
,
head_mul
:
int
=
2
,
adaptive_kv_stride
:
tuple
=
(
1
,
8
,
8
),
rel_pos_embed
:
bool
=
True
,
residual_pooling
:
bool
=
True
,
dim_mul_in_attention
:
bool
=
True
,
with_cls_token
:
bool
=
True
,
output_cls_token
:
bool
=
True
,
rel_pos_zero_init
:
bool
=
False
,
mlp_ratio
:
float
=
4
,
qkv_bias
:
bool
=
True
,
norm_cfg
:
dict
=
dict
(
type
=
'LN'
,
eps
=
1e-6
),
patch_cfg
:
dict
=
dict
(
kernel_size
=
(
3
,
7
,
7
),
stride
=
(
2
,
4
,
4
),
padding
=
(
1
,
3
,
3
)),
init_cfg
:
Optional
[
Union
[
dict
,
List
[
dict
]]]
=
[
dict
(
type
=
'TruncNormal'
,
layer
=
[
'Conv2d'
,
'Conv3d'
],
std
=
0.02
),
dict
(
type
=
'TruncNormal'
,
layer
=
'Linear'
,
std
=
0.02
,
bias
=
0.
),
dict
(
type
=
'Constant'
,
layer
=
'LayerNorm'
,
val
=
1.
,
bias
=
0.02
),
]
)
->
None
:
super
().
__init__
(
arch
=
arch
,
spatial_size
=
spatial_size
,
temporal_size
=
temporal_size
,
in_channels
=
in_channels
,
out_scales
=
out_scales
,
drop_path_rate
=
drop_path_rate
,
use_abs_pos_embed
=
use_abs_pos_embed
,
interpolate_mode
=
interpolate_mode
,
pool_kernel
=
pool_kernel
,
dim_mul
=
dim_mul
,
head_mul
=
head_mul
,
adaptive_kv_stride
=
adaptive_kv_stride
,
rel_pos_embed
=
rel_pos_embed
,
residual_pooling
=
residual_pooling
,
dim_mul_in_attention
=
dim_mul_in_attention
,
with_cls_token
=
with_cls_token
,
output_cls_token
=
output_cls_token
,
rel_pos_zero_init
=
rel_pos_zero_init
,
mlp_ratio
=
mlp_ratio
,
qkv_bias
=
qkv_bias
,
norm_cfg
=
norm_cfg
,
patch_cfg
=
patch_cfg
,
init_cfg
=
init_cfg
)
self
.
mask_token
=
nn
.
Parameter
(
torch
.
zeros
(
1
,
1
,
self
.
embed_dims
))
self
.
patch_stride
=
patch_cfg
[
'stride'
]
def
init_weights
(
self
)
->
None
:
"""Initialize mask token and cls token."""
super
().
init_weights
()
if
(
isinstance
(
self
.
init_cfg
,
dict
)
and
self
.
init_cfg
[
'type'
]
==
'Pretrained'
):
# Suppress default init if use pretrained model.
return
nn
.
init
.
trunc_normal_
(
self
.
cls_token
,
std
=
.
02
)
nn
.
init
.
trunc_normal_
(
self
.
mask_token
,
std
=
.
02
)
def
forward
(
self
,
x
:
torch
.
Tensor
,
mask
:
torch
.
Tensor
)
->
Tuple
[
torch
.
Tensor
]:
x
,
patch_resolution
=
self
.
patch_embed
(
x
)
B
,
L
,
C
=
x
.
shape
T
,
H
,
W
=
patch_resolution
mask_tokens
=
self
.
mask_token
.
expand
(
B
,
L
,
-
1
)
mask
=
F
.
interpolate
(
mask
.
float
(),
size
=
(
H
,
W
))
mask
=
mask
.
flatten
(
1
).
unsqueeze
(
-
1
)
x
=
x
*
(
1
-
mask
)
+
mask_tokens
*
mask
cls_tokens
=
self
.
cls_token
.
expand
(
B
,
-
1
,
-
1
)
x
=
torch
.
cat
((
cls_tokens
,
x
),
dim
=
1
)
if
self
.
use_abs_pos_embed
:
x
=
x
+
resize_pos_embed
(
self
.
pos_embed
,
self
.
patch_resolution
,
patch_resolution
,
mode
=
self
.
interpolate_mode
,
num_extra_tokens
=
self
.
num_extra_tokens
)
# if not self.with_cls_token:
# # Remove class token for transformer encoder input
# x = x[:, 1:]
outs
=
[]
self
.
out_patch_resolution
=
[]
for
i
,
block
in
enumerate
(
self
.
blocks
):
x
,
patch_resolution
=
block
(
x
,
patch_resolution
)
if
i
in
self
.
stage_indices
:
stage_index
=
self
.
stage_indices
[
i
]
if
stage_index
in
self
.
out_scales
:
self
.
out_patch_resolution
.
append
(
patch_resolution
)
x
=
getattr
(
self
,
f
'norm
{
stage_index
}
'
)(
x
)
if
not
self
.
output_cls_token
:
out
=
x
[:,
1
:]
else
:
out
=
x
outs
.
append
(
out
)
return
tuple
(
outs
)
projects/maskfeat_video/models/transforms.py
0 → 100644
View file @
fb54db0f
# Copyright (c) OpenMMLab. All rights reserved.
import
math
import
random
from
typing
import
Optional
,
Tuple
import
numpy
as
np
from
mmcv.transforms.base
import
BaseTransform
from
mmpretrain.registry
import
TRANSFORMS
@
TRANSFORMS
.
register_module
()
class
MaskFeatMaskGenerator3D
(
BaseTransform
):
"""Generate mask for video.
Added Keys:
- mask
This module is borrowed from
https://github.com/facebookresearch/SlowFast/blob/main/slowfast/datasets/transform.py
Args:
input_size (int): The size of input video.
num_masking_patches (int): The number of patches to be masked.
min_num_patches (int): The minimum number of patches to be masked
in the process of generating mask. Defaults to 4.
max_num_patches (int, optional): The maximum number of patches to be
masked in the process of generating mask. Defaults to None.
min_aspect (float): The minimum aspect ratio of mask blocks. Defaults
to 0.3.
min_aspect (float, optional): The minimum aspect ratio of mask blocks.
Defaults to None.
"""
def
__init__
(
self
,
input_size
:
int
,
num_masking_patches
:
int
,
min_num_patches
:
int
=
4
,
max_num_patches
:
Optional
[
int
]
=
None
,
min_aspect
:
float
=
0.3
,
max_aspect
:
Optional
[
float
]
=
None
)
->
None
:
self
.
temporal
,
self
.
height
,
self
.
width
=
input_size
self
.
num_masking_patches
=
num_masking_patches
self
.
min_num_patches
=
min_num_patches
self
.
max_num_patches
=
(
num_masking_patches
if
max_num_patches
is
None
else
max_num_patches
)
max_aspect
=
max_aspect
or
1
/
min_aspect
self
.
log_aspect_ratio
=
(
math
.
log
(
min_aspect
),
math
.
log
(
max_aspect
))
def
get_shape
(
self
)
->
Tuple
[
int
,
int
,
int
]:
"""Get the shape of mask.
Returns:
Tuple[int, int, int]: The shape of mask.
"""
return
self
.
temporal
,
self
.
height
,
self
.
width
def
_mask
(
self
,
mask
:
np
.
ndarray
,
max_mask_patches
:
int
)
->
int
:
"""Generate mask recursively.
Args:
mask (np.ndarray): The mask to be generated.
max_mask_patches (int): The maximum number of patches to be masked.
Returns:
int: The number of patches masked.
"""
delta
=
0
for
_
in
range
(
100
):
target_area
=
random
.
uniform
(
self
.
min_num_patches
,
self
.
max_num_patches
)
aspect_ratio
=
math
.
exp
(
random
.
uniform
(
*
self
.
log_aspect_ratio
))
h
=
int
(
round
(
math
.
sqrt
(
target_area
*
aspect_ratio
)))
w
=
int
(
round
(
math
.
sqrt
(
target_area
/
aspect_ratio
)))
t
=
random
.
randint
(
1
,
self
.
temporal
)
# !
if
w
<
self
.
width
and
h
<
self
.
height
:
top
=
random
.
randint
(
0
,
self
.
height
-
h
)
left
=
random
.
randint
(
0
,
self
.
width
-
w
)
front
=
random
.
randint
(
0
,
self
.
temporal
-
t
)
num_masked
=
mask
[
front
:
front
+
t
,
top
:
top
+
h
,
left
:
left
+
w
].
sum
()
# Overlap
if
0
<
h
*
w
*
t
-
num_masked
<=
max_mask_patches
:
for
i
in
range
(
front
,
front
+
t
):
for
j
in
range
(
top
,
top
+
h
):
for
k
in
range
(
left
,
left
+
w
):
if
mask
[
i
,
j
,
k
]
==
0
:
mask
[
i
,
j
,
k
]
=
1
delta
+=
1
if
delta
>
0
:
break
return
delta
def
transform
(
self
,
results
:
dict
)
->
dict
:
"""Method to generate random block mask.
Args:
results (dict): Result dict from previous pipeline.
Returns:
dict: Result dict with added key ``mask``.
"""
mask
=
np
.
zeros
(
shape
=
self
.
get_shape
(),
dtype
=
np
.
int
)
mask_count
=
0
while
mask_count
<
self
.
num_masking_patches
:
max_mask_patches
=
self
.
num_masking_patches
-
mask_count
delta
=
self
.
_mask
(
mask
,
max_mask_patches
)
if
delta
==
0
:
break
else
:
mask_count
+=
delta
results
.
update
({
'mask'
:
mask
})
return
results
def
__repr__
(
self
)
->
str
:
repr_str
=
self
.
__class__
.
__name__
repr_str
+=
f
'(temporal=
{
self
.
temporal
}
, '
repr_str
+=
f
'height=
{
self
.
height
}
, '
repr_str
+=
f
'width=
{
self
.
width
}
, '
repr_str
+=
f
'num_masking_patches=
{
self
.
num_masking_patches
}
, '
repr_str
+=
f
'min_num_patches=
{
self
.
min_num_patches
}
, '
repr_str
+=
f
'max_num_patches=
{
self
.
max_num_patches
}
, '
repr_str
+=
f
'log_aspect_ratio=
{
self
.
log_aspect_ratio
}
)'
return
repr_str
projects/maskfeat_video/tools/dist_train.sh
0 → 100644
View file @
fb54db0f
#!/usr/bin/env bash
CONFIG
=
$1
GPUS
=
$2
NNODES
=
${
NNODES
:-
1
}
NODE_RANK
=
${
NODE_RANK
:-
0
}
PORT
=
${
PORT
:-
29500
}
MASTER_ADDR
=
${
MASTER_ADDR
:-
"127.0.0.1"
}
PYTHONPATH
=
"
$(
dirname
$0
)
/.."
:
$PYTHONPATH
\
python
-m
torch.distributed.launch
\
--nnodes
=
$NNODES
\
--node_rank
=
$NODE_RANK
\
--master_addr
=
$MASTER_ADDR
\
--nproc_per_node
=
$GPUS
\
--master_port
=
$PORT
\
$(
dirname
"
$0
"
)
/train.py
\
$CONFIG
\
--launcher
pytorch
${
@
:3
}
projects/maskfeat_video/tools/slurm_train.sh
0 → 100644
View file @
fb54db0f
#!/usr/bin/env bash
set
-x
PARTITION
=
$1
JOB_NAME
=
$2
CONFIG
=
$3
GPUS
=
${
GPUS
:-
8
}
GPUS_PER_NODE
=
${
GPUS_PER_NODE
:-
8
}
CPUS_PER_TASK
=
${
CPUS_PER_TASK
:-
5
}
SRUN_ARGS
=
${
SRUN_ARGS
:-
""
}
PY_ARGS
=
${
@
:4
}
PYTHONPATH
=
"
$(
dirname
$0
)
/.."
:
$PYTHONPATH
\
srun
-p
${
PARTITION
}
\
--job-name
=
${
JOB_NAME
}
\
--gres
=
gpu:
${
GPUS_PER_NODE
}
\
--ntasks
=
${
GPUS
}
\
--ntasks-per-node
=
${
GPUS_PER_NODE
}
\
--cpus-per-task
=
${
CPUS_PER_TASK
}
\
--kill-on-bad-exit
=
1
\
${
SRUN_ARGS
}
\
python
-u
tools/train.py
${
CONFIG
}
--launcher
=
"slurm"
${
PY_ARGS
}
projects/maskfeat_video/tools/train.py
0 → 100644
View file @
fb54db0f
# Copyright (c) OpenMMLab. All rights reserved.
import
argparse
import
os
import
os.path
as
osp
from
mmengine.config
import
Config
,
DictAction
from
mmengine.runner
import
Runner
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
'Train a model'
)
parser
.
add_argument
(
'config'
,
help
=
'train config file path'
)
parser
.
add_argument
(
'--work-dir'
,
help
=
'the dir to save logs and models'
)
parser
.
add_argument
(
'--resume'
,
nargs
=
'?'
,
type
=
str
,
const
=
'auto'
,
help
=
'If specify checkpint path, resume from it, while if not '
'specify, try to auto resume from the latest checkpoint '
'in the work directory.'
)
parser
.
add_argument
(
'--amp'
,
action
=
'store_true'
,
help
=
'enable automatic-mixed-precision training'
)
parser
.
add_argument
(
'--cfg-options'
,
nargs
=
'+'
,
action
=
DictAction
,
help
=
'override some settings in the used config, the key-value pair '
'in xxx=yyy format will be merged into config file. If the value to '
'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
'Note that the quotation marks are necessary and that no white space '
'is allowed.'
)
parser
.
add_argument
(
'--launcher'
,
choices
=
[
'none'
,
'pytorch'
,
'slurm'
,
'mpi'
],
default
=
'none'
,
help
=
'job launcher'
)
parser
.
add_argument
(
'--local_rank'
,
type
=
int
,
default
=
0
)
args
=
parser
.
parse_args
()
if
'LOCAL_RANK'
not
in
os
.
environ
:
os
.
environ
[
'LOCAL_RANK'
]
=
str
(
args
.
local_rank
)
return
args
def
main
():
args
=
parse_args
()
# load config
cfg
=
Config
.
fromfile
(
args
.
config
)
cfg
.
launcher
=
args
.
launcher
if
args
.
cfg_options
is
not
None
:
cfg
.
merge_from_dict
(
args
.
cfg_options
)
# work_dir is determined in this priority: CLI > segment in file > filename
if
args
.
work_dir
is
not
None
:
# update configs according to CLI args if args.work_dir is not None
cfg
.
work_dir
=
args
.
work_dir
elif
cfg
.
get
(
'work_dir'
,
None
)
is
None
:
# use config filename as default work_dir if cfg.work_dir is None
work_type
=
args
.
config
.
split
(
'/'
)[
1
]
cfg
.
work_dir
=
osp
.
join
(
'./work_dirs'
,
work_type
,
osp
.
splitext
(
osp
.
basename
(
args
.
config
))[
0
])
# enable automatic-mixed-precision training
if
args
.
amp
is
True
:
optim_wrapper
=
cfg
.
optim_wrapper
.
get
(
'type'
,
'OptimWrapper'
)
assert
optim_wrapper
in
[
'OptimWrapper'
,
'AmpOptimWrapper'
],
\
'`--amp` is not supported custom optimizer wrapper type '
\
f
'`
{
optim_wrapper
}
.'
cfg
.
optim_wrapper
.
type
=
'AmpOptimWrapper'
cfg
.
optim_wrapper
.
setdefault
(
'loss_scale'
,
'dynamic'
)
# resume training
if
args
.
resume
==
'auto'
:
cfg
.
resume
=
True
cfg
.
load_from
=
None
elif
args
.
resume
is
not
None
:
cfg
.
resume
=
True
cfg
.
load_from
=
args
.
resume
# build the runner from config
runner
=
Runner
.
from_cfg
(
cfg
)
# start training
runner
.
train
()
if
__name__
==
'__main__'
:
main
()
Prev
1
2
3
4
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment