add projects code

fb54db0f · limm · 1ac2e802 · fb54db0f · fb54db0f · fb54db0f
Commit fb54db0f authored Jun 24, 2025 by limm
6 changed files
--- a/projects/maskfeat_video/models/maskfeat.py
+++ b/projects/maskfeat_video/models/maskfeat.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
+import torch
+import torch.nn.functional as F
+from mmpretrain.models import BaseSelfSupervisor
+from mmpretrain.registry import MODELS
+from mmpretrain.structures import DataSample
+@MODELS.register_module()
+class VideoMaskFeat(BaseSelfSupervisor):
+    """MaskFeat.
+    Implementation of `Masked Feature Prediction for Self-Supervised Visual
+    Pre-Training <https://arxiv.org/abs/2112.09133>`_.
+    """
+    def loss(self, inputs: List[torch.Tensor], data_samples: List[DataSample],
+             **kwargs) -> Dict[str, torch.Tensor]:
+        """The forward function in training.
+        Args:
+            inputs (List[torch.Tensor]): The input images.
+            data_samples (List[DataSample]): All elements required
+                during the forward function.
+        Returns:
+            Dict[str, torch.Tensor]: A dictionary of loss components.
+        """
+        mask = torch.stack(
+            [data_sample.mask.value for data_sample in data_samples])
+        mask = mask.to(torch.bool)
+        video = inputs[0]
+        video = video.view((-1, ) + video.shape[2:])  # B, C, T, H, W
+        latent = self.backbone(video, mask)
+        B, L, C = latent[0].shape
+        pred = self.neck([latent[0].view(B * L, C)])
+        pred = pred[0].view(B, L, -1)
+        # generate hog target
+        video = video[:, :, ::self.backbone.patch_stride[0], :, :]
+        video = video.transpose(1, 2)  # B, T, C, H, W
+        self.target_generator.B = video.size(0)
+        self.target_generator.T = video.size(1)
+        video = video.flatten(0, 1)  # B*T, C, H, W
+        hog = self.target_generator(video)
+        mask = self._get_output_mask(mask)
+        loss = self.head(pred, hog, mask)
+        losses = dict(loss=loss)
+        return losses
+    def _get_output_mask(self, mask: torch.Tensor) -> torch.Tensor:
+        size = self.backbone.out_patch_resolution[-1][-1]
+        output_mask = F.interpolate(mask.float(), size=size)
+        return output_mask
--- a/projects/maskfeat_video/models/maskfeat_mvit.py
+++ b/projects/maskfeat_video/models/maskfeat_mvit.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Sequence, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmaction.models import MViT
+from mmaction.models.backbones.mvit import resize_pos_embed
+from mmpretrain.registry import MODELS
+@MODELS.register_module()
+class MaskFeatMViT(MViT):
+    arch_zoo = {
+        'maskfeat-small': {
+            'embed_dims': 96,
+            'num_layers': 16,
+            'num_heads': 1,
+            'downscale_indices': [1, 3],
+            'dim_mul_indices': [1, 3, 14]
+        },
+        'maskfeat-large': {
+            'embed_dims': 144,
+            'num_layers': 48,
+            'num_heads': 2,
+            'downscale_indices': [2, 8],
+            'dim_mul_indices': [2, 8, 44]
+        },
+    }
+    def __init__(
+        self,
+        arch: str = 'base',
+        spatial_size: int = 224,
+        temporal_size: int = 16,
+        in_channels: int = 3,
+        out_scales: Union[int, Sequence[int]] = -1,
+        drop_path_rate: float = 0,
+        use_abs_pos_embed: bool = False,
+        interpolate_mode: str = 'trilinear',
+        pool_kernel: tuple = (3, 3, 3),
+        dim_mul: int = 2,
+        head_mul: int = 2,
+        adaptive_kv_stride: tuple = (1, 8, 8),
+        rel_pos_embed: bool = True,
+        residual_pooling: bool = True,
+        dim_mul_in_attention: bool = True,
+        with_cls_token: bool = True,
+        output_cls_token: bool = True,
+        rel_pos_zero_init: bool = False,
+        mlp_ratio: float = 4,
+        qkv_bias: bool = True,
+        norm_cfg: dict = dict(type='LN', eps=1e-6),
+        patch_cfg: dict = dict(
+            kernel_size=(3, 7, 7), stride=(2, 4, 4), padding=(1, 3, 3)),
+        init_cfg: Optional[Union[dict, List[dict]]] = [
+            dict(type='TruncNormal', layer=['Conv2d', 'Conv3d'], std=0.02),
+            dict(type='TruncNormal', layer='Linear', std=0.02, bias=0.),
+            dict(type='Constant', layer='LayerNorm', val=1., bias=0.02),
+        ]
+    ) -> None:
+        super().__init__(
+            arch=arch,
+            spatial_size=spatial_size,
+            temporal_size=temporal_size,
+            in_channels=in_channels,
+            out_scales=out_scales,
+            drop_path_rate=drop_path_rate,
+            use_abs_pos_embed=use_abs_pos_embed,
+            interpolate_mode=interpolate_mode,
+            pool_kernel=pool_kernel,
+            dim_mul=dim_mul,
+            head_mul=head_mul,
+            adaptive_kv_stride=adaptive_kv_stride,
+            rel_pos_embed=rel_pos_embed,
+            residual_pooling=residual_pooling,
+            dim_mul_in_attention=dim_mul_in_attention,
+            with_cls_token=with_cls_token,
+            output_cls_token=output_cls_token,
+            rel_pos_zero_init=rel_pos_zero_init,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            norm_cfg=norm_cfg,
+            patch_cfg=patch_cfg,
+            init_cfg=init_cfg)
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, self.embed_dims))
+        self.patch_stride = patch_cfg['stride']
+    def init_weights(self) -> None:
+        """Initialize mask token and cls token."""
+        super().init_weights()
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg['type'] == 'Pretrained'):
+            # Suppress default init if use pretrained model.
+            return
+        nn.init.trunc_normal_(self.cls_token, std=.02)
+        nn.init.trunc_normal_(self.mask_token, std=.02)
+    def forward(self, x: torch.Tensor,
+                mask: torch.Tensor) -> Tuple[torch.Tensor]:
+        x, patch_resolution = self.patch_embed(x)
+        B, L, C = x.shape
+        T, H, W = patch_resolution
+        mask_tokens = self.mask_token.expand(B, L, -1)
+        mask = F.interpolate(mask.float(), size=(H, W))
+        mask = mask.flatten(1).unsqueeze(-1)
+        x = x * (1 - mask) + mask_tokens * mask
+        cls_tokens = self.cls_token.expand(B, -1, -1)
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.use_abs_pos_embed:
+            x = x + resize_pos_embed(
+                self.pos_embed,
+                self.patch_resolution,
+                patch_resolution,
+                mode=self.interpolate_mode,
+                num_extra_tokens=self.num_extra_tokens)
+        # if not self.with_cls_token:
+        #     # Remove class token for transformer encoder input
+        #     x = x[:, 1:]
+        outs = []
+        self.out_patch_resolution = []
+        for i, block in enumerate(self.blocks):
+            x, patch_resolution = block(x, patch_resolution)
+            if i in self.stage_indices:
+                stage_index = self.stage_indices[i]
+                if stage_index in self.out_scales:
+                    self.out_patch_resolution.append(patch_resolution)
+                    x = getattr(self, f'norm{stage_index}')(x)
+                    if not self.output_cls_token:
+                        out = x[:, 1:]
+                    else:
+                        out = x
+                    outs.append(out)
+        return tuple(outs)
--- a/projects/maskfeat_video/models/transforms.py
+++ b/projects/maskfeat_video/models/transforms.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import random
+from typing import Optional, Tuple
+import numpy as np
+from mmcv.transforms.base import BaseTransform
+from mmpretrain.registry import TRANSFORMS
+@TRANSFORMS.register_module()
+class MaskFeatMaskGenerator3D(BaseTransform):
+    """Generate mask for video.
+    Added Keys:
+    - mask
+    This module is borrowed from
+    https://github.com/facebookresearch/SlowFast/blob/main/slowfast/datasets/transform.py
+    Args:
+        input_size (int): The size of input video.
+        num_masking_patches (int): The number of patches to be masked.
+        min_num_patches (int): The minimum number of patches to be masked
+            in the process of generating mask. Defaults to 4.
+        max_num_patches (int, optional): The maximum number of patches to be
+            masked in the process of generating mask. Defaults to None.
+        min_aspect (float): The minimum aspect ratio of mask blocks. Defaults
+            to 0.3.
+        min_aspect (float, optional): The minimum aspect ratio of mask blocks.
+            Defaults to None.
+    """
+    def __init__(self,
+                 input_size: int,
+                 num_masking_patches: int,
+                 min_num_patches: int = 4,
+                 max_num_patches: Optional[int] = None,
+                 min_aspect: float = 0.3,
+                 max_aspect: Optional[float] = None) -> None:
+        self.temporal, self.height, self.width = input_size
+        self.num_masking_patches = num_masking_patches
+        self.min_num_patches = min_num_patches
+        self.max_num_patches = (
+            num_masking_patches
+            if max_num_patches is None else max_num_patches)
+        max_aspect = max_aspect or 1 / min_aspect
+        self.log_aspect_ratio = (math.log(min_aspect), math.log(max_aspect))
+    def get_shape(self) -> Tuple[int, int, int]:
+        """Get the shape of mask.
+        Returns:
+            Tuple[int, int, int]: The shape of mask.
+        """
+        return self.temporal, self.height, self.width
+    def _mask(self, mask: np.ndarray, max_mask_patches: int) -> int:
+        """Generate mask recursively.
+        Args:
+            mask (np.ndarray): The mask to be generated.
+            max_mask_patches (int): The maximum number of patches to be masked.
+        Returns:
+            int: The number of patches masked.
+        """
+        delta = 0
+        for _ in range(100):
+            target_area = random.uniform(self.min_num_patches,
+                                         self.max_num_patches)
+            aspect_ratio = math.exp(random.uniform(*self.log_aspect_ratio))
+            h = int(round(math.sqrt(target_area * aspect_ratio)))
+            w = int(round(math.sqrt(target_area / aspect_ratio)))
+            t = random.randint(1, self.temporal)  # !
+            if w < self.width and h < self.height:
+                top = random.randint(0, self.height - h)
+                left = random.randint(0, self.width - w)
+                front = random.randint(0, self.temporal - t)
+                num_masked = mask[front:front + t, top:top + h,
+                                  left:left + w].sum()
+                # Overlap
+                if 0 < h * w * t - num_masked <= max_mask_patches:
+                    for i in range(front, front + t):
+                        for j in range(top, top + h):
+                            for k in range(left, left + w):
+                                if mask[i, j, k] == 0:
+                                    mask[i, j, k] = 1
+                                    delta += 1
+                if delta > 0:
+                    break
+        return delta
+    def transform(self, results: dict) -> dict:
+        """Method to generate random block mask.
+        Args:
+            results (dict): Result dict from previous pipeline.
+        Returns:
+            dict: Result dict with added key ``mask``.
+        """
+        mask = np.zeros(shape=self.get_shape(), dtype=np.int)
+        mask_count = 0
+        while mask_count < self.num_masking_patches:
+            max_mask_patches = self.num_masking_patches - mask_count
+            delta = self._mask(mask, max_mask_patches)
+            if delta == 0:
+                break
+            else:
+                mask_count += delta
+        results.update({'mask': mask})
+        return results
+    def __repr__(self) -> str:
+        repr_str = self.__class__.__name__
+        repr_str += f'(temporal={self.temporal}, '
+        repr_str += f'height={self.height}, '
+        repr_str += f'width={self.width}, '
+        repr_str += f'num_masking_patches={self.num_masking_patches}, '
+        repr_str += f'min_num_patches={self.min_num_patches}, '
+        repr_str += f'max_num_patches={self.max_num_patches}, '
+        repr_str += f'log_aspect_ratio={self.log_aspect_ratio})'
+        return repr_str
--- a/projects/maskfeat_video/tools/dist_train.sh
+++ b/projects/maskfeat_video/tools/dist_train.sh
+#!/usr/bin/env bash
+CONFIG=$1
+GPUS=$2
+NNODES=${NNODES:-1}
+NODE_RANK=${NODE_RANK:-0}
+PORT=${PORT:-29500}
+MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+python -m torch.distributed.launch \
+    --nnodes=$NNODES \
+    --node_rank=$NODE_RANK \
+    --master_addr=$MASTER_ADDR \
+    --nproc_per_node=$GPUS \
+    --master_port=$PORT \
+    $(dirname "$0")/train.py \
+    $CONFIG \
+    --launcher pytorch ${@:3}
--- a/projects/maskfeat_video/tools/slurm_train.sh
+++ b/projects/maskfeat_video/tools/slurm_train.sh
+#!/usr/bin/env bash
+set -x
+PARTITION=$1
+JOB_NAME=$2
+CONFIG=$3
+GPUS=${GPUS:-8}
+GPUS_PER_NODE=${GPUS_PER_NODE:-8}
+CPUS_PER_TASK=${CPUS_PER_TASK:-5}
+SRUN_ARGS=${SRUN_ARGS:-""}
+PY_ARGS=${@:4}
+PYTHONPATH="$(dirname $0)/..":$PYTHONPATH \
+srun -p ${PARTITION} \
+    --job-name=${JOB_NAME} \
+    --gres=gpu:${GPUS_PER_NODE} \
+    --ntasks=${GPUS} \
+    --ntasks-per-node=${GPUS_PER_NODE} \
+    --cpus-per-task=${CPUS_PER_TASK} \
+    --kill-on-bad-exit=1 \
+    ${SRUN_ARGS} \
+    python -u tools/train.py ${CONFIG} --launcher="slurm" ${PY_ARGS}
--- a/projects/maskfeat_video/tools/train.py
+++ b/projects/maskfeat_video/tools/train.py
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+from mmengine.config import Config, DictAction
+from mmengine.runner import Runner
+def parse_args():
+    parser = argparse.ArgumentParser(description='Train a model')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume',
+        nargs='?',
+        type=str,
+        const='auto',
+        help='If specify checkpint path, resume from it, while if not '
+        'specify, try to auto resume from the latest checkpoint '
+        'in the work directory.')
+    parser.add_argument(
+        '--amp',
+        action='store_true',
+        help='enable automatic-mixed-precision training')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    parser.add_argument(
+        '--launcher',
+        choices=['none', 'pytorch', 'slurm', 'mpi'],
+        default='none',
+        help='job launcher')
+    parser.add_argument('--local_rank', type=int, default=0)
+    args = parser.parse_args()
+    if 'LOCAL_RANK' not in os.environ:
+        os.environ['LOCAL_RANK'] = str(args.local_rank)
+    return args
+def main():
+    args = parse_args()
+    # load config
+    cfg = Config.fromfile(args.config)
+    cfg.launcher = args.launcher
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+    # work_dir is determined in this priority: CLI > segment in file > filename
+    if args.work_dir is not None:
+        # update configs according to CLI args if args.work_dir is not None
+        cfg.work_dir = args.work_dir
+    elif cfg.get('work_dir', None) is None:
+        # use config filename as default work_dir if cfg.work_dir is None
+        work_type = args.config.split('/')[1]
+        cfg.work_dir = osp.join('./work_dirs', work_type,
+                                osp.splitext(osp.basename(args.config))[0])
+    # enable automatic-mixed-precision training
+    if args.amp is True:
+        optim_wrapper = cfg.optim_wrapper.get('type', 'OptimWrapper')
+        assert optim_wrapper in ['OptimWrapper', 'AmpOptimWrapper'], \
+            '`--amp` is not supported custom optimizer wrapper type ' \
+            f'`{optim_wrapper}.'
+        cfg.optim_wrapper.type = 'AmpOptimWrapper'
+        cfg.optim_wrapper.setdefault('loss_scale', 'dynamic')
+    # resume training
+    if args.resume == 'auto':
+        cfg.resume = True
+        cfg.load_from = None
+    elif args.resume is not None:
+        cfg.resume = True
+        cfg.load_from = args.resume
+    # build the runner from config
+    runner = Runner.from_cfg(cfg)
+    # start training
+    runner.train()
+if __name__ == '__main__':
+    main()