Initial commit

1345fab2 · luopl · 1345fab2 · 1345fab2 · 1345fab2 · 1345fab2
Commit 1345fab2 authored Jun 27, 2024 by luopl
20 changed files
--- a/inference.ipynb
+++ b/inference.ipynb
--- a/inference.py
+++ b/inference.py
+import argparse
+import os
+from functools import partial
+from test import create_test_data_loader
+from typing import Dict, List, Tuple
+
+import accelerate
+import cv2
+import numpy as np
+import torch
+import torch.utils.data as data
+from accelerate import Accelerator
+from PIL import Image
+from tqdm import tqdm
+
+from util.lazy_load import Config
+from util.logger import setup_logger
+from util.utils import load_checkpoint, load_state_dict
+from util.visualize import plot_bounding_boxes_on_image_cv2
+
+
+def is_image(file_path):
+    try:
+        img = Image.open(file_path)
+        img.close()
+        return True
+    except:
+        return False
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Inference a detector")
+
+    # dataset parameters
+    parser.add_argument("--image-dir", type=str, required=True)
+    parser.add_argument("--workers", type=int, default=0)
+
+    # model parameters
+    parser.add_argument("--model-config", type=str, required=True)
+    parser.add_argument("--checkpoint", type=str, required=True)
+
+    # visualization parameters
+    parser.add_argument("--show-dir", type=str, default=None)
+    parser.add_argument("--show-conf", type=float, default=0.5)
+
+    # plot parameters
+    parser.add_argument("--font-scale", type=float, default=1.0)
+    parser.add_argument("--box-thick", type=int, default=1)
+    parser.add_argument("--fill-alpha", type=float, default=0.2)
+    parser.add_argument("--text-box-color", type=int, nargs="+", default=(255, 255, 255))
+    parser.add_argument("--text-font-color", type=int, nargs="+", default=None)
+    parser.add_argument("--text-alpha", type=float, default=1.0)
+
+    # engine parameters
+    parser.add_argument("--seed", type=int, default=42)
+
+    args = parser.parse_args()
+    return args
+
+
+class InferenceDataset(data.Dataset):
+    def __init__(self, root):
+        self.images = [os.path.join(root, img) for img in os.listdir(root)]
+        self.images = [img for img in self.images if is_image(img)]
+        assert len(self.images) > 0, "No images found"
+
+    def __len__(self):
+        return len(self.images)
+
+    def __getitem__(self, index):
+        cv2.setNumThreads(0)
+        cv2.ocl.setUseOpenCL(False)
+        image = cv2.imdecode(np.fromfile(self.images[index], dtype=np.uint8), -1)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB).transpose(2, 0, 1)
+        return torch.tensor(image)
+
+
+def inference():
+    args = parse_args()
+
+    # set fixed seed and deterministic_algorithms
+    accelerator = Accelerator()
+    accelerate.utils.set_seed(args.seed, device_specific=False)
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+    # deterministic in low version pytorch leads to RuntimeError
+    # torch.use_deterministic_algorithms(True, warn_only=True)
+
+    # setup logger
+    for logger_name in ["py.warnings", "accelerate", os.path.basename(os.getcwd())]:
+        setup_logger(distributed_rank=accelerator.local_process_index, name=logger_name)
+
+    dataset = InferenceDataset(args.image_dir)
+    data_loader = create_test_data_loader(
+        dataset, accelerator=accelerator, batch_size=1, num_workers=args.workers
+    )
+
+    # get inference results from model output
+    model = Config(args.model_config).model.eval()
+    checkpoint = load_checkpoint(args.checkpoint)
+    if isinstance(checkpoint, Dict) and "model" in checkpoint:
+        checkpoint = checkpoint["model"]
+    load_state_dict(model, checkpoint)
+    model = accelerator.prepare_model(model)
+
+    with torch.inference_mode():
+        predictions = []
+        for index, images in enumerate(tqdm(data_loader)):
+            prediction = model(images)[0]
+
+            # change torch.Tensor to CPU
+            for key in prediction:
+                prediction[key] = prediction[key].to("cpu", non_blocking=True)
+            image_name = data_loader.dataset.images[index]
+            image = images[0].to("cpu", non_blocking=True)
+            prediction = {"image_name": image_name, "image": image, "output": prediction}
+            predictions.append(prediction)
+
+    # save visualization results
+    if args.show_dir:
+        os.makedirs(args.show_dir, exist_ok=True)
+
+        # create a dummy dataset for visualization with multi-workers
+        data_loader = create_test_data_loader(
+            predictions, accelerator=accelerator, batch_size=1, num_workers=args.workers
+        )
+        data_loader.collate_fn = partial(_visualize_batch_for_infer, classes=model.CLASSES, **vars(args))
+        [None for _ in tqdm(data_loader)]
+
+
+def _visualize_batch_for_infer(
+    batch: Tuple[Dict],
+    classes: List[str],
+    show_conf: float = 0.0,
+    show_dir: str = None,
+    font_scale: float = 1.0,
+    box_thick: int = 3,
+    fill_alpha: float = 0.2,
+    text_box_color: Tuple[int] = (255, 255, 255),
+    text_font_color: Tuple[int] = None,
+    text_alpha: float = 0.5,
+    **kwargs,  # Not useful
+):
+    image_name, image, output = batch[0].values()
+    # plot bounding boxes on image
+    image = image.numpy().transpose(1, 2, 0)
+    image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
+    image = plot_bounding_boxes_on_image_cv2(
+        image=image,
+        boxes=output["boxes"],
+        labels=output["labels"],
+        scores=output.get("scores", None),
+        classes=classes,
+        show_conf=show_conf,
+        font_scale=font_scale,
+        box_thick=box_thick,
+        fill_alpha=fill_alpha,
+        text_box_color=text_box_color,
+        text_font_color=text_font_color,
+        text_alpha=text_alpha,
+    )
+    cv2.imwrite(os.path.join(show_dir, os.path.basename(image_name)), image)
+
+
+if __name__ == "__main__":
+    inference()
--- a/main.py
+++ b/main.py
+import argparse
+import datetime
+import os
+import pprint
+import re
+import time
+
+import accelerate
+import torch
+from accelerate import Accelerator, DistributedDataParallelKwargs
+from accelerate.logging import get_logger
+from accelerate.tracking import TensorBoardTracker
+from accelerate.utils import ProjectConfiguration
+from torch.utils import data
+
+from util.collate_fn import collate_fn
+from util.engine import evaluate_acc, train_one_epoch_acc
+from util.group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
+from util.lazy_load import Config
+from util.misc import default_setup, encode_labels, fixed_generator, seed_worker
+from util.utils import HighestCheckpoint, load_checkpoint, load_state_dict
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Train a detector")
+    parser.add_argument("--config-file", default="configs/train_config.py")
+    parser.add_argument(
+        "--mixed-precision",
+        type=str,
+        default=None,
+        choices=["no", "fp16", "bf16", "fp8"],
+        help="Whether to use mixed precision. Choose"
+        "between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >= 1.10."
+        "and an Nvidia Ampere GPU.",
+    )
+    parser.add_argument(
+        "--accumulate-steps", type=int, default=1, help="Steps to accumulate gradients"
+    )
+    parser.add_argument("--seed", type=int, help="Random seed")
+    parser.add_argument("--use-deterministic-algorithms", action="store_true")
+    dynamo_backend = ["no", "eager", "aot_eager", "inductor", "aot_ts_nvfuser", "nvprims_nvfuser"]
+    dynamo_backend += ["cudagraphs", "ofi", "fx2trt", "onnxrt", "tensorrt", "ipex", "tvm"]
+    parser.add_argument(
+        "--dynamo-backend",
+        type=str,
+        default="no",
+        choices=dynamo_backend,
+        help="""
+        Set to one of the possible dynamo backends to optimize the training with torch dynamo. 
+        See https://pytorch.org/docs/stable/torch.compiler.html and 
+        https://huggingface.co/docs/accelerate/main/en/package_reference/utilities#accelerate.utils.DynamoBackend
+        """,
+    )
+
+    args = parser.parse_args()
+    return args
+
+
+def train():
+    args = parse_args()
+    cfg = Config(args.config_file, partials=("lr_scheduler", "optimizer", "param_dicts"))
+
+    # modify output directory
+    if getattr(cfg, "output_dir", None) is None:
+        if hasattr(cfg, "resume_from_checkpoint") and os.path.isdir(str(cfg.resume_from_checkpoint)):
+            # default path: xxxx-xx-xx-yy_yy_yy/checkpoints/{checkpoint_1}
+            if "checkpoints" in os.listdir(cfg.resume_from_checkpoint):
+                # if given output_dir, find the newest checkpoint under checkpoints directory
+                output_dir = os.path.join(cfg.resume_from_checkpoint, "checkpoints")
+                folders = [os.path.join(output_dir, folder) for folder in os.listdir(output_dir)]
+                folders.sort(
+                    key=lambda folder:
+                    list(map(int, re.findall(r"[\/]?([0-9]+)(?=[^\/]*$)", folder)))[0]
+                )
+                cfg.resume_from_checkpoint = folders[-1]
+
+            if "checkpoints" in os.path.dirname(cfg.resume_from_checkpoint):
+                cfg.output_dir = os.path.dirname(os.path.dirname(cfg.resume_from_checkpoint))
+        else:
+            # make sure all processes have same output directory
+            accelerate.utils.wait_for_everyone()
+            cfg.output_dir = os.path.join(
+                "checkpoints",
+                os.path.basename(cfg.model_path).split(".")[0],
+                "train",
+                datetime.datetime.now().strftime("%Y-%m-%d-%H_%M_%S"),
+            )
+
+    # Initialize accelerator
+    project_config = ProjectConfiguration(
+        project_dir=cfg.output_dir, total_limit=5, automatic_checkpoint_naming=True
+    )
+    tensorboard_tracker = TensorBoardTracker(run_name="tf_log", logging_dir=cfg.output_dir)
+    kwargs = DistributedDataParallelKwargs(find_unused_parameters=cfg.find_unused_parameters)
+    accelerator = Accelerator(
+        log_with=tensorboard_tracker,
+        project_config=project_config,
+        mixed_precision=args.mixed_precision,
+        gradient_accumulation_steps=args.accumulate_steps,
+        dynamo_backend=args.dynamo_backend,
+        step_scheduler_with_optimizer=False,
+        kwargs_handlers=[kwargs],
+    )
+    accelerator.init_trackers("det_train")
+    default_setup(args, cfg, accelerator)
+
+    # instantiate dataset
+    params = dict(num_workers=cfg.num_workers, collate_fn=collate_fn)
+    params.update(dict(pin_memory=cfg.pin_memory, persistent_workers=True))
+    if args.use_deterministic_algorithms:
+        # set using deterministic algorithms
+        torch.backends.cudnn.benchmark = False
+        torch.use_deterministic_algorithms(True, warn_only=True)
+        params.update({"worker_init_fn": seed_worker, "generator": fixed_generator()})
+    # we use group_based sampler, which increases training speed slightly
+    group_ids = create_aspect_ratio_groups(cfg.train_dataset, k=3)
+    train_batch_sampler = GroupedBatchSampler(
+        data.RandomSampler(cfg.train_dataset), group_ids, cfg.batch_size
+    )
+    train_loader = data.DataLoader(cfg.train_dataset, batch_sampler=train_batch_sampler, **params)
+    test_loader = data.DataLoader(cfg.test_dataset, 1, shuffle=False, **params)
+
+    # instantiate model, optimizer and lr_scheduler
+    model = Config(cfg.model_path).model
+    if accelerator.use_distributed:
+        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)
+    optimizer = cfg.optimizer(cfg.param_dicts(model))
+    lr_scheduler = cfg.lr_scheduler(optimizer)
+
+    # register dataset class information into the model, useful for inference
+    cat_ids = list(range(max(cfg.train_dataset.coco.cats.keys()) + 1))
+    classes = tuple(cfg.train_dataset.coco.cats.get(c, {"name": "none"})["name"] for c in cat_ids)
+    model.register_buffer("_classes_", torch.tensor(encode_labels(classes)))
+
+    # log the configerations
+    logger = get_logger(os.path.basename(os.getcwd()) + "." + __name__)
+    # prepare for distributed training
+    model, optimizer, train_loader, test_loader, lr_scheduler = accelerator.prepare(
+        model, optimizer, train_loader, test_loader, lr_scheduler
+    )
+    if getattr(cfg, "resume_from_checkpoint", None) is not None:
+        if os.path.isdir(str(cfg.resume_from_checkpoint)):
+            accelerator.load_state(cfg.resume_from_checkpoint)
+            path = os.path.basename(cfg.resume_from_checkpoint)
+            cfg.starting_epoch = int(path.split("_")[-1]) + 1
+            accelerator.project_configuration.iteration = cfg.starting_epoch
+            logger.info(f"resume training of {cfg.output_dir}, from {path}")
+        elif os.path.isfile(str(cfg.resume_from_checkpoint)):
+            checkpoint = load_checkpoint(cfg.resume_from_checkpoint)
+            checkpoint = checkpoint["model"] if "model" in checkpoint else checkpoint
+            load_state_dict(accelerator.unwrap_model(model), checkpoint)
+            # overwrite _classes_ in checkpoint with current datasets categories
+            model.register_buffer("_classes_", torch.tensor(encode_labels(classes)))
+            logger.info(
+                f"load pretrained from {cfg.resume_from_checkpoint}, output_dir is {cfg.output_dir}"
+            )
+        else:
+            logger.warn("resume_from_checkpoint is not a path or a file, skip loading")
+    else:
+        n_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+        logger.info("model parameters: {}".format(n_params))
+        logger.info("optimizer: {}".format(optimizer))
+        logger.info("lr_scheduler: {}".format(pprint.pformat(lr_scheduler.state_dict())))
+
+    # save dataset name, useful for inference
+    if accelerator.is_main_process:
+        label_file = os.path.join(cfg.output_dir, "label_names.txt")
+        with open(label_file, "w") as f:
+            caid_name = [f"{k} {v['name']}" for k, v in cfg.train_dataset.coco.cats.items()]
+            caid_name = "\n".join(caid_name)
+            f.write(caid_name)
+        logger.info(f"Label names is saved to {label_file}")
+
+    logger.info("Start training")
+    start_time = time.perf_counter()
+    highest_checkpoint = HighestCheckpoint(accelerator, model)
+    for epoch in range(cfg.starting_epoch, cfg.num_epochs):
+        train_one_epoch_acc(
+            model=model,
+            optimizer=optimizer,
+            data_loader=train_loader,
+            epoch=epoch,
+            print_freq=cfg.print_freq,
+            max_grad_norm=cfg.max_norm,
+            accelerator=accelerator,
+        )
+        lr_scheduler.step()
+
+        # we save model and labels together
+        accelerator.save_state(safe_serialization=False)
+        logger.info("Start evaluation")
+        coco_evaluator = evaluate_acc(model, test_loader, epoch, accelerator)
+
+        # save best results
+        cur_ap, cur_ap50 = coco_evaluator.coco_eval["bbox"].stats[:2]
+        highest_checkpoint.update(ap=cur_ap, ap50=cur_ap50)
+
+    total_time = time.perf_counter() - start_time
+    total_time = str(datetime.timedelta(seconds=int(total_time)))
+    logger.info("Training time: {}".format(total_time))
+    accelerator.end_training()
+
+
+if __name__ == "__main__":
+    train()
--- a/model.properties
+++ b/model.properties
+# 模型唯一标识
+modelCode=730
+# 模型名称
+modelName=salience_detr_pytorch
+# 模型描述
+modelDescription=Salience_DETR：用层次显著性滤波细化增强检测变换器的推理和训练
+# 应用场景
+appScenario=训练,推理,科研,制造,医疗,家居,教育
+# 框架类型
+frameType=Pytorch
--- a/models/backbones/__init__.py
+++ b/models/backbones/__init__.py
--- a/models/backbones/base_backbone.py
+++ b/models/backbones/base_backbone.py
+import inspect
+import logging
+import os
+from typing import Dict
+
+from omegaconf import DictConfig
+from torch import nn
+
+from util.utils import load_state_dict as _load_state_dict
+
+
+class BaseBackbone:
+    @staticmethod
+    def load_state_dict(model: nn.Module, state_dict: Dict):
+        if state_dict is None:
+            return
+        assert isinstance(state_dict, Dict), "state_dict must be OrderedDict."
+        _load_state_dict(model, state_dict)
+
+    @staticmethod
+    def freeze_module(module: nn.Module):
+        module.eval()
+        for param in module.parameters():
+            param.requires_grad = False
+
+    def get_instantiate_config(self, func_name, arch, extra_params):
+        # log some necessary information about backbone
+        logger = logging.getLogger(os.path.basename(os.getcwd()) + "." + __name__)
+        assert arch is None or arch in self.model_arch, \
+            f"Expected architecture in {self.model_arch.keys()} but got {arch}"
+        logger.info(f"Backbone architecture: {arch}")
+
+        # merge parameters from self.arch with extra params
+        model_config = self.model_arch[arch] if arch is not None else {}
+        for name, param in inspect.signature(func_name).parameters.items():
+            # get default, current and modified params
+            default = param.default if param.default is not inspect.Parameter.empty else None
+            modified_param = extra_params.get(name, None)
+            if isinstance(model_config, Dict):
+                cur_param = model_config.get(name, None)
+            elif isinstance(model_config, DictConfig):
+                cur_param = getattr(model_config, name, None)
+            else:
+                cur_param = None
+
+            # choose the high-prior parameter
+            if cur_param is not None:
+                default = cur_param
+            if modified_param is not None:
+                default = modified_param
+
+            # replace parameters in model_config
+            if isinstance(model_config, Dict):
+                model_config[name] = default
+            elif isinstance(model_config, DictConfig):
+                setattr(model_config, name, default)
+            else:
+                raise TypeError("Only Dict and DictConfig supported.")
+
+        return model_config
--- a/models/backbones/convnext.py
+++ b/models/backbones/convnext.py
+from functools import partial
+from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+from torchvision.models.feature_extraction import create_feature_extractor
+from torchvision.ops.stochastic_depth import StochasticDepth
+
+from models.backbones.base_backbone import BaseBackbone
+from models.bricks.misc import Conv2dNormActivation, Permute
+from util.lazy_load import LazyCall as L
+from util.lazy_load import instantiate
+from util.utils import load_checkpoint
+
+
+class LayerNorm2d(nn.LayerNorm):
+    def forward(self, x: Tensor) -> Tensor:
+        x = x.permute(0, 2, 3, 1)
+        x = F.layer_norm(x, self.normalized_shape, self.weight, self.bias, self.eps)
+        x = x.permute(0, 3, 1, 2)
+        return x
+
+
+class CNBlock(nn.Module):
+    def __init__(
+        self,
+        dim,
+        layer_scale: float,
+        stochastic_depth_prob: float,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-6)
+
+        self.block = nn.Sequential(
+            nn.Conv2d(dim, dim, kernel_size=7, padding=3, groups=dim, bias=True),
+            Permute([0, 2, 3, 1]),
+            norm_layer(dim),
+            nn.Linear(in_features=dim, out_features=4 * dim, bias=True),
+            nn.GELU(),
+            nn.Linear(in_features=4 * dim, out_features=dim, bias=True),
+            Permute([0, 3, 1, 2]),
+        )
+        self.layer_scale = nn.Parameter(torch.ones(dim, 1, 1) * layer_scale)
+        self.stochastic_depth = StochasticDepth(stochastic_depth_prob, "row")
+
+    def forward(self, input: Tensor) -> Tensor:
+        result = self.layer_scale * self.block(input)
+        result = self.stochastic_depth(result)
+        result += input
+        return result
+
+
+class CNBlockConfig:
+    # Stores information listed at Section 3 of the ConvNeXt paper
+    def __init__(
+        self,
+        input_channels: int,
+        out_channels: Optional[int],
+        num_layers: int,
+    ) -> None:
+        self.input_channels = input_channels
+        self.out_channels = out_channels
+        self.num_layers = num_layers
+
+    def __repr__(self) -> str:
+        s = self.__class__.__name__ + "("
+        s += "input_channels={input_channels}"
+        s += ", out_channels={out_channels}"
+        s += ", num_layers={num_layers}"
+        s += ")"
+        return s.format(**self.__dict__)
+
+
+class ConvNeXt(nn.Module):
+    def __init__(
+        self,
+        block_setting: List[CNBlockConfig],
+        stochastic_depth_prob: float = 0.0,
+        layer_scale: float = 1e-6,
+        num_classes: int = 1000,
+        block: Optional[Callable[..., nn.Module]] = None,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        **kwargs: Any,
+    ) -> None:
+        super().__init__()
+
+        if not block_setting:
+            raise ValueError("The block_setting should not be empty")
+        elif not (
+            isinstance(block_setting, Sequence) and all([isinstance(s, CNBlockConfig) for s in block_setting])
+        ):
+            raise TypeError("The block_setting should be List[CNBlockConfig]")
+
+        if block is None:
+            block = CNBlock
+
+        if norm_layer is None:
+            norm_layer = partial(LayerNorm2d, eps=1e-6)
+
+        layers: List[nn.Module] = []
+
+        # Stem
+        firstconv_output_channels = block_setting[0].input_channels
+        layers.append(
+            Conv2dNormActivation(
+                3,
+                firstconv_output_channels,
+                kernel_size=4,
+                stride=4,
+                padding=0,
+                norm_layer=norm_layer,
+                activation_layer=None,
+                bias=True,
+            )
+        )
+
+        total_stage_blocks = sum(cnf.num_layers for cnf in block_setting)
+        stage_block_id = 0
+        for cnf in block_setting:
+            # Bottlenecks
+            stage: List[nn.Module] = []
+            for _ in range(cnf.num_layers):
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * stage_block_id / (total_stage_blocks - 1.0)
+                stage.append(block(cnf.input_channels, layer_scale, sd_prob))
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+            if cnf.out_channels is not None:
+                # Downsampling
+                layers.append(
+                    nn.Sequential(
+                        norm_layer(cnf.input_channels),
+                        nn.Conv2d(cnf.input_channels, cnf.out_channels, kernel_size=2, stride=2),
+                    )
+                )
+
+        self.features = nn.Sequential(*layers)
+        self.avgpool = nn.AdaptiveAvgPool2d(1)
+
+        lastblock = block_setting[-1]
+        lastconv_output_channels = (
+            lastblock.out_channels if lastblock.out_channels is not None else lastblock.input_channels
+        )
+        self.classifier = nn.Sequential(
+            norm_layer(lastconv_output_channels), nn.Flatten(1),
+            nn.Linear(lastconv_output_channels, num_classes)
+        )
+
+        for m in self.modules():
+            if isinstance(m, (nn.Conv2d, nn.Linear)):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = self.classifier(x)
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+class ConvNeXtBackbone(BaseBackbone):
+
+    # yapf: disable
+    model_weights = {
+        # The following weights are from torchvision
+        "conv_t": "https://download.pytorch.org/models/convnext_tiny-983f1562.pth",
+        "conv_s": "https://download.pytorch.org/models/convnext_small-0c510722.pth",
+        "conv_b": "https://download.pytorch.org/models/convnext_base-6075fbad.pth",
+        "conv_l": "https://download.pytorch.org/models/convnext_large-ea097f82.pth",
+    }
+
+    model_arch = {
+        "conv_t": L(ConvNeXt)(
+            block_setting=[
+                CNBlockConfig(96, 192, 3),
+                CNBlockConfig(192, 384, 3),
+                CNBlockConfig(384, 768, 9),
+                CNBlockConfig(768, None, 3),
+            ],
+            stochastic_depth_prob=0.1,
+            url=model_weights["conv_t"],
+        ),
+        "conv_s": L(ConvNeXt)(
+            block_setting=[
+                CNBlockConfig(96, 192, 3),
+                CNBlockConfig(192, 384, 3),
+                CNBlockConfig(384, 768, 27),
+                CNBlockConfig(768, None, 3),
+            ],
+            stochastic_depth_prob=0.4,
+            url=model_weights["conv_s"],
+        ),
+        "conv_b": L(ConvNeXt)(
+            block_setting = [
+                CNBlockConfig(128, 256, 3),
+                CNBlockConfig(256, 512, 3),
+                CNBlockConfig(512, 1024, 27),
+                CNBlockConfig(1024, None, 3),
+            ],
+            stochastic_depth_prob=0.5,
+            url=model_weights["conv_b"],
+        ),
+        "conv_l": L(ConvNeXt)(
+            block_setting = [
+                CNBlockConfig(192, 384, 3),
+                CNBlockConfig(384, 768, 3),
+                CNBlockConfig(768, 1536, 27),
+                CNBlockConfig(1536, None, 3),
+            ],
+            stochastic_depth_prob=0.5,
+            url=model_weights["conv_l"],
+        )
+    }
+    # yapf: enable
+
+    def __new__(
+        self,
+        arch: str,
+        weights: Union[str, Dict] = None,
+        return_indices: Tuple[int] = (0, 1, 2, 3),
+        freeze_indices: Tuple = (),
+        **kwargs,
+    ):
+        # get parameters and instantiate backbone
+        model_config = self.get_instantiate_config(self, ConvNeXt, arch, kwargs)
+        default_weight = model_config.pop("url", None)
+        convnext = instantiate(model_config)
+
+        # load state dict
+        weights = load_checkpoint(default_weight if weights is None else weights)
+        if isinstance(weights, Dict):
+            weights = weights["model"] if "model" in weights else weights
+        self.load_state_dict(convnext, weights)
+
+        # freeze stages
+        self._freeze_stages(self, convnext, freeze_indices)
+
+        # create feature extractor
+        return_layers = [f"features.{2 * idx + 1}" for idx in return_indices]
+        convnext = create_feature_extractor(convnext, return_layers)
+        convnext.num_channels = [model_config.block_setting[i].input_channels for i in return_indices]
+        return convnext
+
+    def _freeze_stages(self, model: nn.Module, freeze_indices: Tuple[int]):
+        # freeze stem
+        if len(freeze_indices) > 0:
+            self.freeze_module(model.features[0])
+
+        for idx in freeze_indices:
+            # freeze layers
+            self.freeze_module(model.features[2 * idx + 1])
+            # freeze downsample layers
+            if 2 * idx + 2 < len(model.features):
+                self.freeze_module(model.features[2 * idx + 2])
--- a/models/backbones/focalnet.py
+++ b/models/backbones/focalnet.py
--- a/models/backbones/resnet.py
+++ b/models/backbones/resnet.py
+from typing import Callable, Dict, List, Optional, Tuple, Type, Union
+
+import torch
+from torch import Tensor, nn
+from torchvision.models.feature_extraction import create_feature_extractor
+
+from models.bricks.misc import FrozenBatchNorm2d
+from models.backbones.base_backbone import BaseBackbone
+from models.bricks.deform_conv2d_pack import DeformConv2dPack
+from util.lazy_load import LazyCall as L
+from util.lazy_load import instantiate
+from util.utils import load_checkpoint
+
+
+def conv3x3(
+    in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1
+) -> nn.Conv2d:
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+
+
+def conv3x3_dcn(
+    in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1
+) -> DeformConv2dPack:
+    """3x3 deformable convolution with padding"""
+    return DeformConv2dPack(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=dilation,
+        groups=groups,
+        bias=False,
+        dilation=dilation,
+    )
+
+
+def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
+    """1x1 convolution"""
+    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion: int = 1
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        with_dcn: bool = False,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        if groups != 1 or base_width != 64:
+            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
+        if dilation > 1:
+            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
+        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = norm_layer(planes)
+        self.relu = nn.ReLU(inplace=True)
+        if with_dcn:
+            self.conv2 = conv3x3_dcn(planes, planes)
+        else:
+            self.conv2 = conv3x3(planes, planes)
+        self.bn2 = norm_layer(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
+    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
+    # according to "Deep residual learning for image recognition" https://arxiv.org/abs/1512.03385.
+    # This variant is also known as ResNet V1.5 and improves accuracy according to
+    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
+
+    expansion: int = 4
+
+    def __init__(
+        self,
+        inplanes: int,
+        planes: int,
+        stride: int = 1,
+        downsample: Optional[nn.Module] = None,
+        groups: int = 1,
+        base_width: int = 64,
+        dilation: int = 1,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        with_dcn: bool = False,
+    ) -> None:
+        super().__init__()
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        width = int(planes * (base_width / 64.0)) * groups
+        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
+        self.conv1 = conv1x1(inplanes, width)
+        self.bn1 = norm_layer(width)
+        if with_dcn:
+            self.conv2 = conv3x3_dcn(width, width, stride, groups, dilation)
+        else:
+            self.conv2 = conv3x3(width, width, stride, groups, dilation)
+        self.bn2 = norm_layer(width)
+        self.conv3 = conv1x1(width, planes * self.expansion)
+        self.bn3 = norm_layer(planes * self.expansion)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x: Tensor) -> Tensor:
+        identity = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+    def __init__(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        layers: List[int],
+        num_classes: int = 1000,
+        zero_init_residual: bool = False,
+        groups: int = 1,
+        width_per_group: int = 64,
+        replace_stride_with_dilation: Optional[List[bool]] = None,
+        stage_with_dcn: Optional[List[bool]] = None,  # we only add an extra parameter
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        if stage_with_dcn is None:
+            stage_with_dcn = [False] * 4
+        if norm_layer is None:
+            norm_layer = nn.BatchNorm2d
+        self._norm_layer = norm_layer
+
+        self.inplanes = 64
+        self.dilation = 1
+        if replace_stride_with_dilation is None:
+            # each element in the tuple indicates if we should replace
+            # the 2x2 stride with a dilated convolution instead
+            replace_stride_with_dilation = [False, False, False]
+        if len(replace_stride_with_dilation) != 3:
+            raise ValueError(
+                "replace_stride_with_dilation should be None "
+                f"or a 3-element tuple, got {replace_stride_with_dilation}"
+            )
+        self.groups = groups
+        self.base_width = width_per_group
+        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = norm_layer(self.inplanes)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0], with_dcn=stage_with_dcn[0])
+        self.layer2 = self._make_layer(
+            block,
+            128,
+            layers[1],
+            stride=2,
+            dilate=replace_stride_with_dilation[0],
+            with_dcn=stage_with_dcn[1],
+        )
+        self.layer3 = self._make_layer(
+            block,
+            256,
+            layers[2],
+            stride=2,
+            dilate=replace_stride_with_dilation[1],
+            with_dcn=stage_with_dcn[2],
+        )
+        self.layer4 = self._make_layer(
+            block,
+            512,
+            layers[3],
+            stride=2,
+            dilate=replace_stride_with_dilation[2],
+            with_dcn=stage_with_dcn[3],
+        )
+        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
+            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+        # Zero-initialize the last BN in each residual branch,
+        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
+        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
+        if zero_init_residual:
+            for m in self.modules():
+                if isinstance(m, Bottleneck) and m.bn3.weight is not None:
+                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
+                elif isinstance(m, BasicBlock) and m.bn2.weight is not None:
+                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]
+
+    def _make_layer(
+        self,
+        block: Type[Union[BasicBlock, Bottleneck]],
+        planes: int,
+        blocks: int,
+        stride: int = 1,
+        dilate: bool = False,
+        with_dcn: bool = False,
+    ) -> nn.Sequential:
+        norm_layer = self._norm_layer
+        downsample = None
+        previous_dilation = self.dilation
+        if dilate:
+            self.dilation *= stride
+            stride = 1
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                conv1x1(self.inplanes, planes * block.expansion, stride),
+                norm_layer(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(
+                self.inplanes,
+                planes,
+                stride,
+                downsample,
+                self.groups,
+                self.base_width,
+                previous_dilation,
+                norm_layer,
+                with_dcn,
+            )
+        )
+        self.inplanes = planes * block.expansion
+        for _ in range(1, blocks):
+            layers.append(
+                block(
+                    self.inplanes,
+                    planes,
+                    groups=self.groups,
+                    base_width=self.base_width,
+                    dilation=self.dilation,
+                    norm_layer=norm_layer,
+                    with_dcn=with_dcn,
+                )
+            )
+
+        return nn.Sequential(*layers)
+
+    def _forward_impl(self, x: Tensor) -> Tensor:
+        # See note [TorchScript super()]
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.fc(x)
+
+        return x
+
+    def forward(self, x: Tensor) -> Tensor:
+        return self._forward_impl(x)
+
+
+class ResNetBackbone(BaseBackbone):
+
+    # yapf: disable
+    model_weights = {
+        # The following weights are from torchvision
+        "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth",
+        "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth",
+        "resnet50_v1": "https://download.pytorch.org/models/resnet50-0676ba61.pth",
+        "resnet50_v2": "https://download.pytorch.org/models/resnet50-11ad3fa6.pth",
+        "resnet101_v1": "https://download.pytorch.org/models/resnet101-63fe2227.pth",
+        "resnet101_v2": "https://download.pytorch.org/models/resnet101-cd907fc2.pth",
+        "resnet152_v1": "https://download.pytorch.org/models/resnet152-394f9c45.pth",
+        "resnet152_v2": "https://download.pytorch.org/models/resnet152-f82ba261.pth",
+        "resnext50_32x4d_v1": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
+        "resnext50_32x4d_v2": "https://download.pytorch.org/models/resnext50_32x4d-1a0047aa.pth",
+        "resnext101_32x8d_v1": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
+        "resnext101_32x8d_v2": "https://download.pytorch.org/models/resnext101_32x8d-110c445d.pth",
+        "resnext101_64x4d": "https://download.pytorch.org/models/resnext101_64x4d-173b62eb.pth",
+        "wide_resnet50_2_v1": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
+        "wide_resnet50_2_v2": "https://download.pytorch.org/models/wide_resnet50_2-9ba9bcbe.pth",
+        "wide_resnet101_2_v1": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
+        "wide_resnet101_2_v2": "https://download.pytorch.org/models/wide_resnet101_2-d733dc28.pth",
+        # The following weights are transfomed from mmpretrain
+        "resnext101_32x4d":
+        "https://github.com/xiuqhou/pretrained_weights/releases/download/v1.0.1-beta/resnext101_32x4d-e0fa3dd5.pth",
+    }
+
+    model_arch = {
+        "resnet18": L(ResNet)(block=BasicBlock, layers=(2, 2, 2, 2), url=model_weights["resnet18"]),
+        "resnet34": L(ResNet)(block=BasicBlock, layers=(3, 4, 6, 3), url=model_weights["resnet34"]),
+        "resnet50": L(ResNet)(block=Bottleneck, layers=(3, 4, 6, 3), url=model_weights["resnet50_v2"]),
+        "resnet101": L(ResNet)(block=Bottleneck, layers=(3, 4, 23, 3), url=model_weights["resnet101_v2"]),
+        "resnet152": L(ResNet)(block=Bottleneck, layers=(3, 8, 36, 3), url=model_weights["resnet152_v2"]),
+        "resnext50_32x4d": L(ResNet)(
+            block=Bottleneck,
+            layers=(3, 4, 6, 3),
+            groups=32,
+            width_per_group=4,
+            url=model_weights["resnext50_32x4d_v2"],
+        ),
+        "resnext101_32x4d": L(ResNet)(
+            block=Bottleneck,
+            layers=(3, 4, 23, 3),
+            groups=32,
+            width_per_group=4,
+            url=model_weights["resnext101_32x4d"],
+        ),
+        "resnext101_32x8d": L(ResNet)(
+            block=Bottleneck,
+            layers=(3, 4, 23, 3),
+            groups=32,
+            width_per_group=8,
+            url=model_weights["resnext101_32x8d_v2"],
+        ),
+        "resnext101_64x4d": L(ResNet)(
+            block=Bottleneck,
+            layers=(3, 4, 23, 3),
+            groups=64,
+            width_per_group=4,
+            url=model_weights["resnext101_64x4d"],
+        ),
+        "wide_resnet50_2": L(ResNet)(
+            block=Bottleneck,
+            layers=(3, 4, 6, 3),
+            width_per_group=64 * 2,
+            url=model_weights["wide_resnet50_2_v2"],
+        ),
+        "wide_resnet101_2": L(ResNet)(
+            block=Bottleneck,
+            layers=(3, 4, 23, 3),
+            width_per_group=64 * 2,
+            url=model_weights["wide_resnet101_2_v2"],
+        ),
+    }
+    # yapf: enable
+
+    def __new__(
+        self,
+        arch: str,
+        weights: Dict = None,
+        return_indices: Tuple[int] = (0, 1, 2, 3),
+        freeze_indices: Tuple = (),
+        **kwargs,
+    ):
+        # get parameters and instantiate backbone
+        model_config = self.get_instantiate_config(self, ResNet, arch, kwargs)
+        default_weight = model_config.pop("url", None)
+        resnet = instantiate(model_config)
+
+        # load state dict
+        weights = load_checkpoint(default_weight if weights is None else weights)
+        if isinstance(weights, Dict):
+            weights = weights["model"] if "model" in weights else weights
+        self.load_state_dict(resnet, weights)
+
+        # freeze stages
+        self._freeze_stages(self, resnet, freeze_indices)
+
+        # create feature extractor
+        return_layers = [f"layer{idx + 1}" for idx in return_indices]
+        resnet = create_feature_extractor(
+            resnet, return_layers, tracer_kwargs={"leaf_modules": [FrozenBatchNorm2d]}
+        )
+        resnet.num_channels = [64 * model_config.block.expansion * 2**idx for idx in return_indices]
+        return resnet
+
+    def _freeze_stages(self, model: nn.Module, freeze_indices: Tuple[int]):
+        # freeze stem
+        if len(freeze_indices) > 0:
+            self.freeze_module(model.conv1)
+            self.freeze_module(model.bn1)
+
+        # freeze layers
+        for idx in freeze_indices:
+            self.freeze_module(model.get_submodule(f"layer{idx+1}"))
--- a/models/backbones/swin.py
+++ b/models/backbones/swin.py
--- a/models/backbones/vit.py
+++ b/models/backbones/vit.py
--- a/models/bricks/base_transformer.py
+++ b/models/bricks/base_transformer.py
--- a/models/bricks/basic.py
+++ b/models/bricks/basic.py
--- a/models/bricks/deform_conv2d_pack.py
+++ b/models/bricks/deform_conv2d_pack.py
--- a/models/bricks/denoising.py
+++ b/models/bricks/denoising.py
--- a/models/bricks/losses.py
+++ b/models/bricks/losses.py
--- a/models/bricks/misc.py
+++ b/models/bricks/misc.py
--- a/models/bricks/ms_deform_attn.py
+++ b/models/bricks/ms_deform_attn.py
--- a/models/bricks/ops/__init__.py
+++ b/models/bricks/ops/__init__.py
--- a/models/bricks/ops/cuda/ms_deform_attn_cuda.cu
+++ b/models/bricks/ops/cuda/ms_deform_attn_cuda.cu