v1.0

3a6df602 · chenzk · 3a6df602 · 3a6df602 · 3a6df602 · 3a6df602
Commit 3a6df602 authored Jun 13, 2024 by chenzk
20 changed files
--- a/sam/repvit_sam/utils/onnx.py
+++ b/sam/repvit_sam/utils/onnx.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from math import floor
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from typing import Tuple, List
+
+from ..modeling import Sam
+from .amg import calculate_stability_score
+
+
+class SamOnnxModel(nn.Module):
+    """
+    This model should not be called directly, but is used in ONNX export.
+    It combines the prompt encoder, mask decoder, and mask postprocessing of Sam,
+    with some functions modified to enable model tracing. Also supports extra
+    options controlling what information. See the ONNX export script for details.
+    """
+
+    def __init__(
+        self,
+        model: Sam,
+        return_single_mask: bool,
+        use_stability_score: bool = False,
+        return_extra_metrics: bool = False,
+    ) -> None:
+        super().__init__()
+        self.mask_decoder = model.mask_decoder
+        self.model = model
+        self.img_size = model.image_encoder.img_size
+        self.return_single_mask = return_single_mask
+        self.use_stability_score = use_stability_score
+        self.stability_score_offset = 1.0
+        self.return_extra_metrics = return_extra_metrics
+
+    @staticmethod
+    def resize_longest_image_size(
+        input_image_size: torch.Tensor, longest_side: int
+    ) -> torch.Tensor:
+        input_image_size = input_image_size.to(torch.float32)
+        scale = longest_side / torch.max(input_image_size)
+        transformed_size = scale * input_image_size
+        transformed_size = torch.floor(transformed_size + 0.5).to(torch.int64)
+        return transformed_size
+
+    def _embed_points(self, point_coords: torch.Tensor, point_labels: torch.Tensor) -> torch.Tensor:
+        point_coords = point_coords + 0.5
+        point_coords = point_coords / self.img_size
+        point_embedding = self.model.prompt_encoder.pe_layer._pe_encoding(point_coords)
+        point_labels = point_labels.unsqueeze(-1).expand_as(point_embedding)
+
+        point_embedding = point_embedding * (point_labels != -1)
+        point_embedding = point_embedding + self.model.prompt_encoder.not_a_point_embed.weight * (
+            point_labels == -1
+        )
+
+        for i in range(self.model.prompt_encoder.num_point_embeddings):
+            point_embedding = point_embedding + self.model.prompt_encoder.point_embeddings[
+                i
+            ].weight * (point_labels == i)
+
+        return point_embedding
+
+    def _embed_masks(self, input_mask: torch.Tensor, has_mask_input: torch.Tensor) -> torch.Tensor:
+        mask_embedding = has_mask_input * self.model.prompt_encoder.mask_downscaling(input_mask)
+        mask_embedding = mask_embedding + (
+            1 - has_mask_input
+        ) * self.model.prompt_encoder.no_mask_embed.weight.reshape(1, -1, 1, 1)
+        return mask_embedding
+
+    def mask_postprocessing(self, masks: torch.Tensor, orig_im_size: torch.Tensor) -> torch.Tensor:
+        masks = F.interpolate(
+            masks,
+            size=(self.img_size, self.img_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+
+        prepadded_size = self.resize_longest_image_size(orig_im_size, self.img_size).to(torch.int64)
+        masks = masks[..., : prepadded_size[0], : prepadded_size[1]]  # type: ignore
+
+        orig_im_size = orig_im_size.to(torch.int64)
+        h, w = orig_im_size[0], orig_im_size[1]
+        masks = F.interpolate(masks, size=(h, w), mode="bilinear", align_corners=False)
+        return masks
+
+    def select_masks(
+        self, masks: torch.Tensor, iou_preds: torch.Tensor, num_points: int
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Determine if we should return the multiclick mask or not from the number of points.
+        # The reweighting is used to avoid control flow.
+        score_reweight = torch.tensor(
+            [[1000] + [0] * (self.model.mask_decoder.num_mask_tokens - 1)]
+        ).to(iou_preds.device)
+        score = iou_preds + (num_points - 2.5) * score_reweight
+        best_idx = torch.argmax(score, dim=1)
+        masks = masks[torch.arange(masks.shape[0]), best_idx, :, :].unsqueeze(1)
+        iou_preds = iou_preds[torch.arange(masks.shape[0]), best_idx].unsqueeze(1)
+
+        return masks, iou_preds
+
+    @torch.no_grad()
+    def forward(
+        self,
+        image_embeddings: torch.Tensor,
+        point_coords: torch.Tensor,
+        point_labels: torch.Tensor,
+        mask_input: torch.Tensor,
+        has_mask_input: torch.Tensor,
+        orig_im_size: torch.Tensor,
+    ):
+        sparse_embedding = self._embed_points(point_coords, point_labels)
+        dense_embedding = self._embed_masks(mask_input, has_mask_input)
+
+        masks, scores = self.model.mask_decoder.predict_masks(
+            image_embeddings=image_embeddings,
+            image_pe=self.model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embedding,
+            dense_prompt_embeddings=dense_embedding,
+        )
+
+        if self.use_stability_score:
+            scores = calculate_stability_score(
+                masks, self.model.mask_threshold, self.stability_score_offset
+            )
+
+        if self.return_single_mask:
+            masks, scores = self.select_masks(masks, scores, point_coords.shape[1])
+
+        upscaled_masks = self.mask_postprocessing(masks, orig_im_size)
+
+        if self.return_extra_metrics:
+            stability_scores = calculate_stability_score(
+                upscaled_masks, self.model.mask_threshold, self.stability_score_offset
+            )
+            areas = (upscaled_masks > self.model.mask_threshold).sum(-1).sum(-1)
+            return upscaled_masks, scores, stability_scores, areas, masks
+
+        return upscaled_masks, scores, masks
+
--- a/sam/repvit_sam/utils/transforms.py
+++ b/sam/repvit_sam/utils/transforms.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+from torchvision.transforms.functional import resize, to_pil_image  # type: ignore
+
+from copy import deepcopy
+from typing import Tuple
+
+
+class ResizeLongestSide:
+    """
+    Resizes images to the longest side 'target_length', as well as provides
+    methods for resizing coordinates and boxes. Provides methods for
+    transforming both numpy array and batched torch tensors.
+    """
+
+    def __init__(self, target_length: int) -> None:
+        self.target_length = target_length
+
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """
+        Expects a numpy array with shape HxWxC in uint8 format.
+        """
+        target_size = self.get_preprocess_shape(image.shape[0], image.shape[1], self.target_length)
+        return np.array(resize(to_pil_image(image), target_size))
+
+    def apply_coords(self, coords: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+        """
+        Expects a numpy array of length 2 in the final dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).astype(float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+
+    def apply_boxes(self, boxes: np.ndarray, original_size: Tuple[int, ...]) -> np.ndarray:
+        """
+        Expects a numpy array shape Bx4. Requires the original image size
+        in (H, W) format.
+        """
+        boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+
+    def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
+        """
+        Expects batched images with shape BxCxHxW and float format. This
+        transformation may not exactly match apply_image. apply_image is
+        the transformation expected by the model.
+        """
+        # Expects an image in BCHW format. May not exactly match apply_image.
+        target_size = self.get_preprocess_shape(image.shape[2], image.shape[3], self.target_length)
+        return F.interpolate(
+            image, target_size, mode="bilinear", align_corners=False, antialias=True
+        )
+
+    def apply_coords_torch(
+        self, coords: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with length 2 in the last dimension. Requires the
+        original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(
+            original_size[0], original_size[1], self.target_length
+        )
+        coords = deepcopy(coords).to(torch.float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+
+    def apply_boxes_torch(
+        self, boxes: torch.Tensor, original_size: Tuple[int, ...]
+    ) -> torch.Tensor:
+        """
+        Expects a torch tensor with shape Bx4. Requires the original image
+        size in (H, W) format.
+        """
+        boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+
+    @staticmethod
+    def get_preprocess_shape(oldh: int, oldw: int, long_side_length: int) -> Tuple[int, int]:
+        """
+        Compute the output size given input size and target long side length.
+        """
+        scale = long_side_length * 1.0 / max(oldh, oldw)
+        newh, neww = oldh * scale, oldw * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)
--- a/sam/scripts/amg.py
+++ b/sam/scripts/amg.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import cv2  # type: ignore
+
+from repvit_sam import SamAutomaticMaskGenerator, sam_model_registry
+
+import argparse
+import json
+import os
+from typing import Any, Dict, List
+
+parser = argparse.ArgumentParser(
+    description=(
+        "Runs automatic mask generation on an input image or directory of images, "
+        "and outputs masks as either PNGs or COCO-style RLEs. Requires open-cv, "
+        "as well as pycocotools if saving in RLE format."
+    )
+)
+
+parser.add_argument(
+    "--input",
+    type=str,
+    required=True,
+    help="Path to either a single input image or folder of images.",
+)
+
+parser.add_argument(
+    "--output",
+    type=str,
+    required=True,
+    help=(
+        "Path to the directory where masks will be output. Output will be either a folder "
+        "of PNGs per image or a single json with COCO-style masks."
+    ),
+)
+
+parser.add_argument(
+    "--model-type",
+    type=str,
+    required=True,
+    help="The type of model to load, in ['default', 'vit_h', 'vit_l', 'vit_b']",
+)
+
+parser.add_argument(
+    "--checkpoint",
+    type=str,
+    required=True,
+    help="The path to the SAM checkpoint to use for mask generation.",
+)
+
+parser.add_argument("--device", type=str, default="cuda", help="The device to run generation on.")
+
+parser.add_argument(
+    "--convert-to-rle",
+    action="store_true",
+    help=(
+        "Save masks as COCO RLEs in a single json instead of as a folder of PNGs. "
+        "Requires pycocotools."
+    ),
+)
+
+amg_settings = parser.add_argument_group("AMG Settings")
+
+amg_settings.add_argument(
+    "--points-per-side",
+    type=int,
+    default=None,
+    help="Generate masks by sampling a grid over the image with this many points to a side.",
+)
+
+amg_settings.add_argument(
+    "--points-per-batch",
+    type=int,
+    default=None,
+    help="How many input points to process simultaneously in one batch.",
+)
+
+amg_settings.add_argument(
+    "--pred-iou-thresh",
+    type=float,
+    default=None,
+    help="Exclude masks with a predicted score from the model that is lower than this threshold.",
+)
+
+amg_settings.add_argument(
+    "--stability-score-thresh",
+    type=float,
+    default=None,
+    help="Exclude masks with a stability score lower than this threshold.",
+)
+
+amg_settings.add_argument(
+    "--stability-score-offset",
+    type=float,
+    default=None,
+    help="Larger values perturb the mask more when measuring stability score.",
+)
+
+amg_settings.add_argument(
+    "--box-nms-thresh",
+    type=float,
+    default=None,
+    help="The overlap threshold for excluding a duplicate mask.",
+)
+
+amg_settings.add_argument(
+    "--crop-n-layers",
+    type=int,
+    default=None,
+    help=(
+        "If >0, mask generation is run on smaller crops of the image to generate more masks. "
+        "The value sets how many different scales to crop at."
+    ),
+)
+
+amg_settings.add_argument(
+    "--crop-nms-thresh",
+    type=float,
+    default=None,
+    help="The overlap threshold for excluding duplicate masks across different crops.",
+)
+
+amg_settings.add_argument(
+    "--crop-overlap-ratio",
+    type=int,
+    default=None,
+    help="Larger numbers mean image crops will overlap more.",
+)
+
+amg_settings.add_argument(
+    "--crop-n-points-downscale-factor",
+    type=int,
+    default=None,
+    help="The number of points-per-side in each layer of crop is reduced by this factor.",
+)
+
+amg_settings.add_argument(
+    "--min-mask-region-area",
+    type=int,
+    default=None,
+    help=(
+        "Disconnected mask regions or holes with area smaller than this value "
+        "in pixels are removed by postprocessing."
+    ),
+)
+
+
+def write_masks_to_folder(masks: List[Dict[str, Any]], path: str) -> None:
+    header = "id,area,bbox_x0,bbox_y0,bbox_w,bbox_h,point_input_x,point_input_y,predicted_iou,stability_score,crop_box_x0,crop_box_y0,crop_box_w,crop_box_h"  # noqa
+    metadata = [header]
+    for i, mask_data in enumerate(masks):
+        mask = mask_data["segmentation"]
+        filename = f"{i}.png"
+        cv2.imwrite(os.path.join(path, filename), mask * 255)
+        mask_metadata = [
+            str(i),
+            str(mask_data["area"]),
+            *[str(x) for x in mask_data["bbox"]],
+            *[str(x) for x in mask_data["point_coords"][0]],
+            str(mask_data["predicted_iou"]),
+            str(mask_data["stability_score"]),
+            *[str(x) for x in mask_data["crop_box"]],
+        ]
+        row = ",".join(mask_metadata)
+        metadata.append(row)
+    metadata_path = os.path.join(path, "metadata.csv")
+    with open(metadata_path, "w") as f:
+        f.write("\n".join(metadata))
+
+    return
+
+
+def get_amg_kwargs(args):
+    amg_kwargs = {
+        "points_per_side": args.points_per_side,
+        "points_per_batch": args.points_per_batch,
+        "pred_iou_thresh": args.pred_iou_thresh,
+        "stability_score_thresh": args.stability_score_thresh,
+        "stability_score_offset": args.stability_score_offset,
+        "box_nms_thresh": args.box_nms_thresh,
+        "crop_n_layers": args.crop_n_layers,
+        "crop_nms_thresh": args.crop_nms_thresh,
+        "crop_overlap_ratio": args.crop_overlap_ratio,
+        "crop_n_points_downscale_factor": args.crop_n_points_downscale_factor,
+        "min_mask_region_area": args.min_mask_region_area,
+    }
+    amg_kwargs = {k: v for k, v in amg_kwargs.items() if v is not None}
+    return amg_kwargs
+
+
+def main(args: argparse.Namespace) -> None:
+    print("Loading model...")
+    sam = sam_model_registry[args.model_type](checkpoint=args.checkpoint)
+    _ = sam.to(device=args.device)
+    output_mode = "coco_rle" if args.convert_to_rle else "binary_mask"
+    amg_kwargs = get_amg_kwargs(args)
+    generator = SamAutomaticMaskGenerator(sam, output_mode=output_mode, **amg_kwargs)
+
+    if not os.path.isdir(args.input):
+        targets = [args.input]
+    else:
+        targets = [
+            f for f in os.listdir(args.input) if not os.path.isdir(os.path.join(args.input, f))
+        ]
+        targets = [os.path.join(args.input, f) for f in targets]
+
+    os.makedirs(args.output, exist_ok=True)
+
+    for t in targets:
+        print(f"Processing '{t}'...")
+        image = cv2.imread(t)
+        if image is None:
+            print(f"Could not load '{t}' as an image, skipping...")
+            continue
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+
+        masks = generator.generate(image)
+
+        base = os.path.basename(t)
+        base = os.path.splitext(base)[0]
+        save_base = os.path.join(args.output, base)
+        if output_mode == "binary_mask":
+            os.makedirs(save_base, exist_ok=False)
+            write_masks_to_folder(masks, save_base)
+        else:
+            save_file = save_base + ".json"
+            with open(save_file, "w") as f:
+                json.dump(masks, f)
+    print("Done!")
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    main(args)
--- a/sam/scripts/export_coreml_decoder.py
+++ b/sam/scripts/export_coreml_decoder.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from repvit_sam import sam_model_registry
+from repvit_sam.utils.coreml import SamCoreMLModel
+
+import argparse
+import warnings
+
+parser = argparse.ArgumentParser(
+    description="Export the SAM prompt encoder and mask decoder to an ONNX model."
+)
+
+parser.add_argument(
+    "--checkpoint", type=str, required=True, help="The path to the SAM model checkpoint."
+)
+
+parser.add_argument(
+    "--output", type=str, required=False, help="The filename to save the ONNX model to."
+)
+
+parser.add_argument(
+    "--model-type",
+    type=str,
+    required=True,
+    help="In ['default', 'vit_h', 'vit_l', 'vit_b']. Which type of SAM model to export.",
+)
+
+parser.add_argument(
+    "--return-single-mask",
+    action="store_true",
+    default=True,
+    help=(
+        "If true, the exported ONNX model will only return the best mask, "
+        "instead of returning multiple masks. For high resolution images "
+        "this can improve runtime when upscaling masks is expensive."
+    ),
+)
+
+parser.add_argument(
+    "--opset",
+    type=int,
+    default=17,
+    help="The ONNX opset version to use. Must be >=11",
+)
+
+parser.add_argument(
+    "--quantize-out",
+    type=str,
+    default=None,
+    help=(
+        "If set, will quantize the model and save it with this name. "
+        "Quantization is performed with quantize_dynamic from onnxruntime.quantization.quantize."
+    ),
+)
+
+parser.add_argument(
+    "--gelu-approximate",
+    action="store_true",
+    help=(
+        "Replace GELU operations with approximations using tanh. Useful "
+        "for some runtimes that have slow or unimplemented erf ops, used in GELU."
+    ),
+)
+
+parser.add_argument(
+    "--use-stability-score",
+    action="store_true",
+    help=(
+        "Replaces the model's predicted mask quality score with the stability "
+        "score calculated on the low resolution masks using an offset of 1.0. "
+    ),
+)
+
+parser.add_argument(
+    "--return-extra-metrics",
+    action="store_true",
+    help=(
+        "The model will return five results: (masks, scores, stability_scores, "
+        "areas, low_res_logits) instead of the usual three. This can be "
+        "significantly slower for high resolution outputs."
+    ),
+)
+
+parser.add_argument('--precision', default='fp16', type=str)
+
+@torch.no_grad()
+def run_export(
+    model_type: str,
+    checkpoint: str,
+    output: str,
+    opset: int,
+    return_single_mask: bool,
+    gelu_approximate: bool = False,
+    use_stability_score: bool = False,
+    return_extra_metrics=False,
+):
+    print("Loading model...")
+    sam = sam_model_registry[model_type](checkpoint=checkpoint)
+
+    onnx_model = SamCoreMLModel(
+        model=sam,
+        orig_img_size=[1024, 1024],
+        return_single_mask=return_single_mask,
+        use_stability_score=use_stability_score,
+        return_extra_metrics=return_extra_metrics,
+    )
+    onnx_model.eval()
+
+    dynamic_axes = {
+        "point_coords": {1: "num_points"},
+        "point_labels": {1: "num_points"},
+    }
+
+    embed_dim = sam.prompt_encoder.embed_dim
+    embed_size = sam.prompt_encoder.image_embedding_size
+    mask_input_size = [4 * x for x in embed_size]
+    dummy_inputs = {
+        "image_embeddings": torch.randn(1, embed_dim, *embed_size, dtype=torch.float),
+        "point_coords": torch.randint(low=0, high=1024, size=(1, 5, 2), dtype=torch.float),
+        "point_labels": torch.randint(low=0, high=4, size=(1, 5), dtype=torch.float),
+        "mask_input": torch.randn(1, 1, *mask_input_size, dtype=torch.float),
+        "has_mask_input": torch.tensor([1], dtype=torch.float),
+    }
+    traced_model = torch.jit.trace(onnx_model, example_inputs=list(dummy_inputs.values()))
+    out = traced_model(**dummy_inputs)
+
+    output_names = ["masks", "iou_predictions", "low_res_masks"]
+
+    import coremltools as ct
+
+    # Using image_input in the inputs parameter:
+    # Convert to Core ML neural network using the Unified Conversion API.
+    model = ct.convert(
+        traced_model,
+        inputs=[
+            ct.TensorType(name='image_embeddings', shape=dummy_inputs['image_embeddings'].shape),
+            ct.TensorType(name='point_coords', shape=ct.Shape(shape=(1, ct.RangeDim(lower_bound=0, upper_bound=5,default=1), 2))),
+            ct.TensorType(name='point_labels', shape=ct.Shape(shape=(1, ct.RangeDim(lower_bound=0, upper_bound=5,default=1)))),
+            ct.TensorType(name='mask_input', shape=dummy_inputs['mask_input'].shape),
+            ct.TensorType(name='has_mask_input', shape=dummy_inputs['has_mask_input'].shape),
+        ],
+        compute_precision=ct.precision.FLOAT16 if args.precision=='fp16' else ct.precision.FLOAT32
+    )
+
+    # Save the converted model.
+    model.save(f"coreml/sam_decoder.mlpackage")
+
+
+
+def to_numpy(tensor):
+    return tensor.cpu().numpy()
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    run_export(
+        model_type=args.model_type,
+        checkpoint=args.checkpoint,
+        output=args.output,
+        opset=args.opset,
+        return_single_mask=args.return_single_mask,
+        gelu_approximate=args.gelu_approximate,
+        use_stability_score=args.use_stability_score,
+        return_extra_metrics=args.return_extra_metrics,
+    )
--- a/sam/scripts/export_coreml_encoder.py
+++ b/sam/scripts/export_coreml_encoder.py
+import torch
+
+from timm import create_model
+
+import torch
+import torchvision
+from argparse import ArgumentParser
+from timm.models import create_model
+import repvit_sam.modeling
+
+parser = ArgumentParser()
+
+parser.add_argument('--model', default='vit_t', type=str)
+parser.add_argument('--resolution', default=224, type=int)
+parser.add_argument('--ckpt', default=None, type=str)
+parser.add_argument('--samckpt', default=None, type=str)
+parser.add_argument('--precision', default='fp16', type=str)
+
+if __name__ == "__main__":
+    # Load a pre-trained version of MobileNetV2
+    args = parser.parse_args()
+    model = create_model(args.model)
+    if args.ckpt:
+        model.load_state_dict(torch.load(args.ckpt)['model'])
+    if args.samckpt:
+        state = torch.load(args.samckpt, map_location='cpu')
+        new_state = {}
+        for k, v in state.items():
+            if not 'image_encoder' in k:
+                continue
+            new_state[k.replace('image_encoder.', '')] = v
+        model.load_state_dict(new_state)
+    model.eval()
+
+    # Trace the model with random data.
+    resolution = args.resolution
+    example_input = torch.rand(1, 3, resolution, resolution) 
+    traced_model = torch.jit.trace(model, example_input)
+    out = traced_model(example_input)
+
+    import coremltools as ct
+
+    # Using image_input in the inputs parameter:
+    # Convert to Core ML neural network using the Unified Conversion API.
+    model = ct.convert(
+        traced_model,
+        inputs=[ct.TensorType(shape=example_input.shape)],
+        compute_precision=ct.precision.FLOAT16 if args.precision=='fp16' else ct.precision.FLOAT32
+    )
+
+    # Save the converted model.
+    model.save(f"coreml/{args.model}_{resolution}.mlpackage")
--- a/sam/scripts/export_onnx_model.py
+++ b/sam/scripts/export_onnx_model.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import torch
+
+from repvit_sam import sam_model_registry
+from repvit_sam.utils.onnx import SamOnnxModel
+
+import argparse
+import warnings
+
+try:
+    import onnxruntime  # type: ignore
+
+    onnxruntime_exists = True
+except ImportError:
+    onnxruntime_exists = False
+
+parser = argparse.ArgumentParser(
+    description="Export the SAM prompt encoder and mask decoder to an ONNX model."
+)
+
+parser.add_argument(
+    "--checkpoint", type=str, required=True, help="The path to the SAM model checkpoint."
+)
+
+parser.add_argument(
+    "--output", type=str, required=True, help="The filename to save the ONNX model to."
+)
+
+parser.add_argument(
+    "--model-type",
+    type=str,
+    required=True,
+    help="In ['default', 'vit_h', 'vit_l', 'vit_b']. Which type of SAM model to export.",
+)
+
+parser.add_argument(
+    "--return-single-mask",
+    action="store_true",
+    default=True,
+    help=(
+        "If true, the exported ONNX model will only return the best mask, "
+        "instead of returning multiple masks. For high resolution images "
+        "this can improve runtime when upscaling masks is expensive."
+    ),
+)
+
+parser.add_argument(
+    "--opset",
+    type=int,
+    default=17,
+    help="The ONNX opset version to use. Must be >=11",
+)
+
+parser.add_argument(
+    "--quantize-out",
+    type=str,
+    default=None,
+    help=(
+        "If set, will quantize the model and save it with this name. "
+        "Quantization is performed with quantize_dynamic from onnxruntime.quantization.quantize."
+    ),
+)
+
+parser.add_argument(
+    "--gelu-approximate",
+    action="store_true",
+    help=(
+        "Replace GELU operations with approximations using tanh. Useful "
+        "for some runtimes that have slow or unimplemented erf ops, used in GELU."
+    ),
+)
+
+parser.add_argument(
+    "--use-stability-score",
+    action="store_true",
+    help=(
+        "Replaces the model's predicted mask quality score with the stability "
+        "score calculated on the low resolution masks using an offset of 1.0. "
+    ),
+)
+
+parser.add_argument(
+    "--return-extra-metrics",
+    action="store_true",
+    help=(
+        "The model will return five results: (masks, scores, stability_scores, "
+        "areas, low_res_logits) instead of the usual three. This can be "
+        "significantly slower for high resolution outputs."
+    ),
+)
+
+
+def run_export(
+    model_type: str,
+    checkpoint: str,
+    output: str,
+    opset: int,
+    return_single_mask: bool,
+    gelu_approximate: bool = False,
+    use_stability_score: bool = False,
+    return_extra_metrics=False,
+):
+    print("Loading model...")
+    sam = sam_model_registry[model_type](checkpoint=checkpoint)
+
+    onnx_model = SamOnnxModel(
+        model=sam,
+        return_single_mask=return_single_mask,
+        use_stability_score=use_stability_score,
+        return_extra_metrics=return_extra_metrics,
+    )
+    onnx_model.eval()
+
+    if gelu_approximate:
+        for n, m in onnx_model.named_modules():
+            if isinstance(m, torch.nn.GELU):
+                m.approximate = "tanh"
+
+    dynamic_axes = {
+        "point_coords": {1: "num_points"},
+        "point_labels": {1: "num_points"},
+    }
+
+    embed_dim = sam.prompt_encoder.embed_dim
+    embed_size = sam.prompt_encoder.image_embedding_size
+    mask_input_size = [4 * x for x in embed_size]
+    dummy_inputs = {
+        "image_embeddings": torch.randn(1, embed_dim, *embed_size, dtype=torch.float),
+        "point_coords": torch.randint(low=0, high=1024, size=(1, 5, 2), dtype=torch.float),
+        "point_labels": torch.randint(low=0, high=4, size=(1, 5), dtype=torch.float),
+        "mask_input": torch.randn(1, 1, *mask_input_size, dtype=torch.float),
+        "has_mask_input": torch.tensor([1], dtype=torch.float),
+        "orig_im_size": torch.tensor([1500, 2250], dtype=torch.float),
+    }
+
+    # _ = onnx_model(**dummy_inputs)
+
+    output_names = ["masks", "iou_predictions", "low_res_masks"]
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", category=torch.jit.TracerWarning)
+        warnings.filterwarnings("ignore", category=UserWarning)
+        with open(output, "wb") as f:
+            print(f"Exporting onnx model to {output}...")
+            torch.onnx.export(
+                onnx_model,
+                tuple(dummy_inputs.values()),
+                f,
+                export_params=True,
+                verbose=False,
+                opset_version=16,
+                do_constant_folding=True,
+                input_names=list(dummy_inputs.keys()),
+                output_names=output_names,
+                dynamic_axes=dynamic_axes,
+            )
+        
+        img_size = onnx_model.model.image_encoder.img_size
+        tmp = torch.ones((1, 3, img_size, img_size), dtype=torch.float)
+        torch.onnx.export(onnx_model.model.image_encoder, tmp, "repvit_sam_image_encoder.onnx", opset_version=11,input_names=["image"], output_names=["image_embeddings"])
+
+    if onnxruntime_exists:
+        ort_inputs = {k: to_numpy(v) for k, v in dummy_inputs.items()}
+        # set cpu provider default
+        providers = ["CPUExecutionProvider"]
+        ort_session = onnxruntime.InferenceSession(output, providers=providers)
+        _ = ort_session.run(None, ort_inputs)
+        print("Model has successfully been run with ONNXRuntime.")
+
+
+def to_numpy(tensor):
+    return tensor.cpu().numpy()
+
+
+if __name__ == "__main__":
+    args = parser.parse_args()
+    run_export(
+        model_type=args.model_type,
+        checkpoint=args.checkpoint,
+        output=args.output,
+        opset=args.opset,
+        return_single_mask=args.return_single_mask,
+        gelu_approximate=args.gelu_approximate,
+        use_stability_score=args.use_stability_score,
+        return_extra_metrics=args.return_extra_metrics,
+    )
+
+    if args.quantize_out is not None:
+        assert onnxruntime_exists, "onnxruntime is required to quantize the model."
+        from onnxruntime.quantization import QuantType  # type: ignore
+        from onnxruntime.quantization.quantize import quantize_dynamic  # type: ignore
+
+        print(f"Quantizing model and writing to {args.quantize_out}...")
+        quantize_dynamic(
+            model_input=args.output,
+            model_output=args.quantize_out,
+            optimize_model=True,
+            per_channel=False,
+            reduce_range=False,
+            weight_type=QuantType.QUInt8,
+        )
+        print("Done!")
--- a/sam/setup.cfg
+++ b/sam/setup.cfg
+[isort]
+line_length=100
+multi_line_output=3
+include_trailing_comma=True
+known_standard_library=numpy,setuptools
+skip_glob=*/__init__.py
+known_myself=repvit_sam
+known_third_party=matplotlib,cv2,torch,torchvision,pycocotools,onnx,black,isort
+no_lines_before=STDLIB,THIRDPARTY
+sections=FUTURE,STDLIB,THIRDPARTY,MYSELF,FIRSTPARTY,LOCALFOLDER
+default_section=FIRSTPARTY
--- a/sam/setup.py
+++ b/sam/setup.py
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from setuptools import find_packages, setup
+
+setup(
+    name="repvit_sam",
+    version="1.0",
+    install_requires=[],
+    packages=find_packages(exclude="notebooks"),
+    extras_require={
+        "all": ["matplotlib", "pycocotools", "opencv-python", "onnx", "onnxruntime"],
+        "dev": ["flake8", "isort", "black", "mypy"],
+    },
+)
--- a/segmentation/.gitignore
+++ b/segmentation/.gitignore
+pretrain
+work_dirs
+data
+seg_pretrain
\ No newline at end of file
--- a/segmentation/README.md
+++ b/segmentation/README.md
+# Semantic Segmentation 
+
+Segmentation on ADE20K is implemented based on [MMSegmentation](https://github.com/open-mmlab/mmsegmentation).
+
+## Models
+| Model | mIoU | Latency | Ckpt | Log |
+|:---------------|:----:|:---:|:--:|:--:|
+| RepViT-M1.1 |   40.6   |     4.9ms    |   [M1.1](https://github.com/THU-MIG/RepViT/releases/download/v1.0/repvit_m1_1_ade20k.pth)   | [M1.1](./logs/repvit_m1_1_ade20k.json) |
+| RepViT-M1.5 |   43.6   |     6.4ms    |   [M1.5](https://github.com/THU-MIG/RepViT/releases/download/v1.0/repvit_m1_5_ade20k.pth)   | [M1.5](./logs/repvit_m1_5_ade20k.json) |
+| RepViT-M2.3 |   46.1   |     9.9ms    |   [M2.3](https://github.com/THU-MIG/RepViT/releases/download/v1.0/repvit_m2_3_ade20k.pth)   | [M2.3](./logs/repvit_m2_3_ade20k.json) |
+
+The backbone latency is measured with image crops of 512x512 on iPhone 12 by Core ML Tools.
+
+## Requirements
+Install [mmcv-full](https://github.com/open-mmlab/mmcv) and [MMSegmentation v0.30.0](https://github.com/open-mmlab/mmsegmentation/tree/v0.30.0). 
+Later versions should work as well. 
+The easiest way is to install via [MIM](https://github.com/open-mmlab/mim)
+```
+pip install -U openmim
+mim install mmcv-full==1.7.1
+mim install mmseg==0.30.0
+```
+
+## Data preparation
+
+We benchmark RepViT on the challenging ADE20K dataset, which can be downloaded and prepared following [insructions in MMSeg](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#prepare-datasets). 
+The data should appear as: 
+```
+├── segmentation
+│   ├── data
+│   │   ├── ade
+│   │   │   ├── ADEChallengeData2016
+│   │   │   │   ├── annotations
+│   │   │   │   │   ├── training
+│   │   │   │   │   ├── validation
+│   │   │   │   ├── images
+│   │   │   │   │   ├── training
+│   │   │   │   │   ├── validation
+
+```
+
+
+
+## Testing
+
+We provide a multi-GPU testing script, specify config file, checkpoint, and number of GPUs to use: 
+```
+./tools/dist_test.sh config_file path/to/checkpoint #GPUs --eval mIoU
+```
+
+For example, to test RepViT-M1.1 on ADE20K on an 8-GPU machine, 
+
+```
+./tools/dist_test.sh configs/sem_fpn/fpn_repvit_m1_1_ade20k_40k.py path/to/repvit_m1_1_ade20k.pth 8 --eval mIoU
+```
+
+## Training 
+Download ImageNet-1K pretrained weights into `./pretrain` 
+
+We provide PyTorch distributed data parallel (DDP) training script `dist_train.sh`, for example, to train RepViT-M1.1 on an 8-GPU machine: 
+```
+./tools/dist_train.sh configs/sem_fpn/fpn_repvit_m1_1_ade20k_40k.py 8
+```
+Tips: specify configs and #GPUs!
--- a/segmentation/align_resize.py
+++ b/segmentation/align_resize.py
+import mmcv
+import numpy as np
+from mmcv.utils import deprecated_api_warning, is_tuple_of
+from numpy import random
+
+from mmseg.datasets.builder import PIPELINES
+
+
+@PIPELINES.register_module()
+class AlignResize(object):
+    """Resize images & seg. Align
+    """
+
+    def __init__(self,
+                 img_scale=None,
+                 multiscale_mode='range',
+                 ratio_range=None,
+                 keep_ratio=True,
+                 size_divisor=32):
+        if img_scale is None:
+            self.img_scale = None
+        else:
+            if isinstance(img_scale, list):
+                self.img_scale = img_scale
+            else:
+                self.img_scale = [img_scale]
+            assert mmcv.is_list_of(self.img_scale, tuple)
+
+        if ratio_range is not None:
+            # mode 1: given img_scale=None and a range of image ratio
+            # mode 2: given a scale and a range of image ratio
+            assert self.img_scale is None or len(self.img_scale) == 1
+        else:
+            # mode 3 and 4: given multiple scales or a range of scales
+            assert multiscale_mode in ['value', 'range']
+
+        self.multiscale_mode = multiscale_mode
+        self.ratio_range = ratio_range
+        self.keep_ratio = keep_ratio
+        self.size_divisor = size_divisor
+
+    @staticmethod
+    def random_select(img_scales):
+        """Randomly select an img_scale from given candidates.
+
+        Args:
+            img_scales (list[tuple]): Images scales for selection.
+
+        Returns:
+            (tuple, int): Returns a tuple ``(img_scale, scale_dix)``,
+                where ``img_scale`` is the selected image scale and
+                ``scale_idx`` is the selected index in the given candidates.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple)
+        scale_idx = np.random.randint(len(img_scales))
+        img_scale = img_scales[scale_idx]
+        return img_scale, scale_idx
+
+    @staticmethod
+    def random_sample(img_scales):
+        """Randomly sample an img_scale when ``multiscale_mode=='range'``.
+
+        Args:
+            img_scales (list[tuple]): Images scale range for sampling.
+                There must be two tuples in img_scales, which specify the lower
+                and uper bound of image scales.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(img_scale, None)``, where
+                ``img_scale`` is sampled scale and None is just a placeholder
+                to be consistent with :func:`random_select`.
+        """
+
+        assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
+        img_scale_long = [max(s) for s in img_scales]
+        img_scale_short = [min(s) for s in img_scales]
+        long_edge = np.random.randint(
+            min(img_scale_long),
+            max(img_scale_long) + 1)
+        short_edge = np.random.randint(
+            min(img_scale_short),
+            max(img_scale_short) + 1)
+        img_scale = (long_edge, short_edge)
+        return img_scale, None
+
+    @staticmethod
+    def random_sample_ratio(img_scale, ratio_range):
+        """Randomly sample an img_scale when ``ratio_range`` is specified.
+
+        A ratio will be randomly sampled from the range specified by
+        ``ratio_range``. Then it would be multiplied with ``img_scale`` to
+        generate sampled scale.
+
+        Args:
+            img_scale (tuple): Images scale base to multiply with ratio.
+            ratio_range (tuple[float]): The minimum and maximum ratio to scale
+                the ``img_scale``.
+
+        Returns:
+            (tuple, None): Returns a tuple ``(scale, None)``, where
+                ``scale`` is sampled ratio multiplied with ``img_scale`` and
+                None is just a placeholder to be consistent with
+                :func:`random_select`.
+        """
+
+        assert isinstance(img_scale, tuple) and len(img_scale) == 2
+        min_ratio, max_ratio = ratio_range
+        assert min_ratio <= max_ratio
+        ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
+        scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
+        return scale, None
+
+    def _random_scale(self, results):
+        """Randomly sample an img_scale according to ``ratio_range`` and
+        ``multiscale_mode``.
+
+        If ``ratio_range`` is specified, a ratio will be sampled and be
+        multiplied with ``img_scale``.
+        If multiple scales are specified by ``img_scale``, a scale will be
+        sampled according to ``multiscale_mode``.
+        Otherwise, single scale will be used.
+
+        Args:
+            results (dict): Result dict from :obj:`dataset`.
+
+        Returns:
+            dict: Two new keys 'scale` and 'scale_idx` are added into
+                ``results``, which would be used by subsequent pipelines.
+        """
+
+        if self.ratio_range is not None:
+            if self.img_scale is None:
+                h, w = results['img'].shape[:2]
+                scale, scale_idx = self.random_sample_ratio((w, h),
+                                                            self.ratio_range)
+            else:
+                scale, scale_idx = self.random_sample_ratio(
+                    self.img_scale[0], self.ratio_range)
+        elif len(self.img_scale) == 1:
+            scale, scale_idx = self.img_scale[0], 0
+        elif self.multiscale_mode == 'range':
+            scale, scale_idx = self.random_sample(self.img_scale)
+        elif self.multiscale_mode == 'value':
+            scale, scale_idx = self.random_select(self.img_scale)
+        else:
+            raise NotImplementedError
+
+        results['scale'] = scale
+        results['scale_idx'] = scale_idx
+
+    def _align(self, img, size_divisor, interpolation=None):
+        align_h = int(np.ceil(img.shape[0] / size_divisor)) * size_divisor
+        align_w = int(np.ceil(img.shape[1] / size_divisor)) * size_divisor
+        if interpolation == None:
+            img = mmcv.imresize(img, (align_w, align_h))
+        else:
+            img = mmcv.imresize(img, (align_w, align_h), interpolation=interpolation)
+        return img
+
+    def _resize_img(self, results):
+        """Resize images with ``results['scale']``."""
+        if self.keep_ratio:
+            img, scale_factor = mmcv.imrescale(
+                results['img'], results['scale'], return_scale=True)
+            #### align ####
+            img = self._align(img, self.size_divisor)
+            # the w_scale and h_scale has minor difference
+            # a real fix should be done in the mmcv.imrescale in the future
+            new_h, new_w = img.shape[:2]
+            h, w = results['img'].shape[:2]
+            w_scale = new_w / w
+            h_scale = new_h / h
+        else:
+            img, w_scale, h_scale = mmcv.imresize(
+                results['img'], results['scale'], return_scale=True)
+
+            h, w = img.shape[:2]
+            assert int(np.ceil(h / self.size_divisor)) * self.size_divisor == h and \
+                   int(np.ceil(w / self.size_divisor)) * self.size_divisor == w, \
+                "img size not align. h:{} w:{}".format(h, w)
+        scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
+                                dtype=np.float32)
+        results['img'] = img
+        results['img_shape'] = img.shape
+        results['pad_shape'] = img.shape  # in case that there is no padding
+        results['scale_factor'] = scale_factor
+        results['keep_ratio'] = self.keep_ratio
+
+    def _resize_seg(self, results):
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for key in results.get('seg_fields', []):
+            if self.keep_ratio:
+                gt_seg = mmcv.imrescale(
+                    results[key], results['scale'], interpolation='nearest')
+                gt_seg = self._align(gt_seg, self.size_divisor, interpolation='nearest')
+            else:
+                gt_seg = mmcv.imresize(
+                    results[key], results['scale'], interpolation='nearest')
+                h, w = gt_seg.shape[:2]
+                assert int(np.ceil(h / self.size_divisor)) * self.size_divisor == h and \
+                       int(np.ceil(w / self.size_divisor)) * self.size_divisor == w, \
+                    "gt_seg size not align. h:{} w:{}".format(h, w)
+            results[key] = gt_seg
+
+    def __call__(self, results):
+        """Call function to resize images, bounding boxes, masks, semantic
+        segmentation map.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor',
+                'keep_ratio' keys are added into result dict.
+        """
+
+        if 'scale' not in results:
+            self._random_scale(results)
+        self._resize_img(results)
+        self._resize_seg(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += (f'(img_scale={self.img_scale}, '
+                     f'multiscale_mode={self.multiscale_mode}, '
+                     f'ratio_range={self.ratio_range}, '
+                     f'keep_ratio={self.keep_ratio})')
+        return repr_str
--- a/segmentation/configs/_base_/datasets/ade20k.py
+++ b/segmentation/configs/_base_/datasets/ade20k.py
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2048, 512),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='AlignResize', keep_ratio=True, size_divisor=32),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(
+        type='RepeatDataset',
+        times=50,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            img_dir='images/training',
+            ann_dir='annotations/training',
+            pipeline=train_pipeline)),
+    val=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline),
+    test=dict(
+        type=dataset_type,
+        data_root=data_root,
+        img_dir='images/validation',
+        ann_dir='annotations/validation',
+        pipeline=test_pipeline))
--- a/segmentation/configs/_base_/default_runtime.py
+++ b/segmentation/configs/_base_/default_runtime.py
+# yapf:disable
+log_config = dict(
+    interval=50,
+    hooks=[
+        dict(type='TextLoggerHook', by_epoch=False),
+        # dict(type='TensorboardLoggerHook')
+    ])
+# yapf:enable
+dist_params = dict(backend='nccl')
+log_level = 'INFO'
+load_from = None
+resume_from = None
+workflow = [('train', 1)]
+cudnn_benchmark = True
--- a/segmentation/configs/_base_/models/fpn_r50.py
+++ b/segmentation/configs/_base_/models/fpn_r50.py
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    decode_head=dict(
+        type='FPNHead',
+        in_channels=[256, 256, 256, 256],
+        in_index=[0, 1, 2, 3],
+        feature_strides=[4, 8, 16, 32],
+        channels=128,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
--- a/segmentation/configs/_base_/schedules/schedule_160k.py
+++ b/segmentation/configs/_base_/schedules/schedule_160k.py
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=160000)
+checkpoint_config = dict(by_epoch=False, interval=16000)
+evaluation = dict(interval=16000, metric='mIoU')
--- a/segmentation/configs/_base_/schedules/schedule_20k.py
+++ b/segmentation/configs/_base_/schedules/schedule_20k.py
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=20000)
+checkpoint_config = dict(by_epoch=False, interval=2000)
+evaluation = dict(interval=2000, metric='mIoU')
--- a/segmentation/configs/_base_/schedules/schedule_40k.py
+++ b/segmentation/configs/_base_/schedules/schedule_40k.py
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=40000)
+checkpoint_config = dict(by_epoch=False, interval=4000)
+evaluation = dict(interval=4000, metric='mIoU')
--- a/segmentation/configs/_base_/schedules/schedule_80k.py
+++ b/segmentation/configs/_base_/schedules/schedule_80k.py
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=80000)
+checkpoint_config = dict(by_epoch=False, interval=8000)
+evaluation = dict(interval=8000, metric='mIoU')
--- a/segmentation/configs/sem_fpn/fpn_repvit_m1_1_ade20k_40k.py
+++ b/segmentation/configs/sem_fpn/fpn_repvit_m1_1_ade20k_40k.py
+_base_ = [
+    '../_base_/models/fpn_r50.py',
+    '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='repvit_m1_1',
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrain/repvit_m1_1_distill_300e.pth',
+        ),
+        out_indices = [3,7,21,24]
+    ),
+    neck=dict(in_channels=[64, 128, 256, 512]),
+    decode_head=dict(num_classes=150))
+
+gpu_multiples = 2  # we use 8 gpu instead of 4 in mmsegmentation, so lr*2 and max_iters/2
+# optimizer
+optimizer = dict(type='AdamW', lr=0.0001 * gpu_multiples, weight_decay=0.0001)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-6, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=80000 // gpu_multiples)
+checkpoint_config = dict(by_epoch=False, interval=8000 // gpu_multiples)
+evaluation = dict(interval=8000 // gpu_multiples, metric='mIoU')
--- a/segmentation/configs/sem_fpn/fpn_repvit_m1_5_ade20k_40k.py
+++ b/segmentation/configs/sem_fpn/fpn_repvit_m1_5_ade20k_40k.py
+_base_ = [
+    '../_base_/models/fpn_r50.py',
+    '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py'
+]
+# model settings
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='repvit_m1_5',
+        style='pytorch',
+        init_cfg=dict(
+            type='Pretrained',
+            checkpoint='pretrain/repvit_m1_5_distill_300e.pth',
+        ),
+        out_indices=[5, 11, 37, 42]
+    ),
+    neck=dict(in_channels=[64, 128, 256, 512]),
+    decode_head=dict(num_classes=150))
+
+gpu_multiples = 2  # we use 8 gpu instead of 4 in mmsegmentation, so lr*2 and max_iters/2
+# optimizer
+optimizer = dict(type='AdamW', lr=0.0001 * gpu_multiples, weight_decay=0.0001)
+optimizer_config = dict()
+# learning policy
+lr_config = dict(policy='poly', power=0.9, min_lr=1e-6, by_epoch=False)
+# runtime settings
+runner = dict(type='IterBasedRunner', max_iters=80000 // gpu_multiples)
+checkpoint_config = dict(by_epoch=False, interval=8000 // gpu_multiples)
+evaluation = dict(interval=8000 // gpu_multiples, metric='mIoU')