Initial commit

f55a786e · luopl · f55a786e · f55a786e · f55a786e · f55a786e
Commit f55a786e authored Jun 05, 2024 by luopl
20 changed files
--- a/datasets/pascalcontext_val.txt
+++ b/datasets/pascalcontext_val.txt
--- a/datasets/pc459.json
+++ b/datasets/pc459.json
+["accordion", "aeroplane", "airconditioner", "antenna", "artillery", "ashtray", "atrium", "babycarriage", "bag", "ball", "balloon", "bambooweaving", "barrel", "baseballbat", "basket", "basketballbackboard", "bathtub", "bed", "bedclothes", "beer", "bell", "bench", "bicycle", "binoculars", "bird", "birdcage", "birdfeeder", "birdnest", "blackboard", "board", "boat", "bone", "book", "bottle", "bottleopener", "bowl", "box", "bracelet", "brick", "bridge", "broom", "brush", "bucket", "building", "bus", "cabinet", "cabinetdoor", "cage", "cake", "calculator", "calendar", "camel", "camera", "cameralens", "can", "candle", "candleholder", "cap", "car", "card", "cart", "case", "casetterecorder", "cashregister", "cat", "cd", "cdplayer", "ceiling", "cellphone", "cello", "chain", "chair", "chessboard", "chicken", "chopstick", "clip", "clippers", "clock", "closet", "cloth", "clothestree", "coffee", "coffeemachine", "comb", "computer", "concrete", "cone", "container", "controlbooth", "controller", "cooker", "copyingmachine", "coral", "cork", "corkscrew", "counter", "court", "cow", "crabstick", "crane", "crate", "cross", "crutch", "cup", "curtain", "cushion", "cuttingboard", "dais", "disc", "disccase", "dishwasher", "dock", "dog", "dolphin", "door", "drainer", "dray", "drinkdispenser", "drinkingmachine", "drop", "drug", "drum", "drumkit", "duck", "dumbbell", "earphone", "earrings", "egg", "electricfan", "electriciron", "electricpot", "electricsaw", "electronickeyboard", "engine", "envelope", "equipment", "escalator", "exhibitionbooth", "extinguisher", "eyeglass", "fan", "faucet", "faxmachine", "fence", "ferriswheel", "fireextinguisher", "firehydrant", "fireplace", "fish", "fishtank", "fishbowl", "fishingnet", "fishingpole", "flag", "flagstaff", "flame", "flashlight", "floor", "flower", "fly", "foam", "food", "footbridge", "forceps", "fork", "forklift", "fountain", "fox", "frame", "fridge", "frog", "fruit", "funnel", "furnace", "gamecontroller", "gamemachine", "gascylinder", "gashood", "gasstove", "giftbox", "glass", "glassmarble", "globe", "glove", "goal", "grandstand", "grass", "gravestone", "ground", "guardrail", "guitar", "gun", "hammer", "handcart", "handle", "handrail", "hanger", "harddiskdrive", "hat", "hay", "headphone", "heater", "helicopter", "helmet", "holder", "hook", "horse", "horse-drawncarriage", "hot-airballoon", "hydrovalve", "ice", "inflatorpump", "ipod", "iron", "ironingboard", "jar", "kart", "kettle", "key", "keyboard", "kitchenrange", "kite", "knife", "knifeblock", "ladder", "laddertruck", "ladle", "laptop", "leaves", "lid", "lifebuoy", "light", "lightbulb", "lighter", "line", "lion", "lobster", "lock", "machine", "mailbox", "mannequin", "map", "mask", "mat", "matchbook", "mattress", "menu", "metal", "meterbox", "microphone", "microwave", "mirror", "missile", "model", "money", "monkey", "mop", "motorbike", "mountain", "mouse", "mousepad", "musicalinstrument", "napkin", "net", "newspaper", "oar", "ornament", "outlet", "oven", "oxygenbottle", "pack", "pan", "paper", "paperbox", "papercutter", "parachute", "parasol", "parterre", "patio", "pelage", "pen", "pencontainer", "pencil", "person", "photo", "piano", "picture", "pig", "pillar", "pillow", "pipe", "pitcher", "plant", "plastic", "plate", "platform", "player", "playground", "pliers", "plume", "poker", "pokerchip", "pole", "pooltable", "postcard", "poster", "pot", "pottedplant", "printer", "projector", "pumpkin", "rabbit", "racket", "radiator", "radio", "rail", "rake", "ramp", "rangehood", "receiver", "recorder", "recreationalmachines", "remotecontrol", "road", "robot", "rock", "rocket", "rockinghorse", "rope", "rug", "ruler", "runway", "saddle", "sand", "saw", "scale", "scanner", "scissors", "scoop", "screen", "screwdriver", "sculpture", "scythe", "sewer", "sewingmachine", "shed", "sheep", "shell", "shelves", "shoe", "shoppingcart", "shovel", "sidecar", "sidewalk", "sign", "signallight", "sink", "skateboard", "ski", "sky", "sled", "slippers", "smoke", "snail", "snake", "snow", "snowmobiles", "sofa", "spanner", "spatula", "speaker", "speedbump", "spicecontainer", "spoon", "sprayer", "squirrel", "stage", "stair", "stapler", "stick", "stickynote", "stone", "stool", "stove", "straw", "stretcher", "sun", "sunglass", "sunshade", "surveillancecamera", "swan", "sweeper", "swimring", "swimmingpool", "swing", "switch", "table", "tableware", "tank", "tap", "tape", "tarp", "telephone", "telephonebooth", "tent", "tire", "toaster", "toilet", "tong", "tool", "toothbrush", "towel", "toy", "toycar", "track", "train", "trampoline", "trashbin", "tray", "tree", "tricycle", "tripod", "trophy", "truck", "tube", "turtle", "tvmonitor", "tweezers", "typewriter", "umbrella", "unknown", "vacuumcleaner", "vendingmachine", "videocamera", "videogameconsole", "videoplayer", "videotape", "violin", "wakeboard", "wall", "wallet", "wardrobe", "washingmachine", "watch", "water", "waterdispenser", "waterpipe", "waterskateboard", "watermelon", "whale", "wharf", "wheel", "wheelchair", "window", "windowblinds", "wineglass", "wire", "wood", "wool"]
+
--- a/datasets/pc59.json
+++ b/datasets/pc59.json
+["aeroplane", "bag", "bed", "bedclothes", "bench", "bicycle", "bird", "boat", "book", "bottle", "building", "bus", "cabinet", "car", "cat", "ceiling", "chair", "cloth", "computer", "cow", "cup", "curtain", "dog", "door", "fence", "floor", "flower", "food", "grass", "ground", "horse", "keyboard", "light", "motorbike", "mountain", "mouse", "person", "plate", "platform", "pottedplant", "road", "rock", "sheep", "shelves", "sidewalk", "sign", "sky", "snow", "sofa", "diningtable", "track", "train", "tree", "truck", "tvmonitor", "wall", "water", "window", "wood"]
\ No newline at end of file
--- a/datasets/prepare_ade20k_150.py
+++ b/datasets/prepare_ade20k_150.py
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+# Copyright (c) Facebook, Inc. and its affiliates.
+import os
+from pathlib import Path
+
+import numpy as np
+import tqdm
+from PIL import Image
+
+
+def convert(input, output):
+    img = np.asarray(Image.open(input))
+    assert img.dtype == np.uint8
+    img = img - 1  # 0 (ignore) becomes 255. others are shifted by 1
+    Image.fromarray(img).save(output)
+
+
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "ADEChallengeData2016"
+    for name in ["validation"]:
+        annotation_dir = dataset_dir / "annotations" / name
+        output_dir = dataset_dir / "annotations_detectron2" / name
+        output_dir.mkdir(parents=True, exist_ok=True)
+        for file in tqdm.tqdm(list(annotation_dir.iterdir())):
+            output_file = output_dir / file.name
+            convert(file, output_file)
\ No newline at end of file
--- a/datasets/prepare_ade20k_full.py
+++ b/datasets/prepare_ade20k_full.py
--- a/datasets/prepare_coco_stuff.py
+++ b/datasets/prepare_coco_stuff.py
+import os
+import os.path as osp
+from pathlib import Path
+import tqdm
+from glob import glob
+
+import numpy as np
+from PIL import Image
+
+COCO_CATEGORIES = [{'color': [220, 20, 60], 'isthing': 1, 'id': 0, 'name': 'person', 'trainId': 0}, 
+                   {'color': [119, 11, 32], 'isthing': 1, 'id': 1, 'name': 'bicycle', 'trainId': 1}, 
+                   {'color': [0, 0, 142], 'isthing': 1, 'id': 2, 'name': 'car', 'trainId': 2}, 
+                   {'color': [0, 0, 230], 'isthing': 1, 'id': 3, 'name': 'motorcycle', 'trainId': 3},
+                   {'color': [106, 0, 228], 'isthing': 1, 'id': 4, 'name': 'airplane', 'trainId': 4}, 
+                   {'color': [0, 60, 100], 'isthing': 1, 'id': 5, 'name': 'bus', 'trainId': 5}, 
+                   {'color': [0, 80, 100], 'isthing': 1, 'id': 6, 'name': 'train', 'trainId': 6},
+                   {'color': [0, 0, 70], 'isthing': 1, 'id': 7, 'name': 'truck', 'trainId': 7},
+                   {'color': [0, 0, 192], 'isthing': 1, 'id': 8, 'name': 'boat', 'trainId': 8}, 
+                   {'color': [250, 170, 30], 'isthing': 1, 'id': 9, 'name': 'traffic light', 'trainId': 9}, 
+                   {'color': [100, 170, 30], 'isthing': 1, 'id': 10, 'name': 'fire hydrant', 'trainId': 10},
+                   {'color': [220, 220, 0], 'isthing': 1, 'id': 12, 'name': 'stop sign', 'trainId': 11},
+                   {'color': [175, 116, 175], 'isthing': 1, 'id': 13, 'name': 'parking meter', 'trainId': 12},
+                   {'color': [250, 0, 30], 'isthing': 1, 'id': 14, 'name': 'bench', 'trainId': 13},
+                   {'color': [165, 42, 42], 'isthing': 1, 'id': 15, 'name': 'bird', 'trainId': 14},
+                   {'color': [255, 77, 255], 'isthing': 1, 'id': 16, 'name': 'cat', 'trainId': 15},
+                   {'color': [0, 226, 252], 'isthing': 1, 'id': 17, 'name': 'dog', 'trainId': 16},
+                   {'color': [182, 182, 255], 'isthing': 1, 'id': 18, 'name': 'horse', 'trainId': 17},
+                   {'color': [0, 82, 0], 'isthing': 1, 'id': 19, 'name': 'sheep', 'trainId': 18},
+                   {'color': [120, 166, 157], 'isthing': 1, 'id': 20, 'name': 'cow', 'trainId': 19},
+                   {'color': [110, 76, 0], 'isthing': 1, 'id': 21, 'name': 'elephant', 'trainId': 20},
+                   {'color': [174, 57, 255], 'isthing': 1, 'id': 22, 'name': 'bear', 'trainId': 21},
+                   {'color': [199, 100, 0], 'isthing': 1, 'id': 23, 'name': 'zebra', 'trainId': 22},
+                   {'color': [72, 0, 118], 'isthing': 1, 'id': 24, 'name': 'giraffe', 'trainId': 23},
+                   {'color': [255, 179, 240], 'isthing': 1, 'id': 26, 'name': 'backpack', 'trainId': 24},
+                   {'color': [0, 125, 92], 'isthing': 1, 'id': 27, 'name': 'umbrella', 'trainId': 25},
+                   {'color': [209, 0, 151], 'isthing': 1, 'id': 30, 'name': 'handbag', 'trainId': 26},
+                   {'color': [188, 208, 182], 'isthing': 1, 'id': 31, 'name': 'tie', 'trainId': 27},
+                   {'color': [0, 220, 176], 'isthing': 1, 'id': 32, 'name': 'suitcase', 'trainId': 28},
+                   {'color': [255, 99, 164], 'isthing': 1, 'id': 33, 'name': 'frisbee', 'trainId': 29},
+                   {'color': [92, 0, 73], 'isthing': 1, 'id': 34, 'name': 'skis', 'trainId': 30},
+                   {'color': [133, 129, 255], 'isthing': 1, 'id': 35, 'name': 'snowboard', 'trainId': 31},
+                   {'color': [78, 180, 255], 'isthing': 1, 'id': 36, 'name': 'sports ball', 'trainId': 32},
+                   {'color': [0, 228, 0], 'isthing': 1, 'id': 37, 'name': 'kite', 'trainId': 33},
+                   {'color': [174, 255, 243], 'isthing': 1, 'id': 38, 'name': 'baseball bat', 'trainId': 34},
+                   {'color': [45, 89, 255], 'isthing': 1, 'id': 39, 'name': 'baseball glove', 'trainId': 35},
+                   {'color': [134, 134, 103], 'isthing': 1, 'id': 40, 'name': 'skateboard', 'trainId': 36},
+                   {'color': [145, 148, 174], 'isthing': 1, 'id': 41, 'name': 'surfboard', 'trainId': 37},
+                   {'color': [255, 208, 186], 'isthing': 1, 'id': 42, 'name': 'tennis racket', 'trainId': 38},
+                   {'color': [197, 226, 255], 'isthing': 1, 'id': 43, 'name': 'bottle', 'trainId': 39},
+                   {'color': [171, 134, 1], 'isthing': 1, 'id': 45, 'name': 'wine glass', 'trainId': 40},
+                   {'color': [109, 63, 54], 'isthing': 1, 'id': 46, 'name': 'cup', 'trainId': 41},
+                   {'color': [207, 138, 255], 'isthing': 1, 'id': 47, 'name': 'fork', 'trainId': 42},
+                   {'color': [151, 0, 95], 'isthing': 1, 'id': 48, 'name': 'knife', 'trainId': 43},
+                   {'color': [9, 80, 61], 'isthing': 1, 'id': 49, 'name': 'spoon', 'trainId': 44},
+                   {'color': [84, 105, 51], 'isthing': 1, 'id': 50, 'name': 'bowl', 'trainId': 45},
+                   {'color': [74, 65, 105], 'isthing': 1, 'id': 51, 'name': 'banana', 'trainId': 46},
+                   {'color': [166, 196, 102], 'isthing': 1, 'id': 52, 'name': 'apple', 'trainId': 47},
+                   {'color': [208, 195, 210], 'isthing': 1, 'id': 53, 'name': 'sandwich', 'trainId': 48},
+                   {'color': [255, 109, 65], 'isthing': 1, 'id': 54, 'name': 'orange', 'trainId': 49},
+                   {'color': [0, 143, 149], 'isthing': 1, 'id': 55, 'name': 'broccoli', 'trainId': 50},
+                   {'color': [179, 0, 194], 'isthing': 1, 'id': 56, 'name': 'carrot', 'trainId': 51},
+                   {'color': [209, 99, 106], 'isthing': 1, 'id': 57, 'name': 'hot dog', 'trainId': 52},
+                   {'color': [5, 121, 0], 'isthing': 1, 'id': 58, 'name': 'pizza', 'trainId': 53},
+                   {'color': [227, 255, 205], 'isthing': 1, 'id': 59, 'name': 'donut', 'trainId': 54},
+                   {'color': [147, 186, 208], 'isthing': 1, 'id': 60, 'name': 'cake', 'trainId': 55},
+                   {'color': [153, 69, 1], 'isthing': 1, 'id': 61, 'name': 'chair', 'trainId': 56},
+                   {'color': [3, 95, 161], 'isthing': 1, 'id': 62, 'name': 'couch', 'trainId': 57},
+                   {'color': [163, 255, 0], 'isthing': 1, 'id': 63, 'name': 'potted plant', 'trainId': 58},
+                   {'color': [119, 0, 170], 'isthing': 1, 'id': 64, 'name': 'bed', 'trainId': 59},
+                   {'color': [0, 182, 199], 'isthing': 1, 'id': 66, 'name': 'dining table', 'trainId': 60},
+                   {'color': [0, 165, 120], 'isthing': 1, 'id': 69, 'name': 'toilet', 'trainId': 61},
+                   {'color': [183, 130, 88], 'isthing': 1, 'id': 71, 'name': 'tv', 'trainId': 62},
+                   {'color': [95, 32, 0], 'isthing': 1, 'id': 72, 'name': 'laptop', 'trainId': 63},
+                   {'color': [130, 114, 135], 'isthing': 1, 'id': 73, 'name': 'mouse', 'trainId': 64},
+                   {'color': [110, 129, 133], 'isthing': 1, 'id': 74, 'name': 'remote', 'trainId': 65},
+                   {'color': [166, 74, 118], 'isthing': 1, 'id': 75, 'name': 'keyboard', 'trainId': 66},
+                   {'color': [219, 142, 185], 'isthing': 1, 'id': 76, 'name': 'cell phone', 'trainId': 67},
+                   {'color': [79, 210, 114], 'isthing': 1, 'id': 77, 'name': 'microwave', 'trainId': 68},
+                   {'color': [178, 90, 62], 'isthing': 1, 'id': 78, 'name': 'oven', 'trainId': 69},
+                   {'color': [65, 70, 15], 'isthing': 1, 'id': 79, 'name': 'toaster', 'trainId': 70},
+                   {'color': [127, 167, 115], 'isthing': 1, 'id': 80, 'name': 'sink', 'trainId': 71},
+                   {'color': [59, 105, 106], 'isthing': 1, 'id': 81, 'name': 'refrigerator', 'trainId': 72},
+                   {'color': [142, 108, 45], 'isthing': 1, 'id': 83, 'name': 'book', 'trainId': 73},
+                   {'color': [196, 172, 0], 'isthing': 1, 'id': 84, 'name': 'clock', 'trainId': 74},
+                   {'color': [95, 54, 80], 'isthing': 1, 'id': 85, 'name': 'vase', 'trainId': 75},
+                   {'color': [128, 76, 255], 'isthing': 1, 'id': 86, 'name': 'scissors', 'trainId': 76},
+                   {'color': [201, 57, 1], 'isthing': 1, 'id': 87, 'name': 'teddy bear', 'trainId': 77},
+                   {'color': [246, 0, 122], 'isthing': 1, 'id': 88, 'name': 'hair drier', 'trainId': 78},
+                   {'color': [191, 162, 208], 'isthing': 1, 'id': 89, 'name': 'toothbrush', 'trainId': 79},
+                   {'id': 91, 'name': 'banner', 'supercategory': 'textile', 'trainId': 80},
+                   {'id': 92, 'name': 'blanket', 'supercategory': 'textile', 'trainId': 81},
+                   {'id': 93, 'name': 'branch', 'supercategory': 'plant', 'trainId': 82},
+                   {'id': 94, 'name': 'bridge', 'supercategory': 'building', 'trainId': 83},
+                   {'id': 95, 'name': 'building-other', 'supercategory': 'building', 'trainId': 84},
+                   {'id': 96, 'name': 'bush', 'supercategory': 'plant', 'trainId': 85},
+                   {'id': 97, 'name': 'cabinet', 'supercategory': 'furniture-stuff', 'trainId': 86},
+                   {'id': 98, 'name': 'cage', 'supercategory': 'structural', 'trainId': 87},
+                   {'id': 99, 'name': 'cardboard', 'supercategory': 'raw-material', 'trainId': 88},
+                   {'id': 100, 'name': 'carpet', 'supercategory': 'floor', 'trainId': 89},
+                   {'id': 101, 'name': 'ceiling-other', 'supercategory': 'ceiling', 'trainId': 90},
+                   {'id': 102, 'name': 'ceiling-tile', 'supercategory': 'ceiling', 'trainId': 91},
+                   {'id': 103, 'name': 'cloth', 'supercategory': 'textile', 'trainId': 92},
+                   {'id': 104, 'name': 'clothes', 'supercategory': 'textile', 'trainId': 93},
+                   {'id': 105, 'name': 'clouds', 'supercategory': 'sky', 'trainId': 94},
+                   {'id': 106, 'name': 'counter', 'supercategory': 'furniture-stuff', 'trainId': 95},
+                   {'id': 107, 'name': 'cupboard', 'supercategory': 'furniture-stuff', 'trainId': 96},
+                   {'id': 108, 'name': 'curtain', 'supercategory': 'textile', 'trainId': 97},
+                   {'id': 109, 'name': 'desk-stuff', 'supercategory': 'furniture-stuff', 'trainId': 98},
+                   {'id': 110, 'name': 'dirt', 'supercategory': 'ground', 'trainId': 99},
+                   {'id': 111, 'name': 'door-stuff', 'supercategory': 'furniture-stuff', 'trainId': 100},
+                   {'id': 112, 'name': 'fence', 'supercategory': 'structural', 'trainId': 101},
+                   {'id': 113, 'name': 'floor-marble', 'supercategory': 'floor', 'trainId': 102},
+                   {'id': 114, 'name': 'floor-other', 'supercategory': 'floor', 'trainId': 103},
+                   {'id': 115, 'name': 'floor-stone', 'supercategory': 'floor', 'trainId': 104},
+                   {'id': 116, 'name': 'floor-tile', 'supercategory': 'floor', 'trainId': 105},
+                   {'id': 117, 'name': 'floor-wood', 'supercategory': 'floor', 'trainId': 106},
+                   {'id': 118, 'name': 'flower', 'supercategory': 'plant', 'trainId': 107},
+                   {'id': 119, 'name': 'fog', 'supercategory': 'water', 'trainId': 108},
+                   {'id': 120, 'name': 'food-other', 'supercategory': 'food-stuff', 'trainId': 109},
+                   {'id': 121, 'name': 'fruit', 'supercategory': 'food-stuff', 'trainId': 110},
+                   {'id': 122, 'name': 'furniture-other', 'supercategory': 'furniture-stuff', 'trainId': 111},
+                   {'id': 123, 'name': 'grass', 'supercategory': 'plant', 'trainId': 112},
+                   {'id': 124, 'name': 'gravel', 'supercategory': 'ground', 'trainId': 113},
+                   {'id': 125, 'name': 'ground-other', 'supercategory': 'ground', 'trainId': 114},
+                   {'id': 126, 'name': 'hill', 'supercategory': 'solid', 'trainId': 115},
+                   {'id': 127, 'name': 'house', 'supercategory': 'building', 'trainId': 116},
+                   {'id': 128, 'name': 'leaves', 'supercategory': 'plant', 'trainId': 117},
+                   {'id': 129, 'name': 'light', 'supercategory': 'furniture-stuff', 'trainId': 118},
+                   {'id': 130, 'name': 'mat', 'supercategory': 'textile', 'trainId': 119},
+                   {'id': 131, 'name': 'metal', 'supercategory': 'raw-material', 'trainId': 120},
+                   {'id': 132, 'name': 'mirror-stuff', 'supercategory': 'furniture-stuff', 'trainId': 121},
+                   {'id': 133, 'name': 'moss', 'supercategory': 'plant', 'trainId': 122},
+                   {'id': 134, 'name': 'mountain', 'supercategory': 'solid', 'trainId': 123},
+                   {'id': 135, 'name': 'mud', 'supercategory': 'ground', 'trainId': 124},
+                   {'id': 136, 'name': 'napkin', 'supercategory': 'textile', 'trainId': 125},
+                   {'id': 137, 'name': 'net', 'supercategory': 'structural', 'trainId': 126},
+                   {'id': 138, 'name': 'paper', 'supercategory': 'raw-material', 'trainId': 127},
+                   {'id': 139, 'name': 'pavement', 'supercategory': 'ground', 'trainId': 128},
+                   {'id': 140, 'name': 'pillow', 'supercategory': 'textile', 'trainId': 129},
+                   {'id': 141, 'name': 'plant-other', 'supercategory': 'plant', 'trainId': 130},
+                   {'id': 142, 'name': 'plastic', 'supercategory': 'raw-material', 'trainId': 131},
+                   {'id': 143, 'name': 'platform', 'supercategory': 'ground', 'trainId': 132},
+                   {'id': 144, 'name': 'playingfield', 'supercategory': 'ground', 'trainId': 133},
+                   {'id': 145, 'name': 'railing', 'supercategory': 'structural', 'trainId': 134},
+                   {'id': 146, 'name': 'railroad', 'supercategory': 'ground', 'trainId': 135},
+                   {'id': 147, 'name': 'river', 'supercategory': 'water', 'trainId': 136},
+                   {'id': 148, 'name': 'road', 'supercategory': 'ground', 'trainId': 137},
+                   {'id': 149, 'name': 'rock', 'supercategory': 'solid', 'trainId': 138},
+                   {'id': 150, 'name': 'roof', 'supercategory': 'building', 'trainId': 139},
+                   {'id': 151, 'name': 'rug', 'supercategory': 'textile', 'trainId': 140},
+                   {'id': 152, 'name': 'salad', 'supercategory': 'food-stuff', 'trainId': 141},
+                   {'id': 153, 'name': 'sand', 'supercategory': 'ground', 'trainId': 142},
+                   {'id': 154, 'name': 'sea', 'supercategory': 'water', 'trainId': 143},
+                   {'id': 155, 'name': 'shelf', 'supercategory': 'furniture-stuff', 'trainId': 144},
+                   {'id': 156, 'name': 'sky-other', 'supercategory': 'sky', 'trainId': 145},
+                   {'id': 157, 'name': 'skyscraper', 'supercategory': 'building', 'trainId': 146},
+                   {'id': 158, 'name': 'snow', 'supercategory': 'ground', 'trainId': 147},
+                   {'id': 159, 'name': 'solid-other', 'supercategory': 'solid', 'trainId': 148},
+                   {'id': 160, 'name': 'stairs', 'supercategory': 'furniture-stuff', 'trainId': 149},
+                   {'id': 161, 'name': 'stone', 'supercategory': 'solid', 'trainId': 150},
+                   {'id': 162, 'name': 'straw', 'supercategory': 'plant', 'trainId': 151},
+                   {'id': 163, 'name': 'structural-other', 'supercategory': 'structural', 'trainId': 152},
+                   {'id': 164, 'name': 'table', 'supercategory': 'furniture-stuff', 'trainId': 153},
+                   {'id': 165, 'name': 'tent', 'supercategory': 'building', 'trainId': 154},
+                   {'id': 166, 'name': 'textile-other', 'supercategory': 'textile', 'trainId': 155},
+                   {'id': 167, 'name': 'towel', 'supercategory': 'textile', 'trainId': 156},
+                   {'id': 168, 'name': 'tree', 'supercategory': 'plant', 'trainId': 157},
+                   {'id': 169, 'name': 'vegetable', 'supercategory': 'food-stuff', 'trainId': 158},
+                   {'id': 170, 'name': 'wall-brick', 'supercategory': 'wall', 'trainId': 159},
+                   {'id': 171, 'name': 'wall-concrete', 'supercategory': 'wall', 'trainId': 160},
+                   {'id': 172, 'name': 'wall-other', 'supercategory': 'wall', 'trainId': 161},
+                   {'id': 173, 'name': 'wall-panel', 'supercategory': 'wall', 'trainId': 162},
+                   {'id': 174, 'name': 'wall-stone', 'supercategory': 'wall', 'trainId': 163},
+                   {'id': 175, 'name': 'wall-tile', 'supercategory': 'wall', 'trainId': 164},
+                   {'id': 176, 'name': 'wall-wood', 'supercategory': 'wall', 'trainId': 165},
+                   {'id': 177, 'name': 'water-other', 'supercategory': 'water', 'trainId': 166},
+                   {'id': 178, 'name': 'waterdrops', 'supercategory': 'water', 'trainId': 167},
+                   {'id': 179, 'name': 'window-blind', 'supercategory': 'window', 'trainId': 168},
+                   {'id': 180, 'name': 'window-other', 'supercategory': 'window', 'trainId': 169},
+                   {'id': 181, 'name': 'wood', 'supercategory': 'solid', 'trainId': 170}]
+
+
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets")) / "coco-stuff"
+
+    id_map = {}
+    for cat in COCO_CATEGORIES:
+        id_map[cat["id"]] = cat["trainId"]
+
+    for name in ["train2017", "val2017"]:
+        annotation_dir = dataset_dir / "annotations" / name
+        output_dir = dataset_dir / "annotations_detectron2" / name
+        output_dir.mkdir(parents=True, exist_ok=True)
+
+        for file in tqdm.tqdm(list(annotation_dir.iterdir())):
+            output_file = output_dir / file.name
+            lab = np.asarray(Image.open(file))
+            assert lab.dtype == np.uint8
+
+            output = np.zeros_like(lab, dtype=np.uint8) + 255
+            for obj_id in np.unique(lab):
+                if obj_id in id_map:
+                    output[lab == obj_id] = id_map[obj_id]
+
+            Image.fromarray(output).save(output_file)
\ No newline at end of file
--- a/datasets/prepare_pascal_context_459.py
+++ b/datasets/prepare_pascal_context_459.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+
+import tqdm
+import os
+import os.path as osp
+from pathlib import Path
+
+import numpy as np
+from PIL import Image
+import scipy.io
+
+def convert_pc459(mask_path, new_mask_path):
+    mat = scipy.io.loadmat(mask_path)
+    mask = mat['LabelMap']
+    mask = mask - 1
+    min_value = np.amin(mask)
+    assert min_value >= 0, print(min_value)
+    Image.fromarray(mask).save(new_mask_path, "TIFF")
+
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
+    print('Caution: we only generate the validation set!')
+    pc_path = dataset_dir / "VOCdevkit/VOC2010"
+
+    val_list = open(pc_path / "pascalcontext_val.txt", "r")
+    pc459_labels = open(pc_path / "labels.txt", "r")
+
+    pc459_dict = {}
+    for line in pc459_labels.readlines():
+        if ':' in line:
+            idx, name = line.split(':')
+            idx = int(idx.strip())
+            name = name.strip()
+            pc459_dict[name] = idx
+
+    pc459_dir = pc_path / "annotations_detectron2" / "pc459_val"
+    pc459_dir.mkdir(parents=True, exist_ok=True)
+
+    for line in tqdm.tqdm(val_list.readlines()):
+        fileid = line.strip()
+        ori_mask = f'{pc_path}/trainval/{fileid}.mat'
+        pc459_dst = f'{pc459_dir}/{fileid}.tif'
+        if osp.exists(ori_mask):
+            convert_pc459(ori_mask, pc459_dst)
\ No newline at end of file
--- a/datasets/prepare_pascal_context_59.py
+++ b/datasets/prepare_pascal_context_59.py
+import os
+import tqdm
+import json
+import numpy as np
+from pathlib import Path
+from PIL import Image
+from pycocotools import mask as m
+
+_mapping = np.sort(
+    np.array([
+        0, 2, 259, 260, 415, 324, 9, 258, 144, 18, 19, 22, 23, 397, 25, 284,
+        158, 159, 416, 33, 162, 420, 454, 295, 296, 427, 44, 45, 46, 308, 59,
+        440, 445, 31, 232, 65, 354, 424, 68, 326, 72, 458, 34, 207, 80, 355,
+        85, 347, 220, 349, 360, 98, 187, 104, 105, 366, 189, 368, 113, 115
+    ]))
+_key = np.array(range(len(_mapping))).astype('uint8')
+_key = _key - 1
+
+_map = {}
+for (k, v) in zip(_mapping, _key):
+    _map[k] = v
+
+def generate_labels(img_id, anno, out_dir):
+    def _class_to_index(mask, _map):
+        out = np.ones_like(mask, dtype=np.uint8) * 255
+        for k, v in _map.items():
+            out[mask == k] = v
+        return out
+
+    img_id['image_id']
+    mask = Image.fromarray(
+        _class_to_index(anno, _map))
+        #_class_to_index(detail.getMask(img_id), _map))
+    filename = img_id['file_name']
+    mask.save(os.path.join(out_dir, filename.replace('jpg', 'png')))
+    return os.path.splitext(os.path.basename(filename))[0]
+
+if __name__ == '__main__':
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
+    voc_path = dataset_dir / "VOCdevkit" / "VOC2010"
+    out_dir = voc_path / "annotations_detectron2" / "pc59_val"
+    json_path = voc_path / "trainval_merged.json"
+
+    os.makedirs(out_dir, exist_ok=True)
+    img_dir = out_dir / "JPEGImages"
+
+    print("loading annotations...")
+    data = json.load(open(json_path, 'r'))
+    val_images = {d['image_id'] : d for d in data['images'] if d['phase'] == "val"}
+    annos = {}
+
+    print("building annotations...")
+    for ann in data['annos_segmentation']:
+        key = ann['image_id']
+        if key in val_images.keys():
+            if key in annos.keys():
+                annos[key].append(ann)
+            else:
+                annos[key] = [ann]
+
+    for k, v in annos.items():
+        mask = np.zeros((val_images[k]['height'], val_images[k]['width']))
+        for c in v:
+            x = m.decode(c['segmentation'])
+            mask[np.nonzero(x)] = c['category_id']
+        
+        annos[k] = mask
+
+    print("converting annotations...")
+    for id, dat in tqdm.tqdm(val_images.items()):
+        generate_labels(dat, annos[id],out_dir=out_dir)
+    
+    print("done")
\ No newline at end of file
--- a/datasets/prepare_voc.py
+++ b/datasets/prepare_voc.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) Meta Platforms, Inc. All Rights Reserved
+# Modified by Feng Liang from https://github.com/MendelXu/zsseg.baseline/blob/master/datasets/prepare_voc_sem_seg.py
+# Modified by Heeseong Shin from https://github.com/facebookresearch/ov-seg/blob/main/datasets/prepare_voc_sem_seg.py
+
+import os
+import os.path as osp
+from pathlib import Path
+import tqdm
+
+import numpy as np
+from PIL import Image
+
+
+clsID_to_trID = {
+    0: 255,
+    1: 0,
+    2: 1,
+    3: 2,
+    4: 3,
+    5: 4,
+    6: 5,
+    7: 6,
+    8: 7,
+    9: 8,
+    10: 9,
+    11: 10,
+    12: 11,
+    13: 12,
+    14: 13,
+    15: 14,
+    16: 15,
+    17: 16,
+    18: 17,
+    19: 18,
+    20: 19,
+    255: 255,
+}
+clsID_to_trID_bg = clsID_to_trID.copy()
+clsID_to_trID_bg[0] = 20
+
+def convert_to_trainID(
+    maskpath, out_mask_dir, is_train, clsID_to_trID=clsID_to_trID, suffix=""
+):
+    mask = np.array(Image.open(maskpath))
+    mask_copy = np.ones_like(mask, dtype=np.uint8) * 255
+    for clsID, trID in clsID_to_trID.items():
+        mask_copy[mask == clsID] = trID
+    seg_filename = (
+        osp.join(out_mask_dir, "train" + suffix, osp.basename(maskpath))
+        if is_train
+        else osp.join(out_mask_dir, "val" + suffix, osp.basename(maskpath))
+    )
+    if len(np.unique(mask_copy)) == 1 and np.unique(mask_copy)[0] == 255:
+        return
+    Image.fromarray(mask_copy).save(seg_filename, "PNG")
+
+
+
+if __name__ == "__main__":
+    dataset_dir = Path(os.getenv("DETECTRON2_DATASETS", "datasets"))
+    print('Caution: we only generate the validation set!')
+    voc_path = dataset_dir / "VOCdevkit" / "VOC2012"
+    out_mask_dir = voc_path / "annotations_detectron2"
+    out_mask_dir_bg = voc_path / "annotations_detectron2_bg"
+    #out_image_dir = voc_path / "images_detectron2"
+    for name in ["val"]:
+        os.makedirs((out_mask_dir / name), exist_ok=True)
+        os.makedirs((out_mask_dir_bg / name), exist_ok=True)
+        #os.makedirs((out_image_dir / name), exist_ok=True)
+        val_list = [
+            osp.join(voc_path, "SegmentationClassAug", f + ".png")
+            for f in np.loadtxt(osp.join(voc_path, "ImageSets/Segmentation/val.txt"), dtype=np.str).tolist()
+        ]
+        for file in tqdm.tqdm(val_list):
+            convert_to_trainID(file, out_mask_dir, is_train=False)
+            convert_to_trainID(file, out_mask_dir_bg, is_train=False, clsID_to_trID=clsID_to_trID_bg)
\ No newline at end of file
--- a/datasets/voc20.json
+++ b/datasets/voc20.json
+["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor"]
+
--- a/datasets/voc20b.json
+++ b/datasets/voc20b.json
+["aeroplane", "bicycle", "bird", "boat", "bottle", "bus", "car", "cat", "chair", "cow", "diningtable", "dog", "horse", "motorbike", "person", "pottedplant", "sheep", "sofa", "train", "tvmonitor", "bag", "bed", "bench", "book", "building", "cabinet", "ceiling", "cloth", "computer", "cup", "door", "fence", "floor", "flower", "food", "grass", "ground", "keyboard", "light", "mountain", "mouse", "curtain", "platform", "sign", "plate", "road", "rock", "shelves", "sidewalk", "sky", "snow", "bedclothes", "track", "tree", "truck", "wall", "water", "window", "wood"]
--- a/demo/demo_for_gt.py
+++ b/demo/demo_for_gt.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
+import argparse
+import glob
+import multiprocessing as mp
+import os
+
+# fmt: off
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+# fmt: on
+
+import tempfile
+import time
+import warnings
+
+import cv2
+import numpy as np
+import tqdm
+
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.utils.logger import setup_logger
+
+from sed import add_sed_config
+# from predictor import VisualizationDemo
+from visualizer import VisualizationGt
+from PIL import Image
+
+# constants
+WINDOW_NAME = "MaskFormer demo"
+
+
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_deeplab_config(cfg)
+    add_sed_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    return cfg
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="configs/ade20k-150/maskformer_R50_bs16_160k.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    # parser.add_argument(
+    #     "--gt",
+    #     nargs="+",
+    #     help="A list of space seperated ground truth images;"
+    #     "or a single glob pattern such as 'directory/*.png'"
+    # )
+    parser.add_argument(
+        "--gt",
+        # type="str",
+        help="ground truth path of segmentation"
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def test_opencv_video_format(codec, file_ext):
+    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
+        filename = os.path.join(dir, "test_file" + file_ext)
+        writer = cv2.VideoWriter(
+            filename=filename,
+            fourcc=cv2.VideoWriter_fourcc(*codec),
+            fps=float(30),
+            frameSize=(10, 10),
+            isColor=True,
+        )
+        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
+        writer.release()
+        if os.path.isfile(filename):
+            return True
+        return False
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+
+    cfg = setup_cfg(args)
+
+    demo = VisualizationGt(cfg)
+    gt_path = args.gt
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            predictions = {}
+            gt_file = os.path.join(gt_path, os.path.splitext(os.path.basename(path))[0] + '.png')
+            # import pdb; pdb.set_trace()
+            predictions['sem_seg'] = np.asarray(Image.open(gt_file))
+            predictions, visualized_output = demo.run_on_image(img, predictions)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output.save(out_filename)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        assert args.output is None, "output not yet supported with --webcam!"
+        cam = cv2.VideoCapture(0)
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cam.release()
+        cv2.destroyAllWindows()
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+        codec, file_ext = (
+            ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
+        )
+        if codec == ".mp4v":
+            warnings.warn("x264 codec not available, switching to mp4v")
+        if args.output:
+            if os.path.isdir(args.output):
+                output_fname = os.path.join(args.output, basename)
+                output_fname = os.path.splitext(output_fname)[0] + file_ext
+            else:
+                output_fname = args.output
+            assert not os.path.isfile(output_fname), output_fname
+            output_file = cv2.VideoWriter(
+                filename=output_fname,
+                # some installation of opencv may not support x264 (due to its license),
+                # you can try other format (e.g. MPEG)
+                fourcc=cv2.VideoWriter_fourcc(*codec),
+                fps=float(frames_per_second),
+                frameSize=(width, height),
+                isColor=True,
+            )
+        assert os.path.isfile(args.video_input)
+        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+            if args.output:
+                output_file.write(vis_frame)
+            else:
+                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+                cv2.imshow(basename, vis_frame)
+                if cv2.waitKey(1) == 27:
+                    break  # esc to quit
+        video.release()
+        if args.output:
+            output_file.release()
+        else:
+            cv2.destroyAllWindows()
--- a/demo/demo_for_vis.py
+++ b/demo/demo_for_vis.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
+import argparse
+import glob
+import multiprocessing as mp
+import os
+
+# fmt: off
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+# fmt: on
+
+import tempfile
+import time
+import warnings
+
+import cv2
+import numpy as np
+import tqdm
+
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.utils.logger import setup_logger
+
+from sed import add_sed_config
+from predictor import VisualizationDemo
+
+
+# constants
+WINDOW_NAME = "MaskFormer demo"
+
+
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_deeplab_config(cfg)
+    add_sed_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    return cfg
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        #default="configs/ade20k-150/maskformer_R50_bs16_160k.yaml",
+        default='configs/convnextB_768.yaml',
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def test_opencv_video_format(codec, file_ext):
+    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
+        filename = os.path.join(dir, "test_file" + file_ext)
+        writer = cv2.VideoWriter(
+            filename=filename,
+            fourcc=cv2.VideoWriter_fourcc(*codec),
+            fps=float(30),
+            frameSize=(10, 10),
+            isColor=True,
+        )
+        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
+        writer.release()
+        if os.path.isfile(filename):
+            return True
+        return False
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+
+    cfg = setup_cfg(args)
+
+    demo = VisualizationDemo(cfg)
+
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            predictions, visualized_output = demo.run_on_image(img)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output.save(out_filename)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        assert args.output is None, "output not yet supported with --webcam!"
+        cam = cv2.VideoCapture(0)
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cam.release()
+        cv2.destroyAllWindows()
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+        codec, file_ext = (
+            ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
+        )
+        if codec == ".mp4v":
+            warnings.warn("x264 codec not available, switching to mp4v")
+        if args.output:
+            if os.path.isdir(args.output):
+                output_fname = os.path.join(args.output, basename)
+                output_fname = os.path.splitext(output_fname)[0] + file_ext
+            else:
+                output_fname = args.output
+            assert not os.path.isfile(output_fname), output_fname
+            output_file = cv2.VideoWriter(
+                filename=output_fname,
+                # some installation of opencv may not support x264 (due to its license),
+                # you can try other format (e.g. MPEG)
+                fourcc=cv2.VideoWriter_fourcc(*codec),
+                fps=float(frames_per_second),
+                frameSize=(width, height),
+                isColor=True,
+            )
+        assert os.path.isfile(args.video_input)
+        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+            if args.output:
+                output_file.write(vis_frame)
+            else:
+                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+                cv2.imshow(basename, vis_frame)
+                if cv2.waitKey(1) == 27:
+                    break  # esc to quit
+        video.release()
+        if args.output:
+            output_file.release()
+        else:
+            cv2.destroyAllWindows()
--- a/demo/demo_visual_gt.py
+++ b/demo/demo_visual_gt.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from: https://github.com/facebookresearch/detectron2/blob/master/demo/demo.py
+import argparse
+import glob
+import multiprocessing as mp
+import os
+
+# fmt: off
+import sys
+sys.path.insert(1, os.path.join(sys.path[0], '..'))
+# fmt: on
+
+import tempfile
+import time
+import warnings
+
+import cv2
+import numpy as np
+import tqdm
+
+from detectron2.config import get_cfg
+from detectron2.data.detection_utils import read_image
+from detectron2.projects.deeplab import add_deeplab_config
+from detectron2.utils.logger import setup_logger
+
+from mask_former import add_mask_former_config
+# from predictor import VisualizationDemo
+from visualizer import VisualizationGt
+from PIL import Image
+
+# constants
+WINDOW_NAME = "MaskFormer demo"
+
+
+def setup_cfg(args):
+    # load config from file and command-line arguments
+    cfg = get_cfg()
+    add_deeplab_config(cfg)
+    add_mask_former_config(cfg)
+    cfg.merge_from_file(args.config_file)
+    cfg.merge_from_list(args.opts)
+    cfg.freeze()
+    return cfg
+
+
+def get_parser():
+    parser = argparse.ArgumentParser(description="Detectron2 demo for builtin configs")
+    parser.add_argument(
+        "--config-file",
+        default="configs/ade20k-150/maskformer_R50_bs16_160k.yaml",
+        metavar="FILE",
+        help="path to config file",
+    )
+    parser.add_argument("--webcam", action="store_true", help="Take inputs from webcam.")
+    parser.add_argument("--video-input", help="Path to video file.")
+    parser.add_argument(
+        "--input",
+        nargs="+",
+        help="A list of space separated input images; "
+        "or a single glob pattern such as 'directory/*.jpg'",
+    )
+    # parser.add_argument(
+    #     "--gt",
+    #     nargs="+",
+    #     help="A list of space seperated ground truth images;"
+    #     "or a single glob pattern such as 'directory/*.png'"
+    # )
+    parser.add_argument(
+        "--gt",
+        # type="str",
+        help="ground truth path of segmentation"
+    )
+    parser.add_argument(
+        "--output",
+        help="A file or directory to save output visualizations. "
+        "If not given, will show output in an OpenCV window.",
+    )
+
+    parser.add_argument(
+        "--confidence-threshold",
+        type=float,
+        default=0.5,
+        help="Minimum score for instance predictions to be shown",
+    )
+    parser.add_argument(
+        "--opts",
+        help="Modify config options using the command-line 'KEY VALUE' pairs",
+        default=[],
+        nargs=argparse.REMAINDER,
+    )
+    return parser
+
+
+def test_opencv_video_format(codec, file_ext):
+    with tempfile.TemporaryDirectory(prefix="video_format_test") as dir:
+        filename = os.path.join(dir, "test_file" + file_ext)
+        writer = cv2.VideoWriter(
+            filename=filename,
+            fourcc=cv2.VideoWriter_fourcc(*codec),
+            fps=float(30),
+            frameSize=(10, 10),
+            isColor=True,
+        )
+        [writer.write(np.zeros((10, 10, 3), np.uint8)) for _ in range(30)]
+        writer.release()
+        if os.path.isfile(filename):
+            return True
+        return False
+
+
+if __name__ == "__main__":
+    mp.set_start_method("spawn", force=True)
+    args = get_parser().parse_args()
+    setup_logger(name="fvcore")
+    logger = setup_logger()
+    logger.info("Arguments: " + str(args))
+
+    cfg = setup_cfg(args)
+
+    demo = VisualizationGt(cfg)
+    gt_path = args.gt
+    if args.input:
+        if len(args.input) == 1:
+            args.input = glob.glob(os.path.expanduser(args.input[0]))
+            assert args.input, "The input path(s) was not found"
+        for path in tqdm.tqdm(args.input, disable=not args.output):
+            # use PIL, to be consistent with evaluation
+            img = read_image(path, format="BGR")
+            start_time = time.time()
+            predictions = {}
+            gt_file = os.path.join(gt_path, os.path.splitext(os.path.basename(path))[0] + '.png')
+            # import pdb; pdb.set_trace()
+            predictions['sem_seg'] = np.asarray(Image.open(gt_file))
+            predictions, visualized_output = demo.run_on_image(img, predictions)
+            logger.info(
+                "{}: {} in {:.2f}s".format(
+                    path,
+                    "detected {} instances".format(len(predictions["instances"]))
+                    if "instances" in predictions
+                    else "finished",
+                    time.time() - start_time,
+                )
+            )
+
+            if args.output:
+                if os.path.isdir(args.output):
+                    assert os.path.isdir(args.output), args.output
+                    out_filename = os.path.join(args.output, os.path.basename(path))
+                else:
+                    assert len(args.input) == 1, "Please specify a directory with args.output"
+                    out_filename = args.output
+                visualized_output.save(out_filename)
+            else:
+                cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+                cv2.imshow(WINDOW_NAME, visualized_output.get_image()[:, :, ::-1])
+                if cv2.waitKey(0) == 27:
+                    break  # esc to quit
+    elif args.webcam:
+        assert args.input is None, "Cannot have both --input and --webcam!"
+        assert args.output is None, "output not yet supported with --webcam!"
+        cam = cv2.VideoCapture(0)
+        for vis in tqdm.tqdm(demo.run_on_video(cam)):
+            cv2.namedWindow(WINDOW_NAME, cv2.WINDOW_NORMAL)
+            cv2.imshow(WINDOW_NAME, vis)
+            if cv2.waitKey(1) == 27:
+                break  # esc to quit
+        cam.release()
+        cv2.destroyAllWindows()
+    elif args.video_input:
+        video = cv2.VideoCapture(args.video_input)
+        width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
+        height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
+        frames_per_second = video.get(cv2.CAP_PROP_FPS)
+        num_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
+        basename = os.path.basename(args.video_input)
+        codec, file_ext = (
+            ("x264", ".mkv") if test_opencv_video_format("x264", ".mkv") else ("mp4v", ".mp4")
+        )
+        if codec == ".mp4v":
+            warnings.warn("x264 codec not available, switching to mp4v")
+        if args.output:
+            if os.path.isdir(args.output):
+                output_fname = os.path.join(args.output, basename)
+                output_fname = os.path.splitext(output_fname)[0] + file_ext
+            else:
+                output_fname = args.output
+            assert not os.path.isfile(output_fname), output_fname
+            output_file = cv2.VideoWriter(
+                filename=output_fname,
+                # some installation of opencv may not support x264 (due to its license),
+                # you can try other format (e.g. MPEG)
+                fourcc=cv2.VideoWriter_fourcc(*codec),
+                fps=float(frames_per_second),
+                frameSize=(width, height),
+                isColor=True,
+            )
+        assert os.path.isfile(args.video_input)
+        for vis_frame in tqdm.tqdm(demo.run_on_video(video), total=num_frames):
+            if args.output:
+                output_file.write(vis_frame)
+            else:
+                cv2.namedWindow(basename, cv2.WINDOW_NORMAL)
+                cv2.imshow(basename, vis_frame)
+                if cv2.waitKey(1) == 27:
+                    break  # esc to quit
+        video.release()
+        if args.output:
+            output_file.release()
+        else:
+            cv2.destroyAllWindows()
--- a/demo/predictor.py
+++ b/demo/predictor.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+
+import cv2
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+
+
+class VisualizationDemo(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+
+    def run_on_image(self, image):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        predictions = self.predictor(image)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+        if "panoptic_seg" in predictions:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions:
+                vis_output = visualizer.draw_sem_seg(
+                    predictions["sem_seg"].argmax(dim=0).to(self.cpu_device),
+                    alpha=0.4,
+                )
+            if "instances" in predictions:
+                instances = predictions["instances"].to(self.cpu_device)
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+
+        return predictions, vis_output
+
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+
+    def run_on_video(self, video):
+        """
+        Visualizes predictions on frames of the input video.
+        Args:
+            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+                either a webcam or a video file.
+        Yields:
+            ndarray: BGR visualizations of each video frame.
+        """
+        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+
+        def process_predictions(frame, predictions):
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            if "panoptic_seg" in predictions:
+                panoptic_seg, segments_info = predictions["panoptic_seg"]
+                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+                    frame, panoptic_seg.to(self.cpu_device), segments_info
+                )
+            elif "instances" in predictions:
+                predictions = predictions["instances"].to(self.cpu_device)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            elif "sem_seg" in predictions:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+
+            # Converts Matplotlib RGB format to OpenCV BGR format
+            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+            return vis_frame
+
+        frame_gen = self._frame_from_video(video)
+        if self.parallel:
+            buffer_size = self.predictor.default_buffer_size
+
+            frame_data = deque()
+
+            for cnt, frame in enumerate(frame_gen):
+                frame_data.append(frame)
+                self.predictor.put(frame)
+
+                if cnt >= buffer_size:
+                    frame = frame_data.popleft()
+                    predictions = self.predictor.get()
+                    yield process_predictions(frame, predictions)
+
+            while len(frame_data):
+                frame = frame_data.popleft()
+                predictions = self.predictor.get()
+                yield process_predictions(frame, predictions)
+        else:
+            for frame in frame_gen:
+                yield process_predictions(frame, self.predictor(frame))
+
+
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput a little bit when rendering videos.
+    """
+
+    class _StopToken:
+        pass
+
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+
+    def __len__(self):
+        return self.put_idx - self.get_idx
+
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5
--- a/demo/visualizer.py
+++ b/demo/visualizer.py
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copied from: https://github.com/facebookresearch/detectron2/blob/master/demo/predictor.py
+import atexit
+import bisect
+import multiprocessing as mp
+from collections import deque
+
+import cv2
+import torch
+
+from detectron2.data import MetadataCatalog
+from detectron2.engine.defaults import DefaultPredictor
+from detectron2.utils.video_visualizer import VideoVisualizer
+from detectron2.utils.visualizer import ColorMode, Visualizer
+
+
+class VisualizationGt(object):
+    def __init__(self, cfg, instance_mode=ColorMode.IMAGE, parallel=False):
+        """
+        Args:
+            cfg (CfgNode):
+            instance_mode (ColorMode):
+            parallel (bool): whether to run the model in different processes from visualization.
+                Useful since the visualization logic can be slow.
+        """
+        self.metadata = MetadataCatalog.get(
+            cfg.DATASETS.TEST[0] if len(cfg.DATASETS.TEST) else "__unused"
+        )
+        self.cpu_device = torch.device("cpu")
+        self.instance_mode = instance_mode
+
+        self.parallel = parallel
+        if parallel:
+            num_gpu = torch.cuda.device_count()
+            self.predictor = AsyncPredictor(cfg, num_gpus=num_gpu)
+        else:
+            self.predictor = DefaultPredictor(cfg)
+
+    def run_on_image(self, image, predictions):
+        """
+        Args:
+            image (np.ndarray): an image of shape (H, W, C) (in BGR order).
+                This is the format used by OpenCV.
+        Returns:
+            predictions (dict): the output of the model.
+            vis_output (VisImage): the visualized image output.
+        """
+        vis_output = None
+        # predictions = self.predictor(image)
+        # Convert image from OpenCV BGR format to Matplotlib RGB format.
+        image = image[:, :, ::-1]
+        visualizer = Visualizer(image, self.metadata, instance_mode=self.instance_mode)
+        if "panoptic_seg" in predictions:
+            panoptic_seg, segments_info = predictions["panoptic_seg"]
+            vis_output = visualizer.draw_panoptic_seg_predictions(
+                panoptic_seg.to(self.cpu_device), segments_info
+            )
+        else:
+            if "sem_seg" in predictions:
+                vis_output = visualizer.draw_sem_seg(
+                    predictions["sem_seg"]
+                )
+            if "instances" in predictions:
+                instances = predictions["instances"].to(self.cpu_device)
+                vis_output = visualizer.draw_instance_predictions(predictions=instances)
+
+        return predictions, vis_output
+
+    def _frame_from_video(self, video):
+        while video.isOpened():
+            success, frame = video.read()
+            if success:
+                yield frame
+            else:
+                break
+
+    def run_on_video(self, video):
+        """
+        Visualizes predictions on frames of the input video.
+        Args:
+            video (cv2.VideoCapture): a :class:`VideoCapture` object, whose source can be
+                either a webcam or a video file.
+        Yields:
+            ndarray: BGR visualizations of each video frame.
+        """
+        video_visualizer = VideoVisualizer(self.metadata, self.instance_mode)
+
+        def process_predictions(frame, predictions):
+            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+            if "panoptic_seg" in predictions:
+                panoptic_seg, segments_info = predictions["panoptic_seg"]
+                vis_frame = video_visualizer.draw_panoptic_seg_predictions(
+                    frame, panoptic_seg.to(self.cpu_device), segments_info
+                )
+            elif "instances" in predictions:
+                predictions = predictions["instances"].to(self.cpu_device)
+                vis_frame = video_visualizer.draw_instance_predictions(frame, predictions)
+            elif "sem_seg" in predictions:
+                vis_frame = video_visualizer.draw_sem_seg(
+                    frame, predictions["sem_seg"].argmax(dim=0).to(self.cpu_device)
+                )
+
+            # Converts Matplotlib RGB format to OpenCV BGR format
+            vis_frame = cv2.cvtColor(vis_frame.get_image(), cv2.COLOR_RGB2BGR)
+            return vis_frame
+
+        frame_gen = self._frame_from_video(video)
+        if self.parallel:
+            buffer_size = self.predictor.default_buffer_size
+
+            frame_data = deque()
+
+            for cnt, frame in enumerate(frame_gen):
+                frame_data.append(frame)
+                self.predictor.put(frame)
+
+                if cnt >= buffer_size:
+                    frame = frame_data.popleft()
+                    predictions = self.predictor.get()
+                    yield process_predictions(frame, predictions)
+
+            while len(frame_data):
+                frame = frame_data.popleft()
+                predictions = self.predictor.get()
+                yield process_predictions(frame, predictions)
+        else:
+            for frame in frame_gen:
+                yield process_predictions(frame, self.predictor(frame))
+
+
+class AsyncPredictor:
+    """
+    A predictor that runs the model asynchronously, possibly on >1 GPUs.
+    Because rendering the visualization takes considerably amount of time,
+    this helps improve throughput a little bit when rendering videos.
+    """
+
+    class _StopToken:
+        pass
+
+    class _PredictWorker(mp.Process):
+        def __init__(self, cfg, task_queue, result_queue):
+            self.cfg = cfg
+            self.task_queue = task_queue
+            self.result_queue = result_queue
+            super().__init__()
+
+        def run(self):
+            predictor = DefaultPredictor(self.cfg)
+
+            while True:
+                task = self.task_queue.get()
+                if isinstance(task, AsyncPredictor._StopToken):
+                    break
+                idx, data = task
+                result = predictor(data)
+                self.result_queue.put((idx, result))
+
+    def __init__(self, cfg, num_gpus: int = 1):
+        """
+        Args:
+            cfg (CfgNode):
+            num_gpus (int): if 0, will run on CPU
+        """
+        num_workers = max(num_gpus, 1)
+        self.task_queue = mp.Queue(maxsize=num_workers * 3)
+        self.result_queue = mp.Queue(maxsize=num_workers * 3)
+        self.procs = []
+        for gpuid in range(max(num_gpus, 1)):
+            cfg = cfg.clone()
+            cfg.defrost()
+            cfg.MODEL.DEVICE = "cuda:{}".format(gpuid) if num_gpus > 0 else "cpu"
+            self.procs.append(
+                AsyncPredictor._PredictWorker(cfg, self.task_queue, self.result_queue)
+            )
+
+        self.put_idx = 0
+        self.get_idx = 0
+        self.result_rank = []
+        self.result_data = []
+
+        for p in self.procs:
+            p.start()
+        atexit.register(self.shutdown)
+
+    def put(self, image):
+        self.put_idx += 1
+        self.task_queue.put((self.put_idx, image))
+
+    def get(self):
+        self.get_idx += 1  # the index needed for this request
+        if len(self.result_rank) and self.result_rank[0] == self.get_idx:
+            res = self.result_data[0]
+            del self.result_data[0], self.result_rank[0]
+            return res
+
+        while True:
+            # make sure the results are returned in the correct order
+            idx, res = self.result_queue.get()
+            if idx == self.get_idx:
+                return res
+            insert = bisect.bisect(self.result_rank, idx)
+            self.result_rank.insert(insert, idx)
+            self.result_data.insert(insert, res)
+
+    def __len__(self):
+        return self.put_idx - self.get_idx
+
+    def __call__(self, image):
+        self.put(image)
+        return self.get()
+
+    def shutdown(self):
+        for _ in self.procs:
+            self.task_queue.put(AsyncPredictor._StopToken())
+
+    @property
+    def default_buffer_size(self):
+        return len(self.procs) * 5
--- a/docs/ADE_val_00000001.jpg
+++ b/docs/ADE_val_00000001.jpg
--- a/docs/ADE_val_00000001_Original.jpg
+++ b/docs/ADE_val_00000001_Original.jpg
--- a/docs/Structure of category early rejection.png
+++ b/docs/Structure of category early rejection.png
--- a/docs/Structure of gradual fusion decoder.png
+++ b/docs/Structure of gradual fusion decoder.png