Commit a4372e17 authored by huchen's avatar huchen
Browse files

Merge branch 'hepj_work' into 'main'

增加conformer代码

See merge request dcutoolkit/deeplearing/dlexamples_new!7
parents 7f99c1c3 142dcf29
import itertools
import logging
import os.path as osp
import tempfile
from collections import OrderedDict
import numpy as np
from mmcv.utils import print_log
from terminaltables import AsciiTable
from .builder import DATASETS
from .coco import CocoDataset
@DATASETS.register_module()
class LVISV05Dataset(CocoDataset):
CLASSES = (
'acorn', 'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock',
'alcohol', 'alligator', 'almond', 'ambulance', 'amplifier', 'anklet',
'antenna', 'apple', 'apple_juice', 'applesauce', 'apricot', 'apron',
'aquarium', 'armband', 'armchair', 'armoire', 'armor', 'artichoke',
'trash_can', 'ashtray', 'asparagus', 'atomizer', 'avocado', 'award',
'awning', 'ax', 'baby_buggy', 'basketball_backboard', 'backpack',
'handbag', 'suitcase', 'bagel', 'bagpipe', 'baguet', 'bait', 'ball',
'ballet_skirt', 'balloon', 'bamboo', 'banana', 'Band_Aid', 'bandage',
'bandanna', 'banjo', 'banner', 'barbell', 'barge', 'barrel',
'barrette', 'barrow', 'baseball_base', 'baseball', 'baseball_bat',
'baseball_cap', 'baseball_glove', 'basket', 'basketball_hoop',
'basketball', 'bass_horn', 'bat_(animal)', 'bath_mat', 'bath_towel',
'bathrobe', 'bathtub', 'batter_(food)', 'battery', 'beachball', 'bead',
'beaker', 'bean_curd', 'beanbag', 'beanie', 'bear', 'bed',
'bedspread', 'cow', 'beef_(food)', 'beeper', 'beer_bottle', 'beer_can',
'beetle', 'bell', 'bell_pepper', 'belt', 'belt_buckle', 'bench',
'beret', 'bib', 'Bible', 'bicycle', 'visor', 'binder', 'binoculars',
'bird', 'birdfeeder', 'birdbath', 'birdcage', 'birdhouse',
'birthday_cake', 'birthday_card', 'biscuit_(bread)', 'pirate_flag',
'black_sheep', 'blackboard', 'blanket', 'blazer', 'blender', 'blimp',
'blinker', 'blueberry', 'boar', 'gameboard', 'boat', 'bobbin',
'bobby_pin', 'boiled_egg', 'bolo_tie', 'deadbolt', 'bolt', 'bonnet',
'book', 'book_bag', 'bookcase', 'booklet', 'bookmark',
'boom_microphone', 'boot', 'bottle', 'bottle_opener', 'bouquet',
'bow_(weapon)', 'bow_(decorative_ribbons)', 'bow-tie', 'bowl',
'pipe_bowl', 'bowler_hat', 'bowling_ball', 'bowling_pin',
'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
'bread-bin', 'breechcloth', 'bridal_gown', 'briefcase',
'bristle_brush', 'broccoli', 'broach', 'broom', 'brownie',
'brussels_sprouts', 'bubble_gum', 'bucket', 'horse_buggy', 'bull',
'bulldog', 'bulldozer', 'bullet_train', 'bulletin_board',
'bulletproof_vest', 'bullhorn', 'corned_beef', 'bun', 'bunk_bed',
'buoy', 'burrito', 'bus_(vehicle)', 'business_card', 'butcher_knife',
'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
'can', 'can_opener', 'candelabrum', 'candle', 'candle_holder',
'candy_bar', 'candy_cane', 'walking_cane', 'canister', 'cannon',
'canoe', 'cantaloup', 'canteen', 'cap_(headwear)', 'bottle_cap',
'cape', 'cappuccino', 'car_(automobile)', 'railcar_(part_of_a_train)',
'elevator_car', 'car_battery', 'identity_card', 'card', 'cardigan',
'cargo_ship', 'carnation', 'horse_carriage', 'carrot', 'tote_bag',
'cart', 'carton', 'cash_register', 'casserole', 'cassette', 'cast',
'cat', 'cauliflower', 'caviar', 'cayenne_(spice)', 'CD_player',
'celery', 'cellular_telephone', 'chain_mail', 'chair', 'chaise_longue',
'champagne', 'chandelier', 'chap', 'checkbook', 'checkerboard',
'cherry', 'chessboard', 'chest_of_drawers_(furniture)',
'chicken_(animal)', 'chicken_wire', 'chickpea', 'Chihuahua',
'chili_(vegetable)', 'chime', 'chinaware', 'crisp_(potato_chip)',
'poker_chip', 'chocolate_bar', 'chocolate_cake', 'chocolate_milk',
'chocolate_mousse', 'choker', 'chopping_board', 'chopstick',
'Christmas_tree', 'slide', 'cider', 'cigar_box', 'cigarette',
'cigarette_case', 'cistern', 'clarinet', 'clasp', 'cleansing_agent',
'clementine', 'clip', 'clipboard', 'clock', 'clock_tower',
'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster', 'coat',
'coat_hanger', 'coatrack', 'cock', 'coconut', 'coffee_filter',
'coffee_maker', 'coffee_table', 'coffeepot', 'coil', 'coin',
'colander', 'coleslaw', 'coloring_material', 'combination_lock',
'pacifier', 'comic_book', 'computer_keyboard', 'concrete_mixer',
'cone', 'control', 'convertible_(automobile)', 'sofa_bed', 'cookie',
'cookie_jar', 'cooking_utensil', 'cooler_(for_food)',
'cork_(bottle_plug)', 'corkboard', 'corkscrew', 'edible_corn',
'cornbread', 'cornet', 'cornice', 'cornmeal', 'corset',
'romaine_lettuce', 'costume', 'cougar', 'coverall', 'cowbell',
'cowboy_hat', 'crab_(animal)', 'cracker', 'crape', 'crate', 'crayon',
'cream_pitcher', 'credit_card', 'crescent_roll', 'crib', 'crock_pot',
'crossbar', 'crouton', 'crow', 'crown', 'crucifix', 'cruise_ship',
'police_cruiser', 'crumb', 'crutch', 'cub_(animal)', 'cube',
'cucumber', 'cufflink', 'cup', 'trophy_cup', 'cupcake', 'hair_curler',
'curling_iron', 'curtain', 'cushion', 'custard', 'cutting_tool',
'cylinder', 'cymbal', 'dachshund', 'dagger', 'dartboard',
'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux',
'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
'dishwasher_detergent', 'diskette', 'dispenser', 'Dixie_cup', 'dog',
'dog_collar', 'doll', 'dollar', 'dolphin', 'domestic_ass', 'eye_mask',
'doorbell', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
'dresser', 'drill', 'drinking_fountain', 'drone', 'dropper',
'drum_(musical_instrument)', 'drumstick', 'duck', 'duckling',
'duct_tape', 'duffel_bag', 'dumbbell', 'dumpster', 'dustpan',
'Dutch_oven', 'eagle', 'earphone', 'earplug', 'earring', 'easel',
'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm',
'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace',
'fireplug', 'fish', 'fish_(food)', 'fishbowl', 'fishing_boat',
'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flash',
'flashlight', 'fleece', 'flip-flop_(sandal)', 'flipper_(footwear)',
'flower_arrangement', 'flute_glass', 'foal', 'folding_chair',
'food_processor', 'football_(American)', 'football_helmet',
'footstool', 'fork', 'forklift', 'freight_car', 'French_toast',
'freshener', 'frisbee', 'frog', 'fruit_juice', 'fruit_salad',
'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage',
'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic',
'gasmask', 'gazelle', 'gelatin', 'gemstone', 'giant_panda',
'gift_wrap', 'ginger', 'giraffe', 'cincture',
'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
'gorilla', 'gourd', 'surgical_gown', 'grape', 'grasshopper', 'grater',
'gravestone', 'gravy_boat', 'green_bean', 'green_onion', 'griddle',
'grillroom', 'grinder_(tool)', 'grits', 'grizzly', 'grocery_bag',
'guacamole', 'guitar', 'gull', 'gun', 'hair_spray', 'hairbrush',
'hairnet', 'hairpin', 'ham', 'hamburger', 'hammer', 'hammock',
'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
'hardback_book', 'harmonium', 'hat', 'hatbox', 'hatch', 'veil',
'headband', 'headboard', 'headlight', 'headscarf', 'headset',
'headstall_(for_horses)', 'hearing_aid', 'heart', 'heater',
'helicopter', 'helmet', 'heron', 'highchair', 'hinge', 'hippopotamus',
'hockey_stick', 'hog', 'home_plate_(baseball)', 'honey', 'fume_hood',
'hook', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
'ice_tea', 'igniter', 'incense', 'inhaler', 'iPod',
'iron_(for_clothing)', 'ironing_board', 'jacket', 'jam', 'jean',
'jeep', 'jelly_bean', 'jersey', 'jet_plane', 'jewelry', 'joystick',
'jumpsuit', 'kayak', 'keg', 'kennel', 'kettle', 'key', 'keycard',
'kilt', 'kimono', 'kitchen_sink', 'kitchen_table', 'kite', 'kitten',
'kiwi_fruit', 'knee_pad', 'knife', 'knight_(chess_piece)',
'knitting_needle', 'knob', 'knocker_(on_a_door)', 'koala', 'lab_coat',
'ladder', 'ladle', 'ladybug', 'lamb_(animal)', 'lamb-chop', 'lamp',
'lamppost', 'lampshade', 'lantern', 'lanyard', 'laptop_computer',
'lasagna', 'latch', 'lawn_mower', 'leather', 'legging_(clothing)',
'Lego', 'lemon', 'lemonade', 'lettuce', 'license_plate', 'life_buoy',
'life_jacket', 'lightbulb', 'lightning_rod', 'lime', 'limousine',
'linen_paper', 'lion', 'lip_balm', 'lipstick', 'liquor', 'lizard',
'Loafer_(type_of_shoe)', 'log', 'lollipop', 'lotion',
'speaker_(stero_equipment)', 'loveseat', 'machine_gun', 'magazine',
'magnet', 'mail_slot', 'mailbox_(at_home)', 'mallet', 'mammoth',
'mandarin_orange', 'manger', 'manhole', 'map', 'marker', 'martini',
'mascot', 'mashed_potato', 'masher', 'mask', 'mast',
'mat_(gym_equipment)', 'matchbox', 'mattress', 'measuring_cup',
'measuring_stick', 'meatball', 'medicine', 'melon', 'microphone',
'microscope', 'microwave_oven', 'milestone', 'milk', 'minivan',
'mint_candy', 'mirror', 'mitten', 'mixer_(kitchen_tool)', 'money',
'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
'motor_scooter', 'motor_vehicle', 'motorboat', 'motorcycle',
'mound_(baseball)', 'mouse_(animal_rodent)',
'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
'music_stool', 'musical_instrument', 'nailfile', 'nameplate', 'napkin',
'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newsstand',
'nightshirt', 'nosebag_(for_animals)', 'noseband_(for_animals)',
'notebook', 'notepad', 'nut', 'nutcracker', 'oar', 'octopus_(food)',
'octopus_(animal)', 'oil_lamp', 'olive_oil', 'omelet', 'onion',
'orange_(fruit)', 'orange_juice', 'oregano', 'ostrich', 'ottoman',
'overalls_(clothing)', 'owl', 'packet', 'inkpad', 'pad', 'paddle',
'padlock', 'paintbox', 'paintbrush', 'painting', 'pajamas', 'palette',
'pan_(for_cooking)', 'pan_(metal_container)', 'pancake', 'pantyhose',
'papaya', 'paperclip', 'paper_plate', 'paper_towel', 'paperback_book',
'paperweight', 'parachute', 'parakeet', 'parasail_(sports)',
'parchment', 'parka', 'parking_meter', 'parrot',
'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'pegboard',
'pelican', 'pen', 'pencil', 'pencil_box', 'pencil_sharpener',
'pendulum', 'penguin', 'pennant', 'penny_(coin)', 'pepper',
'pepper_mill', 'perfume', 'persimmon', 'baby', 'pet', 'petfood',
'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
'plate', 'platter', 'playing_card', 'playpen', 'pliers',
'plow_(farm_equipment)', 'pocket_watch', 'pocketknife',
'poker_(fire_stirring_tool)', 'pole', 'police_van', 'polo_shirt',
'poncho', 'pony', 'pool_table', 'pop_(soda)', 'portrait',
'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato',
'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'printer',
'projectile_(weapon)', 'projector', 'propeller', 'prune', 'pudding',
'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher', 'puppet',
'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit', 'race_car',
'racket', 'radar', 'radiator', 'radio_receiver', 'radish', 'raft',
'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
'recliner', 'record_player', 'red_cabbage', 'reflector',
'remote_control', 'rhinoceros', 'rib_(food)', 'rifle', 'ring',
'river_boat', 'road_map', 'robe', 'rocking_chair', 'roller_skate',
'Rollerblade', 'rolling_pin', 'root_beer',
'router_(computer_equipment)', 'rubber_band', 'runner_(carpet)',
'plastic_bag', 'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag',
'safety_pin', 'sail', 'salad', 'salad_plate', 'salami',
'salmon_(fish)', 'salmon_(food)', 'salsa', 'saltshaker',
'sandal_(type_of_shoe)', 'sandwich', 'satchel', 'saucepan', 'saucer',
'sausage', 'sawhorse', 'saxophone', 'scale_(measuring_instrument)',
'scarecrow', 'scarf', 'school_bus', 'scissors', 'scoreboard',
'scrambled_eggs', 'scraper', 'scratcher', 'screwdriver',
'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
'seashell', 'seedling', 'serving_dish', 'sewing_machine', 'shaker',
'shampoo', 'shark', 'sharpener', 'Sharpie', 'shaver_(electric)',
'shaving_cream', 'shawl', 'shears', 'sheep', 'shepherd_dog',
'sherbert', 'shield', 'shirt', 'shoe', 'shopping_bag', 'shopping_cart',
'short_pants', 'shot_glass', 'shoulder_bag', 'shovel', 'shower_head',
'shower_curtain', 'shredder_(for_paper)', 'sieve', 'signboard', 'silo',
'sink', 'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka',
'ski_pole', 'skirt', 'sled', 'sleeping_bag', 'sling_(bandage)',
'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
'snowmobile', 'soap', 'soccer_ball', 'sock', 'soda_fountain',
'carbonated_water', 'sofa', 'softball', 'solar_array', 'sombrero',
'soup', 'soup_bowl', 'soupspoon', 'sour_cream', 'soya_milk',
'space_shuttle', 'sparkler_(fireworks)', 'spatula', 'spear',
'spectacles', 'spice_rack', 'spider', 'sponge', 'spoon', 'sportswear',
'spotlight', 'squirrel', 'stapler_(stapling_machine)', 'starfish',
'statue_(sculpture)', 'steak_(food)', 'steak_knife',
'steamer_(kitchen_appliance)', 'steering_wheel', 'stencil',
'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer',
'stirrup', 'stockings_(leg_wear)', 'stool', 'stop_sign', 'brake_light',
'stove', 'strainer', 'strap', 'straw_(for_drinking)', 'strawberry',
'street_sign', 'streetlight', 'string_cheese', 'stylus', 'subwoofer',
'sugar_bowl', 'sugarcane_(plant)', 'suit_(clothing)', 'sunflower',
'sunglasses', 'sunhat', 'sunscreen', 'surfboard', 'sushi', 'mop',
'sweat_pants', 'sweatband', 'sweater', 'sweatshirt', 'sweet_potato',
'swimsuit', 'sword', 'syringe', 'Tabasco_sauce', 'table-tennis_table',
'table', 'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag',
'taillight', 'tambourine', 'army_tank', 'tank_(storage_vessel)',
'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
'telephone_pole', 'telephoto_lens', 'television_camera',
'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil',
'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven',
'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush',
'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel',
'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light',
'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline',
'tray', 'tree_house', 'trench_coat', 'triangle_(musical_instrument)',
'tricycle', 'tripod', 'trousers', 'truck', 'truffle_(chocolate)',
'trunk', 'vat', 'turban', 'turkey_(bird)', 'turkey_(food)', 'turnip',
'turtle', 'turtleneck_(clothing)', 'typewriter', 'umbrella',
'underwear', 'unicycle', 'urinal', 'urn', 'vacuum_cleaner', 'valve',
'vase', 'vending_machine', 'vent', 'videotape', 'vinegar', 'violin',
'vodka', 'volleyball', 'vulture', 'waffle', 'waffle_iron', 'wagon',
'wagon_wheel', 'walking_stick', 'wall_clock', 'wall_socket', 'wallet',
'walrus', 'wardrobe', 'wasabi', 'automatic_washer', 'watch',
'water_bottle', 'water_cooler', 'water_faucet', 'water_filter',
'water_heater', 'water_jug', 'water_gun', 'water_scooter', 'water_ski',
'water_tower', 'watering_can', 'watermelon', 'weathervane', 'webcam',
'wedding_cake', 'wedding_ring', 'wet_suit', 'wheel', 'wheelchair',
'whipped_cream', 'whiskey', 'whistle', 'wick', 'wig', 'wind_chime',
'windmill', 'window_box_(for_plants)', 'windshield_wiper', 'windsock',
'wine_bottle', 'wine_bucket', 'wineglass', 'wing_chair',
'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon', 'wreath',
'wrench', 'wristband', 'wristlet', 'yacht', 'yak', 'yogurt',
'yoke_(animal_equipment)', 'zebra', 'zucchini')
def load_annotations(self, ann_file):
"""Load annotation from lvis style annotation file.
Args:
ann_file (str): Path of annotation file.
Returns:
list[dict]: Annotation info from LVIS api.
"""
try:
import lvis
assert lvis.__version__ >= '10.5.3'
from lvis import LVIS
except AssertionError:
raise AssertionError('Incompatible version of lvis is installed. '
'Run pip uninstall lvis first. Then run pip '
'install mmlvis to install open-mmlab forked '
'lvis. ')
except ImportError:
raise ImportError('Package lvis is not installed. Please run pip '
'install mmlvis to install open-mmlab forked '
'lvis.')
self.coco = LVIS(ann_file)
self.cat_ids = self.coco.get_cat_ids()
self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
self.img_ids = self.coco.get_img_ids()
data_infos = []
for i in self.img_ids:
info = self.coco.load_imgs([i])[0]
if info['file_name'].startswith('COCO'):
# Convert form the COCO 2014 file naming convention of
# COCO_[train/val/test]2014_000000000000.jpg to the 2017
# naming convention of 000000000000.jpg
# (LVIS v1 will fix this naming issue)
info['filename'] = info['file_name'][-16:]
else:
info['filename'] = info['file_name']
data_infos.append(info)
return data_infos
def evaluate(self,
results,
metric='bbox',
logger=None,
jsonfile_prefix=None,
classwise=False,
proposal_nums=(100, 300, 1000),
iou_thrs=np.arange(0.5, 0.96, 0.05)):
"""Evaluation in LVIS protocol.
Args:
results (list[list | tuple]): Testing results of the dataset.
metric (str | list[str]): Metrics to be evaluated. Options are
'bbox', 'segm', 'proposal', 'proposal_fast'.
logger (logging.Logger | str | None): Logger used for printing
related information during evaluation. Default: None.
jsonfile_prefix (str | None):
classwise (bool): Whether to evaluating the AP for each class.
proposal_nums (Sequence[int]): Proposal number used for evaluating
recalls, such as recall@100, recall@1000.
Default: (100, 300, 1000).
iou_thrs (Sequence[float]): IoU threshold used for evaluating
recalls. If set to a list, the average recall of all IoUs will
also be computed. Default: 0.5.
Returns:
dict[str, float]: LVIS style metrics.
"""
try:
import lvis
assert lvis.__version__ >= '10.5.3'
from lvis import LVISResults, LVISEval
except AssertionError:
raise AssertionError('Incompatible version of lvis is installed. '
'Run pip uninstall lvis first. Then run pip '
'install mmlvis to install open-mmlab forked '
'lvis. ')
except ImportError:
raise ImportError('Package lvis is not installed. Please run pip '
'install mmlvis to install open-mmlab forked '
'lvis.')
assert isinstance(results, list), 'results must be a list'
assert len(results) == len(self), (
'The length of results is not equal to the dataset len: {} != {}'.
format(len(results), len(self)))
metrics = metric if isinstance(metric, list) else [metric]
allowed_metrics = ['bbox', 'segm', 'proposal', 'proposal_fast']
for metric in metrics:
if metric not in allowed_metrics:
raise KeyError('metric {} is not supported'.format(metric))
if jsonfile_prefix is None:
tmp_dir = tempfile.TemporaryDirectory()
jsonfile_prefix = osp.join(tmp_dir.name, 'results')
else:
tmp_dir = None
result_files = self.results2json(results, jsonfile_prefix)
eval_results = OrderedDict()
# get original api
lvis_gt = self.coco
for metric in metrics:
msg = 'Evaluating {}...'.format(metric)
if logger is None:
msg = '\n' + msg
print_log(msg, logger=logger)
if metric == 'proposal_fast':
ar = self.fast_eval_recall(
results, proposal_nums, iou_thrs, logger='silent')
log_msg = []
for i, num in enumerate(proposal_nums):
eval_results['AR@{}'.format(num)] = ar[i]
log_msg.append('\nAR@{}\t{:.4f}'.format(num, ar[i]))
log_msg = ''.join(log_msg)
print_log(log_msg, logger=logger)
continue
if metric not in result_files:
raise KeyError('{} is not in results'.format(metric))
try:
lvis_dt = LVISResults(lvis_gt, result_files[metric])
except IndexError:
print_log(
'The testing results of the whole dataset is empty.',
logger=logger,
level=logging.ERROR)
break
iou_type = 'bbox' if metric == 'proposal' else metric
lvis_eval = LVISEval(lvis_gt, lvis_dt, iou_type)
lvis_eval.params.imgIds = self.img_ids
if metric == 'proposal':
lvis_eval.params.useCats = 0
lvis_eval.params.maxDets = list(proposal_nums)
lvis_eval.evaluate()
lvis_eval.accumulate()
lvis_eval.summarize()
for k, v in lvis_eval.get_results().items():
if k.startswith('AR'):
val = float('{:.3f}'.format(float(v)))
eval_results[k] = val
else:
lvis_eval.evaluate()
lvis_eval.accumulate()
lvis_eval.summarize()
lvis_results = lvis_eval.get_results()
if classwise: # Compute per-category AP
# Compute per-category AP
# from https://github.com/facebookresearch/detectron2/
precisions = lvis_eval.eval['precision']
# precision: (iou, recall, cls, area range, max dets)
assert len(self.cat_ids) == precisions.shape[2]
results_per_category = []
for idx, catId in enumerate(self.cat_ids):
# area range index 0: all area ranges
# max dets index -1: typically 100 per image
nm = self.coco.load_cats(catId)[0]
precision = precisions[:, :, idx, 0, -1]
precision = precision[precision > -1]
if precision.size:
ap = np.mean(precision)
else:
ap = float('nan')
results_per_category.append(
(f'{nm["name"]}', f'{float(ap):0.3f}'))
num_columns = min(6, len(results_per_category) * 2)
results_flatten = list(
itertools.chain(*results_per_category))
headers = ['category', 'AP'] * (num_columns // 2)
results_2d = itertools.zip_longest(*[
results_flatten[i::num_columns]
for i in range(num_columns)
])
table_data = [headers]
table_data += [result for result in results_2d]
table = AsciiTable(table_data)
print_log('\n' + table.table, logger=logger)
for k, v in lvis_results.items():
if k.startswith('AP'):
key = '{}_{}'.format(metric, k)
val = float('{:.3f}'.format(float(v)))
eval_results[key] = val
ap_summary = ' '.join([
'{}:{:.3f}'.format(k, float(v))
for k, v in lvis_results.items() if k.startswith('AP')
])
eval_results['{}_mAP_copypaste'.format(metric)] = ap_summary
lvis_eval.print_results()
if tmp_dir is not None:
tmp_dir.cleanup()
return eval_results
LVISDataset = LVISV05Dataset
DATASETS.register_module(name='LVISDataset', module=LVISDataset)
@DATASETS.register_module()
class LVISV1Dataset(LVISDataset):
CLASSES = (
'aerosol_can', 'air_conditioner', 'airplane', 'alarm_clock', 'alcohol',
'alligator', 'almond', 'ambulance', 'amplifier', 'anklet', 'antenna',
'apple', 'applesauce', 'apricot', 'apron', 'aquarium',
'arctic_(type_of_shoe)', 'armband', 'armchair', 'armoire', 'armor',
'artichoke', 'trash_can', 'ashtray', 'asparagus', 'atomizer',
'avocado', 'award', 'awning', 'ax', 'baboon', 'baby_buggy',
'basketball_backboard', 'backpack', 'handbag', 'suitcase', 'bagel',
'bagpipe', 'baguet', 'bait', 'ball', 'ballet_skirt', 'balloon',
'bamboo', 'banana', 'Band_Aid', 'bandage', 'bandanna', 'banjo',
'banner', 'barbell', 'barge', 'barrel', 'barrette', 'barrow',
'baseball_base', 'baseball', 'baseball_bat', 'baseball_cap',
'baseball_glove', 'basket', 'basketball', 'bass_horn', 'bat_(animal)',
'bath_mat', 'bath_towel', 'bathrobe', 'bathtub', 'batter_(food)',
'battery', 'beachball', 'bead', 'bean_curd', 'beanbag', 'beanie',
'bear', 'bed', 'bedpan', 'bedspread', 'cow', 'beef_(food)', 'beeper',
'beer_bottle', 'beer_can', 'beetle', 'bell', 'bell_pepper', 'belt',
'belt_buckle', 'bench', 'beret', 'bib', 'Bible', 'bicycle', 'visor',
'billboard', 'binder', 'binoculars', 'bird', 'birdfeeder', 'birdbath',
'birdcage', 'birdhouse', 'birthday_cake', 'birthday_card',
'pirate_flag', 'black_sheep', 'blackberry', 'blackboard', 'blanket',
'blazer', 'blender', 'blimp', 'blinker', 'blouse', 'blueberry',
'gameboard', 'boat', 'bob', 'bobbin', 'bobby_pin', 'boiled_egg',
'bolo_tie', 'deadbolt', 'bolt', 'bonnet', 'book', 'bookcase',
'booklet', 'bookmark', 'boom_microphone', 'boot', 'bottle',
'bottle_opener', 'bouquet', 'bow_(weapon)', 'bow_(decorative_ribbons)',
'bow-tie', 'bowl', 'pipe_bowl', 'bowler_hat', 'bowling_ball', 'box',
'boxing_glove', 'suspenders', 'bracelet', 'brass_plaque', 'brassiere',
'bread-bin', 'bread', 'breechcloth', 'bridal_gown', 'briefcase',
'broccoli', 'broach', 'broom', 'brownie', 'brussels_sprouts',
'bubble_gum', 'bucket', 'horse_buggy', 'bull', 'bulldog', 'bulldozer',
'bullet_train', 'bulletin_board', 'bulletproof_vest', 'bullhorn',
'bun', 'bunk_bed', 'buoy', 'burrito', 'bus_(vehicle)', 'business_card',
'butter', 'butterfly', 'button', 'cab_(taxi)', 'cabana', 'cabin_car',
'cabinet', 'locker', 'cake', 'calculator', 'calendar', 'calf',
'camcorder', 'camel', 'camera', 'camera_lens', 'camper_(vehicle)',
'can', 'can_opener', 'candle', 'candle_holder', 'candy_bar',
'candy_cane', 'walking_cane', 'canister', 'canoe', 'cantaloup',
'canteen', 'cap_(headwear)', 'bottle_cap', 'cape', 'cappuccino',
'car_(automobile)', 'railcar_(part_of_a_train)', 'elevator_car',
'car_battery', 'identity_card', 'card', 'cardigan', 'cargo_ship',
'carnation', 'horse_carriage', 'carrot', 'tote_bag', 'cart', 'carton',
'cash_register', 'casserole', 'cassette', 'cast', 'cat', 'cauliflower',
'cayenne_(spice)', 'CD_player', 'celery', 'cellular_telephone',
'chain_mail', 'chair', 'chaise_longue', 'chalice', 'chandelier',
'chap', 'checkbook', 'checkerboard', 'cherry', 'chessboard',
'chicken_(animal)', 'chickpea', 'chili_(vegetable)', 'chime',
'chinaware', 'crisp_(potato_chip)', 'poker_chip', 'chocolate_bar',
'chocolate_cake', 'chocolate_milk', 'chocolate_mousse', 'choker',
'chopping_board', 'chopstick', 'Christmas_tree', 'slide', 'cider',
'cigar_box', 'cigarette', 'cigarette_case', 'cistern', 'clarinet',
'clasp', 'cleansing_agent', 'cleat_(for_securing_rope)', 'clementine',
'clip', 'clipboard', 'clippers_(for_plants)', 'cloak', 'clock',
'clock_tower', 'clothes_hamper', 'clothespin', 'clutch_bag', 'coaster',
'coat', 'coat_hanger', 'coatrack', 'cock', 'cockroach',
'cocoa_(beverage)', 'coconut', 'coffee_maker', 'coffee_table',
'coffeepot', 'coil', 'coin', 'colander', 'coleslaw',
'coloring_material', 'combination_lock', 'pacifier', 'comic_book',
'compass', 'computer_keyboard', 'condiment', 'cone', 'control',
'convertible_(automobile)', 'sofa_bed', 'cooker', 'cookie',
'cooking_utensil', 'cooler_(for_food)', 'cork_(bottle_plug)',
'corkboard', 'corkscrew', 'edible_corn', 'cornbread', 'cornet',
'cornice', 'cornmeal', 'corset', 'costume', 'cougar', 'coverall',
'cowbell', 'cowboy_hat', 'crab_(animal)', 'crabmeat', 'cracker',
'crape', 'crate', 'crayon', 'cream_pitcher', 'crescent_roll', 'crib',
'crock_pot', 'crossbar', 'crouton', 'crow', 'crowbar', 'crown',
'crucifix', 'cruise_ship', 'police_cruiser', 'crumb', 'crutch',
'cub_(animal)', 'cube', 'cucumber', 'cufflink', 'cup', 'trophy_cup',
'cupboard', 'cupcake', 'hair_curler', 'curling_iron', 'curtain',
'cushion', 'cylinder', 'cymbal', 'dagger', 'dalmatian', 'dartboard',
'date_(fruit)', 'deck_chair', 'deer', 'dental_floss', 'desk',
'detergent', 'diaper', 'diary', 'die', 'dinghy', 'dining_table', 'tux',
'dish', 'dish_antenna', 'dishrag', 'dishtowel', 'dishwasher',
'dishwasher_detergent', 'dispenser', 'diving_board', 'Dixie_cup',
'dog', 'dog_collar', 'doll', 'dollar', 'dollhouse', 'dolphin',
'domestic_ass', 'doorknob', 'doormat', 'doughnut', 'dove', 'dragonfly',
'drawer', 'underdrawers', 'dress', 'dress_hat', 'dress_suit',
'dresser', 'drill', 'drone', 'dropper', 'drum_(musical_instrument)',
'drumstick', 'duck', 'duckling', 'duct_tape', 'duffel_bag', 'dumbbell',
'dumpster', 'dustpan', 'eagle', 'earphone', 'earplug', 'earring',
'easel', 'eclair', 'eel', 'egg', 'egg_roll', 'egg_yolk', 'eggbeater',
'eggplant', 'electric_chair', 'refrigerator', 'elephant', 'elk',
'envelope', 'eraser', 'escargot', 'eyepatch', 'falcon', 'fan',
'faucet', 'fedora', 'ferret', 'Ferris_wheel', 'ferry', 'fig_(fruit)',
'fighter_jet', 'figurine', 'file_cabinet', 'file_(tool)', 'fire_alarm',
'fire_engine', 'fire_extinguisher', 'fire_hose', 'fireplace',
'fireplug', 'first-aid_kit', 'fish', 'fish_(food)', 'fishbowl',
'fishing_rod', 'flag', 'flagpole', 'flamingo', 'flannel', 'flap',
'flash', 'flashlight', 'fleece', 'flip-flop_(sandal)',
'flipper_(footwear)', 'flower_arrangement', 'flute_glass', 'foal',
'folding_chair', 'food_processor', 'football_(American)',
'football_helmet', 'footstool', 'fork', 'forklift', 'freight_car',
'French_toast', 'freshener', 'frisbee', 'frog', 'fruit_juice',
'frying_pan', 'fudge', 'funnel', 'futon', 'gag', 'garbage',
'garbage_truck', 'garden_hose', 'gargle', 'gargoyle', 'garlic',
'gasmask', 'gazelle', 'gelatin', 'gemstone', 'generator',
'giant_panda', 'gift_wrap', 'ginger', 'giraffe', 'cincture',
'glass_(drink_container)', 'globe', 'glove', 'goat', 'goggles',
'goldfish', 'golf_club', 'golfcart', 'gondola_(boat)', 'goose',
'gorilla', 'gourd', 'grape', 'grater', 'gravestone', 'gravy_boat',
'green_bean', 'green_onion', 'griddle', 'grill', 'grits', 'grizzly',
'grocery_bag', 'guitar', 'gull', 'gun', 'hairbrush', 'hairnet',
'hairpin', 'halter_top', 'ham', 'hamburger', 'hammer', 'hammock',
'hamper', 'hamster', 'hair_dryer', 'hand_glass', 'hand_towel',
'handcart', 'handcuff', 'handkerchief', 'handle', 'handsaw',
'hardback_book', 'harmonium', 'hat', 'hatbox', 'veil', 'headband',
'headboard', 'headlight', 'headscarf', 'headset',
'headstall_(for_horses)', 'heart', 'heater', 'helicopter', 'helmet',
'heron', 'highchair', 'hinge', 'hippopotamus', 'hockey_stick', 'hog',
'home_plate_(baseball)', 'honey', 'fume_hood', 'hook', 'hookah',
'hornet', 'horse', 'hose', 'hot-air_balloon', 'hotplate', 'hot_sauce',
'hourglass', 'houseboat', 'hummingbird', 'hummus', 'polar_bear',
'icecream', 'popsicle', 'ice_maker', 'ice_pack', 'ice_skate',
'igniter', 'inhaler', 'iPod', 'iron_(for_clothing)', 'ironing_board',
'jacket', 'jam', 'jar', 'jean', 'jeep', 'jelly_bean', 'jersey',
'jet_plane', 'jewel', 'jewelry', 'joystick', 'jumpsuit', 'kayak',
'keg', 'kennel', 'kettle', 'key', 'keycard', 'kilt', 'kimono',
'kitchen_sink', 'kitchen_table', 'kite', 'kitten', 'kiwi_fruit',
'knee_pad', 'knife', 'knitting_needle', 'knob', 'knocker_(on_a_door)',
'koala', 'lab_coat', 'ladder', 'ladle', 'ladybug', 'lamb_(animal)',
'lamb-chop', 'lamp', 'lamppost', 'lampshade', 'lantern', 'lanyard',
'laptop_computer', 'lasagna', 'latch', 'lawn_mower', 'leather',
'legging_(clothing)', 'Lego', 'legume', 'lemon', 'lemonade', 'lettuce',
'license_plate', 'life_buoy', 'life_jacket', 'lightbulb',
'lightning_rod', 'lime', 'limousine', 'lion', 'lip_balm', 'liquor',
'lizard', 'log', 'lollipop', 'speaker_(stero_equipment)', 'loveseat',
'machine_gun', 'magazine', 'magnet', 'mail_slot', 'mailbox_(at_home)',
'mallard', 'mallet', 'mammoth', 'manatee', 'mandarin_orange', 'manger',
'manhole', 'map', 'marker', 'martini', 'mascot', 'mashed_potato',
'masher', 'mask', 'mast', 'mat_(gym_equipment)', 'matchbox',
'mattress', 'measuring_cup', 'measuring_stick', 'meatball', 'medicine',
'melon', 'microphone', 'microscope', 'microwave_oven', 'milestone',
'milk', 'milk_can', 'milkshake', 'minivan', 'mint_candy', 'mirror',
'mitten', 'mixer_(kitchen_tool)', 'money',
'monitor_(computer_equipment) computer_monitor', 'monkey', 'motor',
'motor_scooter', 'motor_vehicle', 'motorcycle', 'mound_(baseball)',
'mouse_(computer_equipment)', 'mousepad', 'muffin', 'mug', 'mushroom',
'music_stool', 'musical_instrument', 'nailfile', 'napkin',
'neckerchief', 'necklace', 'necktie', 'needle', 'nest', 'newspaper',
'newsstand', 'nightshirt', 'nosebag_(for_animals)',
'noseband_(for_animals)', 'notebook', 'notepad', 'nut', 'nutcracker',
'oar', 'octopus_(food)', 'octopus_(animal)', 'oil_lamp', 'olive_oil',
'omelet', 'onion', 'orange_(fruit)', 'orange_juice', 'ostrich',
'ottoman', 'oven', 'overalls_(clothing)', 'owl', 'packet', 'inkpad',
'pad', 'paddle', 'padlock', 'paintbrush', 'painting', 'pajamas',
'palette', 'pan_(for_cooking)', 'pan_(metal_container)', 'pancake',
'pantyhose', 'papaya', 'paper_plate', 'paper_towel', 'paperback_book',
'paperweight', 'parachute', 'parakeet', 'parasail_(sports)', 'parasol',
'parchment', 'parka', 'parking_meter', 'parrot',
'passenger_car_(part_of_a_train)', 'passenger_ship', 'passport',
'pastry', 'patty_(food)', 'pea_(food)', 'peach', 'peanut_butter',
'pear', 'peeler_(tool_for_fruit_and_vegetables)', 'wooden_leg',
'pegboard', 'pelican', 'pen', 'pencil', 'pencil_box',
'pencil_sharpener', 'pendulum', 'penguin', 'pennant', 'penny_(coin)',
'pepper', 'pepper_mill', 'perfume', 'persimmon', 'person', 'pet',
'pew_(church_bench)', 'phonebook', 'phonograph_record', 'piano',
'pickle', 'pickup_truck', 'pie', 'pigeon', 'piggy_bank', 'pillow',
'pin_(non_jewelry)', 'pineapple', 'pinecone', 'ping-pong_ball',
'pinwheel', 'tobacco_pipe', 'pipe', 'pistol', 'pita_(bread)',
'pitcher_(vessel_for_liquid)', 'pitchfork', 'pizza', 'place_mat',
'plate', 'platter', 'playpen', 'pliers', 'plow_(farm_equipment)',
'plume', 'pocket_watch', 'pocketknife', 'poker_(fire_stirring_tool)',
'pole', 'polo_shirt', 'poncho', 'pony', 'pool_table', 'pop_(soda)',
'postbox_(public)', 'postcard', 'poster', 'pot', 'flowerpot', 'potato',
'potholder', 'pottery', 'pouch', 'power_shovel', 'prawn', 'pretzel',
'printer', 'projectile_(weapon)', 'projector', 'propeller', 'prune',
'pudding', 'puffer_(fish)', 'puffin', 'pug-dog', 'pumpkin', 'puncher',
'puppet', 'puppy', 'quesadilla', 'quiche', 'quilt', 'rabbit',
'race_car', 'racket', 'radar', 'radiator', 'radio_receiver', 'radish',
'raft', 'rag_doll', 'raincoat', 'ram_(animal)', 'raspberry', 'rat',
'razorblade', 'reamer_(juicer)', 'rearview_mirror', 'receipt',
'recliner', 'record_player', 'reflector', 'remote_control',
'rhinoceros', 'rib_(food)', 'rifle', 'ring', 'river_boat', 'road_map',
'robe', 'rocking_chair', 'rodent', 'roller_skate', 'Rollerblade',
'rolling_pin', 'root_beer', 'router_(computer_equipment)',
'rubber_band', 'runner_(carpet)', 'plastic_bag',
'saddle_(on_an_animal)', 'saddle_blanket', 'saddlebag', 'safety_pin',
'sail', 'salad', 'salad_plate', 'salami', 'salmon_(fish)',
'salmon_(food)', 'salsa', 'saltshaker', 'sandal_(type_of_shoe)',
'sandwich', 'satchel', 'saucepan', 'saucer', 'sausage', 'sawhorse',
'saxophone', 'scale_(measuring_instrument)', 'scarecrow', 'scarf',
'school_bus', 'scissors', 'scoreboard', 'scraper', 'screwdriver',
'scrubbing_brush', 'sculpture', 'seabird', 'seahorse', 'seaplane',
'seashell', 'sewing_machine', 'shaker', 'shampoo', 'shark',
'sharpener', 'Sharpie', 'shaver_(electric)', 'shaving_cream', 'shawl',
'shears', 'sheep', 'shepherd_dog', 'sherbert', 'shield', 'shirt',
'shoe', 'shopping_bag', 'shopping_cart', 'short_pants', 'shot_glass',
'shoulder_bag', 'shovel', 'shower_head', 'shower_cap',
'shower_curtain', 'shredder_(for_paper)', 'signboard', 'silo', 'sink',
'skateboard', 'skewer', 'ski', 'ski_boot', 'ski_parka', 'ski_pole',
'skirt', 'skullcap', 'sled', 'sleeping_bag', 'sling_(bandage)',
'slipper_(footwear)', 'smoothie', 'snake', 'snowboard', 'snowman',
'snowmobile', 'soap', 'soccer_ball', 'sock', 'sofa', 'softball',
'solar_array', 'sombrero', 'soup', 'soup_bowl', 'soupspoon',
'sour_cream', 'soya_milk', 'space_shuttle', 'sparkler_(fireworks)',
'spatula', 'spear', 'spectacles', 'spice_rack', 'spider', 'crawfish',
'sponge', 'spoon', 'sportswear', 'spotlight', 'squid_(food)',
'squirrel', 'stagecoach', 'stapler_(stapling_machine)', 'starfish',
'statue_(sculpture)', 'steak_(food)', 'steak_knife', 'steering_wheel',
'stepladder', 'step_stool', 'stereo_(sound_system)', 'stew', 'stirrer',
'stirrup', 'stool', 'stop_sign', 'brake_light', 'stove', 'strainer',
'strap', 'straw_(for_drinking)', 'strawberry', 'street_sign',
'streetlight', 'string_cheese', 'stylus', 'subwoofer', 'sugar_bowl',
'sugarcane_(plant)', 'suit_(clothing)', 'sunflower', 'sunglasses',
'sunhat', 'surfboard', 'sushi', 'mop', 'sweat_pants', 'sweatband',
'sweater', 'sweatshirt', 'sweet_potato', 'swimsuit', 'sword',
'syringe', 'Tabasco_sauce', 'table-tennis_table', 'table',
'table_lamp', 'tablecloth', 'tachometer', 'taco', 'tag', 'taillight',
'tambourine', 'army_tank', 'tank_(storage_vessel)',
'tank_top_(clothing)', 'tape_(sticky_cloth_or_paper)', 'tape_measure',
'tapestry', 'tarp', 'tartan', 'tassel', 'tea_bag', 'teacup',
'teakettle', 'teapot', 'teddy_bear', 'telephone', 'telephone_booth',
'telephone_pole', 'telephoto_lens', 'television_camera',
'television_set', 'tennis_ball', 'tennis_racket', 'tequila',
'thermometer', 'thermos_bottle', 'thermostat', 'thimble', 'thread',
'thumbtack', 'tiara', 'tiger', 'tights_(clothing)', 'timer', 'tinfoil',
'tinsel', 'tissue_paper', 'toast_(food)', 'toaster', 'toaster_oven',
'toilet', 'toilet_tissue', 'tomato', 'tongs', 'toolbox', 'toothbrush',
'toothpaste', 'toothpick', 'cover', 'tortilla', 'tow_truck', 'towel',
'towel_rack', 'toy', 'tractor_(farm_equipment)', 'traffic_light',
'dirt_bike', 'trailer_truck', 'train_(railroad_vehicle)', 'trampoline',
'tray', 'trench_coat', 'triangle_(musical_instrument)', 'tricycle',
'tripod', 'trousers', 'truck', 'truffle_(chocolate)', 'trunk', 'vat',
'turban', 'turkey_(food)', 'turnip', 'turtle', 'turtleneck_(clothing)',
'typewriter', 'umbrella', 'underwear', 'unicycle', 'urinal', 'urn',
'vacuum_cleaner', 'vase', 'vending_machine', 'vent', 'vest',
'videotape', 'vinegar', 'violin', 'vodka', 'volleyball', 'vulture',
'waffle', 'waffle_iron', 'wagon', 'wagon_wheel', 'walking_stick',
'wall_clock', 'wall_socket', 'wallet', 'walrus', 'wardrobe',
'washbasin', 'automatic_washer', 'watch', 'water_bottle',
'water_cooler', 'water_faucet', 'water_heater', 'water_jug',
'water_gun', 'water_scooter', 'water_ski', 'water_tower',
'watering_can', 'watermelon', 'weathervane', 'webcam', 'wedding_cake',
'wedding_ring', 'wet_suit', 'wheel', 'wheelchair', 'whipped_cream',
'whistle', 'wig', 'wind_chime', 'windmill', 'window_box_(for_plants)',
'windshield_wiper', 'windsock', 'wine_bottle', 'wine_bucket',
'wineglass', 'blinder_(for_horses)', 'wok', 'wolf', 'wooden_spoon',
'wreath', 'wrench', 'wristband', 'wristlet', 'yacht', 'yogurt',
'yoke_(animal_equipment)', 'zebra', 'zucchini')
def load_annotations(self, ann_file):
try:
import lvis
assert lvis.__version__ >= '10.5.3'
from lvis import LVIS
except AssertionError:
raise AssertionError('Incompatible version of lvis is installed. '
'Run pip uninstall lvis first. Then run pip '
'install mmlvis to install open-mmlab forked '
'lvis. ')
except ImportError:
raise ImportError('Package lvis is not installed. Please run pip '
'install mmlvis to install open-mmlab forked '
'lvis.')
self.coco = LVIS(ann_file)
self.cat_ids = self.coco.get_cat_ids()
self.cat2label = {cat_id: i for i, cat_id in enumerate(self.cat_ids)}
self.img_ids = self.coco.get_img_ids()
data_infos = []
for i in self.img_ids:
info = self.coco.load_imgs([i])[0]
# coco_url is used in LVISv1 instead of file_name
# e.g. http://images.cocodataset.org/train2017/000000391895.jpg
# train/val split in specified in url
info['filename'] = info['coco_url'].replace(
'http://images.cocodataset.org/', '')
data_infos.append(info)
return data_infos
from .auto_augment import (AutoAugment, BrightnessTransform, ColorTransform,
ContrastTransform, EqualizeTransform, Rotate, Shear,
Translate)
from .compose import Compose
from .formating import (Collect, DefaultFormatBundle, ImageToTensor,
ToDataContainer, ToTensor, Transpose, to_tensor)
from .instaboost import InstaBoost
from .loading import (LoadAnnotations, LoadImageFromFile, LoadImageFromWebcam,
LoadMultiChannelImageFromFiles, LoadProposals)
from .test_time_aug import MultiScaleFlipAug
from .transforms import (Albu, CutOut, Expand, MinIoURandomCrop, Normalize,
Pad, PhotoMetricDistortion, RandomCenterCropPad,
RandomCrop, RandomFlip, Resize, SegRescale)
__all__ = [
'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
'Transpose', 'Collect', 'DefaultFormatBundle', 'LoadAnnotations',
'LoadImageFromFile', 'LoadImageFromWebcam',
'LoadMultiChannelImageFromFiles', 'LoadProposals', 'MultiScaleFlipAug',
'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 'Normalize', 'SegRescale',
'MinIoURandomCrop', 'Expand', 'PhotoMetricDistortion', 'Albu',
'InstaBoost', 'RandomCenterCropPad', 'AutoAugment', 'CutOut', 'Shear',
'Rotate', 'ColorTransform', 'EqualizeTransform', 'BrightnessTransform',
'ContrastTransform', 'Translate'
]
import copy
import cv2
import mmcv
import numpy as np
from ..builder import PIPELINES
from .compose import Compose
_MAX_LEVEL = 10
def level_to_value(level, max_value):
"""Map from level to values based on max_value."""
return (level / _MAX_LEVEL) * max_value
def enhance_level_to_value(level, a=1.8, b=0.1):
"""Map from level to values."""
return (level / _MAX_LEVEL) * a + b
def random_negative(value, random_negative_prob):
"""Randomly negate value based on random_negative_prob."""
return -value if np.random.rand() < random_negative_prob else value
def bbox2fields():
"""The key correspondence from bboxes to labels, masks and
segmentations."""
bbox2label = {
'gt_bboxes': 'gt_labels',
'gt_bboxes_ignore': 'gt_labels_ignore'
}
bbox2mask = {
'gt_bboxes': 'gt_masks',
'gt_bboxes_ignore': 'gt_masks_ignore'
}
bbox2seg = {
'gt_bboxes': 'gt_semantic_seg',
}
return bbox2label, bbox2mask, bbox2seg
@PIPELINES.register_module()
class AutoAugment(object):
"""Auto augmentation.
This data augmentation is proposed in `Learning Data Augmentation
Strategies for Object Detection <https://arxiv.org/pdf/1906.11172>`_.
TODO: Implement 'Shear', 'Sharpness' and 'Rotate' transforms
Args:
policies (list[list[dict]]): The policies of auto augmentation. Each
policy in ``policies`` is a specific augmentation policy, and is
composed by several augmentations (dict). When AutoAugment is
called, a random policy in ``policies`` will be selected to
augment images.
Examples:
>>> replace = (104, 116, 124)
>>> policies = [
>>> [
>>> dict(type='Sharpness', prob=0.0, level=8),
>>> dict(
>>> type='Shear',
>>> prob=0.4,
>>> level=0,
>>> replace=replace,
>>> axis='x')
>>> ],
>>> [
>>> dict(
>>> type='Rotate',
>>> prob=0.6,
>>> level=10,
>>> replace=replace),
>>> dict(type='Color', prob=1.0, level=6)
>>> ]
>>> ]
>>> augmentation = AutoAugment(policies)
>>> img = np.ones(100, 100, 3)
>>> gt_bboxes = np.ones(10, 4)
>>> results = dict(img=img, gt_bboxes=gt_bboxes)
>>> results = augmentation(results)
"""
def __init__(self, policies):
assert isinstance(policies, list) and len(policies) > 0, \
'Policies must be a non-empty list.'
for policy in policies:
assert isinstance(policy, list) and len(policy) > 0, \
'Each policy in policies must be a non-empty list.'
for augment in policy:
assert isinstance(augment, dict) and 'type' in augment, \
'Each specific augmentation must be a dict with key' \
' "type".'
self.policies = copy.deepcopy(policies)
self.transforms = [Compose(policy) for policy in self.policies]
def __call__(self, results):
transform = np.random.choice(self.transforms)
return transform(results)
def __repr__(self):
return f'{self.__class__.__name__}(policies={self.policies})'
@PIPELINES.register_module()
class Shear(object):
"""Apply Shear Transformation to image (and its corresponding bbox, mask,
segmentation).
Args:
level (int | float): The level should be in range [0,_MAX_LEVEL].
img_fill_val (int | float | tuple): The filled values for image border.
If float, the same fill value will be used for all the three
channels of image. If tuple, the should be 3 elements.
seg_ignore_label (int): The fill value used for segmentation map.
Note this value must equals ``ignore_label`` in ``semantic_head``
of the corresponding config. Default 255.
prob (float): The probability for performing Shear and should be in
range [0, 1].
direction (str): The direction for shear, either "horizontal"
or "vertical".
max_shear_magnitude (float): The maximum magnitude for Shear
transformation.
random_negative_prob (float): The probability that turns the
offset negative. Should be in range [0,1]
interpolation (str): Same as in :func:`mmcv.imshear`.
"""
def __init__(self,
level,
img_fill_val=128,
seg_ignore_label=255,
prob=0.5,
direction='horizontal',
max_shear_magnitude=0.3,
random_negative_prob=0.5,
interpolation='bilinear'):
assert isinstance(level, (int, float)), 'The level must be type ' \
f'int or float, got {type(level)}.'
assert 0 <= level <= _MAX_LEVEL, 'The level should be in range ' \
f'[0,{_MAX_LEVEL}], got {level}.'
if isinstance(img_fill_val, (float, int)):
img_fill_val = tuple([float(img_fill_val)] * 3)
elif isinstance(img_fill_val, tuple):
assert len(img_fill_val) == 3, 'img_fill_val as tuple must ' \
f'have 3 elements. got {len(img_fill_val)}.'
img_fill_val = tuple([float(val) for val in img_fill_val])
else:
raise ValueError(
'img_fill_val must be float or tuple with 3 elements.')
assert np.all([0 <= val <= 255 for val in img_fill_val]), 'all ' \
'elements of img_fill_val should between range [0,255].' \
f'got {img_fill_val}.'
assert 0 <= prob <= 1.0, 'The probability of shear should be in ' \
f'range [0,1]. got {prob}.'
assert direction in ('horizontal', 'vertical'), 'direction must ' \
f'in be either "horizontal" or "vertical". got {direction}.'
assert isinstance(max_shear_magnitude, float), 'max_shear_magnitude ' \
f'should be type float. got {type(max_shear_magnitude)}.'
assert 0. <= max_shear_magnitude <= 1., 'Defaultly ' \
'max_shear_magnitude should be in range [0,1]. ' \
f'got {max_shear_magnitude}.'
self.level = level
self.magnitude = level_to_value(level, max_shear_magnitude)
self.img_fill_val = img_fill_val
self.seg_ignore_label = seg_ignore_label
self.prob = prob
self.direction = direction
self.max_shear_magnitude = max_shear_magnitude
self.random_negative_prob = random_negative_prob
self.interpolation = interpolation
def _shear_img(self,
results,
magnitude,
direction='horizontal',
interpolation='bilinear'):
"""Shear the image.
Args:
results (dict): Result dict from loading pipeline.
magnitude (int | float): The magnitude used for shear.
direction (str): The direction for shear, either "horizontal"
or "vertical".
interpolation (str): Same as in :func:`mmcv.imshear`.
"""
for key in results.get('img_fields', ['img']):
img = results[key]
img_sheared = mmcv.imshear(
img,
magnitude,
direction,
border_value=self.img_fill_val,
interpolation=interpolation)
results[key] = img_sheared.astype(img.dtype)
def _shear_bboxes(self, results, magnitude):
"""Shear the bboxes."""
h, w, c = results['img_shape']
if self.direction == 'horizontal':
shear_matrix = np.stack([[1, magnitude],
[0, 1]]).astype(np.float32) # [2, 2]
else:
shear_matrix = np.stack([[1, 0], [magnitude,
1]]).astype(np.float32)
for key in results.get('bbox_fields', []):
min_x, min_y, max_x, max_y = np.split(
results[key], results[key].shape[-1], axis=-1)
coordinates = np.stack([[min_x, min_y], [max_x, min_y],
[min_x, max_y],
[max_x, max_y]]) # [4, 2, nb_box, 1]
coordinates = coordinates[..., 0].transpose(
(2, 1, 0)).astype(np.float32) # [nb_box, 2, 4]
new_coords = np.matmul(shear_matrix[None, :, :],
coordinates) # [nb_box, 2, 4]
min_x = np.min(new_coords[:, 0, :], axis=-1)
min_y = np.min(new_coords[:, 1, :], axis=-1)
max_x = np.max(new_coords[:, 0, :], axis=-1)
max_y = np.max(new_coords[:, 1, :], axis=-1)
min_x = np.clip(min_x, a_min=0, a_max=w)
min_y = np.clip(min_y, a_min=0, a_max=h)
max_x = np.clip(max_x, a_min=min_x, a_max=w)
max_y = np.clip(max_y, a_min=min_y, a_max=h)
results[key] = np.stack([min_x, min_y, max_x, max_y],
axis=-1).astype(results[key].dtype)
def _shear_masks(self,
results,
magnitude,
direction='horizontal',
fill_val=0,
interpolation='bilinear'):
"""Shear the masks."""
h, w, c = results['img_shape']
for key in results.get('mask_fields', []):
masks = results[key]
results[key] = masks.shear((h, w),
magnitude,
direction,
border_value=fill_val,
interpolation=interpolation)
def _shear_seg(self,
results,
magnitude,
direction='horizontal',
fill_val=255,
interpolation='bilinear'):
"""Shear the segmentation maps."""
for key in results.get('seg_fields', []):
seg = results[key]
results[key] = mmcv.imshear(
seg,
magnitude,
direction,
border_value=fill_val,
interpolation=interpolation).astype(seg.dtype)
def _filter_invalid(self, results, min_bbox_size=0):
"""Filter bboxes and corresponding masks too small after shear
augmentation."""
bbox2label, bbox2mask, _ = bbox2fields()
for key in results.get('bbox_fields', []):
bbox_w = results[key][:, 2] - results[key][:, 0]
bbox_h = results[key][:, 3] - results[key][:, 1]
valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
valid_inds = np.nonzero(valid_inds)[0]
results[key] = results[key][valid_inds]
# label fields. e.g. gt_labels and gt_labels_ignore
label_key = bbox2label.get(key)
if label_key in results:
results[label_key] = results[label_key][valid_inds]
# mask fields, e.g. gt_masks and gt_masks_ignore
mask_key = bbox2mask.get(key)
if mask_key in results:
results[mask_key] = results[mask_key][valid_inds]
def __call__(self, results):
"""Call function to shear images, bounding boxes, masks and semantic
segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Sheared results.
"""
if np.random.rand() > self.prob:
return results
magnitude = random_negative(self.magnitude, self.random_negative_prob)
self._shear_img(results, magnitude, self.direction, self.interpolation)
self._shear_bboxes(results, magnitude)
# fill_val set to 0 for background of mask.
self._shear_masks(
results,
magnitude,
self.direction,
fill_val=0,
interpolation=self.interpolation)
self._shear_seg(
results,
magnitude,
self.direction,
fill_val=self.seg_ignore_label,
interpolation=self.interpolation)
self._filter_invalid(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(level={self.level}, '
repr_str += f'img_fill_val={self.img_fill_val}, '
repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
repr_str += f'prob={self.prob}, '
repr_str += f'direction={self.direction}, '
repr_str += f'max_shear_magnitude={self.max_shear_magnitude}, '
repr_str += f'random_negative_prob={self.random_negative_prob}, '
repr_str += f'interpolation={self.interpolation})'
return repr_str
@PIPELINES.register_module()
class Rotate(object):
"""Apply Rotate Transformation to image (and its corresponding bbox, mask,
segmentation).
Args:
level (int | float): The level should be in range (0,_MAX_LEVEL].
scale (int | float): Isotropic scale factor. Same in
``mmcv.imrotate``.
center (int | float | tuple[float]): Center point (w, h) of the
rotation in the source image. If None, the center of the
image will be used. Same in ``mmcv.imrotate``.
img_fill_val (int | float | tuple): The fill value for image border.
If float, the same value will be used for all the three
channels of image. If tuple, the should be 3 elements (e.g.
equals the number of channels for image).
seg_ignore_label (int): The fill value used for segmentation map.
Note this value must equals ``ignore_label`` in ``semantic_head``
of the corresponding config. Default 255.
prob (float): The probability for perform transformation and
should be in range 0 to 1.
max_rotate_angle (int | float): The maximum angles for rotate
transformation.
random_negative_prob (float): The probability that turns the
offset negative.
"""
def __init__(self,
level,
scale=1,
center=None,
img_fill_val=128,
seg_ignore_label=255,
prob=0.5,
max_rotate_angle=30,
random_negative_prob=0.5):
assert isinstance(level, (int, float)), \
f'The level must be type int or float. got {type(level)}.'
assert 0 <= level <= _MAX_LEVEL, \
f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.'
assert isinstance(scale, (int, float)), \
f'The scale must be type int or float. got type {type(scale)}.'
if isinstance(center, (int, float)):
center = (center, center)
elif isinstance(center, tuple):
assert len(center) == 2, 'center with type tuple must have '\
f'2 elements. got {len(center)} elements.'
else:
assert center is None, 'center must be None or type int, '\
f'float or tuple, got type {type(center)}.'
if isinstance(img_fill_val, (float, int)):
img_fill_val = tuple([float(img_fill_val)] * 3)
elif isinstance(img_fill_val, tuple):
assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\
f'have 3 elements. got {len(img_fill_val)}.'
img_fill_val = tuple([float(val) for val in img_fill_val])
else:
raise ValueError(
'img_fill_val must be float or tuple with 3 elements.')
assert np.all([0 <= val <= 255 for val in img_fill_val]), \
'all elements of img_fill_val should between range [0,255]. '\
f'got {img_fill_val}.'
assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
'got {prob}.'
assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\
f'should be type int or float. got type {type(max_rotate_angle)}.'
self.level = level
self.scale = scale
# Rotation angle in degrees. Positive values mean
# clockwise rotation.
self.angle = level_to_value(level, max_rotate_angle)
self.center = center
self.img_fill_val = img_fill_val
self.seg_ignore_label = seg_ignore_label
self.prob = prob
self.max_rotate_angle = max_rotate_angle
self.random_negative_prob = random_negative_prob
def _rotate_img(self, results, angle, center=None, scale=1.0):
"""Rotate the image.
Args:
results (dict): Result dict from loading pipeline.
angle (float): Rotation angle in degrees, positive values
mean clockwise rotation. Same in ``mmcv.imrotate``.
center (tuple[float], optional): Center point (w, h) of the
rotation. Same in ``mmcv.imrotate``.
scale (int | float): Isotropic scale factor. Same in
``mmcv.imrotate``.
"""
for key in results.get('img_fields', ['img']):
img = results[key].copy()
img_rotated = mmcv.imrotate(
img, angle, center, scale, border_value=self.img_fill_val)
results[key] = img_rotated.astype(img.dtype)
def _rotate_bboxes(self, results, rotate_matrix):
"""Rotate the bboxes."""
h, w, c = results['img_shape']
for key in results.get('bbox_fields', []):
min_x, min_y, max_x, max_y = np.split(
results[key], results[key].shape[-1], axis=-1)
coordinates = np.stack([[min_x, min_y], [max_x, min_y],
[min_x, max_y],
[max_x, max_y]]) # [4, 2, nb_bbox, 1]
# pad 1 to convert from format [x, y] to homogeneous
# coordinates format [x, y, 1]
coordinates = np.concatenate(
(coordinates,
np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)),
axis=1) # [4, 3, nb_bbox, 1]
coordinates = coordinates.transpose(
(2, 0, 1, 3)) # [nb_bbox, 4, 3, 1]
rotated_coords = np.matmul(rotate_matrix,
coordinates) # [nb_bbox, 4, 2, 1]
rotated_coords = rotated_coords[..., 0] # [nb_bbox, 4, 2]
min_x, min_y = np.min(
rotated_coords[:, :, 0], axis=1), np.min(
rotated_coords[:, :, 1], axis=1)
max_x, max_y = np.max(
rotated_coords[:, :, 0], axis=1), np.max(
rotated_coords[:, :, 1], axis=1)
min_x, min_y = np.clip(
min_x, a_min=0, a_max=w), np.clip(
min_y, a_min=0, a_max=h)
max_x, max_y = np.clip(
max_x, a_min=min_x, a_max=w), np.clip(
max_y, a_min=min_y, a_max=h)
results[key] = np.stack([min_x, min_y, max_x, max_y],
axis=-1).astype(results[key].dtype)
def _rotate_masks(self,
results,
angle,
center=None,
scale=1.0,
fill_val=0):
"""Rotate the masks."""
h, w, c = results['img_shape']
for key in results.get('mask_fields', []):
masks = results[key]
results[key] = masks.rotate((h, w), angle, center, scale, fill_val)
def _rotate_seg(self,
results,
angle,
center=None,
scale=1.0,
fill_val=255):
"""Rotate the segmentation map."""
for key in results.get('seg_fields', []):
seg = results[key].copy()
results[key] = mmcv.imrotate(
seg, angle, center, scale,
border_value=fill_val).astype(seg.dtype)
def _filter_invalid(self, results, min_bbox_size=0):
"""Filter bboxes and corresponding masks too small after rotate
augmentation."""
bbox2label, bbox2mask, _ = bbox2fields()
for key in results.get('bbox_fields', []):
bbox_w = results[key][:, 2] - results[key][:, 0]
bbox_h = results[key][:, 3] - results[key][:, 1]
valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
valid_inds = np.nonzero(valid_inds)[0]
results[key] = results[key][valid_inds]
# label fields. e.g. gt_labels and gt_labels_ignore
label_key = bbox2label.get(key)
if label_key in results:
results[label_key] = results[label_key][valid_inds]
# mask fields, e.g. gt_masks and gt_masks_ignore
mask_key = bbox2mask.get(key)
if mask_key in results:
results[mask_key] = results[mask_key][valid_inds]
def __call__(self, results):
"""Call function to rotate images, bounding boxes, masks and semantic
segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Rotated results.
"""
if np.random.rand() > self.prob:
return results
h, w = results['img'].shape[:2]
center = self.center
if center is None:
center = ((w - 1) * 0.5, (h - 1) * 0.5)
angle = random_negative(self.angle, self.random_negative_prob)
self._rotate_img(results, angle, center, self.scale)
rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale)
self._rotate_bboxes(results, rotate_matrix)
self._rotate_masks(results, angle, center, self.scale, fill_val=0)
self._rotate_seg(
results, angle, center, self.scale, fill_val=self.seg_ignore_label)
self._filter_invalid(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(level={self.level}, '
repr_str += f'scale={self.scale}, '
repr_str += f'center={self.center}, '
repr_str += f'img_fill_val={self.img_fill_val}, '
repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
repr_str += f'prob={self.prob}, '
repr_str += f'max_rotate_angle={self.max_rotate_angle}, '
repr_str += f'random_negative_prob={self.random_negative_prob})'
return repr_str
@PIPELINES.register_module()
class Translate(object):
"""Translate the images, bboxes, masks and segmentation maps horizontally
or vertically.
Args:
level (int | float): The level for Translate and should be in
range [0,_MAX_LEVEL].
prob (float): The probability for performing translation and
should be in range [0, 1].
img_fill_val (int | float | tuple): The filled value for image
border. If float, the same fill value will be used for all
the three channels of image. If tuple, the should be 3
elements (e.g. equals the number of channels for image).
seg_ignore_label (int): The fill value used for segmentation map.
Note this value must equals ``ignore_label`` in ``semantic_head``
of the corresponding config. Default 255.
direction (str): The translate direction, either "horizontal"
or "vertical".
max_translate_offset (int | float): The maximum pixel's offset for
Translate.
random_negative_prob (float): The probability that turns the
offset negative.
min_size (int | float): The minimum pixel for filtering
invalid bboxes after the translation.
"""
def __init__(self,
level,
prob=0.5,
img_fill_val=128,
seg_ignore_label=255,
direction='horizontal',
max_translate_offset=250.,
random_negative_prob=0.5,
min_size=0):
assert isinstance(level, (int, float)), \
'The level must be type int or float.'
assert 0 <= level <= _MAX_LEVEL, \
'The level used for calculating Translate\'s offset should be ' \
'in range [0,_MAX_LEVEL]'
assert 0 <= prob <= 1.0, \
'The probability of translation should be in range [0, 1].'
if isinstance(img_fill_val, (float, int)):
img_fill_val = tuple([float(img_fill_val)] * 3)
elif isinstance(img_fill_val, tuple):
assert len(img_fill_val) == 3, \
'img_fill_val as tuple must have 3 elements.'
img_fill_val = tuple([float(val) for val in img_fill_val])
else:
raise ValueError('img_fill_val must be type float or tuple.')
assert np.all([0 <= val <= 255 for val in img_fill_val]), \
'all elements of img_fill_val should between range [0,255].'
assert direction in ('horizontal', 'vertical'), \
'direction should be "horizontal" or "vertical".'
assert isinstance(max_translate_offset, (int, float)), \
'The max_translate_offset must be type int or float.'
# the offset used for translation
self.offset = int(level_to_value(level, max_translate_offset))
self.level = level
self.prob = prob
self.img_fill_val = img_fill_val
self.seg_ignore_label = seg_ignore_label
self.direction = direction
self.max_translate_offset = max_translate_offset
self.random_negative_prob = random_negative_prob
self.min_size = min_size
def _translate_img(self, results, offset, direction='horizontal'):
"""Translate the image.
Args:
results (dict): Result dict from loading pipeline.
offset (int | float): The offset for translate.
direction (str): The translate direction, either "horizontal"
or "vertical".
"""
for key in results.get('img_fields', ['img']):
img = results[key].copy()
results[key] = mmcv.imtranslate(
img, offset, direction, self.img_fill_val).astype(img.dtype)
def _translate_bboxes(self, results, offset):
"""Shift bboxes horizontally or vertically, according to offset."""
h, w, c = results['img_shape']
for key in results.get('bbox_fields', []):
min_x, min_y, max_x, max_y = np.split(
results[key], results[key].shape[-1], axis=-1)
if self.direction == 'horizontal':
min_x = np.maximum(0, min_x + offset)
max_x = np.minimum(w, max_x + offset)
elif self.direction == 'vertical':
min_y = np.maximum(0, min_y + offset)
max_y = np.minimum(h, max_y + offset)
# the boxs translated outside of image will be filtered along with
# the corresponding masks, by invoking ``_filter_invalid``.
results[key] = np.concatenate([min_x, min_y, max_x, max_y],
axis=-1)
def _translate_masks(self,
results,
offset,
direction='horizontal',
fill_val=0):
"""Translate masks horizontally or vertically."""
h, w, c = results['img_shape']
for key in results.get('mask_fields', []):
masks = results[key]
results[key] = masks.translate((h, w), offset, direction, fill_val)
def _translate_seg(self,
results,
offset,
direction='horizontal',
fill_val=255):
"""Translate segmentation maps horizontally or vertically."""
for key in results.get('seg_fields', []):
seg = results[key].copy()
results[key] = mmcv.imtranslate(seg, offset, direction,
fill_val).astype(seg.dtype)
def _filter_invalid(self, results, min_size=0):
"""Filter bboxes and masks too small or translated out of image."""
bbox2label, bbox2mask, _ = bbox2fields()
for key in results.get('bbox_fields', []):
bbox_w = results[key][:, 2] - results[key][:, 0]
bbox_h = results[key][:, 3] - results[key][:, 1]
valid_inds = (bbox_w > min_size) & (bbox_h > min_size)
valid_inds = np.nonzero(valid_inds)[0]
results[key] = results[key][valid_inds]
# label fields. e.g. gt_labels and gt_labels_ignore
label_key = bbox2label.get(key)
if label_key in results:
results[label_key] = results[label_key][valid_inds]
# mask fields, e.g. gt_masks and gt_masks_ignore
mask_key = bbox2mask.get(key)
if mask_key in results:
results[mask_key] = results[mask_key][valid_inds]
return results
def __call__(self, results):
"""Call function to translate images, bounding boxes, masks and
semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Translated results.
"""
if np.random.rand() > self.prob:
return results
offset = random_negative(self.offset, self.random_negative_prob)
self._translate_img(results, offset, self.direction)
self._translate_bboxes(results, offset)
# fill_val defaultly 0 for BitmapMasks and None for PolygonMasks.
self._translate_masks(results, offset, self.direction)
# fill_val set to ``seg_ignore_label`` for the ignored value
# of segmentation map.
self._translate_seg(
results, offset, self.direction, fill_val=self.seg_ignore_label)
self._filter_invalid(results, min_size=self.min_size)
return results
@PIPELINES.register_module()
class ColorTransform(object):
"""Apply Color transformation to image. The bboxes, masks, and
segmentations are not modified.
Args:
level (int | float): Should be in range [0,_MAX_LEVEL].
prob (float): The probability for performing Color transformation.
"""
def __init__(self, level, prob=0.5):
assert isinstance(level, (int, float)), \
'The level must be type int or float.'
assert 0 <= level <= _MAX_LEVEL, \
'The level should be in range [0,_MAX_LEVEL].'
assert 0 <= prob <= 1.0, \
'The probability should be in range [0,1].'
self.level = level
self.prob = prob
self.factor = enhance_level_to_value(level)
def _adjust_color_img(self, results, factor=1.0):
"""Apply Color transformation to image."""
for key in results.get('img_fields', ['img']):
# NOTE defaultly the image should be BGR format
img = results[key]
results[key] = mmcv.adjust_color(img, factor).astype(img.dtype)
def __call__(self, results):
"""Call function for Color transformation.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Colored results.
"""
if np.random.rand() > self.prob:
return results
self._adjust_color_img(results, self.factor)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(level={self.level}, '
repr_str += f'prob={self.prob})'
return repr_str
@PIPELINES.register_module()
class EqualizeTransform(object):
"""Apply Equalize transformation to image. The bboxes, masks and
segmentations are not modified.
Args:
prob (float): The probability for performing Equalize transformation.
"""
def __init__(self, prob=0.5):
assert 0 <= prob <= 1.0, \
'The probability should be in range [0,1].'
self.prob = prob
def _imequalize(self, results):
"""Equalizes the histogram of one image."""
for key in results.get('img_fields', ['img']):
img = results[key]
results[key] = mmcv.imequalize(img).astype(img.dtype)
def __call__(self, results):
"""Call function for Equalize transformation.
Args:
results (dict): Results dict from loading pipeline.
Returns:
dict: Results after the transformation.
"""
if np.random.rand() > self.prob:
return results
self._imequalize(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(prob={self.prob})'
@PIPELINES.register_module()
class BrightnessTransform(object):
"""Apply Brightness transformation to image. The bboxes, masks and
segmentations are not modified.
Args:
level (int | float): Should be in range [0,_MAX_LEVEL].
prob (float): The probability for performing Brightness transformation.
"""
def __init__(self, level, prob=0.5):
assert isinstance(level, (int, float)), \
'The level must be type int or float.'
assert 0 <= level <= _MAX_LEVEL, \
'The level should be in range [0,_MAX_LEVEL].'
assert 0 <= prob <= 1.0, \
'The probability should be in range [0,1].'
self.level = level
self.prob = prob
self.factor = enhance_level_to_value(level)
def _adjust_brightness_img(self, results, factor=1.0):
"""Adjust the brightness of image."""
for key in results.get('img_fields', ['img']):
img = results[key]
results[key] = mmcv.adjust_brightness(img,
factor).astype(img.dtype)
def __call__(self, results):
"""Call function for Brightness transformation.
Args:
results (dict): Results dict from loading pipeline.
Returns:
dict: Results after the transformation.
"""
if np.random.rand() > self.prob:
return results
self._adjust_brightness_img(results, self.factor)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(level={self.level}, '
repr_str += f'prob={self.prob})'
return repr_str
@PIPELINES.register_module()
class ContrastTransform(object):
"""Apply Contrast transformation to image. The bboxes, masks and
segmentations are not modified.
Args:
level (int | float): Should be in range [0,_MAX_LEVEL].
prob (float): The probability for performing Contrast transformation.
"""
def __init__(self, level, prob=0.5):
assert isinstance(level, (int, float)), \
'The level must be type int or float.'
assert 0 <= level <= _MAX_LEVEL, \
'The level should be in range [0,_MAX_LEVEL].'
assert 0 <= prob <= 1.0, \
'The probability should be in range [0,1].'
self.level = level
self.prob = prob
self.factor = enhance_level_to_value(level)
def _adjust_contrast_img(self, results, factor=1.0):
"""Adjust the image contrast."""
for key in results.get('img_fields', ['img']):
img = results[key]
results[key] = mmcv.adjust_contrast(img, factor).astype(img.dtype)
def __call__(self, results):
"""Call function for Contrast transformation.
Args:
results (dict): Results dict from loading pipeline.
Returns:
dict: Results after the transformation.
"""
if np.random.rand() > self.prob:
return results
self._adjust_contrast_img(results, self.factor)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(level={self.level}, '
repr_str += f'prob={self.prob})'
return repr_str
import collections
from mmcv.utils import build_from_cfg
from ..builder import PIPELINES
@PIPELINES.register_module()
class Compose(object):
"""Compose multiple transforms sequentially.
Args:
transforms (Sequence[dict | callable]): Sequence of transform object or
config dict to be composed.
"""
def __init__(self, transforms):
assert isinstance(transforms, collections.abc.Sequence)
self.transforms = []
for transform in transforms:
if isinstance(transform, dict):
transform = build_from_cfg(transform, PIPELINES)
self.transforms.append(transform)
elif callable(transform):
self.transforms.append(transform)
else:
raise TypeError('transform must be callable or a dict')
def __call__(self, data):
"""Call function to apply transforms sequentially.
Args:
data (dict): A result dict contains the data to transform.
Returns:
dict: Transformed data.
"""
for t in self.transforms:
data = t(data)
if data is None:
return None
return data
def __repr__(self):
format_string = self.__class__.__name__ + '('
for t in self.transforms:
format_string += '\n'
format_string += f' {t}'
format_string += '\n)'
return format_string
from collections.abc import Sequence
import mmcv
import numpy as np
import torch
from mmcv.parallel import DataContainer as DC
from ..builder import PIPELINES
def to_tensor(data):
"""Convert objects of various python types to :obj:`torch.Tensor`.
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.
Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
"""
if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not mmcv.is_str(data):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f'type {type(data)} cannot be converted to tensor.')
@PIPELINES.register_module()
class ToTensor(object):
"""Convert some results to :obj:`torch.Tensor` by given keys.
Args:
keys (Sequence[str]): Keys that need to be converted to Tensor.
"""
def __init__(self, keys):
self.keys = keys
def __call__(self, results):
"""Call function to convert data in results to :obj:`torch.Tensor`.
Args:
results (dict): Result dict contains the data to convert.
Returns:
dict: The result dict contains the data converted
to :obj:`torch.Tensor`.
"""
for key in self.keys:
results[key] = to_tensor(results[key])
return results
def __repr__(self):
return self.__class__.__name__ + f'(keys={self.keys})'
@PIPELINES.register_module()
class ImageToTensor(object):
"""Convert image to :obj:`torch.Tensor` by given keys.
The dimension order of input image is (H, W, C). The pipeline will convert
it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
(1, H, W).
Args:
keys (Sequence[str]): Key of images to be converted to Tensor.
"""
def __init__(self, keys):
self.keys = keys
def __call__(self, results):
"""Call function to convert image in results to :obj:`torch.Tensor` and
transpose the channel order.
Args:
results (dict): Result dict contains the image data to convert.
Returns:
dict: The result dict contains the image converted
to :obj:`torch.Tensor` and transposed to (C, H, W) order.
"""
for key in self.keys:
img = results[key]
if len(img.shape) < 3:
img = np.expand_dims(img, -1)
results[key] = to_tensor(img.transpose(2, 0, 1))
return results
def __repr__(self):
return self.__class__.__name__ + f'(keys={self.keys})'
@PIPELINES.register_module()
class Transpose(object):
"""Transpose some results by given keys.
Args:
keys (Sequence[str]): Keys of results to be transposed.
order (Sequence[int]): Order of transpose.
"""
def __init__(self, keys, order):
self.keys = keys
self.order = order
def __call__(self, results):
"""Call function to transpose the channel order of data in results.
Args:
results (dict): Result dict contains the data to transpose.
Returns:
dict: The result dict contains the data transposed to \
``self.order``.
"""
for key in self.keys:
results[key] = results[key].transpose(self.order)
return results
def __repr__(self):
return self.__class__.__name__ + \
f'(keys={self.keys}, order={self.order})'
@PIPELINES.register_module()
class ToDataContainer(object):
"""Convert results to :obj:`mmcv.DataContainer` by given fields.
Args:
fields (Sequence[dict]): Each field is a dict like
``dict(key='xxx', **kwargs)``. The ``key`` in result will
be converted to :obj:`mmcv.DataContainer` with ``**kwargs``.
Default: ``(dict(key='img', stack=True), dict(key='gt_bboxes'),
dict(key='gt_labels'))``.
"""
def __init__(self,
fields=(dict(key='img', stack=True), dict(key='gt_bboxes'),
dict(key='gt_labels'))):
self.fields = fields
def __call__(self, results):
"""Call function to convert data in results to
:obj:`mmcv.DataContainer`.
Args:
results (dict): Result dict contains the data to convert.
Returns:
dict: The result dict contains the data converted to \
:obj:`mmcv.DataContainer`.
"""
for field in self.fields:
field = field.copy()
key = field.pop('key')
results[key] = DC(results[key], **field)
return results
def __repr__(self):
return self.__class__.__name__ + f'(fields={self.fields})'
@PIPELINES.register_module()
class DefaultFormatBundle(object):
"""Default formatting bundle.
It simplifies the pipeline of formatting common fields, including "img",
"proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
These fields are formatted as follows.
- img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
- proposals: (1)to tensor, (2)to DataContainer
- gt_bboxes: (1)to tensor, (2)to DataContainer
- gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
- gt_labels: (1)to tensor, (2)to DataContainer
- gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
- gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
(3)to DataContainer (stack=True)
"""
def __call__(self, results):
"""Call function to transform and format common fields in results.
Args:
results (dict): Result dict contains the data to convert.
Returns:
dict: The result dict contains the data that is formatted with \
default bundle.
"""
if 'img' in results:
img = results['img']
# add default meta keys
results = self._add_default_meta_keys(results)
if len(img.shape) < 3:
img = np.expand_dims(img, -1)
img = np.ascontiguousarray(img.transpose(2, 0, 1))
results['img'] = DC(to_tensor(img), stack=True)
for key in ['proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_labels']:
if key not in results:
continue
results[key] = DC(to_tensor(results[key]))
if 'gt_masks' in results:
results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
if 'gt_semantic_seg' in results:
results['gt_semantic_seg'] = DC(
to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
return results
def _add_default_meta_keys(self, results):
"""Add default meta keys.
We set default meta keys including `pad_shape`, `scale_factor` and
`img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
`Pad` are implemented during the whole pipeline.
Args:
results (dict): Result dict contains the data to convert.
Returns:
results (dict): Updated result dict contains the data to convert.
"""
img = results['img']
results.setdefault('pad_shape', img.shape)
results.setdefault('scale_factor', 1.0)
num_channels = 1 if len(img.shape) < 3 else img.shape[2]
results.setdefault(
'img_norm_cfg',
dict(
mean=np.zeros(num_channels, dtype=np.float32),
std=np.ones(num_channels, dtype=np.float32),
to_rgb=False))
return results
def __repr__(self):
return self.__class__.__name__
@PIPELINES.register_module()
class Collect(object):
"""Collect data from the loader relevant to the specific task.
This is usually the last stage of the data loader pipeline. Typically keys
is set to some subset of "img", "proposals", "gt_bboxes",
"gt_bboxes_ignore", "gt_labels", and/or "gt_masks".
The "img_meta" item is always populated. The contents of the "img_meta"
dictionary depends on "meta_keys". By default this includes:
- "img_shape": shape of the image input to the network as a tuple \
(h, w, c). Note that images may be zero padded on the \
bottom/right if the batch tensor is larger than this shape.
- "scale_factor": a float indicating the preprocessing scale
- "flip": a boolean indicating if image flip transform was used
- "filename": path to the image file
- "ori_shape": original shape of the image as a tuple (h, w, c)
- "pad_shape": image shape after padding
- "img_norm_cfg": a dict of normalization information:
- mean - per channel mean subtraction
- std - per channel std divisor
- to_rgb - bool indicating if bgr was converted to rgb
Args:
keys (Sequence[str]): Keys of results to be collected in ``data``.
meta_keys (Sequence[str], optional): Meta keys to be converted to
``mmcv.DataContainer`` and collected in ``data[img_metas]``.
Default: ``('filename', 'ori_filename', 'ori_shape', 'img_shape',
'pad_shape', 'scale_factor', 'flip', 'flip_direction',
'img_norm_cfg')``
"""
def __init__(self,
keys,
meta_keys=('filename', 'ori_filename', 'ori_shape',
'img_shape', 'pad_shape', 'scale_factor', 'flip',
'flip_direction', 'img_norm_cfg')):
self.keys = keys
self.meta_keys = meta_keys
def __call__(self, results):
"""Call function to collect keys in results. The keys in ``meta_keys``
will be converted to :obj:mmcv.DataContainer.
Args:
results (dict): Result dict contains the data to collect.
Returns:
dict: The result dict contains the following keys
- keys in``self.keys``
- ``img_metas``
"""
data = {}
img_meta = {}
for key in self.meta_keys:
img_meta[key] = results[key]
data['img_metas'] = DC(img_meta, cpu_only=True)
for key in self.keys:
data[key] = results[key]
return data
def __repr__(self):
return self.__class__.__name__ + \
f'(keys={self.keys}, meta_keys={self.meta_keys})'
@PIPELINES.register_module()
class WrapFieldsToLists(object):
"""Wrap fields of the data dictionary into lists for evaluation.
This class can be used as a last step of a test or validation
pipeline for single image evaluation or inference.
Example:
>>> test_pipeline = [
>>> dict(type='LoadImageFromFile'),
>>> dict(type='Normalize',
mean=[123.675, 116.28, 103.53],
std=[58.395, 57.12, 57.375],
to_rgb=True),
>>> dict(type='Pad', size_divisor=32),
>>> dict(type='ImageToTensor', keys=['img']),
>>> dict(type='Collect', keys=['img']),
>>> dict(type='WrapFieldsToLists')
>>> ]
"""
def __call__(self, results):
"""Call function to wrap fields into lists.
Args:
results (dict): Result dict contains the data to wrap.
Returns:
dict: The result dict where value of ``self.keys`` are wrapped \
into list.
"""
# Wrap dict fields into lists
for key, val in results.items():
results[key] = [val]
return results
def __repr__(self):
return f'{self.__class__.__name__}()'
import numpy as np
from ..builder import PIPELINES
@PIPELINES.register_module()
class InstaBoost(object):
r"""Data augmentation method in `InstaBoost: Boosting Instance
Segmentation Via Probability Map Guided Copy-Pasting
<https://arxiv.org/abs/1908.07801>`_.
Refer to https://github.com/GothicAi/Instaboost for implementation details.
"""
def __init__(self,
action_candidate=('normal', 'horizontal', 'skip'),
action_prob=(1, 0, 0),
scale=(0.8, 1.2),
dx=15,
dy=15,
theta=(-1, 1),
color_prob=0.5,
hflag=False,
aug_ratio=0.5):
try:
import instaboostfast as instaboost
except ImportError:
raise ImportError(
'Please run "pip install instaboostfast" '
'to install instaboostfast first for instaboost augmentation.')
self.cfg = instaboost.InstaBoostConfig(action_candidate, action_prob,
scale, dx, dy, theta,
color_prob, hflag)
self.aug_ratio = aug_ratio
def _load_anns(self, results):
labels = results['ann_info']['labels']
masks = results['ann_info']['masks']
bboxes = results['ann_info']['bboxes']
n = len(labels)
anns = []
for i in range(n):
label = labels[i]
bbox = bboxes[i]
mask = masks[i]
x1, y1, x2, y2 = bbox
# assert (x2 - x1) >= 1 and (y2 - y1) >= 1
bbox = [x1, y1, x2 - x1, y2 - y1]
anns.append({
'category_id': label,
'segmentation': mask,
'bbox': bbox
})
return anns
def _parse_anns(self, results, anns, img):
gt_bboxes = []
gt_labels = []
gt_masks_ann = []
for ann in anns:
x1, y1, w, h = ann['bbox']
# TODO: more essential bug need to be fixed in instaboost
if w <= 0 or h <= 0:
continue
bbox = [x1, y1, x1 + w, y1 + h]
gt_bboxes.append(bbox)
gt_labels.append(ann['category_id'])
gt_masks_ann.append(ann['segmentation'])
gt_bboxes = np.array(gt_bboxes, dtype=np.float32)
gt_labels = np.array(gt_labels, dtype=np.int64)
results['ann_info']['labels'] = gt_labels
results['ann_info']['bboxes'] = gt_bboxes
results['ann_info']['masks'] = gt_masks_ann
results['img'] = img
return results
def __call__(self, results):
img = results['img']
orig_type = img.dtype
anns = self._load_anns(results)
if np.random.choice([0, 1], p=[1 - self.aug_ratio, self.aug_ratio]):
try:
import instaboostfast as instaboost
except ImportError:
raise ImportError('Please run "pip install instaboostfast" '
'to install instaboostfast first.')
anns, img = instaboost.get_new_data(
anns, img.astype(np.uint8), self.cfg, background=None)
results = self._parse_anns(results, anns, img.astype(orig_type))
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(cfg={self.cfg}, aug_ratio={self.aug_ratio})'
return repr_str
import os.path as osp
import mmcv
import numpy as np
import pycocotools.mask as maskUtils
from mmdet.core import BitmapMasks, PolygonMasks
from ..builder import PIPELINES
@PIPELINES.register_module()
class LoadImageFromFile(object):
"""Load an image from file.
Required keys are "img_prefix" and "img_info" (a dict that must contain the
key "filename"). Added or updated keys are "filename", "img", "img_shape",
"ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
"scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
Args:
to_float32 (bool): Whether to convert the loaded image to a float32
numpy array. If set to False, the loaded image is an uint8 array.
Defaults to False.
color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
Defaults to 'color'.
file_client_args (dict): Arguments to instantiate a FileClient.
See :class:`mmcv.fileio.FileClient` for details.
Defaults to ``dict(backend='disk')``.
"""
def __init__(self,
to_float32=False,
color_type='color',
file_client_args=dict(backend='disk')):
self.to_float32 = to_float32
self.color_type = color_type
self.file_client_args = file_client_args.copy()
self.file_client = None
def __call__(self, results):
"""Call functions to load image and get image meta information.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded image and meta information.
"""
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
if results['img_prefix'] is not None:
filename = osp.join(results['img_prefix'],
results['img_info']['filename'])
else:
filename = results['img_info']['filename']
img_bytes = self.file_client.get(filename)
img = mmcv.imfrombytes(img_bytes, flag=self.color_type)
if self.to_float32:
img = img.astype(np.float32)
results['filename'] = filename
results['ori_filename'] = results['img_info']['filename']
results['img'] = img
results['img_shape'] = img.shape
results['ori_shape'] = img.shape
results['img_fields'] = ['img']
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'to_float32={self.to_float32}, '
f"color_type='{self.color_type}', "
f'file_client_args={self.file_client_args})')
return repr_str
@PIPELINES.register_module()
class LoadImageFromWebcam(LoadImageFromFile):
"""Load an image from webcam.
Similar with :obj:`LoadImageFromFile`, but the image read from webcam is in
``results['img']``.
"""
def __call__(self, results):
"""Call functions to add image meta information.
Args:
results (dict): Result dict with Webcam read image in
``results['img']``.
Returns:
dict: The dict contains loaded image and meta information.
"""
img = results['img']
if self.to_float32:
img = img.astype(np.float32)
results['filename'] = None
results['ori_filename'] = None
results['img'] = img
results['img_shape'] = img.shape
results['ori_shape'] = img.shape
results['img_fields'] = ['img']
return results
@PIPELINES.register_module()
class LoadMultiChannelImageFromFiles(object):
"""Load multi-channel images from a list of separate channel files.
Required keys are "img_prefix" and "img_info" (a dict that must contain the
key "filename", which is expected to be a list of filenames).
Added or updated keys are "filename", "img", "img_shape",
"ori_shape" (same as `img_shape`), "pad_shape" (same as `img_shape`),
"scale_factor" (1.0) and "img_norm_cfg" (means=0 and stds=1).
Args:
to_float32 (bool): Whether to convert the loaded image to a float32
numpy array. If set to False, the loaded image is an uint8 array.
Defaults to False.
color_type (str): The flag argument for :func:`mmcv.imfrombytes`.
Defaults to 'color'.
file_client_args (dict): Arguments to instantiate a FileClient.
See :class:`mmcv.fileio.FileClient` for details.
Defaults to ``dict(backend='disk')``.
"""
def __init__(self,
to_float32=False,
color_type='unchanged',
file_client_args=dict(backend='disk')):
self.to_float32 = to_float32
self.color_type = color_type
self.file_client_args = file_client_args.copy()
self.file_client = None
def __call__(self, results):
"""Call functions to load multiple images and get images meta
information.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded images and meta information.
"""
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
if results['img_prefix'] is not None:
filename = [
osp.join(results['img_prefix'], fname)
for fname in results['img_info']['filename']
]
else:
filename = results['img_info']['filename']
img = []
for name in filename:
img_bytes = self.file_client.get(name)
img.append(mmcv.imfrombytes(img_bytes, flag=self.color_type))
img = np.stack(img, axis=-1)
if self.to_float32:
img = img.astype(np.float32)
results['filename'] = filename
results['ori_filename'] = results['img_info']['filename']
results['img'] = img
results['img_shape'] = img.shape
results['ori_shape'] = img.shape
# Set initial values for default meta_keys
results['pad_shape'] = img.shape
results['scale_factor'] = 1.0
num_channels = 1 if len(img.shape) < 3 else img.shape[2]
results['img_norm_cfg'] = dict(
mean=np.zeros(num_channels, dtype=np.float32),
std=np.ones(num_channels, dtype=np.float32),
to_rgb=False)
return results
def __repr__(self):
repr_str = (f'{self.__class__.__name__}('
f'to_float32={self.to_float32}, '
f"color_type='{self.color_type}', "
f'file_client_args={self.file_client_args})')
return repr_str
@PIPELINES.register_module()
class LoadAnnotations(object):
"""Load mutiple types of annotations.
Args:
with_bbox (bool): Whether to parse and load the bbox annotation.
Default: True.
with_label (bool): Whether to parse and load the label annotation.
Default: True.
with_mask (bool): Whether to parse and load the mask annotation.
Default: False.
with_seg (bool): Whether to parse and load the semantic segmentation
annotation. Default: False.
poly2mask (bool): Whether to convert the instance masks from polygons
to bitmaps. Default: True.
file_client_args (dict): Arguments to instantiate a FileClient.
See :class:`mmcv.fileio.FileClient` for details.
Defaults to ``dict(backend='disk')``.
"""
def __init__(self,
with_bbox=True,
with_label=True,
with_mask=False,
with_seg=False,
poly2mask=True,
file_client_args=dict(backend='disk')):
self.with_bbox = with_bbox
self.with_label = with_label
self.with_mask = with_mask
self.with_seg = with_seg
self.poly2mask = poly2mask
self.file_client_args = file_client_args.copy()
self.file_client = None
def _load_bboxes(self, results):
"""Private function to load bounding box annotations.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
ann_info = results['ann_info']
results['gt_bboxes'] = ann_info['bboxes'].copy()
gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
if gt_bboxes_ignore is not None:
results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
results['bbox_fields'].append('gt_bboxes_ignore')
results['bbox_fields'].append('gt_bboxes')
return results
def _load_labels(self, results):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded label annotations.
"""
results['gt_labels'] = results['ann_info']['labels'].copy()
return results
def _poly2mask(self, mask_ann, img_h, img_w):
"""Private function to convert masks represented with polygon to
bitmaps.
Args:
mask_ann (list | dict): Polygon mask annotation input.
img_h (int): The height of output mask.
img_w (int): The width of output mask.
Returns:
numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
"""
if isinstance(mask_ann, list):
# polygon -- a single object might consist of multiple parts
# we merge all parts into one mask rle code
rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
rle = maskUtils.merge(rles)
elif isinstance(mask_ann['counts'], list):
# uncompressed RLE
rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
else:
# rle
rle = mask_ann
mask = maskUtils.decode(rle)
return mask
def process_polygons(self, polygons):
"""Convert polygons to list of ndarray and filter invalid polygons.
Args:
polygons (list[list]): Polygons of one instance.
Returns:
list[numpy.ndarray]: Processed polygons.
"""
polygons = [np.array(p) for p in polygons]
valid_polygons = []
for polygon in polygons:
if len(polygon) % 2 == 0 and len(polygon) >= 6:
valid_polygons.append(polygon)
return valid_polygons
def _load_masks(self, results):
"""Private function to load mask annotations.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded mask annotations.
If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
:obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
"""
h, w = results['img_info']['height'], results['img_info']['width']
gt_masks = results['ann_info']['masks']
if self.poly2mask:
gt_masks = BitmapMasks(
[self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
else:
gt_masks = PolygonMasks(
[self.process_polygons(polygons) for polygons in gt_masks], h,
w)
results['gt_masks'] = gt_masks
results['mask_fields'].append('gt_masks')
return results
def _load_semantic_seg(self, results):
"""Private function to load semantic segmentation annotations.
Args:
results (dict): Result dict from :obj:`dataset`.
Returns:
dict: The dict contains loaded semantic segmentation annotations.
"""
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
filename = osp.join(results['seg_prefix'],
results['ann_info']['seg_map'])
img_bytes = self.file_client.get(filename)
results['gt_semantic_seg'] = mmcv.imfrombytes(
img_bytes, flag='unchanged').squeeze()
results['seg_fields'].append('gt_semantic_seg')
return results
def __call__(self, results):
"""Call function to load multiple types annotations.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded bounding box, label, mask and
semantic segmentation annotations.
"""
if self.with_bbox:
results = self._load_bboxes(results)
if results is None:
return None
if self.with_label:
results = self._load_labels(results)
if self.with_mask:
results = self._load_masks(results)
if self.with_seg:
results = self._load_semantic_seg(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(with_bbox={self.with_bbox}, '
repr_str += f'with_label={self.with_label}, '
repr_str += f'with_mask={self.with_mask}, '
repr_str += f'with_seg={self.with_seg}, '
repr_str += f'poly2mask={self.poly2mask}, '
repr_str += f'poly2mask={self.file_client_args})'
return repr_str
@PIPELINES.register_module()
class LoadProposals(object):
"""Load proposal pipeline.
Required key is "proposals". Updated keys are "proposals", "bbox_fields".
Args:
num_max_proposals (int, optional): Maximum number of proposals to load.
If not specified, all proposals will be loaded.
"""
def __init__(self, num_max_proposals=None):
self.num_max_proposals = num_max_proposals
def __call__(self, results):
"""Call function to load proposals from file.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded proposal annotations.
"""
proposals = results['proposals']
if proposals.shape[1] not in (4, 5):
raise AssertionError(
'proposals should have shapes (n, 4) or (n, 5), '
f'but found {proposals.shape}')
proposals = proposals[:, :4]
if self.num_max_proposals is not None:
proposals = proposals[:self.num_max_proposals]
if len(proposals) == 0:
proposals = np.array([[0, 0, 0, 0]], dtype=np.float32)
results['proposals'] = proposals
results['bbox_fields'].append('proposals')
return results
def __repr__(self):
return self.__class__.__name__ + \
f'(num_max_proposals={self.num_max_proposals})'
@PIPELINES.register_module()
class FilterAnnotations(object):
"""Filter invalid annotations.
Args:
min_gt_bbox_wh (tuple[int]): Minimum width and height of ground truth
boxes.
"""
def __init__(self, min_gt_bbox_wh):
# TODO: add more filter options
self.min_gt_bbox_wh = min_gt_bbox_wh
def __call__(self, results):
assert 'gt_bboxes' in results
gt_bboxes = results['gt_bboxes']
w = gt_bboxes[:, 2] - gt_bboxes[:, 0]
h = gt_bboxes[:, 3] - gt_bboxes[:, 1]
keep = (w > self.min_gt_bbox_wh[0]) & (h > self.min_gt_bbox_wh[1])
if not keep.any():
return None
else:
keys = ('gt_bboxes', 'gt_labels', 'gt_masks', 'gt_semantic_seg')
for key in keys:
if key in results:
results[key] = results[key][keep]
return results
import warnings
import mmcv
from ..builder import PIPELINES
from .compose import Compose
@PIPELINES.register_module()
class MultiScaleFlipAug(object):
"""Test-time augmentation with multiple scales and flipping.
An example configuration is as followed:
.. code-block::
img_scale=[(1333, 400), (1333, 800)],
flip=True,
transforms=[
dict(type='Resize', keep_ratio=True),
dict(type='RandomFlip'),
dict(type='Normalize', **img_norm_cfg),
dict(type='Pad', size_divisor=32),
dict(type='ImageToTensor', keys=['img']),
dict(type='Collect', keys=['img']),
]
After MultiScaleFLipAug with above configuration, the results are wrapped
into lists of the same length as followed:
.. code-block::
dict(
img=[...],
img_shape=[...],
scale=[(1333, 400), (1333, 400), (1333, 800), (1333, 800)]
flip=[False, True, False, True]
...
)
Args:
transforms (list[dict]): Transforms to apply in each augmentation.
img_scale (tuple | list[tuple] | None): Images scales for resizing.
scale_factor (float | list[float] | None): Scale factors for resizing.
flip (bool): Whether apply flip augmentation. Default: False.
flip_direction (str | list[str]): Flip augmentation directions,
options are "horizontal" and "vertical". If flip_direction is list,
multiple flip augmentations will be applied.
It has no effect when flip == False. Default: "horizontal".
"""
def __init__(self,
transforms,
img_scale=None,
scale_factor=None,
flip=False,
flip_direction='horizontal'):
self.transforms = Compose(transforms)
assert (img_scale is None) ^ (scale_factor is None), (
'Must have but only one variable can be setted')
if img_scale is not None:
self.img_scale = img_scale if isinstance(img_scale,
list) else [img_scale]
self.scale_key = 'scale'
assert mmcv.is_list_of(self.img_scale, tuple)
else:
self.img_scale = scale_factor if isinstance(
scale_factor, list) else [scale_factor]
self.scale_key = 'scale_factor'
self.flip = flip
self.flip_direction = flip_direction if isinstance(
flip_direction, list) else [flip_direction]
assert mmcv.is_list_of(self.flip_direction, str)
if not self.flip and self.flip_direction != ['horizontal']:
warnings.warn(
'flip_direction has no effect when flip is set to False')
if (self.flip
and not any([t['type'] == 'RandomFlip' for t in transforms])):
warnings.warn(
'flip has no effect when RandomFlip is not in transforms')
def __call__(self, results):
"""Call function to apply test time augment transforms on results.
Args:
results (dict): Result dict contains the data to transform.
Returns:
dict[str: list]: The augmented data, where each value is wrapped
into a list.
"""
aug_data = []
flip_args = [(False, None)]
if self.flip:
flip_args += [(True, direction)
for direction in self.flip_direction]
for scale in self.img_scale:
for flip, direction in flip_args:
_results = results.copy()
_results[self.scale_key] = scale
_results['flip'] = flip
_results['flip_direction'] = direction
data = self.transforms(_results)
aug_data.append(data)
# list of dict to dict of list
aug_data_dict = {key: [] for key in aug_data[0]}
for data in aug_data:
for key, val in data.items():
aug_data_dict[key].append(val)
return aug_data_dict
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(transforms={self.transforms}, '
repr_str += f'img_scale={self.img_scale}, flip={self.flip}, '
repr_str += f'flip_direction={self.flip_direction})'
return repr_str
import inspect
import mmcv
import numpy as np
from numpy import random
from mmdet.core import PolygonMasks
from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
from ..builder import PIPELINES
try:
from imagecorruptions import corrupt
except ImportError:
corrupt = None
try:
import albumentations
from albumentations import Compose
except ImportError:
albumentations = None
Compose = None
@PIPELINES.register_module()
class Resize(object):
"""Resize images & bbox & mask.
This transform resizes the input image to some scale. Bboxes and masks are
then resized with the same scale factor. If the input dict contains the key
"scale", then the scale in the input dict is used, otherwise the specified
scale in the init method is used. If the input dict contains the key
"scale_factor" (if MultiScaleFlipAug does not give img_scale but
scale_factor), the actual scale will be computed by image shape and
scale_factor.
`img_scale` can either be a tuple (single-scale) or a list of tuple
(multi-scale). There are 3 multiscale modes:
- ``ratio_range is not None``: randomly sample a ratio from the ratio \
range and multiply it with the image scale.
- ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
sample a scale from the multiscale range.
- ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
sample a scale from multiple scales.
Args:
img_scale (tuple or list[tuple]): Images scales for resizing.
multiscale_mode (str): Either "range" or "value".
ratio_range (tuple[float]): (min_ratio, max_ratio)
keep_ratio (bool): Whether to keep the aspect ratio when resizing the
image.
bbox_clip_border (bool, optional): Whether clip the objects outside
the border of the image. Defaults to True.
backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
These two backends generates slightly different results. Defaults
to 'cv2'.
override (bool, optional): Whether to override `scale` and
`scale_factor` so as to call resize twice. Default False. If True,
after the first resizing, the existed `scale` and `scale_factor`
will be ignored so the second resizing can be allowed.
This option is a work-around for multiple times of resize in DETR.
Defaults to False.
"""
def __init__(self,
img_scale=None,
multiscale_mode='range',
ratio_range=None,
keep_ratio=True,
bbox_clip_border=True,
backend='cv2',
override=False):
if img_scale is None:
self.img_scale = None
else:
if isinstance(img_scale, list):
self.img_scale = img_scale
else:
self.img_scale = [img_scale]
assert mmcv.is_list_of(self.img_scale, tuple)
if ratio_range is not None:
# mode 1: given a scale and a range of image ratio
assert len(self.img_scale) == 1
else:
# mode 2: given multiple scales or a range of scales
assert multiscale_mode in ['value', 'range']
self.backend = backend
self.multiscale_mode = multiscale_mode
self.ratio_range = ratio_range
self.keep_ratio = keep_ratio
# TODO: refactor the override option in Resize
self.override = override
self.bbox_clip_border = bbox_clip_border
@staticmethod
def random_select(img_scales):
"""Randomly select an img_scale from given candidates.
Args:
img_scales (list[tuple]): Images scales for selection.
Returns:
(tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
where ``img_scale`` is the selected image scale and \
``scale_idx`` is the selected index in the given candidates.
"""
assert mmcv.is_list_of(img_scales, tuple)
scale_idx = np.random.randint(len(img_scales))
img_scale = img_scales[scale_idx]
return img_scale, scale_idx
@staticmethod
def random_sample(img_scales):
"""Randomly sample an img_scale when ``multiscale_mode=='range'``.
Args:
img_scales (list[tuple]): Images scale range for sampling.
There must be two tuples in img_scales, which specify the lower
and uper bound of image scales.
Returns:
(tuple, None): Returns a tuple ``(img_scale, None)``, where \
``img_scale`` is sampled scale and None is just a placeholder \
to be consistent with :func:`random_select`.
"""
assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
img_scale_long = [max(s) for s in img_scales]
img_scale_short = [min(s) for s in img_scales]
long_edge = np.random.randint(
min(img_scale_long),
max(img_scale_long) + 1)
short_edge = np.random.randint(
min(img_scale_short),
max(img_scale_short) + 1)
img_scale = (long_edge, short_edge)
return img_scale, None
@staticmethod
def random_sample_ratio(img_scale, ratio_range):
"""Randomly sample an img_scale when ``ratio_range`` is specified.
A ratio will be randomly sampled from the range specified by
``ratio_range``. Then it would be multiplied with ``img_scale`` to
generate sampled scale.
Args:
img_scale (tuple): Images scale base to multiply with ratio.
ratio_range (tuple[float]): The minimum and maximum ratio to scale
the ``img_scale``.
Returns:
(tuple, None): Returns a tuple ``(scale, None)``, where \
``scale`` is sampled ratio multiplied with ``img_scale`` and \
None is just a placeholder to be consistent with \
:func:`random_select`.
"""
assert isinstance(img_scale, tuple) and len(img_scale) == 2
min_ratio, max_ratio = ratio_range
assert min_ratio <= max_ratio
ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
return scale, None
def _random_scale(self, results):
"""Randomly sample an img_scale according to ``ratio_range`` and
``multiscale_mode``.
If ``ratio_range`` is specified, a ratio will be sampled and be
multiplied with ``img_scale``.
If multiple scales are specified by ``img_scale``, a scale will be
sampled according to ``multiscale_mode``.
Otherwise, single scale will be used.
Args:
results (dict): Result dict from :obj:`dataset`.
Returns:
dict: Two new keys 'scale` and 'scale_idx` are added into \
``results``, which would be used by subsequent pipelines.
"""
if self.ratio_range is not None:
scale, scale_idx = self.random_sample_ratio(
self.img_scale[0], self.ratio_range)
elif len(self.img_scale) == 1:
scale, scale_idx = self.img_scale[0], 0
elif self.multiscale_mode == 'range':
scale, scale_idx = self.random_sample(self.img_scale)
elif self.multiscale_mode == 'value':
scale, scale_idx = self.random_select(self.img_scale)
else:
raise NotImplementedError
results['scale'] = scale
results['scale_idx'] = scale_idx
def _resize_img(self, results):
"""Resize images with ``results['scale']``."""
for key in results.get('img_fields', ['img']):
if self.keep_ratio:
img, scale_factor = mmcv.imrescale(
results[key],
results['scale'],
return_scale=True,
backend=self.backend)
# the w_scale and h_scale has minor difference
# a real fix should be done in the mmcv.imrescale in the future
new_h, new_w = img.shape[:2]
h, w = results[key].shape[:2]
w_scale = new_w / w
h_scale = new_h / h
else:
img, w_scale, h_scale = mmcv.imresize(
results[key],
results['scale'],
return_scale=True,
backend=self.backend)
results[key] = img
scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
dtype=np.float32)
results['img_shape'] = img.shape
# in case that there is no padding
results['pad_shape'] = img.shape
results['scale_factor'] = scale_factor
results['keep_ratio'] = self.keep_ratio
def _resize_bboxes(self, results):
"""Resize bounding boxes with ``results['scale_factor']``."""
for key in results.get('bbox_fields', []):
bboxes = results[key] * results['scale_factor']
if self.bbox_clip_border:
img_shape = results['img_shape']
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
results[key] = bboxes
def _resize_masks(self, results):
"""Resize masks with ``results['scale']``"""
for key in results.get('mask_fields', []):
if results[key] is None:
continue
if self.keep_ratio:
results[key] = results[key].rescale(results['scale'])
else:
results[key] = results[key].resize(results['img_shape'][:2])
def _resize_seg(self, results):
"""Resize semantic segmentation map with ``results['scale']``."""
for key in results.get('seg_fields', []):
if self.keep_ratio:
gt_seg = mmcv.imrescale(
results[key],
results['scale'],
interpolation='nearest',
backend=self.backend)
else:
gt_seg = mmcv.imresize(
results[key],
results['scale'],
interpolation='nearest',
backend=self.backend)
results['gt_semantic_seg'] = gt_seg
def __call__(self, results):
"""Call function to resize images, bounding boxes, masks, semantic
segmentation map.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
'keep_ratio' keys are added into result dict.
"""
if 'scale' not in results:
if 'scale_factor' in results:
img_shape = results['img'].shape[:2]
scale_factor = results['scale_factor']
assert isinstance(scale_factor, float)
results['scale'] = tuple(
[int(x * scale_factor) for x in img_shape][::-1])
else:
self._random_scale(results)
else:
if not self.override:
assert 'scale_factor' not in results, (
'scale and scale_factor cannot be both set.')
else:
results.pop('scale')
if 'scale_factor' in results:
results.pop('scale_factor')
self._random_scale(results)
self._resize_img(results)
self._resize_bboxes(results)
self._resize_masks(results)
self._resize_seg(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(img_scale={self.img_scale}, '
repr_str += f'multiscale_mode={self.multiscale_mode}, '
repr_str += f'ratio_range={self.ratio_range}, '
repr_str += f'keep_ratio={self.keep_ratio}, '
repr_str += f'bbox_clip_border={self.bbox_clip_border})'
return repr_str
@PIPELINES.register_module()
class RandomFlip(object):
"""Flip the image & bbox & mask.
If the input dict contains the key "flip", then the flag will be used,
otherwise it will be randomly decided by a ratio specified in the init
method.
When random flip is enabled, ``flip_ratio``/``direction`` can either be a
float/string or tuple of float/string. There are 3 flip modes:
- ``flip_ratio`` is float, ``direction`` is string: the image will be
``direction``ly flipped with probability of ``flip_ratio`` .
E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
then image will be horizontally flipped with probability of 0.5.
- ``flip_ratio`` is float, ``direction`` is list of string: the image wil
be ``direction[i]``ly flipped with probability of
``flip_ratio/len(direction)``.
E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
then image will be horizontally flipped with probability of 0.25,
vertically with probability of 0.25.
- ``flip_ratio`` is list of float, ``direction`` is list of string:
given ``len(flip_ratio) == len(direction)``, the image wil
be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
'vertical']``, then image will be horizontally flipped with probability
of 0.3, vertically with probability of 0.5
Args:
flip_ratio (float | list[float], optional): The flipping probability.
Default: None.
direction(str | list[str], optional): The flipping direction. Options
are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
If input is a list, the length must equal ``flip_ratio``. Each
element in ``flip_ratio`` indicates the flip probability of
corresponding direction.
"""
def __init__(self, flip_ratio=None, direction='horizontal'):
if isinstance(flip_ratio, list):
assert mmcv.is_list_of(flip_ratio, float)
assert 0 <= sum(flip_ratio) <= 1
elif isinstance(flip_ratio, float):
assert 0 <= flip_ratio <= 1
elif flip_ratio is None:
pass
else:
raise ValueError('flip_ratios must be None, float, '
'or list of float')
self.flip_ratio = flip_ratio
valid_directions = ['horizontal', 'vertical', 'diagonal']
if isinstance(direction, str):
assert direction in valid_directions
elif isinstance(direction, list):
assert mmcv.is_list_of(direction, str)
assert set(direction).issubset(set(valid_directions))
else:
raise ValueError('direction must be either str or list of str')
self.direction = direction
if isinstance(flip_ratio, list):
assert len(self.flip_ratio) == len(self.direction)
def bbox_flip(self, bboxes, img_shape, direction):
"""Flip bboxes horizontally.
Args:
bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
img_shape (tuple[int]): Image shape (height, width)
direction (str): Flip direction. Options are 'horizontal',
'vertical'.
Returns:
numpy.ndarray: Flipped bounding boxes.
"""
assert bboxes.shape[-1] % 4 == 0
flipped = bboxes.copy()
if direction == 'horizontal':
w = img_shape[1]
flipped[..., 0::4] = w - bboxes[..., 2::4]
flipped[..., 2::4] = w - bboxes[..., 0::4]
elif direction == 'vertical':
h = img_shape[0]
flipped[..., 1::4] = h - bboxes[..., 3::4]
flipped[..., 3::4] = h - bboxes[..., 1::4]
elif direction == 'diagonal':
w = img_shape[1]
h = img_shape[0]
flipped[..., 0::4] = w - bboxes[..., 2::4]
flipped[..., 1::4] = h - bboxes[..., 3::4]
flipped[..., 2::4] = w - bboxes[..., 0::4]
flipped[..., 3::4] = h - bboxes[..., 1::4]
else:
raise ValueError(f"Invalid flipping direction '{direction}'")
return flipped
def __call__(self, results):
"""Call function to flip bounding boxes, masks, semantic segmentation
maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Flipped results, 'flip', 'flip_direction' keys are added \
into result dict.
"""
if 'flip' not in results:
if isinstance(self.direction, list):
# None means non-flip
direction_list = self.direction + [None]
else:
# None means non-flip
direction_list = [self.direction, None]
if isinstance(self.flip_ratio, list):
non_flip_ratio = 1 - sum(self.flip_ratio)
flip_ratio_list = self.flip_ratio + [non_flip_ratio]
else:
non_flip_ratio = 1 - self.flip_ratio
# exclude non-flip
single_ratio = self.flip_ratio / (len(direction_list) - 1)
flip_ratio_list = [single_ratio] * (len(direction_list) -
1) + [non_flip_ratio]
cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
results['flip'] = cur_dir is not None
if 'flip_direction' not in results:
results['flip_direction'] = cur_dir
if results['flip']:
# flip image
for key in results.get('img_fields', ['img']):
results[key] = mmcv.imflip(
results[key], direction=results['flip_direction'])
# flip bboxes
for key in results.get('bbox_fields', []):
results[key] = self.bbox_flip(results[key],
results['img_shape'],
results['flip_direction'])
# flip masks
for key in results.get('mask_fields', []):
results[key] = results[key].flip(results['flip_direction'])
# flip segs
for key in results.get('seg_fields', []):
results[key] = mmcv.imflip(
results[key], direction=results['flip_direction'])
return results
def __repr__(self):
return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
@PIPELINES.register_module()
class Pad(object):
"""Pad the image & mask.
There are two padding modes: (1) pad to a fixed size and (2) pad to the
minimum size that is divisible by some number.
Added keys are "pad_shape", "pad_fixed_size", "pad_size_divisor",
Args:
size (tuple, optional): Fixed padding size.
size_divisor (int, optional): The divisor of padded size.
pad_val (float, optional): Padding value, 0 by default.
"""
def __init__(self, size=None, size_divisor=None, pad_val=0):
self.size = size
self.size_divisor = size_divisor
self.pad_val = pad_val
# only one of size and size_divisor should be valid
assert size is not None or size_divisor is not None
assert size is None or size_divisor is None
def _pad_img(self, results):
"""Pad images according to ``self.size``."""
for key in results.get('img_fields', ['img']):
if self.size is not None:
padded_img = mmcv.impad(
results[key], shape=self.size, pad_val=self.pad_val)
elif self.size_divisor is not None:
padded_img = mmcv.impad_to_multiple(
results[key], self.size_divisor, pad_val=self.pad_val)
results[key] = padded_img
results['pad_shape'] = padded_img.shape
results['pad_fixed_size'] = self.size
results['pad_size_divisor'] = self.size_divisor
def _pad_masks(self, results):
"""Pad masks according to ``results['pad_shape']``."""
pad_shape = results['pad_shape'][:2]
for key in results.get('mask_fields', []):
results[key] = results[key].pad(pad_shape, pad_val=self.pad_val)
def _pad_seg(self, results):
"""Pad semantic segmentation map according to
``results['pad_shape']``."""
for key in results.get('seg_fields', []):
results[key] = mmcv.impad(
results[key], shape=results['pad_shape'][:2])
def __call__(self, results):
"""Call function to pad images, masks, semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Updated result dict.
"""
self._pad_img(results)
self._pad_masks(results)
self._pad_seg(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(size={self.size}, '
repr_str += f'size_divisor={self.size_divisor}, '
repr_str += f'pad_val={self.pad_val})'
return repr_str
@PIPELINES.register_module()
class Normalize(object):
"""Normalize the image.
Added key is "img_norm_cfg".
Args:
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB,
default is true.
"""
def __init__(self, mean, std, to_rgb=True):
self.mean = np.array(mean, dtype=np.float32)
self.std = np.array(std, dtype=np.float32)
self.to_rgb = to_rgb
def __call__(self, results):
"""Call function to normalize images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Normalized results, 'img_norm_cfg' key is added into
result dict.
"""
for key in results.get('img_fields', ['img']):
results[key] = mmcv.imnormalize(results[key], self.mean, self.std,
self.to_rgb)
results['img_norm_cfg'] = dict(
mean=self.mean, std=self.std, to_rgb=self.to_rgb)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(mean={self.mean}, std={self.std}, to_rgb={self.to_rgb})'
return repr_str
@PIPELINES.register_module()
class RandomCrop(object):
"""Random crop the image & bboxes & masks.
The absolute `crop_size` is sampled based on `crop_type` and `image_size`,
then the cropped results are generated.
Args:
crop_size (tuple): The relative ratio or absolute pixels of
height and width.
crop_type (str, optional): one of "relative_range", "relative",
"absolute", "absolute_range". "relative" randomly crops
(h * crop_size[0], w * crop_size[1]) part from an input of size
(h, w). "relative_range" uniformly samples relative crop size from
range [crop_size[0], 1] and [crop_size[1], 1] for height and width
respectively. "absolute" crops from an input with absolute size
(crop_size[0], crop_size[1]). "absolute_range" uniformly samples
crop_h in range [crop_size[0], min(h, crop_size[1])] and crop_w
in range [crop_size[0], min(w, crop_size[1])]. Default "absolute".
allow_negative_crop (bool, optional): Whether to allow a crop that does
not contain any bbox area. Default False.
bbox_clip_border (bool, optional): Whether clip the objects outside
the border of the image. Defaults to True.
Note:
- If the image is smaller than the absolute crop size, return the
original image.
- The keys for bboxes, labels and masks must be aligned. That is,
`gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and
`gt_bboxes_ignore` corresponds to `gt_labels_ignore` and
`gt_masks_ignore`.
- If the crop does not contain any gt-bbox region and
`allow_negative_crop` is set to False, skip this image.
"""
def __init__(self,
crop_size,
crop_type='absolute',
allow_negative_crop=False,
bbox_clip_border=True):
if crop_type not in [
'relative_range', 'relative', 'absolute', 'absolute_range'
]:
raise ValueError(f'Invalid crop_type {crop_type}.')
if crop_type in ['absolute', 'absolute_range']:
assert crop_size[0] > 0 and crop_size[1] > 0
assert isinstance(crop_size[0], int) and isinstance(
crop_size[1], int)
else:
assert 0 < crop_size[0] <= 1 and 0 < crop_size[1] <= 1
self.crop_size = crop_size
self.crop_type = crop_type
self.allow_negative_crop = allow_negative_crop
self.bbox_clip_border = bbox_clip_border
# The key correspondence from bboxes to labels and masks.
self.bbox2label = {
'gt_bboxes': 'gt_labels',
'gt_bboxes_ignore': 'gt_labels_ignore'
}
self.bbox2mask = {
'gt_bboxes': 'gt_masks',
'gt_bboxes_ignore': 'gt_masks_ignore'
}
def _crop_data(self, results, crop_size, allow_negative_crop):
"""Function to randomly crop images, bounding boxes, masks, semantic
segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
crop_size (tuple): Expected absolute size after cropping, (h, w).
allow_negative_crop (bool): Whether to allow a crop that does not
contain any bbox area. Default to False.
Returns:
dict: Randomly cropped results, 'img_shape' key in result dict is
updated according to crop size.
"""
assert crop_size[0] > 0 and crop_size[1] > 0
for key in results.get('img_fields', ['img']):
img = results[key]
margin_h = max(img.shape[0] - crop_size[0], 0)
margin_w = max(img.shape[1] - crop_size[1], 0)
offset_h = np.random.randint(0, margin_h + 1)
offset_w = np.random.randint(0, margin_w + 1)
crop_y1, crop_y2 = offset_h, offset_h + crop_size[0]
crop_x1, crop_x2 = offset_w, offset_w + crop_size[1]
# crop the image
img = img[crop_y1:crop_y2, crop_x1:crop_x2, ...]
img_shape = img.shape
results[key] = img
results['img_shape'] = img_shape
# crop bboxes accordingly and clip to the image boundary
for key in results.get('bbox_fields', []):
# e.g. gt_bboxes and gt_bboxes_ignore
bbox_offset = np.array([offset_w, offset_h, offset_w, offset_h],
dtype=np.float32)
bboxes = results[key] - bbox_offset
if self.bbox_clip_border:
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
valid_inds = (bboxes[:, 2] > bboxes[:, 0]) & (
bboxes[:, 3] > bboxes[:, 1])
# If the crop does not contain any gt-bbox area and
# allow_negative_crop is False, skip this image.
if (key == 'gt_bboxes' and not valid_inds.any()
and not allow_negative_crop):
return None
results[key] = bboxes[valid_inds, :]
# label fields. e.g. gt_labels and gt_labels_ignore
label_key = self.bbox2label.get(key)
if label_key in results:
results[label_key] = results[label_key][valid_inds]
# mask fields, e.g. gt_masks and gt_masks_ignore
mask_key = self.bbox2mask.get(key)
if mask_key in results:
results[mask_key] = results[mask_key][
valid_inds.nonzero()[0]].crop(
np.asarray([crop_x1, crop_y1, crop_x2, crop_y2]))
# crop semantic seg
for key in results.get('seg_fields', []):
results[key] = results[key][crop_y1:crop_y2, crop_x1:crop_x2]
return results
def _get_crop_size(self, image_size):
"""Randomly generates the absolute crop size based on `crop_type` and
`image_size`.
Args:
image_size (tuple): (h, w).
Returns:
crop_size (tuple): (crop_h, crop_w) in absolute pixels.
"""
h, w = image_size
if self.crop_type == 'absolute':
return (min(self.crop_size[0], h), min(self.crop_size[1], w))
elif self.crop_type == 'absolute_range':
assert self.crop_size[0] <= self.crop_size[1]
crop_h = np.random.randint(
min(h, self.crop_size[0]),
min(h, self.crop_size[1]) + 1)
crop_w = np.random.randint(
min(w, self.crop_size[0]),
min(w, self.crop_size[1]) + 1)
return crop_h, crop_w
elif self.crop_type == 'relative':
crop_h, crop_w = self.crop_size
return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
elif self.crop_type == 'relative_range':
crop_size = np.asarray(self.crop_size, dtype=np.float32)
crop_h, crop_w = crop_size + np.random.rand(2) * (1 - crop_size)
return int(h * crop_h + 0.5), int(w * crop_w + 0.5)
def __call__(self, results):
"""Call function to randomly crop images, bounding boxes, masks,
semantic segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Randomly cropped results, 'img_shape' key in result dict is
updated according to crop size.
"""
image_size = results['img'].shape[:2]
crop_size = self._get_crop_size(image_size)
results = self._crop_data(results, crop_size, self.allow_negative_crop)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(crop_size={self.crop_size}, '
repr_str += f'crop_type={self.crop_type}, '
repr_str += f'allow_negative_crop={self.allow_negative_crop}, '
repr_str += f'bbox_clip_border={self.bbox_clip_border})'
return repr_str
@PIPELINES.register_module()
class SegRescale(object):
"""Rescale semantic segmentation maps.
Args:
scale_factor (float): The scale factor of the final output.
backend (str): Image rescale backend, choices are 'cv2' and 'pillow'.
These two backends generates slightly different results. Defaults
to 'cv2'.
"""
def __init__(self, scale_factor=1, backend='cv2'):
self.scale_factor = scale_factor
self.backend = backend
def __call__(self, results):
"""Call function to scale the semantic segmentation map.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with semantic segmentation map scaled.
"""
for key in results.get('seg_fields', []):
if self.scale_factor != 1:
results[key] = mmcv.imrescale(
results[key],
self.scale_factor,
interpolation='nearest',
backend=self.backend)
return results
def __repr__(self):
return self.__class__.__name__ + f'(scale_factor={self.scale_factor})'
@PIPELINES.register_module()
class PhotoMetricDistortion(object):
"""Apply photometric distortion to image sequentially, every transformation
is applied with a probability of 0.5. The position of random contrast is in
second or second to last.
1. random brightness
2. random contrast (mode 0)
3. convert color from BGR to HSV
4. random saturation
5. random hue
6. convert color from HSV to BGR
7. random contrast (mode 1)
8. randomly swap channels
Args:
brightness_delta (int): delta of brightness.
contrast_range (tuple): range of contrast.
saturation_range (tuple): range of saturation.
hue_delta (int): delta of hue.
"""
def __init__(self,
brightness_delta=32,
contrast_range=(0.5, 1.5),
saturation_range=(0.5, 1.5),
hue_delta=18):
self.brightness_delta = brightness_delta
self.contrast_lower, self.contrast_upper = contrast_range
self.saturation_lower, self.saturation_upper = saturation_range
self.hue_delta = hue_delta
def __call__(self, results):
"""Call function to perform photometric distortion on images.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images distorted.
"""
if 'img_fields' in results:
assert results['img_fields'] == ['img'], \
'Only single img_fields is allowed'
img = results['img']
assert img.dtype == np.float32, \
'PhotoMetricDistortion needs the input image of dtype np.float32,'\
' please set "to_float32=True" in "LoadImageFromFile" pipeline'
# random brightness
if random.randint(2):
delta = random.uniform(-self.brightness_delta,
self.brightness_delta)
img += delta
# mode == 0 --> do random contrast first
# mode == 1 --> do random contrast last
mode = random.randint(2)
if mode == 1:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# convert color from BGR to HSV
img = mmcv.bgr2hsv(img)
# random saturation
if random.randint(2):
img[..., 1] *= random.uniform(self.saturation_lower,
self.saturation_upper)
# random hue
if random.randint(2):
img[..., 0] += random.uniform(-self.hue_delta, self.hue_delta)
img[..., 0][img[..., 0] > 360] -= 360
img[..., 0][img[..., 0] < 0] += 360
# convert color from HSV to BGR
img = mmcv.hsv2bgr(img)
# random contrast
if mode == 0:
if random.randint(2):
alpha = random.uniform(self.contrast_lower,
self.contrast_upper)
img *= alpha
# randomly swap channels
if random.randint(2):
img = img[..., random.permutation(3)]
results['img'] = img
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(\nbrightness_delta={self.brightness_delta},\n'
repr_str += 'contrast_range='
repr_str += f'{(self.contrast_lower, self.contrast_upper)},\n'
repr_str += 'saturation_range='
repr_str += f'{(self.saturation_lower, self.saturation_upper)},\n'
repr_str += f'hue_delta={self.hue_delta})'
return repr_str
@PIPELINES.register_module()
class Expand(object):
"""Random expand the image & bboxes.
Randomly place the original image on a canvas of 'ratio' x original image
size filled with mean values. The ratio is in the range of ratio_range.
Args:
mean (tuple): mean value of dataset.
to_rgb (bool): if need to convert the order of mean to align with RGB.
ratio_range (tuple): range of expand ratio.
prob (float): probability of applying this transformation
"""
def __init__(self,
mean=(0, 0, 0),
to_rgb=True,
ratio_range=(1, 4),
seg_ignore_label=None,
prob=0.5):
self.to_rgb = to_rgb
self.ratio_range = ratio_range
if to_rgb:
self.mean = mean[::-1]
else:
self.mean = mean
self.min_ratio, self.max_ratio = ratio_range
self.seg_ignore_label = seg_ignore_label
self.prob = prob
def __call__(self, results):
"""Call function to expand images, bounding boxes.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images, bounding boxes expanded
"""
if random.uniform(0, 1) > self.prob:
return results
if 'img_fields' in results:
assert results['img_fields'] == ['img'], \
'Only single img_fields is allowed'
img = results['img']
h, w, c = img.shape
ratio = random.uniform(self.min_ratio, self.max_ratio)
# speedup expand when meets large image
if np.all(self.mean == self.mean[0]):
expand_img = np.empty((int(h * ratio), int(w * ratio), c),
img.dtype)
expand_img.fill(self.mean[0])
else:
expand_img = np.full((int(h * ratio), int(w * ratio), c),
self.mean,
dtype=img.dtype)
left = int(random.uniform(0, w * ratio - w))
top = int(random.uniform(0, h * ratio - h))
expand_img[top:top + h, left:left + w] = img
results['img'] = expand_img
# expand bboxes
for key in results.get('bbox_fields', []):
results[key] = results[key] + np.tile(
(left, top), 2).astype(results[key].dtype)
# expand masks
for key in results.get('mask_fields', []):
results[key] = results[key].expand(
int(h * ratio), int(w * ratio), top, left)
# expand segs
for key in results.get('seg_fields', []):
gt_seg = results[key]
expand_gt_seg = np.full((int(h * ratio), int(w * ratio)),
self.seg_ignore_label,
dtype=gt_seg.dtype)
expand_gt_seg[top:top + h, left:left + w] = gt_seg
results[key] = expand_gt_seg
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(mean={self.mean}, to_rgb={self.to_rgb}, '
repr_str += f'ratio_range={self.ratio_range}, '
repr_str += f'seg_ignore_label={self.seg_ignore_label})'
return repr_str
@PIPELINES.register_module()
class MinIoURandomCrop(object):
"""Random crop the image & bboxes, the cropped patches have minimum IoU
requirement with original image & bboxes, the IoU threshold is randomly
selected from min_ious.
Args:
min_ious (tuple): minimum IoU threshold for all intersections with
bounding boxes
min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
where a >= min_crop_size).
bbox_clip_border (bool, optional): Whether clip the objects outside
the border of the image. Defaults to True.
Note:
The keys for bboxes, labels and masks should be paired. That is, \
`gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
`gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
"""
def __init__(self,
min_ious=(0.1, 0.3, 0.5, 0.7, 0.9),
min_crop_size=0.3,
bbox_clip_border=True):
# 1: return ori img
self.min_ious = min_ious
self.sample_mode = (1, *min_ious, 0)
self.min_crop_size = min_crop_size
self.bbox_clip_border = bbox_clip_border
self.bbox2label = {
'gt_bboxes': 'gt_labels',
'gt_bboxes_ignore': 'gt_labels_ignore'
}
self.bbox2mask = {
'gt_bboxes': 'gt_masks',
'gt_bboxes_ignore': 'gt_masks_ignore'
}
def __call__(self, results):
"""Call function to crop images and bounding boxes with minimum IoU
constraint.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images and bounding boxes cropped, \
'img_shape' key is updated.
"""
if 'img_fields' in results:
assert results['img_fields'] == ['img'], \
'Only single img_fields is allowed'
img = results['img']
assert 'bbox_fields' in results
boxes = [results[key] for key in results['bbox_fields']]
boxes = np.concatenate(boxes, 0)
h, w, c = img.shape
while True:
mode = random.choice(self.sample_mode)
self.mode = mode
if mode == 1:
return results
min_iou = mode
for i in range(50):
new_w = random.uniform(self.min_crop_size * w, w)
new_h = random.uniform(self.min_crop_size * h, h)
# h / w in [0.5, 2]
if new_h / new_w < 0.5 or new_h / new_w > 2:
continue
left = random.uniform(w - new_w)
top = random.uniform(h - new_h)
patch = np.array(
(int(left), int(top), int(left + new_w), int(top + new_h)))
# Line or point crop is not allowed
if patch[2] == patch[0] or patch[3] == patch[1]:
continue
overlaps = bbox_overlaps(
patch.reshape(-1, 4), boxes.reshape(-1, 4)).reshape(-1)
if len(overlaps) > 0 and overlaps.min() < min_iou:
continue
# center of boxes should inside the crop img
# only adjust boxes and instance masks when the gt is not empty
if len(overlaps) > 0:
# adjust boxes
def is_center_of_bboxes_in_patch(boxes, patch):
center = (boxes[:, :2] + boxes[:, 2:]) / 2
mask = ((center[:, 0] > patch[0]) *
(center[:, 1] > patch[1]) *
(center[:, 0] < patch[2]) *
(center[:, 1] < patch[3]))
return mask
mask = is_center_of_bboxes_in_patch(boxes, patch)
if not mask.any():
continue
for key in results.get('bbox_fields', []):
boxes = results[key].copy()
mask = is_center_of_bboxes_in_patch(boxes, patch)
boxes = boxes[mask]
if self.bbox_clip_border:
boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
boxes -= np.tile(patch[:2], 2)
results[key] = boxes
# labels
label_key = self.bbox2label.get(key)
if label_key in results:
results[label_key] = results[label_key][mask]
# mask fields
mask_key = self.bbox2mask.get(key)
if mask_key in results:
results[mask_key] = results[mask_key][
mask.nonzero()[0]].crop(patch)
# adjust the img no matter whether the gt is empty before crop
img = img[patch[1]:patch[3], patch[0]:patch[2]]
results['img'] = img
results['img_shape'] = img.shape
# seg fields
for key in results.get('seg_fields', []):
results[key] = results[key][patch[1]:patch[3],
patch[0]:patch[2]]
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(min_ious={self.min_ious}, '
repr_str += f'min_crop_size={self.min_crop_size}, '
repr_str += f'bbox_clip_border={self.bbox_clip_border})'
return repr_str
@PIPELINES.register_module()
class Corrupt(object):
"""Corruption augmentation.
Corruption transforms implemented based on
`imagecorruptions <https://github.com/bethgelab/imagecorruptions>`_.
Args:
corruption (str): Corruption name.
severity (int, optional): The severity of corruption. Default: 1.
"""
def __init__(self, corruption, severity=1):
self.corruption = corruption
self.severity = severity
def __call__(self, results):
"""Call function to corrupt image.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images corrupted.
"""
if corrupt is None:
raise RuntimeError('imagecorruptions is not installed')
if 'img_fields' in results:
assert results['img_fields'] == ['img'], \
'Only single img_fields is allowed'
results['img'] = corrupt(
results['img'].astype(np.uint8),
corruption_name=self.corruption,
severity=self.severity)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(corruption={self.corruption}, '
repr_str += f'severity={self.severity})'
return repr_str
@PIPELINES.register_module()
class Albu(object):
"""Albumentation augmentation.
Adds custom transformations from Albumentations library.
Please, visit `https://albumentations.readthedocs.io`
to get more information.
An example of ``transforms`` is as followed:
.. code-block::
[
dict(
type='ShiftScaleRotate',
shift_limit=0.0625,
scale_limit=0.0,
rotate_limit=0,
interpolation=1,
p=0.5),
dict(
type='RandomBrightnessContrast',
brightness_limit=[0.1, 0.3],
contrast_limit=[0.1, 0.3],
p=0.2),
dict(type='ChannelShuffle', p=0.1),
dict(
type='OneOf',
transforms=[
dict(type='Blur', blur_limit=3, p=1.0),
dict(type='MedianBlur', blur_limit=3, p=1.0)
],
p=0.1),
]
Args:
transforms (list[dict]): A list of albu transformations
bbox_params (dict): Bbox_params for albumentation `Compose`
keymap (dict): Contains {'input key':'albumentation-style key'}
skip_img_without_anno (bool): Whether to skip the image if no ann left
after aug
"""
def __init__(self,
transforms,
bbox_params=None,
keymap=None,
update_pad_shape=False,
skip_img_without_anno=False):
if Compose is None:
raise RuntimeError('albumentations is not installed')
self.transforms = transforms
self.filter_lost_elements = False
self.update_pad_shape = update_pad_shape
self.skip_img_without_anno = skip_img_without_anno
# A simple workaround to remove masks without boxes
if (isinstance(bbox_params, dict) and 'label_fields' in bbox_params
and 'filter_lost_elements' in bbox_params):
self.filter_lost_elements = True
self.origin_label_fields = bbox_params['label_fields']
bbox_params['label_fields'] = ['idx_mapper']
del bbox_params['filter_lost_elements']
self.bbox_params = (
self.albu_builder(bbox_params) if bbox_params else None)
self.aug = Compose([self.albu_builder(t) for t in self.transforms],
bbox_params=self.bbox_params)
if not keymap:
self.keymap_to_albu = {
'img': 'image',
'gt_masks': 'masks',
'gt_bboxes': 'bboxes'
}
else:
self.keymap_to_albu = keymap
self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
def albu_builder(self, cfg):
"""Import a module from albumentations.
It inherits some of :func:`build_from_cfg` logic.
Args:
cfg (dict): Config dict. It should at least contain the key "type".
Returns:
obj: The constructed object.
"""
assert isinstance(cfg, dict) and 'type' in cfg
args = cfg.copy()
obj_type = args.pop('type')
if mmcv.is_str(obj_type):
if albumentations is None:
raise RuntimeError('albumentations is not installed')
obj_cls = getattr(albumentations, obj_type)
elif inspect.isclass(obj_type):
obj_cls = obj_type
else:
raise TypeError(
f'type must be a str or valid type, but got {type(obj_type)}')
if 'transforms' in args:
args['transforms'] = [
self.albu_builder(transform)
for transform in args['transforms']
]
return obj_cls(**args)
@staticmethod
def mapper(d, keymap):
"""Dictionary mapper. Renames keys according to keymap provided.
Args:
d (dict): old dict
keymap (dict): {'old_key':'new_key'}
Returns:
dict: new dict.
"""
updated_dict = {}
for k, v in zip(d.keys(), d.values()):
new_k = keymap.get(k, k)
updated_dict[new_k] = d[k]
return updated_dict
def __call__(self, results):
# dict to albumentations format
results = self.mapper(results, self.keymap_to_albu)
# TODO: add bbox_fields
if 'bboxes' in results:
# to list of boxes
if isinstance(results['bboxes'], np.ndarray):
results['bboxes'] = [x for x in results['bboxes']]
# add pseudo-field for filtration
if self.filter_lost_elements:
results['idx_mapper'] = np.arange(len(results['bboxes']))
# TODO: Support mask structure in albu
if 'masks' in results:
if isinstance(results['masks'], PolygonMasks):
raise NotImplementedError(
'Albu only supports BitMap masks now')
ori_masks = results['masks']
if albumentations.__version__ < '0.5':
results['masks'] = results['masks'].masks
else:
results['masks'] = [mask for mask in results['masks'].masks]
results = self.aug(**results)
if 'bboxes' in results:
if isinstance(results['bboxes'], list):
results['bboxes'] = np.array(
results['bboxes'], dtype=np.float32)
results['bboxes'] = results['bboxes'].reshape(-1, 4)
# filter label_fields
if self.filter_lost_elements:
for label in self.origin_label_fields:
results[label] = np.array(
[results[label][i] for i in results['idx_mapper']])
if 'masks' in results:
results['masks'] = np.array(
[results['masks'][i] for i in results['idx_mapper']])
results['masks'] = ori_masks.__class__(
results['masks'], results['image'].shape[0],
results['image'].shape[1])
if (not len(results['idx_mapper'])
and self.skip_img_without_anno):
return None
if 'gt_labels' in results:
if isinstance(results['gt_labels'], list):
results['gt_labels'] = np.array(results['gt_labels'])
results['gt_labels'] = results['gt_labels'].astype(np.int64)
# back to the original format
results = self.mapper(results, self.keymap_back)
# update final shape
if self.update_pad_shape:
results['pad_shape'] = results['img'].shape
return results
def __repr__(self):
repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
return repr_str
@PIPELINES.register_module()
class RandomCenterCropPad(object):
"""Random center crop and random around padding for CornerNet.
This operation generates randomly cropped image from the original image and
pads it simultaneously. Different from :class:`RandomCrop`, the output
shape may not equal to ``crop_size`` strictly. We choose a random value
from ``ratios`` and the output shape could be larger or smaller than
``crop_size``. The padding operation is also different from :class:`Pad`,
here we use around padding instead of right-bottom padding.
The relation between output image (padding image) and original image:
.. code:: text
output image
+----------------------------+
| padded area |
+------|----------------------------|----------+
| | cropped area | |
| | +---------------+ | |
| | | . center | | | original image
| | | range | | |
| | +---------------+ | |
+------|----------------------------|----------+
| padded area |
+----------------------------+
There are 5 main areas in the figure:
- output image: output image of this operation, also called padding
image in following instruction.
- original image: input image of this operation.
- padded area: non-intersect area of output image and original image.
- cropped area: the overlap of output image and original image.
- center range: a smaller area where random center chosen from.
center range is computed by ``border`` and original image's shape
to avoid our random center is too close to original image's border.
Also this operation act differently in train and test mode, the summary
pipeline is listed below.
Train pipeline:
1. Choose a ``random_ratio`` from ``ratios``, the shape of padding image
will be ``random_ratio * crop_size``.
2. Choose a ``random_center`` in center range.
3. Generate padding image with center matches the ``random_center``.
4. Initialize the padding image with pixel value equals to ``mean``.
5. Copy the cropped area to padding image.
6. Refine annotations.
Test pipeline:
1. Compute output shape according to ``test_pad_mode``.
2. Generate padding image with center matches the original image
center.
3. Initialize the padding image with pixel value equals to ``mean``.
4. Copy the ``cropped area`` to padding image.
Args:
crop_size (tuple | None): expected size after crop, final size will
computed according to ratio. Requires (h, w) in train mode, and
None in test mode.
ratios (tuple): random select a ratio from tuple and crop image to
(crop_size[0] * ratio) * (crop_size[1] * ratio).
Only available in train mode.
border (int): max distance from center select area to image border.
Only available in train mode.
mean (sequence): Mean values of 3 channels.
std (sequence): Std values of 3 channels.
to_rgb (bool): Whether to convert the image from BGR to RGB.
test_mode (bool): whether involve random variables in transform.
In train mode, crop_size is fixed, center coords and ratio is
random selected from predefined lists. In test mode, crop_size
is image's original shape, center coords and ratio is fixed.
test_pad_mode (tuple): padding method and padding shape value, only
available in test mode. Default is using 'logical_or' with
127 as padding shape value.
- 'logical_or': final_shape = input_shape | padding_shape_value
- 'size_divisor': final_shape = int(
ceil(input_shape / padding_shape_value) * padding_shape_value)
bbox_clip_border (bool, optional): Whether clip the objects outside
the border of the image. Defaults to True.
"""
def __init__(self,
crop_size=None,
ratios=(0.9, 1.0, 1.1),
border=128,
mean=None,
std=None,
to_rgb=None,
test_mode=False,
test_pad_mode=('logical_or', 127),
bbox_clip_border=True):
if test_mode:
assert crop_size is None, 'crop_size must be None in test mode'
assert ratios is None, 'ratios must be None in test mode'
assert border is None, 'border must be None in test mode'
assert isinstance(test_pad_mode, (list, tuple))
assert test_pad_mode[0] in ['logical_or', 'size_divisor']
else:
assert isinstance(crop_size, (list, tuple))
assert crop_size[0] > 0 and crop_size[1] > 0, (
'crop_size must > 0 in train mode')
assert isinstance(ratios, (list, tuple))
assert test_pad_mode is None, (
'test_pad_mode must be None in train mode')
self.crop_size = crop_size
self.ratios = ratios
self.border = border
# We do not set default value to mean, std and to_rgb because these
# hyper-parameters are easy to forget but could affect the performance.
# Please use the same setting as Normalize for performance assurance.
assert mean is not None and std is not None and to_rgb is not None
self.to_rgb = to_rgb
self.input_mean = mean
self.input_std = std
if to_rgb:
self.mean = mean[::-1]
self.std = std[::-1]
else:
self.mean = mean
self.std = std
self.test_mode = test_mode
self.test_pad_mode = test_pad_mode
self.bbox_clip_border = bbox_clip_border
def _get_border(self, border, size):
"""Get final border for the target size.
This function generates a ``final_border`` according to image's shape.
The area between ``final_border`` and ``size - final_border`` is the
``center range``. We randomly choose center from the ``center range``
to avoid our random center is too close to original image's border.
Also ``center range`` should be larger than 0.
Args:
border (int): The initial border, default is 128.
size (int): The width or height of original image.
Returns:
int: The final border.
"""
k = 2 * border / size
i = pow(2, np.ceil(np.log2(np.ceil(k))) + (k == int(k)))
return border // i
def _filter_boxes(self, patch, boxes):
"""Check whether the center of each box is in the patch.
Args:
patch (list[int]): The cropped area, [left, top, right, bottom].
boxes (numpy array, (N x 4)): Ground truth boxes.
Returns:
mask (numpy array, (N,)): Each box is inside or outside the patch.
"""
center = (boxes[:, :2] + boxes[:, 2:]) / 2
mask = (center[:, 0] > patch[0]) * (center[:, 1] > patch[1]) * (
center[:, 0] < patch[2]) * (
center[:, 1] < patch[3])
return mask
def _crop_image_and_paste(self, image, center, size):
"""Crop image with a given center and size, then paste the cropped
image to a blank image with two centers align.
This function is equivalent to generating a blank image with ``size``
as its shape. Then cover it on the original image with two centers (
the center of blank image and the random center of original image)
aligned. The overlap area is paste from the original image and the
outside area is filled with ``mean pixel``.
Args:
image (np array, H x W x C): Original image.
center (list[int]): Target crop center coord.
size (list[int]): Target crop size. [target_h, target_w]
Returns:
cropped_img (np array, target_h x target_w x C): Cropped image.
border (np array, 4): The distance of four border of
``cropped_img`` to the original image area, [top, bottom,
left, right]
patch (list[int]): The cropped area, [left, top, right, bottom].
"""
center_y, center_x = center
target_h, target_w = size
img_h, img_w, img_c = image.shape
x0 = max(0, center_x - target_w // 2)
x1 = min(center_x + target_w // 2, img_w)
y0 = max(0, center_y - target_h // 2)
y1 = min(center_y + target_h // 2, img_h)
patch = np.array((int(x0), int(y0), int(x1), int(y1)))
left, right = center_x - x0, x1 - center_x
top, bottom = center_y - y0, y1 - center_y
cropped_center_y, cropped_center_x = target_h // 2, target_w // 2
cropped_img = np.zeros((target_h, target_w, img_c), dtype=image.dtype)
for i in range(img_c):
cropped_img[:, :, i] += self.mean[i]
y_slice = slice(cropped_center_y - top, cropped_center_y + bottom)
x_slice = slice(cropped_center_x - left, cropped_center_x + right)
cropped_img[y_slice, x_slice, :] = image[y0:y1, x0:x1, :]
border = np.array([
cropped_center_y - top, cropped_center_y + bottom,
cropped_center_x - left, cropped_center_x + right
],
dtype=np.float32)
return cropped_img, border, patch
def _train_aug(self, results):
"""Random crop and around padding the original image.
Args:
results (dict): Image infomations in the augment pipeline.
Returns:
results (dict): The updated dict.
"""
img = results['img']
h, w, c = img.shape
boxes = results['gt_bboxes']
while True:
scale = random.choice(self.ratios)
new_h = int(self.crop_size[0] * scale)
new_w = int(self.crop_size[1] * scale)
h_border = self._get_border(self.border, h)
w_border = self._get_border(self.border, w)
for i in range(50):
center_x = random.randint(low=w_border, high=w - w_border)
center_y = random.randint(low=h_border, high=h - h_border)
cropped_img, border, patch = self._crop_image_and_paste(
img, [center_y, center_x], [new_h, new_w])
mask = self._filter_boxes(patch, boxes)
# if image do not have valid bbox, any crop patch is valid.
if not mask.any() and len(boxes) > 0:
continue
results['img'] = cropped_img
results['img_shape'] = cropped_img.shape
results['pad_shape'] = cropped_img.shape
x0, y0, x1, y1 = patch
left_w, top_h = center_x - x0, center_y - y0
cropped_center_x, cropped_center_y = new_w // 2, new_h // 2
# crop bboxes accordingly and clip to the image boundary
for key in results.get('bbox_fields', []):
mask = self._filter_boxes(patch, results[key])
bboxes = results[key][mask]
bboxes[:, 0:4:2] += cropped_center_x - left_w - x0
bboxes[:, 1:4:2] += cropped_center_y - top_h - y0
if self.bbox_clip_border:
bboxes[:, 0:4:2] = np.clip(bboxes[:, 0:4:2], 0, new_w)
bboxes[:, 1:4:2] = np.clip(bboxes[:, 1:4:2], 0, new_h)
keep = (bboxes[:, 2] > bboxes[:, 0]) & (
bboxes[:, 3] > bboxes[:, 1])
bboxes = bboxes[keep]
results[key] = bboxes
if key in ['gt_bboxes']:
if 'gt_labels' in results:
labels = results['gt_labels'][mask]
labels = labels[keep]
results['gt_labels'] = labels
if 'gt_masks' in results:
raise NotImplementedError(
'RandomCenterCropPad only supports bbox.')
# crop semantic seg
for key in results.get('seg_fields', []):
raise NotImplementedError(
'RandomCenterCropPad only supports bbox.')
return results
def _test_aug(self, results):
"""Around padding the original image without cropping.
The padding mode and value are from ``test_pad_mode``.
Args:
results (dict): Image infomations in the augment pipeline.
Returns:
results (dict): The updated dict.
"""
img = results['img']
h, w, c = img.shape
results['img_shape'] = img.shape
if self.test_pad_mode[0] in ['logical_or']:
target_h = h | self.test_pad_mode[1]
target_w = w | self.test_pad_mode[1]
elif self.test_pad_mode[0] in ['size_divisor']:
divisor = self.test_pad_mode[1]
target_h = int(np.ceil(h / divisor)) * divisor
target_w = int(np.ceil(w / divisor)) * divisor
else:
raise NotImplementedError(
'RandomCenterCropPad only support two testing pad mode:'
'logical-or and size_divisor.')
cropped_img, border, _ = self._crop_image_and_paste(
img, [h // 2, w // 2], [target_h, target_w])
results['img'] = cropped_img
results['pad_shape'] = cropped_img.shape
results['border'] = border
return results
def __call__(self, results):
img = results['img']
assert img.dtype == np.float32, (
'RandomCenterCropPad needs the input image of dtype np.float32,'
' please set "to_float32=True" in "LoadImageFromFile" pipeline')
h, w, c = img.shape
assert c == len(self.mean)
if self.test_mode:
return self._test_aug(results)
else:
return self._train_aug(results)
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(crop_size={self.crop_size}, '
repr_str += f'ratios={self.ratios}, '
repr_str += f'border={self.border}, '
repr_str += f'mean={self.input_mean}, '
repr_str += f'std={self.input_std}, '
repr_str += f'to_rgb={self.to_rgb}, '
repr_str += f'test_mode={self.test_mode}, '
repr_str += f'test_pad_mode={self.test_pad_mode}, '
repr_str += f'bbox_clip_border={self.bbox_clip_border})'
return repr_str
@PIPELINES.register_module()
class CutOut(object):
"""CutOut operation.
Randomly drop some regions of image used in
`Cutout <https://arxiv.org/abs/1708.04552>`_.
Args:
n_holes (int | tuple[int, int]): Number of regions to be dropped.
If it is given as a list, number of holes will be randomly
selected from the closed interval [`n_holes[0]`, `n_holes[1]`].
cutout_shape (tuple[int, int] | list[tuple[int, int]]): The candidate
shape of dropped regions. It can be `tuple[int, int]` to use a
fixed cutout shape, or `list[tuple[int, int]]` to randomly choose
shape from the list.
cutout_ratio (tuple[float, float] | list[tuple[float, float]]): The
candidate ratio of dropped regions. It can be `tuple[float, float]`
to use a fixed ratio or `list[tuple[float, float]]` to randomly
choose ratio from the list. Please note that `cutout_shape`
and `cutout_ratio` cannot be both given at the same time.
fill_in (tuple[float, float, float] | tuple[int, int, int]): The value
of pixel to fill in the dropped regions. Default: (0, 0, 0).
"""
def __init__(self,
n_holes,
cutout_shape=None,
cutout_ratio=None,
fill_in=(0, 0, 0)):
assert (cutout_shape is None) ^ (cutout_ratio is None), \
'Either cutout_shape or cutout_ratio should be specified.'
assert (isinstance(cutout_shape, (list, tuple))
or isinstance(cutout_ratio, (list, tuple)))
if isinstance(n_holes, tuple):
assert len(n_holes) == 2 and 0 <= n_holes[0] < n_holes[1]
else:
n_holes = (n_holes, n_holes)
self.n_holes = n_holes
self.fill_in = fill_in
self.with_ratio = cutout_ratio is not None
self.candidates = cutout_ratio if self.with_ratio else cutout_shape
if not isinstance(self.candidates, list):
self.candidates = [self.candidates]
def __call__(self, results):
"""Call function to drop some regions of image."""
h, w, c = results['img'].shape
n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
for _ in range(n_holes):
x1 = np.random.randint(0, w)
y1 = np.random.randint(0, h)
index = np.random.randint(0, len(self.candidates))
if not self.with_ratio:
cutout_w, cutout_h = self.candidates[index]
else:
cutout_w = int(self.candidates[index][0] * w)
cutout_h = int(self.candidates[index][1] * h)
x2 = np.clip(x1 + cutout_w, 0, w)
y2 = np.clip(y1 + cutout_h, 0, h)
results['img'][y1:y2, x1:x2, :] = self.fill_in
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(n_holes={self.n_holes}, '
repr_str += (f'cutout_ratio={self.candidates}, ' if self.with_ratio
else f'cutout_shape={self.candidates}, ')
repr_str += f'fill_in={self.fill_in})'
return repr_str
from .distributed_sampler import DistributedSampler
from .group_sampler import DistributedGroupSampler, GroupSampler
__all__ = ['DistributedSampler', 'DistributedGroupSampler', 'GroupSampler']
import math
import torch
from torch.utils.data import DistributedSampler as _DistributedSampler
class DistributedSampler(_DistributedSampler):
def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
super().__init__(dataset, num_replicas=num_replicas, rank=rank)
self.shuffle = shuffle
def __iter__(self):
# deterministically shuffle based on epoch
if self.shuffle:
g = torch.Generator()
g.manual_seed(self.epoch)
indices = torch.randperm(len(self.dataset), generator=g).tolist()
else:
indices = torch.arange(len(self.dataset)).tolist()
# add extra samples to make it evenly divisible
# in case that indices is shorter than half of total_size
indices = (indices *
math.ceil(self.total_size / len(indices)))[:self.total_size]
assert len(indices) == self.total_size
# subsample
indices = indices[self.rank:self.total_size:self.num_replicas]
assert len(indices) == self.num_samples
return iter(indices)
from __future__ import division
import math
import numpy as np
import torch
from mmcv.runner import get_dist_info
from torch.utils.data import Sampler
class GroupSampler(Sampler):
def __init__(self, dataset, samples_per_gpu=1):
assert hasattr(dataset, 'flag')
self.dataset = dataset
self.samples_per_gpu = samples_per_gpu
self.flag = dataset.flag.astype(np.int64)
self.group_sizes = np.bincount(self.flag)
self.num_samples = 0
for i, size in enumerate(self.group_sizes):
self.num_samples += int(np.ceil(
size / self.samples_per_gpu)) * self.samples_per_gpu
def __iter__(self):
indices = []
for i, size in enumerate(self.group_sizes):
if size == 0:
continue
indice = np.where(self.flag == i)[0]
assert len(indice) == size
np.random.shuffle(indice)
num_extra = int(np.ceil(size / self.samples_per_gpu)
) * self.samples_per_gpu - len(indice)
indice = np.concatenate(
[indice, np.random.choice(indice, num_extra)])
indices.append(indice)
indices = np.concatenate(indices)
indices = [
indices[i * self.samples_per_gpu:(i + 1) * self.samples_per_gpu]
for i in np.random.permutation(
range(len(indices) // self.samples_per_gpu))
]
indices = np.concatenate(indices)
indices = indices.astype(np.int64).tolist()
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
class DistributedGroupSampler(Sampler):
"""Sampler that restricts data loading to a subset of the dataset.
It is especially useful in conjunction with
:class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
process can pass a DistributedSampler instance as a DataLoader sampler,
and load a subset of the original dataset that is exclusive to it.
.. note::
Dataset is assumed to be of constant size.
Arguments:
dataset: Dataset used for sampling.
num_replicas (optional): Number of processes participating in
distributed training.
rank (optional): Rank of the current process within num_replicas.
"""
def __init__(self,
dataset,
samples_per_gpu=1,
num_replicas=None,
rank=None):
_rank, _num_replicas = get_dist_info()
if num_replicas is None:
num_replicas = _num_replicas
if rank is None:
rank = _rank
self.dataset = dataset
self.samples_per_gpu = samples_per_gpu
self.num_replicas = num_replicas
self.rank = rank
self.epoch = 0
assert hasattr(self.dataset, 'flag')
self.flag = self.dataset.flag
self.group_sizes = np.bincount(self.flag)
self.num_samples = 0
for i, j in enumerate(self.group_sizes):
self.num_samples += int(
math.ceil(self.group_sizes[i] * 1.0 / self.samples_per_gpu /
self.num_replicas)) * self.samples_per_gpu
self.total_size = self.num_samples * self.num_replicas
def __iter__(self):
# deterministically shuffle based on epoch
g = torch.Generator()
g.manual_seed(self.epoch)
indices = []
for i, size in enumerate(self.group_sizes):
if size > 0:
indice = np.where(self.flag == i)[0]
assert len(indice) == size
# add .numpy() to avoid bug when selecting indice in parrots.
# TODO: check whether torch.randperm() can be replaced by
# numpy.random.permutation().
indice = indice[list(
torch.randperm(int(size), generator=g).numpy())].tolist()
extra = int(
math.ceil(
size * 1.0 / self.samples_per_gpu / self.num_replicas)
) * self.samples_per_gpu * self.num_replicas - len(indice)
# pad indice
tmp = indice.copy()
for _ in range(extra // size):
indice.extend(tmp)
indice.extend(tmp[:extra % size])
indices.extend(indice)
assert len(indices) == self.total_size
indices = [
indices[j] for i in list(
torch.randperm(
len(indices) // self.samples_per_gpu, generator=g))
for j in range(i * self.samples_per_gpu, (i + 1) *
self.samples_per_gpu)
]
# subsample
offset = self.num_samples * self.rank
indices = indices[offset:offset + self.num_samples]
assert len(indices) == self.num_samples
return iter(indices)
def __len__(self):
return self.num_samples
def set_epoch(self, epoch):
self.epoch = epoch
import copy
import warnings
def replace_ImageToTensor(pipelines):
"""Replace the ImageToTensor transform in a data pipeline to
DefaultFormatBundle, which is normally useful in batch inference.
Args:
pipelines (list[dict]): Data pipeline configs.
Returns:
list: The new pipeline list with all ImageToTensor replaced by
DefaultFormatBundle.
Examples:
>>> pipelines = [
... dict(type='LoadImageFromFile'),
... dict(
... type='MultiScaleFlipAug',
... img_scale=(1333, 800),
... flip=False,
... transforms=[
... dict(type='Resize', keep_ratio=True),
... dict(type='RandomFlip'),
... dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
... dict(type='Pad', size_divisor=32),
... dict(type='ImageToTensor', keys=['img']),
... dict(type='Collect', keys=['img']),
... ])
... ]
>>> expected_pipelines = [
... dict(type='LoadImageFromFile'),
... dict(
... type='MultiScaleFlipAug',
... img_scale=(1333, 800),
... flip=False,
... transforms=[
... dict(type='Resize', keep_ratio=True),
... dict(type='RandomFlip'),
... dict(type='Normalize', mean=[0, 0, 0], std=[1, 1, 1]),
... dict(type='Pad', size_divisor=32),
... dict(type='DefaultFormatBundle'),
... dict(type='Collect', keys=['img']),
... ])
... ]
>>> assert expected_pipelines == replace_ImageToTensor(pipelines)
"""
pipelines = copy.deepcopy(pipelines)
for i, pipeline in enumerate(pipelines):
if pipeline['type'] == 'MultiScaleFlipAug':
assert 'transforms' in pipeline
pipeline['transforms'] = replace_ImageToTensor(
pipeline['transforms'])
elif pipeline['type'] == 'ImageToTensor':
warnings.warn(
'"ImageToTensor" pipeline is replaced by '
'"DefaultFormatBundle" for batch inference. It is '
'recommended to manually replace it in the test '
'data pipeline in your config file.', UserWarning)
pipelines[i] = {'type': 'DefaultFormatBundle'}
return pipelines
def get_loading_pipeline(pipeline):
"""Only keep loading image and annotations related configuration.
Args:
pipeline (list[dict]): Data pipeline configs.
Returns:
list[dict]: The new pipeline list with only keep
loading image and annotations related configuration.
Examples:
>>> pipelines = [
... dict(type='LoadImageFromFile'),
... dict(type='LoadAnnotations', with_bbox=True),
... dict(type='Resize', img_scale=(1333, 800), keep_ratio=True),
... dict(type='RandomFlip', flip_ratio=0.5),
... dict(type='Normalize', **img_norm_cfg),
... dict(type='Pad', size_divisor=32),
... dict(type='DefaultFormatBundle'),
... dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels'])
... ]
>>> expected_pipelines = [
... dict(type='LoadImageFromFile'),
... dict(type='LoadAnnotations', with_bbox=True)
... ]
>>> assert expected_pipelines ==\
... get_loading_pipeline(pipelines)
"""
loading_pipeline_cfg = []
for cfg in pipeline:
if cfg['type'].startswith('Load'):
loading_pipeline_cfg.append(cfg)
assert len(loading_pipeline_cfg) == 2, \
'The data pipeline in your config file must include ' \
'loading image and annotations related pipeline.'
return loading_pipeline_cfg
from collections import OrderedDict
from mmcv.utils import print_log
from mmdet.core import eval_map, eval_recalls
from .builder import DATASETS
from .xml_style import XMLDataset
@DATASETS.register_module()
class VOCDataset(XMLDataset):
CLASSES = ('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse',
'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 'train',
'tvmonitor')
def __init__(self, **kwargs):
super(VOCDataset, self).__init__(**kwargs)
if 'VOC2007' in self.img_prefix:
self.year = 2007
elif 'VOC2012' in self.img_prefix:
self.year = 2012
else:
raise ValueError('Cannot infer dataset year from img_prefix')
def evaluate(self,
results,
metric='mAP',
logger=None,
proposal_nums=(100, 300, 1000),
iou_thr=0.5,
scale_ranges=None):
"""Evaluate in VOC protocol.
Args:
results (list[list | tuple]): Testing results of the dataset.
metric (str | list[str]): Metrics to be evaluated. Options are
'mAP', 'recall'.
logger (logging.Logger | str, optional): Logger used for printing
related information during evaluation. Default: None.
proposal_nums (Sequence[int]): Proposal number used for evaluating
recalls, such as recall@100, recall@1000.
Default: (100, 300, 1000).
iou_thr (float | list[float]): IoU threshold. Default: 0.5.
scale_ranges (list[tuple], optional): Scale ranges for evaluating
mAP. If not specified, all bounding boxes would be included in
evaluation. Default: None.
Returns:
dict[str, float]: AP/recall metrics.
"""
if not isinstance(metric, str):
assert len(metric) == 1
metric = metric[0]
allowed_metrics = ['mAP', 'recall']
if metric not in allowed_metrics:
raise KeyError(f'metric {metric} is not supported')
annotations = [self.get_ann_info(i) for i in range(len(self))]
eval_results = OrderedDict()
iou_thrs = [iou_thr] if isinstance(iou_thr, float) else iou_thr
if metric == 'mAP':
assert isinstance(iou_thrs, list)
if self.year == 2007:
ds_name = 'voc07'
else:
ds_name = self.CLASSES
mean_aps = []
for iou_thr in iou_thrs:
print_log(f'\n{"-" * 15}iou_thr: {iou_thr}{"-" * 15}')
mean_ap, _ = eval_map(
results,
annotations,
scale_ranges=None,
iou_thr=iou_thr,
dataset=ds_name,
logger=logger)
mean_aps.append(mean_ap)
eval_results[f'AP{int(iou_thr * 100):02d}'] = round(mean_ap, 3)
eval_results['mAP'] = sum(mean_aps) / len(mean_aps)
elif metric == 'recall':
gt_bboxes = [ann['bboxes'] for ann in annotations]
recalls = eval_recalls(
gt_bboxes, results, proposal_nums, iou_thr, logger=logger)
for i, num in enumerate(proposal_nums):
for j, iou in enumerate(iou_thr):
eval_results[f'recall@{num}@{iou}'] = recalls[i, j]
if recalls.shape[1] > 1:
ar = recalls.mean(axis=1)
for i, num in enumerate(proposal_nums):
eval_results[f'AR@{num}'] = ar[i]
return eval_results
import os.path as osp
import xml.etree.ElementTree as ET
import mmcv
from .builder import DATASETS
from .xml_style import XMLDataset
@DATASETS.register_module()
class WIDERFaceDataset(XMLDataset):
"""Reader for the WIDER Face dataset in PASCAL VOC format.
Conversion scripts can be found in
https://github.com/sovrasov/wider-face-pascal-voc-annotations
"""
CLASSES = ('face', )
def __init__(self, **kwargs):
super(WIDERFaceDataset, self).__init__(**kwargs)
def load_annotations(self, ann_file):
"""Load annotation from WIDERFace XML style annotation file.
Args:
ann_file (str): Path of XML file.
Returns:
list[dict]: Annotation info from XML file.
"""
data_infos = []
img_ids = mmcv.list_from_file(ann_file)
for img_id in img_ids:
filename = f'{img_id}.jpg'
xml_path = osp.join(self.img_prefix, 'Annotations',
f'{img_id}.xml')
tree = ET.parse(xml_path)
root = tree.getroot()
size = root.find('size')
width = int(size.find('width').text)
height = int(size.find('height').text)
folder = root.find('folder').text
data_infos.append(
dict(
id=img_id,
filename=osp.join(folder, filename),
width=width,
height=height))
return data_infos
import os.path as osp
import xml.etree.ElementTree as ET
import mmcv
import numpy as np
from PIL import Image
from .builder import DATASETS
from .custom import CustomDataset
@DATASETS.register_module()
class XMLDataset(CustomDataset):
"""XML dataset for detection.
Args:
min_size (int | float, optional): The minimum size of bounding
boxes in the images. If the size of a bounding box is less than
``min_size``, it would be add to ignored field.
"""
def __init__(self, min_size=None, **kwargs):
super(XMLDataset, self).__init__(**kwargs)
self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
self.min_size = min_size
def load_annotations(self, ann_file):
"""Load annotation from XML style ann_file.
Args:
ann_file (str): Path of XML file.
Returns:
list[dict]: Annotation info from XML file.
"""
data_infos = []
img_ids = mmcv.list_from_file(ann_file)
for img_id in img_ids:
filename = f'JPEGImages/{img_id}.jpg'
xml_path = osp.join(self.img_prefix, 'Annotations',
f'{img_id}.xml')
tree = ET.parse(xml_path)
root = tree.getroot()
size = root.find('size')
width = 0
height = 0
if size is not None:
width = int(size.find('width').text)
height = int(size.find('height').text)
else:
img_path = osp.join(self.img_prefix, 'JPEGImages',
'{}.jpg'.format(img_id))
img = Image.open(img_path)
width, height = img.size
data_infos.append(
dict(id=img_id, filename=filename, width=width, height=height))
return data_infos
def _filter_imgs(self, min_size=32):
"""Filter images too small or without annotation."""
valid_inds = []
for i, img_info in enumerate(self.data_infos):
if min(img_info['width'], img_info['height']) < min_size:
continue
if self.filter_empty_gt:
img_id = img_info['id']
xml_path = osp.join(self.img_prefix, 'Annotations',
f'{img_id}.xml')
tree = ET.parse(xml_path)
root = tree.getroot()
for obj in root.findall('object'):
name = obj.find('name').text
if name in self.CLASSES:
valid_inds.append(i)
break
else:
valid_inds.append(i)
return valid_inds
def get_ann_info(self, idx):
"""Get annotation from XML file by index.
Args:
idx (int): Index of data.
Returns:
dict: Annotation info of specified index.
"""
img_id = self.data_infos[idx]['id']
xml_path = osp.join(self.img_prefix, 'Annotations', f'{img_id}.xml')
tree = ET.parse(xml_path)
root = tree.getroot()
bboxes = []
labels = []
bboxes_ignore = []
labels_ignore = []
for obj in root.findall('object'):
name = obj.find('name').text
if name not in self.CLASSES:
continue
label = self.cat2label[name]
difficult = int(obj.find('difficult').text)
bnd_box = obj.find('bndbox')
# TODO: check whether it is necessary to use int
# Coordinates may be float type
bbox = [
int(float(bnd_box.find('xmin').text)),
int(float(bnd_box.find('ymin').text)),
int(float(bnd_box.find('xmax').text)),
int(float(bnd_box.find('ymax').text))
]
ignore = False
if self.min_size:
assert not self.test_mode
w = bbox[2] - bbox[0]
h = bbox[3] - bbox[1]
if w < self.min_size or h < self.min_size:
ignore = True
if difficult or ignore:
bboxes_ignore.append(bbox)
labels_ignore.append(label)
else:
bboxes.append(bbox)
labels.append(label)
if not bboxes:
bboxes = np.zeros((0, 4))
labels = np.zeros((0, ))
else:
bboxes = np.array(bboxes, ndmin=2) - 1
labels = np.array(labels)
if not bboxes_ignore:
bboxes_ignore = np.zeros((0, 4))
labels_ignore = np.zeros((0, ))
else:
bboxes_ignore = np.array(bboxes_ignore, ndmin=2) - 1
labels_ignore = np.array(labels_ignore)
ann = dict(
bboxes=bboxes.astype(np.float32),
labels=labels.astype(np.int64),
bboxes_ignore=bboxes_ignore.astype(np.float32),
labels_ignore=labels_ignore.astype(np.int64))
return ann
def get_cat_ids(self, idx):
"""Get category ids in XML file by index.
Args:
idx (int): Index of data.
Returns:
list[int]: All categories in the image of specified index.
"""
cat_ids = []
img_id = self.data_infos[idx]['id']
xml_path = osp.join(self.img_prefix, 'Annotations', f'{img_id}.xml')
tree = ET.parse(xml_path)
root = tree.getroot()
for obj in root.findall('object'):
name = obj.find('name').text
if name not in self.CLASSES:
continue
label = self.cat2label[name]
cat_ids.append(label)
return cat_ids
from .backbones import * # noqa: F401,F403
from .builder import (BACKBONES, DETECTORS, HEADS, LOSSES, NECKS,
ROI_EXTRACTORS, SHARED_HEADS, build_backbone,
build_detector, build_head, build_loss, build_neck,
build_roi_extractor, build_shared_head)
from .dense_heads import * # noqa: F401,F403
from .detectors import * # noqa: F401,F403
from .losses import * # noqa: F401,F403
from .necks import * # noqa: F401,F403
from .roi_heads import * # noqa: F401,F403
__all__ = [
'BACKBONES', 'NECKS', 'ROI_EXTRACTORS', 'SHARED_HEADS', 'HEADS', 'LOSSES',
'DETECTORS', 'build_backbone', 'build_neck', 'build_roi_extractor',
'build_shared_head', 'build_head', 'build_loss', 'build_detector'
]
import torch
import torch.nn as nn
import torch.nn.functional as F
from functools import partial
import math
import warnings
from mmdet.utils import get_root_logger
from mmcv.runner import load_checkpoint
from ..builder import BACKBONES
_DEFAULT_SCALE_CLAMP = math.log(100000.0 / 16)
import pdb
def _no_grad_trunc_normal_(tensor, mean, std, a, b):
# Cut & paste from PyTorch official master until it's in a few official releases - RW
# Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
def norm_cdf(x):
# Computes standard normal cumulative distribution function
return (1. + math.erf(x / math.sqrt(2.))) / 2.
if (mean < a - 2 * std) or (mean > b + 2 * std):
warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
"The distribution of values may be incorrect.",
stacklevel=2)
with torch.no_grad():
# Values are generated by using a truncated uniform distribution and
# then using the inverse CDF for the normal distribution.
# Get upper and lower cdf values
l = norm_cdf((a - mean) / std)
u = norm_cdf((b - mean) / std)
# Uniformly fill tensor with values from [l, u], then translate to
# [2l-1, 2u-1].
tensor.uniform_(2 * l - 1, 2 * u - 1)
# Use inverse cdf transform for normal distribution to get truncated
# standard normal
tensor.erfinv_()
# Transform to proper mean, std
tensor.mul_(std * math.sqrt(2.))
tensor.add_(mean)
# Clamp to ensure it's in the proper range
tensor.clamp_(min=a, max=b)
return tensor
def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
# type: (Tensor, float, float, float, float) -> Tensor
r"""Fills the input Tensor with values drawn from a truncated
normal distribution. The values are effectively drawn from the
normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
with values outside :math:`[a, b]` redrawn until they are within
the bounds. The method used for generating the random values works
best when :math:`a \leq \text{mean} \leq b`.
Args:
tensor: an n-dimensional `torch.Tensor`
mean: the mean of the normal distribution
std: the standard deviation of the normal distribution
a: the minimum cutoff value
b: the maximum cutoff value
Examples:
>>> w = torch.empty(3, 5)
>>> nn.init.trunc_normal_(w)
"""
return _no_grad_trunc_normal_(tensor, mean, std, a, b)
class DropPath(nn.Module):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
"""
def __init__(self, drop_prob=None):
super(DropPath, self).__init__()
self.drop_prob = drop_prob
def forward(self, x):
return self.drop_path(x, self.drop_prob, self.training)
def drop_path(self, x, drop_prob: float = 0., training: bool = False):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument.
"""
if drop_prob == 0. or not training:
return x
keep_prob = 1 - drop_prob
shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
random_tensor.floor_() # binarize
output = x.div(keep_prob) * random_tensor
return output
class Mlp(nn.Module):
def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
super().__init__()
out_features = out_features or in_features
hidden_features = hidden_features or in_features
self.fc1 = nn.Linear(in_features, hidden_features)
self.act = act_layer()
self.fc2 = nn.Linear(hidden_features, out_features)
self.drop = nn.Dropout(drop)
def forward(self, x):
x = self.fc1(x)
x = self.act(x)
x = self.drop(x)
x = self.fc2(x)
x = self.drop(x)
return x
class Attention(nn.Module):
def __init__(self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0., proj_drop=0.):
super().__init__()
self.num_heads = num_heads
head_dim = dim // num_heads
# NOTE scale factor was wrong in my original version, can set manually to be compat with prev weights
self.scale = qk_scale or head_dim ** -0.5
self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
self.attn_drop = nn.Dropout(attn_drop)
self.proj = nn.Linear(dim, dim)
self.proj_drop = nn.Dropout(proj_drop)
def forward(self, x):
B, N, C = x.shape
qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
q, k, v = qkv[0], qkv[1], qkv[2] # make torchscript happy (cannot use tensor as tuple)
attn = (q @ k.transpose(-2, -1)) * self.scale
attn = attn.softmax(dim=-1)
attn = self.attn_drop(attn)
x = (attn @ v).transpose(1, 2).reshape(B, N, C)
x = self.proj(x)
x = self.proj_drop(x)
return x
class Block(nn.Module):
def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
drop_path=0., act_layer=nn.GELU, norm_layer=partial(nn.LayerNorm, eps=1e-6)):
super().__init__()
self.norm1 = norm_layer(dim)
self.attn = Attention(
dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale, attn_drop=attn_drop, proj_drop=drop)
# NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
self.norm2 = norm_layer(dim)
mlp_hidden_dim = int(dim * mlp_ratio)
self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
def forward(self, x):
x = x + self.drop_path(self.attn(self.norm1(x)))
x = x + self.drop_path(self.mlp(self.norm2(x)))
return x
class ConvBlock(nn.Module):
def __init__(self, inplanes, outplanes, stride=1, res_conv=False, act_layer=nn.ReLU, groups=1,
norm_layer=partial(nn.BatchNorm2d, eps=1e-6), drop_block=None, drop_path=None):
super(ConvBlock, self).__init__()
expansion = 4
med_planes = outplanes // expansion
self.conv1 = nn.Conv2d(inplanes, med_planes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn1 = norm_layer(med_planes)
self.act1 = act_layer(inplace=True)
self.conv2 = nn.Conv2d(med_planes, med_planes, kernel_size=3, stride=stride, groups=groups, padding=1, bias=False)
self.bn2 = norm_layer(med_planes)
self.act2 = act_layer(inplace=True)
self.conv3 = nn.Conv2d(med_planes, outplanes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn3 = norm_layer(outplanes)
self.act3 = act_layer(inplace=True)
if res_conv:
self.residual_conv = nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=stride, padding=0, bias=False)
self.residual_bn = norm_layer(outplanes)
self.res_conv = res_conv
self.drop_block = drop_block
self.drop_path = drop_path
def zero_init_last_bn(self):
nn.init.zeros_(self.bn3.weight)
def forward(self, x, x_t=None, return_x_2=True):
residual = x
x = self.conv1(x)
x = self.bn1(x)
if self.drop_block is not None:
x = self.drop_block(x)
x = self.act1(x)
x = self.conv2(x) if x_t is None else self.conv2(x + x_t)
x = self.bn2(x)
if self.drop_block is not None:
x = self.drop_block(x)
x2 = self.act2(x)
x = self.conv3(x2)
x = self.bn3(x)
if self.drop_block is not None:
x = self.drop_block(x)
if self.drop_path is not None:
x = self.drop_path(x)
if self.res_conv:
residual = self.residual_conv(residual)
residual = self.residual_bn(residual)
x += residual
x = self.act3(x)
if return_x_2:
return x, x2
else:
return x
class FCUDown(nn.Module):
""" CNN feature maps -> Transformer patch embeddings
"""
def __init__(self, inplanes, outplanes, dw_stride, act_layer=nn.GELU,
norm_layer=partial(nn.LayerNorm, eps=1e-6)):
super(FCUDown, self).__init__()
self.dw_stride = dw_stride
self.conv_project = nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, padding=0)
self.sample_pooling = nn.AvgPool2d(kernel_size=dw_stride, stride=dw_stride)
self.ln = norm_layer(outplanes)
self.act = act_layer()
def forward(self, x, x_t):
x = self.conv_project(x) # [N, C, H, W]
x = self.sample_pooling(x).flatten(2).transpose(1, 2)
x = self.ln(x)
x = self.act(x)
x = torch.cat([x_t[:, 0][:, None, :], x], dim=1)
return x
class FCUUp(nn.Module):
""" Transformer patch embeddings -> CNN feature maps
"""
def __init__(self, inplanes, outplanes, up_stride, act_layer=nn.ReLU,
norm_layer=partial(nn.BatchNorm2d, eps=1e-6),):
super(FCUUp, self).__init__()
self.up_stride = up_stride
self.conv_project = nn.Conv2d(inplanes, outplanes, kernel_size=1, stride=1, padding=0)
self.bn = norm_layer(outplanes)
self.act = act_layer()
def forward(self, x, H, W):
B, _, C = x.shape
# [N, 197, 384] -> [N, 196, 384] -> [N, 384, 196] -> [N, 384, 14, 14]
x_r = x[:, 1:].transpose(1, 2).reshape(B, C, H, W)
x_r = self.act(self.bn(self.conv_project(x_r)))
return F.interpolate(x_r, size=(H * self.up_stride, W * self.up_stride))
class Med_ConvBlock(nn.Module):
""" special case for Convblock with down sampling,
"""
def __init__(self, inplanes, act_layer=nn.ReLU, groups=1, norm_layer=partial(nn.BatchNorm2d, eps=1e-6),
drop_block=None, drop_path=None):
super(Med_ConvBlock, self).__init__()
expansion = 4
med_planes = inplanes // expansion
self.conv1 = nn.Conv2d(inplanes, med_planes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn1 = norm_layer(med_planes)
self.act1 = act_layer(inplace=True)
self.conv2 = nn.Conv2d(med_planes, med_planes, kernel_size=3, stride=1, groups=groups, padding=1, bias=False)
self.bn2 = norm_layer(med_planes)
self.act2 = act_layer(inplace=True)
self.conv3 = nn.Conv2d(med_planes, inplanes, kernel_size=1, stride=1, padding=0, bias=False)
self.bn3 = norm_layer(inplanes)
self.act3 = act_layer(inplace=True)
self.drop_block = drop_block
self.drop_path = drop_path
def zero_init_last_bn(self):
nn.init.zeros_(self.bn3.weight)
def forward(self, x):
residual = x
x = self.conv1(x)
x = self.bn1(x)
if self.drop_block is not None:
x = self.drop_block(x)
x = self.act1(x)
x = self.conv2(x)
x = self.bn2(x)
if self.drop_block is not None:
x = self.drop_block(x)
x = self.act2(x)
x = self.conv3(x)
x = self.bn3(x)
if self.drop_block is not None:
x = self.drop_block(x)
if self.drop_path is not None:
x = self.drop_path(x)
x += residual
x = self.act3(x)
return x
class ConvTransBlock(nn.Module):
"""
Basic module for Conformer, keep feature maps for CNN block and patch embeddings for transformer encoder block
"""
def __init__(self, inplanes, outplanes, res_conv, stride, dw_stride, embed_dim, num_heads=12, mlp_ratio=4.,
qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.,
last_fusion=False, num_med_block=0, groups=1):
super(ConvTransBlock, self).__init__()
expansion = 4
self.cnn_block = ConvBlock(inplanes=inplanes, outplanes=outplanes, res_conv=res_conv, stride=stride, groups=groups)
if last_fusion:
self.fusion_block = ConvBlock(inplanes=outplanes, outplanes=outplanes, stride=2, res_conv=True, groups=groups)
else:
self.fusion_block = ConvBlock(inplanes=outplanes, outplanes=outplanes, groups=groups)
if num_med_block > 0:
self.med_block = []
for i in range(num_med_block):
self.med_block.append(Med_ConvBlock(inplanes=outplanes, groups=groups))
self.med_block = nn.ModuleList(self.med_block)
self.squeeze_block = FCUDown(inplanes=outplanes // expansion, outplanes=embed_dim, dw_stride=dw_stride)
self.expand_block = FCUUp(inplanes=embed_dim, outplanes=outplanes // expansion, up_stride=dw_stride)
self.trans_block = Block(
dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop=drop_rate, attn_drop=attn_drop_rate, drop_path=drop_path_rate)
self.dw_stride = dw_stride
self.embed_dim = embed_dim
self.num_med_block = num_med_block
self.last_fusion = last_fusion
def forward(self, x, x_t):
x, x2 = self.cnn_block(x)
_, _, H, W = x2.shape
x_st = self.squeeze_block(x2, x_t)
x_t = self.trans_block(x_st + x_t)
if self.num_med_block > 0:
for m in self.med_block:
x = m(x)
x_t_r = self.expand_block(x_t, H // self.dw_stride, W // self.dw_stride)
x = self.fusion_block(x, x_t_r, return_x_2=False)
return x, x_t
@BACKBONES.register_module()
class Conformer(nn.Module):
""" Vision Transformer with support for patch or hybrid CNN input stage
"""
def __init__(self, patch_size=16, in_chans=3, num_classes=1000, base_channel=64, channel_ratio=4, num_med_block=0,
embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0.,
attn_drop_rate=0., drop_path_rate=0., norm_eval=True, frozen_stages=1, return_cls_token=False):
# Transformer
super().__init__()
self.num_classes = num_classes
self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
self.return_cls_token = return_cls_token
self.norm_eval = norm_eval
self.frozen_stages = frozen_stages
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.trans_dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
# Classifiers
if self.return_cls_token:
self.trans_norm = nn.LayerNorm(embed_dim)
self.trans_cls_head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
# self.pooling = nn.AdaptiveAvgPool2d(1)
# self.conv_cls_head = nn.Linear(1024, num_classes)
# Stem stage: get the feature maps by conv block (copied form resnet.py)
self.conv1 = nn.Conv2d(in_chans, 64, kernel_size=7, stride=2, padding=3, bias=False) # 1 / 2 [112, 112]
self.bn1 = nn.BatchNorm2d(64)
self.act1 = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) # 1 / 4 [56, 56]
# 1 stage
stage_1_channel = int(base_channel * channel_ratio)
trans_dw_stride = patch_size // 4
self.conv_1 = ConvBlock(inplanes=64, outplanes=stage_1_channel, res_conv=True, stride=1)
self.trans_patch_conv = nn.Conv2d(64, embed_dim, kernel_size=trans_dw_stride, stride=trans_dw_stride, padding=0)
self.trans_1 = Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias,
qk_scale=qk_scale, drop=drop_rate, attn_drop=attn_drop_rate, drop_path=self.trans_dpr[0],
)
# 2~4 stage
init_stage = 2
fin_stage = depth // 3 + 1
for i in range(init_stage, fin_stage):
self.add_module('conv_trans_' + str(i),
ConvTransBlock(
stage_1_channel, stage_1_channel, False, 1, dw_stride=trans_dw_stride,
embed_dim=embed_dim,
num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop_rate=drop_rate, attn_drop_rate=attn_drop_rate,
drop_path_rate=self.trans_dpr[i - 1],
num_med_block=num_med_block
)
)
stage_2_channel = int(base_channel * channel_ratio * 2)
# 5~8 stage
init_stage = fin_stage # 5
fin_stage = fin_stage + depth // 3 # 9
for i in range(init_stage, fin_stage):
s = 2 if i == init_stage else 1
in_channel = stage_1_channel if i == init_stage else stage_2_channel
res_conv = True if i == init_stage else False
self.add_module('conv_trans_' + str(i),
ConvTransBlock(
in_channel, stage_2_channel, res_conv, s, dw_stride=trans_dw_stride // 2,
embed_dim=embed_dim,
num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop_rate=drop_rate, attn_drop_rate=attn_drop_rate,
drop_path_rate=self.trans_dpr[i - 1],
num_med_block=num_med_block
)
)
stage_3_channel = int(base_channel * channel_ratio * 2 * 2)
# 9~12 stage
init_stage = fin_stage # 9
fin_stage = fin_stage + depth // 3 # 13
for i in range(init_stage, fin_stage):
s = 2 if i == init_stage else 1
in_channel = stage_2_channel if i == init_stage else stage_3_channel
res_conv = True if i == init_stage else False
last_fusion = True if i == depth else False
self.add_module('conv_trans_' + str(i),
ConvTransBlock(
in_channel, stage_3_channel, res_conv, s, dw_stride=trans_dw_stride // 4,
embed_dim=embed_dim,
num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
drop_rate=drop_rate, attn_drop_rate=attn_drop_rate,
drop_path_rate=self.trans_dpr[i - 1],
num_med_block=num_med_block, last_fusion=last_fusion
)
)
self.fin_stage = fin_stage
trunc_normal_(self.cls_token, std=.02)
def _init_weights(self, m):
if isinstance(m, nn.Linear):
trunc_normal_(m.weight, std=.02)
if isinstance(m, nn.Linear) and m.bias is not None:
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.LayerNorm):
nn.init.constant_(m.bias, 0)
nn.init.constant_(m.weight, 1.0)
elif isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, nn.BatchNorm2d):
nn.init.constant_(m.weight, 1.)
nn.init.constant_(m.bias, 0.)
def init_weights(self, pretrained=None):
if isinstance(pretrained, str):
logger = get_root_logger()
load_checkpoint(self, pretrained, strict=False, logger=logger, map_location='cpu')
elif pretrained is None:
self.apply(self._init_weights)
else:
raise TypeError('pretrained must be a str or None')
@torch.jit.ignore
def no_weight_decay(self):
return {'cls_token',}
def forward(self, x):
output = []
B = x.shape[0]
cls_tokens = self.cls_token.expand(B, -1, -1)
# stem
x_base = self.maxpool(self.act1(self.bn1(self.conv1(x))))
# 1 stage [N, 64, 56, 56] -> [N, 128, 56, 56]
x = self.conv_1(x_base, return_x_2=False)
x_t = self.trans_patch_conv(x_base).flatten(2).transpose(1, 2)
x_t = torch.cat([cls_tokens, x_t], dim=1)
x_t = self.trans_1(x_t)
# 2 ~ final
for i in range(2, self.fin_stage):
x, x_t = eval('self.conv_trans_' + str(i))(x, x_t)
if i in [4, 8, 11, 12]:
output.append(x)
if self.return_cls_token:
return tuple(output), self.trans_cls_head(self.trans_norm(x_t[:, [0,]]))
else:
return tuple(output)
def _freeze_stages(self):
if self.frozen_stages >= 0:
self.bn1.eval()
for m in [self.conv1, self.bn1]:
for param in m.parameters():
param.requires_grad = False
# for i in range(1, self.frozen_stages + 1):
# m = getattr(self, f'layer{i}')
# m.eval()
# for param in m.parameters():
# param.requires_grad = False
def freeze_bn(self, m):
if isinstance(m, nn.BatchNorm2d):
m.eval()
def train(self, mode=True):
"""Convert the model into training mode while keep normalization layer
freezed."""
super(Conformer, self).train(mode)
self._freeze_stages()
if mode and self.norm_eval:
self.apply(self.freeze_bn)
from .darknet import Darknet
from .detectors_resnet import DetectoRS_ResNet
from .detectors_resnext import DetectoRS_ResNeXt
from .hourglass import HourglassNet
from .hrnet import HRNet
from .regnet import RegNet
from .res2net import Res2Net
from .resnest import ResNeSt
from .resnet import ResNet, ResNetV1d
from .resnext import ResNeXt
from .ssd_vgg import SSDVGG
from .trident_resnet import TridentResNet
from .Conformer import Conformer
__all__ = [
'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SSDVGG', 'HRNet', 'Res2Net',
'HourglassNet', 'DetectoRS_ResNet', 'DetectoRS_ResNeXt', 'Darknet',
'ResNeSt', 'TridentResNet', 'Conformer'
]
# Copyright (c) 2019 Western Digital Corporation or its affiliates.
import logging
import torch.nn as nn
from mmcv.cnn import ConvModule, constant_init, kaiming_init
from mmcv.runner import load_checkpoint
from torch.nn.modules.batchnorm import _BatchNorm
from ..builder import BACKBONES
class ResBlock(nn.Module):
"""The basic residual block used in Darknet. Each ResBlock consists of two
ConvModules and the input is added to the final output. Each ConvModule is
composed of Conv, BN, and LeakyReLU. In YoloV3 paper, the first convLayer
has half of the number of the filters as much as the second convLayer. The
first convLayer has filter size of 1x1 and the second one has the filter
size of 3x3.
Args:
in_channels (int): The input channels. Must be even.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
act_cfg (dict): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
"""
def __init__(self,
in_channels,
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1)):
super(ResBlock, self).__init__()
assert in_channels % 2 == 0 # ensure the in_channels is even
half_in_channels = in_channels // 2
# shortcut
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
self.conv1 = ConvModule(in_channels, half_in_channels, 1, **cfg)
self.conv2 = ConvModule(
half_in_channels, in_channels, 3, padding=1, **cfg)
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.conv2(out)
out = out + residual
return out
@BACKBONES.register_module()
class Darknet(nn.Module):
"""Darknet backbone.
Args:
depth (int): Depth of Darknet. Currently only support 53.
out_indices (Sequence[int]): Output from which stages.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters. Default: -1.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
act_cfg (dict): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
norm_eval (bool): Whether to set norm layers to eval mode, namely,
freeze running stats (mean and var). Note: Effect on Batch Norm
and its variants only.
Example:
>>> from mmdet.models import Darknet
>>> import torch
>>> self = Darknet(depth=53)
>>> self.eval()
>>> inputs = torch.rand(1, 3, 416, 416)
>>> level_outputs = self.forward(inputs)
>>> for level_out in level_outputs:
... print(tuple(level_out.shape))
...
(1, 256, 52, 52)
(1, 512, 26, 26)
(1, 1024, 13, 13)
"""
# Dict(depth: (layers, channels))
arch_settings = {
53: ((1, 2, 8, 8, 4), ((32, 64), (64, 128), (128, 256), (256, 512),
(512, 1024)))
}
def __init__(self,
depth=53,
out_indices=(3, 4, 5),
frozen_stages=-1,
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
norm_eval=True):
super(Darknet, self).__init__()
if depth not in self.arch_settings:
raise KeyError(f'invalid depth {depth} for darknet')
self.depth = depth
self.out_indices = out_indices
self.frozen_stages = frozen_stages
self.layers, self.channels = self.arch_settings[depth]
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
self.conv1 = ConvModule(3, 32, 3, padding=1, **cfg)
self.cr_blocks = ['conv1']
for i, n_layers in enumerate(self.layers):
layer_name = f'conv_res_block{i + 1}'
in_c, out_c = self.channels[i]
self.add_module(
layer_name,
self.make_conv_res_block(in_c, out_c, n_layers, **cfg))
self.cr_blocks.append(layer_name)
self.norm_eval = norm_eval
def forward(self, x):
outs = []
for i, layer_name in enumerate(self.cr_blocks):
cr_block = getattr(self, layer_name)
x = cr_block(x)
if i in self.out_indices:
outs.append(x)
return tuple(outs)
def init_weights(self, pretrained=None):
if isinstance(pretrained, str):
logger = logging.getLogger()
load_checkpoint(self, pretrained, strict=False, logger=logger)
elif pretrained is None:
for m in self.modules():
if isinstance(m, nn.Conv2d):
kaiming_init(m)
elif isinstance(m, (_BatchNorm, nn.GroupNorm)):
constant_init(m, 1)
else:
raise TypeError('pretrained must be a str or None')
def _freeze_stages(self):
if self.frozen_stages >= 0:
for i in range(self.frozen_stages):
m = getattr(self, self.cr_blocks[i])
m.eval()
for param in m.parameters():
param.requires_grad = False
def train(self, mode=True):
super(Darknet, self).train(mode)
self._freeze_stages()
if mode and self.norm_eval:
for m in self.modules():
if isinstance(m, _BatchNorm):
m.eval()
@staticmethod
def make_conv_res_block(in_channels,
out_channels,
res_repeat,
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
act_cfg=dict(type='LeakyReLU',
negative_slope=0.1)):
"""In Darknet backbone, ConvLayer is usually followed by ResBlock. This
function will make that. The Conv layers always have 3x3 filters with
stride=2. The number of the filters in Conv layer is the same as the
out channels of the ResBlock.
Args:
in_channels (int): The number of input channels.
out_channels (int): The number of output channels.
res_repeat (int): The number of ResBlocks.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
act_cfg (dict): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
"""
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
model = nn.Sequential()
model.add_module(
'conv',
ConvModule(
in_channels, out_channels, 3, stride=2, padding=1, **cfg))
for idx in range(res_repeat):
model.add_module('res{}'.format(idx),
ResBlock(out_channels, **cfg))
return model
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment