add v0.19.1 release

bf491463 · limm · e17f5ea2 · bf491463 · bf491463 · bf491463
Commit bf491463 authored May 30, 2025 by limm
20 changed files
--- a/gallery/assets/imagenet_class_index.json
+++ b/gallery/assets/imagenet_class_index.json
-{"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]}
\ No newline at end of file
+{"0": ["n01440764", "tench"], "1": ["n01443537", "goldfish"], "2": ["n01484850", "great_white_shark"], "3": ["n01491361", "tiger_shark"], "4": ["n01494475", "hammerhead"], "5": ["n01496331", "electric_ray"], "6": ["n01498041", "stingray"], "7": ["n01514668", "cock"], "8": ["n01514859", "hen"], "9": ["n01518878", "ostrich"], "10": ["n01530575", "brambling"], "11": ["n01531178", "goldfinch"], "12": ["n01532829", "house_finch"], "13": ["n01534433", "junco"], "14": ["n01537544", "indigo_bunting"], "15": ["n01558993", "robin"], "16": ["n01560419", "bulbul"], "17": ["n01580077", "jay"], "18": ["n01582220", "magpie"], "19": ["n01592084", "chickadee"], "20": ["n01601694", "water_ouzel"], "21": ["n01608432", "kite"], "22": ["n01614925", "bald_eagle"], "23": ["n01616318", "vulture"], "24": ["n01622779", "great_grey_owl"], "25": ["n01629819", "European_fire_salamander"], "26": ["n01630670", "common_newt"], "27": ["n01631663", "eft"], "28": ["n01632458", "spotted_salamander"], "29": ["n01632777", "axolotl"], "30": ["n01641577", "bullfrog"], "31": ["n01644373", "tree_frog"], "32": ["n01644900", "tailed_frog"], "33": ["n01664065", "loggerhead"], "34": ["n01665541", "leatherback_turtle"], "35": ["n01667114", "mud_turtle"], "36": ["n01667778", "terrapin"], "37": ["n01669191", "box_turtle"], "38": ["n01675722", "banded_gecko"], "39": ["n01677366", "common_iguana"], "40": ["n01682714", "American_chameleon"], "41": ["n01685808", "whiptail"], "42": ["n01687978", "agama"], "43": ["n01688243", "frilled_lizard"], "44": ["n01689811", "alligator_lizard"], "45": ["n01692333", "Gila_monster"], "46": ["n01693334", "green_lizard"], "47": ["n01694178", "African_chameleon"], "48": ["n01695060", "Komodo_dragon"], "49": ["n01697457", "African_crocodile"], "50": ["n01698640", "American_alligator"], "51": ["n01704323", "triceratops"], "52": ["n01728572", "thunder_snake"], "53": ["n01728920", "ringneck_snake"], "54": ["n01729322", "hognose_snake"], "55": ["n01729977", "green_snake"], "56": ["n01734418", "king_snake"], "57": ["n01735189", "garter_snake"], "58": ["n01737021", "water_snake"], "59": ["n01739381", "vine_snake"], "60": ["n01740131", "night_snake"], "61": ["n01742172", "boa_constrictor"], "62": ["n01744401", "rock_python"], "63": ["n01748264", "Indian_cobra"], "64": ["n01749939", "green_mamba"], "65": ["n01751748", "sea_snake"], "66": ["n01753488", "horned_viper"], "67": ["n01755581", "diamondback"], "68": ["n01756291", "sidewinder"], "69": ["n01768244", "trilobite"], "70": ["n01770081", "harvestman"], "71": ["n01770393", "scorpion"], "72": ["n01773157", "black_and_gold_garden_spider"], "73": ["n01773549", "barn_spider"], "74": ["n01773797", "garden_spider"], "75": ["n01774384", "black_widow"], "76": ["n01774750", "tarantula"], "77": ["n01775062", "wolf_spider"], "78": ["n01776313", "tick"], "79": ["n01784675", "centipede"], "80": ["n01795545", "black_grouse"], "81": ["n01796340", "ptarmigan"], "82": ["n01797886", "ruffed_grouse"], "83": ["n01798484", "prairie_chicken"], "84": ["n01806143", "peacock"], "85": ["n01806567", "quail"], "86": ["n01807496", "partridge"], "87": ["n01817953", "African_grey"], "88": ["n01818515", "macaw"], "89": ["n01819313", "sulphur-crested_cockatoo"], "90": ["n01820546", "lorikeet"], "91": ["n01824575", "coucal"], "92": ["n01828970", "bee_eater"], "93": ["n01829413", "hornbill"], "94": ["n01833805", "hummingbird"], "95": ["n01843065", "jacamar"], "96": ["n01843383", "toucan"], "97": ["n01847000", "drake"], "98": ["n01855032", "red-breasted_merganser"], "99": ["n01855672", "goose"], "100": ["n01860187", "black_swan"], "101": ["n01871265", "tusker"], "102": ["n01872401", "echidna"], "103": ["n01873310", "platypus"], "104": ["n01877812", "wallaby"], "105": ["n01882714", "koala"], "106": ["n01883070", "wombat"], "107": ["n01910747", "jellyfish"], "108": ["n01914609", "sea_anemone"], "109": ["n01917289", "brain_coral"], "110": ["n01924916", "flatworm"], "111": ["n01930112", "nematode"], "112": ["n01943899", "conch"], "113": ["n01944390", "snail"], "114": ["n01945685", "slug"], "115": ["n01950731", "sea_slug"], "116": ["n01955084", "chiton"], "117": ["n01968897", "chambered_nautilus"], "118": ["n01978287", "Dungeness_crab"], "119": ["n01978455", "rock_crab"], "120": ["n01980166", "fiddler_crab"], "121": ["n01981276", "king_crab"], "122": ["n01983481", "American_lobster"], "123": ["n01984695", "spiny_lobster"], "124": ["n01985128", "crayfish"], "125": ["n01986214", "hermit_crab"], "126": ["n01990800", "isopod"], "127": ["n02002556", "white_stork"], "128": ["n02002724", "black_stork"], "129": ["n02006656", "spoonbill"], "130": ["n02007558", "flamingo"], "131": ["n02009229", "little_blue_heron"], "132": ["n02009912", "American_egret"], "133": ["n02011460", "bittern"], "134": ["n02012849", "crane"], "135": ["n02013706", "limpkin"], "136": ["n02017213", "European_gallinule"], "137": ["n02018207", "American_coot"], "138": ["n02018795", "bustard"], "139": ["n02025239", "ruddy_turnstone"], "140": ["n02027492", "red-backed_sandpiper"], "141": ["n02028035", "redshank"], "142": ["n02033041", "dowitcher"], "143": ["n02037110", "oystercatcher"], "144": ["n02051845", "pelican"], "145": ["n02056570", "king_penguin"], "146": ["n02058221", "albatross"], "147": ["n02066245", "grey_whale"], "148": ["n02071294", "killer_whale"], "149": ["n02074367", "dugong"], "150": ["n02077923", "sea_lion"], "151": ["n02085620", "Chihuahua"], "152": ["n02085782", "Japanese_spaniel"], "153": ["n02085936", "Maltese_dog"], "154": ["n02086079", "Pekinese"], "155": ["n02086240", "Shih-Tzu"], "156": ["n02086646", "Blenheim_spaniel"], "157": ["n02086910", "papillon"], "158": ["n02087046", "toy_terrier"], "159": ["n02087394", "Rhodesian_ridgeback"], "160": ["n02088094", "Afghan_hound"], "161": ["n02088238", "basset"], "162": ["n02088364", "beagle"], "163": ["n02088466", "bloodhound"], "164": ["n02088632", "bluetick"], "165": ["n02089078", "black-and-tan_coonhound"], "166": ["n02089867", "Walker_hound"], "167": ["n02089973", "English_foxhound"], "168": ["n02090379", "redbone"], "169": ["n02090622", "borzoi"], "170": ["n02090721", "Irish_wolfhound"], "171": ["n02091032", "Italian_greyhound"], "172": ["n02091134", "whippet"], "173": ["n02091244", "Ibizan_hound"], "174": ["n02091467", "Norwegian_elkhound"], "175": ["n02091635", "otterhound"], "176": ["n02091831", "Saluki"], "177": ["n02092002", "Scottish_deerhound"], "178": ["n02092339", "Weimaraner"], "179": ["n02093256", "Staffordshire_bullterrier"], "180": ["n02093428", "American_Staffordshire_terrier"], "181": ["n02093647", "Bedlington_terrier"], "182": ["n02093754", "Border_terrier"], "183": ["n02093859", "Kerry_blue_terrier"], "184": ["n02093991", "Irish_terrier"], "185": ["n02094114", "Norfolk_terrier"], "186": ["n02094258", "Norwich_terrier"], "187": ["n02094433", "Yorkshire_terrier"], "188": ["n02095314", "wire-haired_fox_terrier"], "189": ["n02095570", "Lakeland_terrier"], "190": ["n02095889", "Sealyham_terrier"], "191": ["n02096051", "Airedale"], "192": ["n02096177", "cairn"], "193": ["n02096294", "Australian_terrier"], "194": ["n02096437", "Dandie_Dinmont"], "195": ["n02096585", "Boston_bull"], "196": ["n02097047", "miniature_schnauzer"], "197": ["n02097130", "giant_schnauzer"], "198": ["n02097209", "standard_schnauzer"], "199": ["n02097298", "Scotch_terrier"], "200": ["n02097474", "Tibetan_terrier"], "201": ["n02097658", "silky_terrier"], "202": ["n02098105", "soft-coated_wheaten_terrier"], "203": ["n02098286", "West_Highland_white_terrier"], "204": ["n02098413", "Lhasa"], "205": ["n02099267", "flat-coated_retriever"], "206": ["n02099429", "curly-coated_retriever"], "207": ["n02099601", "golden_retriever"], "208": ["n02099712", "Labrador_retriever"], "209": ["n02099849", "Chesapeake_Bay_retriever"], "210": ["n02100236", "German_short-haired_pointer"], "211": ["n02100583", "vizsla"], "212": ["n02100735", "English_setter"], "213": ["n02100877", "Irish_setter"], "214": ["n02101006", "Gordon_setter"], "215": ["n02101388", "Brittany_spaniel"], "216": ["n02101556", "clumber"], "217": ["n02102040", "English_springer"], "218": ["n02102177", "Welsh_springer_spaniel"], "219": ["n02102318", "cocker_spaniel"], "220": ["n02102480", "Sussex_spaniel"], "221": ["n02102973", "Irish_water_spaniel"], "222": ["n02104029", "kuvasz"], "223": ["n02104365", "schipperke"], "224": ["n02105056", "groenendael"], "225": ["n02105162", "malinois"], "226": ["n02105251", "briard"], "227": ["n02105412", "kelpie"], "228": ["n02105505", "komondor"], "229": ["n02105641", "Old_English_sheepdog"], "230": ["n02105855", "Shetland_sheepdog"], "231": ["n02106030", "collie"], "232": ["n02106166", "Border_collie"], "233": ["n02106382", "Bouvier_des_Flandres"], "234": ["n02106550", "Rottweiler"], "235": ["n02106662", "German_shepherd"], "236": ["n02107142", "Doberman"], "237": ["n02107312", "miniature_pinscher"], "238": ["n02107574", "Greater_Swiss_Mountain_dog"], "239": ["n02107683", "Bernese_mountain_dog"], "240": ["n02107908", "Appenzeller"], "241": ["n02108000", "EntleBucher"], "242": ["n02108089", "boxer"], "243": ["n02108422", "bull_mastiff"], "244": ["n02108551", "Tibetan_mastiff"], "245": ["n02108915", "French_bulldog"], "246": ["n02109047", "Great_Dane"], "247": ["n02109525", "Saint_Bernard"], "248": ["n02109961", "Eskimo_dog"], "249": ["n02110063", "malamute"], "250": ["n02110185", "Siberian_husky"], "251": ["n02110341", "dalmatian"], "252": ["n02110627", "affenpinscher"], "253": ["n02110806", "basenji"], "254": ["n02110958", "pug"], "255": ["n02111129", "Leonberg"], "256": ["n02111277", "Newfoundland"], "257": ["n02111500", "Great_Pyrenees"], "258": ["n02111889", "Samoyed"], "259": ["n02112018", "Pomeranian"], "260": ["n02112137", "chow"], "261": ["n02112350", "keeshond"], "262": ["n02112706", "Brabancon_griffon"], "263": ["n02113023", "Pembroke"], "264": ["n02113186", "Cardigan"], "265": ["n02113624", "toy_poodle"], "266": ["n02113712", "miniature_poodle"], "267": ["n02113799", "standard_poodle"], "268": ["n02113978", "Mexican_hairless"], "269": ["n02114367", "timber_wolf"], "270": ["n02114548", "white_wolf"], "271": ["n02114712", "red_wolf"], "272": ["n02114855", "coyote"], "273": ["n02115641", "dingo"], "274": ["n02115913", "dhole"], "275": ["n02116738", "African_hunting_dog"], "276": ["n02117135", "hyena"], "277": ["n02119022", "red_fox"], "278": ["n02119789", "kit_fox"], "279": ["n02120079", "Arctic_fox"], "280": ["n02120505", "grey_fox"], "281": ["n02123045", "tabby"], "282": ["n02123159", "tiger_cat"], "283": ["n02123394", "Persian_cat"], "284": ["n02123597", "Siamese_cat"], "285": ["n02124075", "Egyptian_cat"], "286": ["n02125311", "cougar"], "287": ["n02127052", "lynx"], "288": ["n02128385", "leopard"], "289": ["n02128757", "snow_leopard"], "290": ["n02128925", "jaguar"], "291": ["n02129165", "lion"], "292": ["n02129604", "tiger"], "293": ["n02130308", "cheetah"], "294": ["n02132136", "brown_bear"], "295": ["n02133161", "American_black_bear"], "296": ["n02134084", "ice_bear"], "297": ["n02134418", "sloth_bear"], "298": ["n02137549", "mongoose"], "299": ["n02138441", "meerkat"], "300": ["n02165105", "tiger_beetle"], "301": ["n02165456", "ladybug"], "302": ["n02167151", "ground_beetle"], "303": ["n02168699", "long-horned_beetle"], "304": ["n02169497", "leaf_beetle"], "305": ["n02172182", "dung_beetle"], "306": ["n02174001", "rhinoceros_beetle"], "307": ["n02177972", "weevil"], "308": ["n02190166", "fly"], "309": ["n02206856", "bee"], "310": ["n02219486", "ant"], "311": ["n02226429", "grasshopper"], "312": ["n02229544", "cricket"], "313": ["n02231487", "walking_stick"], "314": ["n02233338", "cockroach"], "315": ["n02236044", "mantis"], "316": ["n02256656", "cicada"], "317": ["n02259212", "leafhopper"], "318": ["n02264363", "lacewing"], "319": ["n02268443", "dragonfly"], "320": ["n02268853", "damselfly"], "321": ["n02276258", "admiral"], "322": ["n02277742", "ringlet"], "323": ["n02279972", "monarch"], "324": ["n02280649", "cabbage_butterfly"], "325": ["n02281406", "sulphur_butterfly"], "326": ["n02281787", "lycaenid"], "327": ["n02317335", "starfish"], "328": ["n02319095", "sea_urchin"], "329": ["n02321529", "sea_cucumber"], "330": ["n02325366", "wood_rabbit"], "331": ["n02326432", "hare"], "332": ["n02328150", "Angora"], "333": ["n02342885", "hamster"], "334": ["n02346627", "porcupine"], "335": ["n02356798", "fox_squirrel"], "336": ["n02361337", "marmot"], "337": ["n02363005", "beaver"], "338": ["n02364673", "guinea_pig"], "339": ["n02389026", "sorrel"], "340": ["n02391049", "zebra"], "341": ["n02395406", "hog"], "342": ["n02396427", "wild_boar"], "343": ["n02397096", "warthog"], "344": ["n02398521", "hippopotamus"], "345": ["n02403003", "ox"], "346": ["n02408429", "water_buffalo"], "347": ["n02410509", "bison"], "348": ["n02412080", "ram"], "349": ["n02415577", "bighorn"], "350": ["n02417914", "ibex"], "351": ["n02422106", "hartebeest"], "352": ["n02422699", "impala"], "353": ["n02423022", "gazelle"], "354": ["n02437312", "Arabian_camel"], "355": ["n02437616", "llama"], "356": ["n02441942", "weasel"], "357": ["n02442845", "mink"], "358": ["n02443114", "polecat"], "359": ["n02443484", "black-footed_ferret"], "360": ["n02444819", "otter"], "361": ["n02445715", "skunk"], "362": ["n02447366", "badger"], "363": ["n02454379", "armadillo"], "364": ["n02457408", "three-toed_sloth"], "365": ["n02480495", "orangutan"], "366": ["n02480855", "gorilla"], "367": ["n02481823", "chimpanzee"], "368": ["n02483362", "gibbon"], "369": ["n02483708", "siamang"], "370": ["n02484975", "guenon"], "371": ["n02486261", "patas"], "372": ["n02486410", "baboon"], "373": ["n02487347", "macaque"], "374": ["n02488291", "langur"], "375": ["n02488702", "colobus"], "376": ["n02489166", "proboscis_monkey"], "377": ["n02490219", "marmoset"], "378": ["n02492035", "capuchin"], "379": ["n02492660", "howler_monkey"], "380": ["n02493509", "titi"], "381": ["n02493793", "spider_monkey"], "382": ["n02494079", "squirrel_monkey"], "383": ["n02497673", "Madagascar_cat"], "384": ["n02500267", "indri"], "385": ["n02504013", "Indian_elephant"], "386": ["n02504458", "African_elephant"], "387": ["n02509815", "lesser_panda"], "388": ["n02510455", "giant_panda"], "389": ["n02514041", "barracouta"], "390": ["n02526121", "eel"], "391": ["n02536864", "coho"], "392": ["n02606052", "rock_beauty"], "393": ["n02607072", "anemone_fish"], "394": ["n02640242", "sturgeon"], "395": ["n02641379", "gar"], "396": ["n02643566", "lionfish"], "397": ["n02655020", "puffer"], "398": ["n02666196", "abacus"], "399": ["n02667093", "abaya"], "400": ["n02669723", "academic_gown"], "401": ["n02672831", "accordion"], "402": ["n02676566", "acoustic_guitar"], "403": ["n02687172", "aircraft_carrier"], "404": ["n02690373", "airliner"], "405": ["n02692877", "airship"], "406": ["n02699494", "altar"], "407": ["n02701002", "ambulance"], "408": ["n02704792", "amphibian"], "409": ["n02708093", "analog_clock"], "410": ["n02727426", "apiary"], "411": ["n02730930", "apron"], "412": ["n02747177", "ashcan"], "413": ["n02749479", "assault_rifle"], "414": ["n02769748", "backpack"], "415": ["n02776631", "bakery"], "416": ["n02777292", "balance_beam"], "417": ["n02782093", "balloon"], "418": ["n02783161", "ballpoint"], "419": ["n02786058", "Band_Aid"], "420": ["n02787622", "banjo"], "421": ["n02788148", "bannister"], "422": ["n02790996", "barbell"], "423": ["n02791124", "barber_chair"], "424": ["n02791270", "barbershop"], "425": ["n02793495", "barn"], "426": ["n02794156", "barometer"], "427": ["n02795169", "barrel"], "428": ["n02797295", "barrow"], "429": ["n02799071", "baseball"], "430": ["n02802426", "basketball"], "431": ["n02804414", "bassinet"], "432": ["n02804610", "bassoon"], "433": ["n02807133", "bathing_cap"], "434": ["n02808304", "bath_towel"], "435": ["n02808440", "bathtub"], "436": ["n02814533", "beach_wagon"], "437": ["n02814860", "beacon"], "438": ["n02815834", "beaker"], "439": ["n02817516", "bearskin"], "440": ["n02823428", "beer_bottle"], "441": ["n02823750", "beer_glass"], "442": ["n02825657", "bell_cote"], "443": ["n02834397", "bib"], "444": ["n02835271", "bicycle-built-for-two"], "445": ["n02837789", "bikini"], "446": ["n02840245", "binder"], "447": ["n02841315", "binoculars"], "448": ["n02843684", "birdhouse"], "449": ["n02859443", "boathouse"], "450": ["n02860847", "bobsled"], "451": ["n02865351", "bolo_tie"], "452": ["n02869837", "bonnet"], "453": ["n02870880", "bookcase"], "454": ["n02871525", "bookshop"], "455": ["n02877765", "bottlecap"], "456": ["n02879718", "bow"], "457": ["n02883205", "bow_tie"], "458": ["n02892201", "brass"], "459": ["n02892767", "brassiere"], "460": ["n02894605", "breakwater"], "461": ["n02895154", "breastplate"], "462": ["n02906734", "broom"], "463": ["n02909870", "bucket"], "464": ["n02910353", "buckle"], "465": ["n02916936", "bulletproof_vest"], "466": ["n02917067", "bullet_train"], "467": ["n02927161", "butcher_shop"], "468": ["n02930766", "cab"], "469": ["n02939185", "caldron"], "470": ["n02948072", "candle"], "471": ["n02950826", "cannon"], "472": ["n02951358", "canoe"], "473": ["n02951585", "can_opener"], "474": ["n02963159", "cardigan"], "475": ["n02965783", "car_mirror"], "476": ["n02966193", "carousel"], "477": ["n02966687", "carpenter's_kit"], "478": ["n02971356", "carton"], "479": ["n02974003", "car_wheel"], "480": ["n02977058", "cash_machine"], "481": ["n02978881", "cassette"], "482": ["n02979186", "cassette_player"], "483": ["n02980441", "castle"], "484": ["n02981792", "catamaran"], "485": ["n02988304", "CD_player"], "486": ["n02992211", "cello"], "487": ["n02992529", "cellular_telephone"], "488": ["n02999410", "chain"], "489": ["n03000134", "chainlink_fence"], "490": ["n03000247", "chain_mail"], "491": ["n03000684", "chain_saw"], "492": ["n03014705", "chest"], "493": ["n03016953", "chiffonier"], "494": ["n03017168", "chime"], "495": ["n03018349", "china_cabinet"], "496": ["n03026506", "Christmas_stocking"], "497": ["n03028079", "church"], "498": ["n03032252", "cinema"], "499": ["n03041632", "cleaver"], "500": ["n03042490", "cliff_dwelling"], "501": ["n03045698", "cloak"], "502": ["n03047690", "clog"], "503": ["n03062245", "cocktail_shaker"], "504": ["n03063599", "coffee_mug"], "505": ["n03063689", "coffeepot"], "506": ["n03065424", "coil"], "507": ["n03075370", "combination_lock"], "508": ["n03085013", "computer_keyboard"], "509": ["n03089624", "confectionery"], "510": ["n03095699", "container_ship"], "511": ["n03100240", "convertible"], "512": ["n03109150", "corkscrew"], "513": ["n03110669", "cornet"], "514": ["n03124043", "cowboy_boot"], "515": ["n03124170", "cowboy_hat"], "516": ["n03125729", "cradle"], "517": ["n03126707", "crane"], "518": ["n03127747", "crash_helmet"], "519": ["n03127925", "crate"], "520": ["n03131574", "crib"], "521": ["n03133878", "Crock_Pot"], "522": ["n03134739", "croquet_ball"], "523": ["n03141823", "crutch"], "524": ["n03146219", "cuirass"], "525": ["n03160309", "dam"], "526": ["n03179701", "desk"], "527": ["n03180011", "desktop_computer"], "528": ["n03187595", "dial_telephone"], "529": ["n03188531", "diaper"], "530": ["n03196217", "digital_clock"], "531": ["n03197337", "digital_watch"], "532": ["n03201208", "dining_table"], "533": ["n03207743", "dishrag"], "534": ["n03207941", "dishwasher"], "535": ["n03208938", "disk_brake"], "536": ["n03216828", "dock"], "537": ["n03218198", "dogsled"], "538": ["n03220513", "dome"], "539": ["n03223299", "doormat"], "540": ["n03240683", "drilling_platform"], "541": ["n03249569", "drum"], "542": ["n03250847", "drumstick"], "543": ["n03255030", "dumbbell"], "544": ["n03259280", "Dutch_oven"], "545": ["n03271574", "electric_fan"], "546": ["n03272010", "electric_guitar"], "547": ["n03272562", "electric_locomotive"], "548": ["n03290653", "entertainment_center"], "549": ["n03291819", "envelope"], "550": ["n03297495", "espresso_maker"], "551": ["n03314780", "face_powder"], "552": ["n03325584", "feather_boa"], "553": ["n03337140", "file"], "554": ["n03344393", "fireboat"], "555": ["n03345487", "fire_engine"], "556": ["n03347037", "fire_screen"], "557": ["n03355925", "flagpole"], "558": ["n03372029", "flute"], "559": ["n03376595", "folding_chair"], "560": ["n03379051", "football_helmet"], "561": ["n03384352", "forklift"], "562": ["n03388043", "fountain"], "563": ["n03388183", "fountain_pen"], "564": ["n03388549", "four-poster"], "565": ["n03393912", "freight_car"], "566": ["n03394916", "French_horn"], "567": ["n03400231", "frying_pan"], "568": ["n03404251", "fur_coat"], "569": ["n03417042", "garbage_truck"], "570": ["n03424325", "gasmask"], "571": ["n03425413", "gas_pump"], "572": ["n03443371", "goblet"], "573": ["n03444034", "go-kart"], "574": ["n03445777", "golf_ball"], "575": ["n03445924", "golfcart"], "576": ["n03447447", "gondola"], "577": ["n03447721", "gong"], "578": ["n03450230", "gown"], "579": ["n03452741", "grand_piano"], "580": ["n03457902", "greenhouse"], "581": ["n03459775", "grille"], "582": ["n03461385", "grocery_store"], "583": ["n03467068", "guillotine"], "584": ["n03476684", "hair_slide"], "585": ["n03476991", "hair_spray"], "586": ["n03478589", "half_track"], "587": ["n03481172", "hammer"], "588": ["n03482405", "hamper"], "589": ["n03483316", "hand_blower"], "590": ["n03485407", "hand-held_computer"], "591": ["n03485794", "handkerchief"], "592": ["n03492542", "hard_disc"], "593": ["n03494278", "harmonica"], "594": ["n03495258", "harp"], "595": ["n03496892", "harvester"], "596": ["n03498962", "hatchet"], "597": ["n03527444", "holster"], "598": ["n03529860", "home_theater"], "599": ["n03530642", "honeycomb"], "600": ["n03532672", "hook"], "601": ["n03534580", "hoopskirt"], "602": ["n03535780", "horizontal_bar"], "603": ["n03538406", "horse_cart"], "604": ["n03544143", "hourglass"], "605": ["n03584254", "iPod"], "606": ["n03584829", "iron"], "607": ["n03590841", "jack-o'-lantern"], "608": ["n03594734", "jean"], "609": ["n03594945", "jeep"], "610": ["n03595614", "jersey"], "611": ["n03598930", "jigsaw_puzzle"], "612": ["n03599486", "jinrikisha"], "613": ["n03602883", "joystick"], "614": ["n03617480", "kimono"], "615": ["n03623198", "knee_pad"], "616": ["n03627232", "knot"], "617": ["n03630383", "lab_coat"], "618": ["n03633091", "ladle"], "619": ["n03637318", "lampshade"], "620": ["n03642806", "laptop"], "621": ["n03649909", "lawn_mower"], "622": ["n03657121", "lens_cap"], "623": ["n03658185", "letter_opener"], "624": ["n03661043", "library"], "625": ["n03662601", "lifeboat"], "626": ["n03666591", "lighter"], "627": ["n03670208", "limousine"], "628": ["n03673027", "liner"], "629": ["n03676483", "lipstick"], "630": ["n03680355", "Loafer"], "631": ["n03690938", "lotion"], "632": ["n03691459", "loudspeaker"], "633": ["n03692522", "loupe"], "634": ["n03697007", "lumbermill"], "635": ["n03706229", "magnetic_compass"], "636": ["n03709823", "mailbag"], "637": ["n03710193", "mailbox"], "638": ["n03710637", "maillot"], "639": ["n03710721", "maillot"], "640": ["n03717622", "manhole_cover"], "641": ["n03720891", "maraca"], "642": ["n03721384", "marimba"], "643": ["n03724870", "mask"], "644": ["n03729826", "matchstick"], "645": ["n03733131", "maypole"], "646": ["n03733281", "maze"], "647": ["n03733805", "measuring_cup"], "648": ["n03742115", "medicine_chest"], "649": ["n03743016", "megalith"], "650": ["n03759954", "microphone"], "651": ["n03761084", "microwave"], "652": ["n03763968", "military_uniform"], "653": ["n03764736", "milk_can"], "654": ["n03769881", "minibus"], "655": ["n03770439", "miniskirt"], "656": ["n03770679", "minivan"], "657": ["n03773504", "missile"], "658": ["n03775071", "mitten"], "659": ["n03775546", "mixing_bowl"], "660": ["n03776460", "mobile_home"], "661": ["n03777568", "Model_T"], "662": ["n03777754", "modem"], "663": ["n03781244", "monastery"], "664": ["n03782006", "monitor"], "665": ["n03785016", "moped"], "666": ["n03786901", "mortar"], "667": ["n03787032", "mortarboard"], "668": ["n03788195", "mosque"], "669": ["n03788365", "mosquito_net"], "670": ["n03791053", "motor_scooter"], "671": ["n03792782", "mountain_bike"], "672": ["n03792972", "mountain_tent"], "673": ["n03793489", "mouse"], "674": ["n03794056", "mousetrap"], "675": ["n03796401", "moving_van"], "676": ["n03803284", "muzzle"], "677": ["n03804744", "nail"], "678": ["n03814639", "neck_brace"], "679": ["n03814906", "necklace"], "680": ["n03825788", "nipple"], "681": ["n03832673", "notebook"], "682": ["n03837869", "obelisk"], "683": ["n03838899", "oboe"], "684": ["n03840681", "ocarina"], "685": ["n03841143", "odometer"], "686": ["n03843555", "oil_filter"], "687": ["n03854065", "organ"], "688": ["n03857828", "oscilloscope"], "689": ["n03866082", "overskirt"], "690": ["n03868242", "oxcart"], "691": ["n03868863", "oxygen_mask"], "692": ["n03871628", "packet"], "693": ["n03873416", "paddle"], "694": ["n03874293", "paddlewheel"], "695": ["n03874599", "padlock"], "696": ["n03876231", "paintbrush"], "697": ["n03877472", "pajama"], "698": ["n03877845", "palace"], "699": ["n03884397", "panpipe"], "700": ["n03887697", "paper_towel"], "701": ["n03888257", "parachute"], "702": ["n03888605", "parallel_bars"], "703": ["n03891251", "park_bench"], "704": ["n03891332", "parking_meter"], "705": ["n03895866", "passenger_car"], "706": ["n03899768", "patio"], "707": ["n03902125", "pay-phone"], "708": ["n03903868", "pedestal"], "709": ["n03908618", "pencil_box"], "710": ["n03908714", "pencil_sharpener"], "711": ["n03916031", "perfume"], "712": ["n03920288", "Petri_dish"], "713": ["n03924679", "photocopier"], "714": ["n03929660", "pick"], "715": ["n03929855", "pickelhaube"], "716": ["n03930313", "picket_fence"], "717": ["n03930630", "pickup"], "718": ["n03933933", "pier"], "719": ["n03935335", "piggy_bank"], "720": ["n03937543", "pill_bottle"], "721": ["n03938244", "pillow"], "722": ["n03942813", "ping-pong_ball"], "723": ["n03944341", "pinwheel"], "724": ["n03947888", "pirate"], "725": ["n03950228", "pitcher"], "726": ["n03954731", "plane"], "727": ["n03956157", "planetarium"], "728": ["n03958227", "plastic_bag"], "729": ["n03961711", "plate_rack"], "730": ["n03967562", "plow"], "731": ["n03970156", "plunger"], "732": ["n03976467", "Polaroid_camera"], "733": ["n03976657", "pole"], "734": ["n03977966", "police_van"], "735": ["n03980874", "poncho"], "736": ["n03982430", "pool_table"], "737": ["n03983396", "pop_bottle"], "738": ["n03991062", "pot"], "739": ["n03992509", "potter's_wheel"], "740": ["n03995372", "power_drill"], "741": ["n03998194", "prayer_rug"], "742": ["n04004767", "printer"], "743": ["n04005630", "prison"], "744": ["n04008634", "projectile"], "745": ["n04009552", "projector"], "746": ["n04019541", "puck"], "747": ["n04023962", "punching_bag"], "748": ["n04026417", "purse"], "749": ["n04033901", "quill"], "750": ["n04033995", "quilt"], "751": ["n04037443", "racer"], "752": ["n04039381", "racket"], "753": ["n04040759", "radiator"], "754": ["n04041544", "radio"], "755": ["n04044716", "radio_telescope"], "756": ["n04049303", "rain_barrel"], "757": ["n04065272", "recreational_vehicle"], "758": ["n04067472", "reel"], "759": ["n04069434", "reflex_camera"], "760": ["n04070727", "refrigerator"], "761": ["n04074963", "remote_control"], "762": ["n04081281", "restaurant"], "763": ["n04086273", "revolver"], "764": ["n04090263", "rifle"], "765": ["n04099969", "rocking_chair"], "766": ["n04111531", "rotisserie"], "767": ["n04116512", "rubber_eraser"], "768": ["n04118538", "rugby_ball"], "769": ["n04118776", "rule"], "770": ["n04120489", "running_shoe"], "771": ["n04125021", "safe"], "772": ["n04127249", "safety_pin"], "773": ["n04131690", "saltshaker"], "774": ["n04133789", "sandal"], "775": ["n04136333", "sarong"], "776": ["n04141076", "sax"], "777": ["n04141327", "scabbard"], "778": ["n04141975", "scale"], "779": ["n04146614", "school_bus"], "780": ["n04147183", "schooner"], "781": ["n04149813", "scoreboard"], "782": ["n04152593", "screen"], "783": ["n04153751", "screw"], "784": ["n04154565", "screwdriver"], "785": ["n04162706", "seat_belt"], "786": ["n04179913", "sewing_machine"], "787": ["n04192698", "shield"], "788": ["n04200800", "shoe_shop"], "789": ["n04201297", "shoji"], "790": ["n04204238", "shopping_basket"], "791": ["n04204347", "shopping_cart"], "792": ["n04208210", "shovel"], "793": ["n04209133", "shower_cap"], "794": ["n04209239", "shower_curtain"], "795": ["n04228054", "ski"], "796": ["n04229816", "ski_mask"], "797": ["n04235860", "sleeping_bag"], "798": ["n04238763", "slide_rule"], "799": ["n04239074", "sliding_door"], "800": ["n04243546", "slot"], "801": ["n04251144", "snorkel"], "802": ["n04252077", "snowmobile"], "803": ["n04252225", "snowplow"], "804": ["n04254120", "soap_dispenser"], "805": ["n04254680", "soccer_ball"], "806": ["n04254777", "sock"], "807": ["n04258138", "solar_dish"], "808": ["n04259630", "sombrero"], "809": ["n04263257", "soup_bowl"], "810": ["n04264628", "space_bar"], "811": ["n04265275", "space_heater"], "812": ["n04266014", "space_shuttle"], "813": ["n04270147", "spatula"], "814": ["n04273569", "speedboat"], "815": ["n04275548", "spider_web"], "816": ["n04277352", "spindle"], "817": ["n04285008", "sports_car"], "818": ["n04286575", "spotlight"], "819": ["n04296562", "stage"], "820": ["n04310018", "steam_locomotive"], "821": ["n04311004", "steel_arch_bridge"], "822": ["n04311174", "steel_drum"], "823": ["n04317175", "stethoscope"], "824": ["n04325704", "stole"], "825": ["n04326547", "stone_wall"], "826": ["n04328186", "stopwatch"], "827": ["n04330267", "stove"], "828": ["n04332243", "strainer"], "829": ["n04335435", "streetcar"], "830": ["n04336792", "stretcher"], "831": ["n04344873", "studio_couch"], "832": ["n04346328", "stupa"], "833": ["n04347754", "submarine"], "834": ["n04350905", "suit"], "835": ["n04355338", "sundial"], "836": ["n04355933", "sunglass"], "837": ["n04356056", "sunglasses"], "838": ["n04357314", "sunscreen"], "839": ["n04366367", "suspension_bridge"], "840": ["n04367480", "swab"], "841": ["n04370456", "sweatshirt"], "842": ["n04371430", "swimming_trunks"], "843": ["n04371774", "swing"], "844": ["n04372370", "switch"], "845": ["n04376876", "syringe"], "846": ["n04380533", "table_lamp"], "847": ["n04389033", "tank"], "848": ["n04392985", "tape_player"], "849": ["n04398044", "teapot"], "850": ["n04399382", "teddy"], "851": ["n04404412", "television"], "852": ["n04409515", "tennis_ball"], "853": ["n04417672", "thatch"], "854": ["n04418357", "theater_curtain"], "855": ["n04423845", "thimble"], "856": ["n04428191", "thresher"], "857": ["n04429376", "throne"], "858": ["n04435653", "tile_roof"], "859": ["n04442312", "toaster"], "860": ["n04443257", "tobacco_shop"], "861": ["n04447861", "toilet_seat"], "862": ["n04456115", "torch"], "863": ["n04458633", "totem_pole"], "864": ["n04461696", "tow_truck"], "865": ["n04462240", "toyshop"], "866": ["n04465501", "tractor"], "867": ["n04467665", "trailer_truck"], "868": ["n04476259", "tray"], "869": ["n04479046", "trench_coat"], "870": ["n04482393", "tricycle"], "871": ["n04483307", "trimaran"], "872": ["n04485082", "tripod"], "873": ["n04486054", "triumphal_arch"], "874": ["n04487081", "trolleybus"], "875": ["n04487394", "trombone"], "876": ["n04493381", "tub"], "877": ["n04501370", "turnstile"], "878": ["n04505470", "typewriter_keyboard"], "879": ["n04507155", "umbrella"], "880": ["n04509417", "unicycle"], "881": ["n04515003", "upright"], "882": ["n04517823", "vacuum"], "883": ["n04522168", "vase"], "884": ["n04523525", "vault"], "885": ["n04525038", "velvet"], "886": ["n04525305", "vending_machine"], "887": ["n04532106", "vestment"], "888": ["n04532670", "viaduct"], "889": ["n04536866", "violin"], "890": ["n04540053", "volleyball"], "891": ["n04542943", "waffle_iron"], "892": ["n04548280", "wall_clock"], "893": ["n04548362", "wallet"], "894": ["n04550184", "wardrobe"], "895": ["n04552348", "warplane"], "896": ["n04553703", "washbasin"], "897": ["n04554684", "washer"], "898": ["n04557648", "water_bottle"], "899": ["n04560804", "water_jug"], "900": ["n04562935", "water_tower"], "901": ["n04579145", "whiskey_jug"], "902": ["n04579432", "whistle"], "903": ["n04584207", "wig"], "904": ["n04589890", "window_screen"], "905": ["n04590129", "window_shade"], "906": ["n04591157", "Windsor_tie"], "907": ["n04591713", "wine_bottle"], "908": ["n04592741", "wing"], "909": ["n04596742", "wok"], "910": ["n04597913", "wooden_spoon"], "911": ["n04599235", "wool"], "912": ["n04604644", "worm_fence"], "913": ["n04606251", "wreck"], "914": ["n04612504", "yawl"], "915": ["n04613696", "yurt"], "916": ["n06359193", "web_site"], "917": ["n06596364", "comic_book"], "918": ["n06785654", "crossword_puzzle"], "919": ["n06794110", "street_sign"], "920": ["n06874185", "traffic_light"], "921": ["n07248320", "book_jacket"], "922": ["n07565083", "menu"], "923": ["n07579787", "plate"], "924": ["n07583066", "guacamole"], "925": ["n07584110", "consomme"], "926": ["n07590611", "hot_pot"], "927": ["n07613480", "trifle"], "928": ["n07614500", "ice_cream"], "929": ["n07615774", "ice_lolly"], "930": ["n07684084", "French_loaf"], "931": ["n07693725", "bagel"], "932": ["n07695742", "pretzel"], "933": ["n07697313", "cheeseburger"], "934": ["n07697537", "hotdog"], "935": ["n07711569", "mashed_potato"], "936": ["n07714571", "head_cabbage"], "937": ["n07714990", "broccoli"], "938": ["n07715103", "cauliflower"], "939": ["n07716358", "zucchini"], "940": ["n07716906", "spaghetti_squash"], "941": ["n07717410", "acorn_squash"], "942": ["n07717556", "butternut_squash"], "943": ["n07718472", "cucumber"], "944": ["n07718747", "artichoke"], "945": ["n07720875", "bell_pepper"], "946": ["n07730033", "cardoon"], "947": ["n07734744", "mushroom"], "948": ["n07742313", "Granny_Smith"], "949": ["n07745940", "strawberry"], "950": ["n07747607", "orange"], "951": ["n07749582", "lemon"], "952": ["n07753113", "fig"], "953": ["n07753275", "pineapple"], "954": ["n07753592", "banana"], "955": ["n07754684", "jackfruit"], "956": ["n07760859", "custard_apple"], "957": ["n07768694", "pomegranate"], "958": ["n07802026", "hay"], "959": ["n07831146", "carbonara"], "960": ["n07836838", "chocolate_sauce"], "961": ["n07860988", "dough"], "962": ["n07871810", "meat_loaf"], "963": ["n07873807", "pizza"], "964": ["n07875152", "potpie"], "965": ["n07880968", "burrito"], "966": ["n07892512", "red_wine"], "967": ["n07920052", "espresso"], "968": ["n07930864", "cup"], "969": ["n07932039", "eggnog"], "970": ["n09193705", "alp"], "971": ["n09229709", "bubble"], "972": ["n09246464", "cliff"], "973": ["n09256479", "coral_reef"], "974": ["n09288635", "geyser"], "975": ["n09332890", "lakeside"], "976": ["n09399592", "promontory"], "977": ["n09421951", "sandbar"], "978": ["n09428293", "seashore"], "979": ["n09468604", "valley"], "980": ["n09472597", "volcano"], "981": ["n09835506", "ballplayer"], "982": ["n10148035", "groom"], "983": ["n10565667", "scuba_diver"], "984": ["n11879895", "rapeseed"], "985": ["n11939491", "daisy"], "986": ["n12057211", "yellow_lady's_slipper"], "987": ["n12144580", "corn"], "988": ["n12267677", "acorn"], "989": ["n12620546", "hip"], "990": ["n12768682", "buckeye"], "991": ["n12985857", "coral_fungus"], "992": ["n12998815", "agaric"], "993": ["n13037406", "gyromitra"], "994": ["n13040303", "stinkhorn"], "995": ["n13044778", "earthstar"], "996": ["n13052670", "hen-of-the-woods"], "997": ["n13054560", "bolete"], "998": ["n13133613", "ear"], "999": ["n15075141", "toilet_tissue"]}
--- a/gallery/assets/person1.jpg
+++ b/gallery/assets/person1.jpg
--- a/gallery/assets/repurposing_annotations_thumbnail.png
+++ b/gallery/assets/repurposing_annotations_thumbnail.png
--- a/gallery/assets/transforms_thumbnail.png
+++ b/gallery/assets/transforms_thumbnail.png
--- a/gallery/assets/visualization_utils_thumbnail2.png
+++ b/gallery/assets/visualization_utils_thumbnail2.png
--- a/gallery/others/README.rst
+++ b/gallery/others/README.rst
+Others
+------
--- a/gallery/others/plot_optical_flow.py
+++ b/gallery/others/plot_optical_flow.py
+"""
+=====================================================
+Optical Flow: Predicting movement with the RAFT model
+=====================================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_optical_flow.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_optical_flow.py>` to download the full example code.
+
+Optical flow is the task of predicting movement between two images, usually two
+consecutive frames of a video. Optical flow models take two images as input, and
+predict a flow: the flow indicates the displacement of every single pixel in the
+first image, and maps it to its corresponding pixel in the second image. Flows
+are (2, H, W)-dimensional tensors, where the first axis corresponds to the
+predicted horizontal and vertical displacements.
+
+The following example illustrates how torchvision can be used to predict flows
+using our implementation of the RAFT model. We will also see how to convert the
+predicted flows to RGB images for visualization.
+"""
+
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+import torchvision.transforms.functional as F
+
+
+plt.rcParams["savefig.bbox"] = "tight"
+# sphinx_gallery_thumbnail_number = 2
+
+
+def plot(imgs, **imshow_kwargs):
+    if not isinstance(imgs[0], list):
+        # Make a 2d grid even if there's just 1 row
+        imgs = [imgs]
+
+    num_rows = len(imgs)
+    num_cols = len(imgs[0])
+    _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
+    for row_idx, row in enumerate(imgs):
+        for col_idx, img in enumerate(row):
+            ax = axs[row_idx, col_idx]
+            img = F.to_pil_image(img.to("cpu"))
+            ax.imshow(np.asarray(img), **imshow_kwargs)
+            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+    plt.tight_layout()
+
+# %%
+# Reading Videos Using Torchvision
+# --------------------------------
+# We will first read a video using :func:`~torchvision.io.read_video`.
+# Alternatively one can use the new :class:`~torchvision.io.VideoReader` API (if
+# torchvision is built from source).
+# The video we will use here is free of use from `pexels.com
+# <https://www.pexels.com/video/a-man-playing-a-game-of-basketball-5192157/>`_,
+# credits go to `Pavel Danilyuk <https://www.pexels.com/@pavel-danilyuk>`_.
+
+
+import tempfile
+from pathlib import Path
+from urllib.request import urlretrieve
+
+
+video_url = "https://download.pytorch.org/tutorial/pexelscom_pavel_danilyuk_basketball_hd.mp4"
+video_path = Path(tempfile.mkdtemp()) / "basketball.mp4"
+_ = urlretrieve(video_url, video_path)
+
+# %%
+# :func:`~torchvision.io.read_video` returns the video frames, audio frames and
+# the metadata associated with the video. In our case, we only need the video
+# frames.
+#
+# Here we will just make 2 predictions between 2 pre-selected pairs of frames,
+# namely frames (100, 101) and (150, 151). Each of these pairs corresponds to a
+# single model input.
+
+from torchvision.io import read_video
+frames, _, _ = read_video(str(video_path), output_format="TCHW")
+
+img1_batch = torch.stack([frames[100], frames[150]])
+img2_batch = torch.stack([frames[101], frames[151]])
+
+plot(img1_batch)
+
+# %%
+# The RAFT model accepts RGB images. We first get the frames from
+# :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
+# are divisible by 8. Note that we explicitly use ``antialias=False``, because
+# this is how those models were trained. Then we use the transforms bundled into
+# the weights in order to preprocess the input and rescale its values to the
+# required ``[-1, 1]`` interval.
+
+from torchvision.models.optical_flow import Raft_Large_Weights
+
+weights = Raft_Large_Weights.DEFAULT
+transforms = weights.transforms()
+
+
+def preprocess(img1_batch, img2_batch):
+    img1_batch = F.resize(img1_batch, size=[520, 960], antialias=False)
+    img2_batch = F.resize(img2_batch, size=[520, 960], antialias=False)
+    return transforms(img1_batch, img2_batch)
+
+
+img1_batch, img2_batch = preprocess(img1_batch, img2_batch)
+
+print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}")
+
+
+# %%
+# Estimating Optical flow using RAFT
+# ----------------------------------
+# We will use our RAFT implementation from
+# :func:`~torchvision.models.optical_flow.raft_large`, which follows the same
+# architecture as the one described in the `original paper <https://arxiv.org/abs/2003.12039>`_.
+# We also provide the :func:`~torchvision.models.optical_flow.raft_small` model
+# builder, which is smaller and faster to run, sacrificing a bit of accuracy.
+
+from torchvision.models.optical_flow import raft_large
+
+# If you can, run this example on a GPU, it will be a lot faster.
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+model = raft_large(weights=Raft_Large_Weights.DEFAULT, progress=False).to(device)
+model = model.eval()
+
+list_of_flows = model(img1_batch.to(device), img2_batch.to(device))
+print(f"type = {type(list_of_flows)}")
+print(f"length = {len(list_of_flows)} = number of iterations of the model")
+
+# %%
+# The RAFT model outputs lists of predicted flows where each entry is a
+# (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
+# in the model. For more details on the iterative nature of the model, please
+# refer to the `original paper <https://arxiv.org/abs/2003.12039>`_. Here, we
+# are only interested in the final predicted flows (they are the most accurate
+# ones), so we will just retrieve the last item in the list.
+#
+# As described above, a flow is a tensor with dimensions (2, H, W) (or (N, 2, H,
+# W) for batches of flows) where each entry corresponds to the horizontal and
+# vertical displacement of each pixel from the first image to the second image.
+# Note that the predicted flows are in "pixel" unit, they are not normalized
+# w.r.t. the dimensions of the images.
+predicted_flows = list_of_flows[-1]
+print(f"dtype = {predicted_flows.dtype}")
+print(f"shape = {predicted_flows.shape} = (N, 2, H, W)")
+print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}")
+
+
+# %%
+# Visualizing predicted flows
+# ---------------------------
+# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utility to
+# convert a flow into an RGB image. It also supports batches of flows.
+# each "direction" in the flow will be mapped to a given RGB color. In the
+# images below, pixels with similar colors are assumed by the model to be moving
+# in similar directions. The model is properly able to predict the movement of
+# the ball and the player. Note in particular the different predicted direction
+# of the ball in the first image (going to the left) and in the second image
+# (going up).
+
+from torchvision.utils import flow_to_image
+
+flow_imgs = flow_to_image(predicted_flows)
+
+# The images have been mapped into [-1, 1] but for plotting we want them in [0, 1]
+img1_batch = [(img1 + 1) / 2 for img1 in img1_batch]
+
+grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)]
+plot(grid)
+
+# %%
+# Bonus: Creating GIFs of predicted flows
+# ---------------------------------------
+# In the example above we have only shown the predicted flows of 2 pairs of
+# frames. A fun way to apply the Optical Flow models is to run the model on an
+# entire video, and create a new video from all the predicted flows. Below is a
+# snippet that can get you started with this. We comment out the code, because
+# this example is being rendered on a machine without a GPU, and it would take
+# too long to run it.
+
+# from torchvision.io import write_jpeg
+# for i, (img1, img2) in enumerate(zip(frames, frames[1:])):
+#     # Note: it would be faster to predict batches of flows instead of individual flows
+#     img1, img2 = preprocess(img1, img2)
+
+#     list_of_flows = model(img1.to(device), img2.to(device))
+#     predicted_flow = list_of_flows[-1][0]
+#     flow_img = flow_to_image(predicted_flow).to("cpu")
+#     output_folder = "/tmp/"  # Update this to the folder of your choice
+#     write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg")
+
+# %%
+# Once the .jpg flow images are saved, you can convert them into a video or a
+# GIF using ffmpeg with e.g.:
+#
+# ffmpeg -f image2 -framerate 30 -i predicted_flow_%d.jpg -loop -1 flow.gif
--- a/gallery/others/plot_repurposing_annotations.py
+++ b/gallery/others/plot_repurposing_annotations.py
+"""
+=====================================
+Repurposing masks into bounding boxes
+=====================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_repurposing_annotations.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_repurposing_annotations.py>` to download the full example code.
+
+The following example illustrates the operations available
+the :ref:`torchvision.ops <ops>` module for repurposing
+segmentation masks into object localization annotations for different tasks
+(e.g. transforming masks used by instance and panoptic segmentation
+methods into bounding boxes used by object detection methods).
+"""
+
+# sphinx_gallery_thumbnail_path = "../../gallery/assets/repurposing_annotations_thumbnail.png"
+
+import os
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+
+import torchvision.transforms.functional as F
+
+
+ASSETS_DIRECTORY = "../assets"
+
+plt.rcParams["savefig.bbox"] = "tight"
+
+
+def show(imgs):
+    if not isinstance(imgs, list):
+        imgs = [imgs]
+    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
+    for i, img in enumerate(imgs):
+        img = img.detach()
+        img = F.to_pil_image(img)
+        axs[0, i].imshow(np.asarray(img))
+        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+
+# %%
+# Masks
+# -----
+# In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
+# as a multi-dimensional array (e.g. a NumPy array or a PyTorch tensor) with the following shape:
+#
+#       (num_objects, height, width)
+#
+# Where num_objects is the number of annotated objects in the image. Each (height, width) object corresponds to exactly
+# one object. For example, if your input image has the dimensions 224 x 224 and has four annotated objects the shape
+# of your masks annotation has the following shape:
+#
+#       (4, 224, 224).
+#
+# A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
+# localization tasks.
+
+# %%
+# Converting Masks to Bounding Boxes
+# -----------------------------------------------
+# For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to
+# transform masks into bounding boxes that can be
+# used as input to detection models such as FasterRCNN and RetinaNet.
+# We will take images and masks from the `PenFudan Dataset <https://www.cis.upenn.edu/~jshi/ped_html/>`_.
+
+
+from torchvision.io import read_image
+
+img_path = os.path.join(ASSETS_DIRECTORY, "FudanPed00054.png")
+mask_path = os.path.join(ASSETS_DIRECTORY, "FudanPed00054_mask.png")
+img = read_image(img_path)
+mask = read_image(mask_path)
+
+
+# %%
+# Here the masks are represented as a PNG Image, with floating point values.
+# Each pixel is encoded as different colors, with 0 being background.
+# Notice that the spatial dimensions of image and mask match.
+
+print(mask.size())
+print(img.size())
+print(mask)
+
+# %%
+
+# We get the unique colors, as these would be the object ids.
+obj_ids = torch.unique(mask)
+
+# first id is the background, so remove it.
+obj_ids = obj_ids[1:]
+
+# split the color-encoded mask into a set of boolean masks.
+# Note that this snippet would work as well if the masks were float values instead of ints.
+masks = mask == obj_ids[:, None, None]
+
+# %%
+# Now the masks are a boolean tensor.
+# The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image.
+# The other two dimensions are height and width, which are equal to the dimensions of the image.
+# For each instance, the boolean tensors represent if the particular pixel
+# belongs to the segmentation mask of the image.
+
+print(masks.size())
+print(masks)
+
+# %%
+# Let us visualize an image and plot its corresponding segmentation masks.
+# We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks.
+
+from torchvision.utils import draw_segmentation_masks
+
+drawn_masks = []
+for mask in masks:
+    drawn_masks.append(draw_segmentation_masks(img, mask, alpha=0.8, colors="blue"))
+
+show(drawn_masks)
+
+# %%
+# To convert the boolean masks into bounding boxes.
+# We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module
+# It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format.
+
+from torchvision.ops import masks_to_boxes
+
+boxes = masks_to_boxes(masks)
+print(boxes.size())
+print(boxes)
+
+# %%
+# As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format.
+# These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility
+# provided in :ref:`torchvision.utils <utils>`.
+
+from torchvision.utils import draw_bounding_boxes
+
+drawn_boxes = draw_bounding_boxes(img, boxes, colors="red")
+show(drawn_boxes)
+
+# %%
+# These boxes can now directly be used by detection models in torchvision.
+# Here is demo with a Faster R-CNN model loaded from
+# :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
+
+from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights
+
+weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+model = fasterrcnn_resnet50_fpn(weights=weights, progress=False)
+print(img.size())
+
+tranforms = weights.transforms()
+img = tranforms(img)
+target = {}
+target["boxes"] = boxes
+target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64)
+detection_outputs = model(img.unsqueeze(0), [target])
+
+
+# %%
+# Converting Segmentation Dataset to Detection Dataset
+# ----------------------------------------------------
+#
+# With this utility it becomes very simple to convert a segmentation dataset to a detection dataset.
+# With this we can now use a segmentation dataset to train a detection model.
+# One can similarly convert panoptic dataset to detection dataset.
+# Here is an example where we re-purpose the dataset from the
+# `PenFudan Detection Tutorial <https://pytorch.org/tutorials/intermediate/torchvision_tutorial.html>`_.
+
+class SegmentationToDetectionDataset(torch.utils.data.Dataset):
+    def __init__(self, root, transforms):
+        self.root = root
+        self.transforms = transforms
+        # load all image files, sorting them to
+        # ensure that they are aligned
+        self.imgs = list(sorted(os.listdir(os.path.join(root, "PNGImages"))))
+        self.masks = list(sorted(os.listdir(os.path.join(root, "PedMasks"))))
+
+    def __getitem__(self, idx):
+        # load images and masks
+        img_path = os.path.join(self.root, "PNGImages", self.imgs[idx])
+        mask_path = os.path.join(self.root, "PedMasks", self.masks[idx])
+
+        img = read_image(img_path)
+        mask = read_image(mask_path)
+
+        img = F.convert_image_dtype(img, dtype=torch.float)
+        mask = F.convert_image_dtype(mask, dtype=torch.float)
+
+        # We get the unique colors, as these would be the object ids.
+        obj_ids = torch.unique(mask)
+
+        # first id is the background, so remove it.
+        obj_ids = obj_ids[1:]
+
+        # split the color-encoded mask into a set of boolean masks.
+        masks = mask == obj_ids[:, None, None]
+
+        boxes = masks_to_boxes(masks)
+
+        # there is only one class
+        labels = torch.ones((masks.shape[0],), dtype=torch.int64)
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = labels
+
+        if self.transforms is not None:
+            img, target = self.transforms(img, target)
+
+        return img, target
--- a/gallery/plot_scripted_tensor_transforms.py
+++ b/gallery/plot_scripted_tensor_transforms.py
 """
-=========================
-Tensor transforms and JIT
-=========================
-
-This example illustrates various features that are now supported by the
-:ref:`image transformations <transforms>` on Tensor images. In particular, we
-show how image transforms can be performed on GPU, and how one can also script
-them using JIT compilation.
-
-Prior to v0.8.0, transforms in torchvision have traditionally been PIL-centric
-and presented multiple limitations due to that. Now, since v0.8.0, transforms
-implementations are Tensor and PIL compatible and we can achieve the following
-new features:
-
- transform multi-band torch tensor images (with more than 3-4 channels)
- torchscript transforms together with your model for deployment
- support for GPU acceleration
- batched transformation such as for videos
- read and decode data directly as torch tensor with torchscript support (for PNG and JPEG image formats)
+===================
+Torchscript support
+===================

 .. note::
-    These features are only possible with **Tensor** images.
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_scripted_tensor_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_scripted_tensor_transforms.py>` to download the full example code.
+
+This example illustrates `torchscript
+<https://pytorch.org/docs/stable/jit.html>`_ support of the torchvision
+:ref:`transforms <transforms>` on Tensor images.
 """

+# %%
 from pathlib import Path

 import matplotlib.pyplot as plt
-import numpy as np

 import torch
-import torchvision.transforms as T
-from torchvision.io import read_image
+import torch.nn as nn

+import torchvision.transforms as v1
+from torchvision.io import read_image

 plt.rcParams["savefig.bbox"] = 'tight'
 torch.manual_seed(1)

+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+import sys
+sys.path += ["../transforms"]
+from helpers import plot
+ASSETS_PATH = Path('../assets')

-def show(imgs):
-    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
-    for i, img in enumerate(imgs):
-        img = T.ToPILImage()(img.to('cpu'))
-        axs[0, i].imshow(np.asarray(img))
-        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

+# %%
+# Most transforms support torchscript. For composing transforms, we use
+# :class:`torch.nn.Sequential` instead of
+# :class:`~torchvision.transforms.v2.Compose`:

-####################################
-# The :func:`~torchvision.io.read_image` function allows to read an image and
-# directly load it as a tensor
-
-dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
-show([dog1, dog2])
-
-####################################
-# Transforming images on GPU
-# --------------------------
-# Most transforms natively support tensors on top of PIL images (to visualize
-# the effect of the transforms, you may refer to see
-# :ref:`sphx_glr_auto_examples_plot_transforms.py`).
-# Using tensor images, we can run the transforms on GPUs if cuda is available!
-
-import torch.nn as nn
+dog1 = read_image(str(ASSETS_PATH / 'dog1.jpg'))
+dog2 = read_image(str(ASSETS_PATH / 'dog2.jpg'))

 transforms = torch.nn.Sequential(
-    T.RandomCrop(224),
-    T.RandomHorizontalFlip(p=0.3),
+    v1.RandomCrop(224),
+    v1.RandomHorizontalFlip(p=0.3),
 )

-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-dog1 = dog1.to(device)
-dog2 = dog2.to(device)
+scripted_transforms = torch.jit.script(transforms)

-transformed_dog1 = transforms(dog1)
-transformed_dog2 = transforms(dog2)
-show([transformed_dog1, transformed_dog2])
+plot([dog1, scripted_transforms(dog1), dog2, scripted_transforms(dog2)])

-####################################
-# Scriptable transforms for easier deployment via torchscript
-# -----------------------------------------------------------
-# We now show how to combine image transformations and a model forward pass,
-# while using ``torch.jit.script`` to obtain a single scripted module.
+
+# %%
+# .. warning::
+#
+#     Above we have used transforms from the ``torchvision.transforms``
+#     namespace, i.e. the "v1" transforms. The v2 transforms from the
+#     ``torchvision.transforms.v2`` namespace are the :ref:`recommended
+#     <v1_or_v2>` way to use transforms in your code.
+#
+#     The v2 transforms also support torchscript, but if you call
+#     ``torch.jit.script()`` on a v2 **class** transform, you'll actually end up
+#     with its (scripted) v1 equivalent.  This may lead to slightly different
+#     results between the scripted and eager executions due to implementation
+#     differences between v1 and v2.
+#
+#     If you really need torchscript support for the v2 transforms, **we
+#     recommend scripting the functionals** from the
+#     ``torchvision.transforms.v2.functional`` namespace to avoid surprises.
+#
+# Below we now show how to combine image transformations and a model forward
+# pass, while using ``torch.jit.script`` to obtain a single scripted module.
 #
 # Let's define a ``Predictor`` module that transforms the input tensor and then
 # applies an ImageNet model on it.

-from torchvision.models import resnet18
+from torchvision.models import resnet18, ResNet18_Weights


 class Predictor(nn.Module):

    def __init__(self):
        super().__init__()
-        self.resnet18 = resnet18(pretrained=True, progress=False).eval()
-        self.transforms = nn.Sequential(
-            T.Resize([256, ]),  # We use single int value inside a list due to torchscript type restrictions
-            T.CenterCrop(224),
-            T.ConvertImageDtype(torch.float),
-            T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
-        )
+        weights = ResNet18_Weights.DEFAULT
+        self.resnet18 = resnet18(weights=weights, progress=False).eval()
+        self.transforms = weights.transforms(antialias=True)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        with torch.no_grad():
@@ -107,10 +94,12 @@ class Predictor(nn.Module):
            return y_pred.argmax(dim=1)


-####################################
+# %%
 # Now, let's define scripted and non-scripted instances of ``Predictor`` and
 # apply it on multiple tensor images of the same size

+device = "cuda" if torch.cuda.is_available() else "cpu"
+
 predictor = Predictor().to(device)
 scripted_predictor = torch.jit.script(predictor).to(device)

@@ -119,21 +108,21 @@ batch = torch.stack([dog1, dog2]).to(device)
 res = predictor(batch)
 res_scripted = scripted_predictor(batch)

-####################################
+# %%
 # We can verify that the prediction of the scripted and non-scripted models are
 # the same:

 import json

-with open(Path('assets') / 'imagenet_class_index.json', 'r') as labels_file:
+with open(Path('../assets') / 'imagenet_class_index.json') as labels_file:
    labels = json.load(labels_file)

 for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
    assert pred == pred_scripted
    print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")

-####################################
-# Since the model is scripted, it can be easily dumped on disk an re-used
+# %%
+# Since the model is scripted, it can be easily dumped on disk and re-used

 import tempfile

@@ -143,3 +132,5 @@ with tempfile.NamedTemporaryFile() as f:
    dumped_scripted_predictor = torch.jit.load(f.name)
    res_scripted_dumped = dumped_scripted_predictor(batch)
 assert (res_scripted_dumped == res_scripted).all()
+
+# %%
--- a/gallery/others/plot_video_api.py
+++ b/gallery/others/plot_video_api.py
+"""
+=========
+Video API
+=========
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_video_api.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_video_api.py>` to download the full example code.
+
+This example illustrates some of the APIs that torchvision offers for
+videos, together with the examples on how to build datasets and more.
+"""
+
+# %%
+# 1. Introduction: building a new video object and examining the properties
+# -------------------------------------------------------------------------
+# First we select a video to test the object out. For the sake of argument
+# we're using one from kinetics400 dataset.
+# To create it, we need to define the path and the stream we want to use.
+
+# %%
+# Chosen video statistics:
+#
+# - WUzgd7C1pWA.mp4
+#     - source:
+#         - kinetics-400
+#     - video:
+#         - H-264
+#         - MPEG-4 AVC (part 10) (avc1)
+#         - fps: 29.97
+#     - audio:
+#         - MPEG AAC audio (mp4a)
+#         - sample rate: 48K Hz
+#
+
+import torch
+import torchvision
+from torchvision.datasets.utils import download_url
+torchvision.set_video_backend("video_reader")
+
+# Download the sample video
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
+    ".",
+    "WUzgd7C1pWA.mp4"
+)
+video_path = "./WUzgd7C1pWA.mp4"
+
+# %%
+# Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
+# of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
+# The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
+# Firstly, let's get the metadata for our particular video:
+
+stream = "video"
+video = torchvision.io.VideoReader(video_path, stream)
+video.get_metadata()
+
+# %%
+# Here we can see that video has two streams - a video and an audio stream.
+# Currently available stream types include ['video', 'audio'].
+# Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
+# (which are determined by video encoding).
+# In this way, if the video container contains multiple streams of the same type,
+# users can access the one they want.
+# If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
+
+# %%
+# Let's read all the frames from the video stream. By default, the return value of
+# ``next(video_reader)`` is a dict containing the following fields.
+#
+# The return fields are:
+#
+# - ``data``: containing a torch.tensor
+# - ``pts``: containing a float timestamp of this particular frame
+
+metadata = video.get_metadata()
+video.set_current_stream("audio")
+
+frames = []  # we are going to save the frames here.
+ptss = []  # pts is a presentation timestamp in seconds (float) of each frame
+for frame in video:
+    frames.append(frame['data'])
+    ptss.append(frame['pts'])
+
+print("PTS for first five frames ", ptss[:5])
+print("Total number of frames: ", len(frames))
+approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
+print("Approx total number of datapoints we can expect: ", approx_nf)
+print("Read data size: ", frames[0].size(0) * len(frames))
+
+# %%
+# But what if we only want to read certain time segment of the video?
+# That can be done easily using the combination of our ``seek`` function, and the fact that each call
+# to next returns the presentation timestamp of the returned frame in seconds.
+#
+# Given that our implementation relies on python iterators,
+# we can leverage itertools to simplify the process and make it more pythonic.
+#
+# For example, if we wanted to read ten frames from second second:
+
+
+import itertools
+video.set_current_stream("video")
+
+frames = []  # we are going to save the frames here.
+
+# We seek into a second second of the video and use islice to get 10 frames since
+for frame, pts in itertools.islice(video.seek(2), 10):
+    frames.append(frame)
+
+print("Total number of frames: ", len(frames))
+
+# %%
+# Or if we wanted to read from 2nd to 5th second,
+# We seek into a second second of the video,
+# then we utilize the itertools takewhile to get the
+# correct number of frames:
+
+video.set_current_stream("video")
+frames = []  # we are going to save the frames here.
+video = video.seek(2)
+
+for frame in itertools.takewhile(lambda x: x['pts'] <= 5, video):
+    frames.append(frame['data'])
+
+print("Total number of frames: ", len(frames))
+approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
+print("We can expect approx: ", approx_nf)
+print("Tensor size: ", frames[0].size())
+
+# %%
+# 2. Building a sample read_video function
+# ----------------------------------------------------------------------------------------
+# We can utilize the methods above to build the read video function that follows
+# the same API to the existing ``read_video`` function.
+
+
+def example_read_video(video_object, start=0, end=None, read_video=True, read_audio=True):
+    if end is None:
+        end = float("inf")
+    if end < start:
+        raise ValueError(
+            "end time should be larger than start time, got "
+            f"start time={start} and end time={end}"
+        )
+
+    video_frames = torch.empty(0)
+    video_pts = []
+    if read_video:
+        video_object.set_current_stream("video")
+        frames = []
+        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
+            frames.append(frame['data'])
+            video_pts.append(frame['pts'])
+        if len(frames) > 0:
+            video_frames = torch.stack(frames, 0)
+
+    audio_frames = torch.empty(0)
+    audio_pts = []
+    if read_audio:
+        video_object.set_current_stream("audio")
+        frames = []
+        for frame in itertools.takewhile(lambda x: x['pts'] <= end, video_object.seek(start)):
+            frames.append(frame['data'])
+            audio_pts.append(frame['pts'])
+        if len(frames) > 0:
+            audio_frames = torch.cat(frames, 0)
+
+    return video_frames, audio_frames, (video_pts, audio_pts), video_object.get_metadata()
+
+
+# Total number of frames should be 327 for video and 523264 datapoints for audio
+vf, af, info, meta = example_read_video(video)
+print(vf.size(), af.size())
+
+# %%
+# 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
+# -------------------------------------------------------------------------------------------------------
+# Cool, so now we can use the same principle to make the sample dataset.
+# We suggest trying out iterable dataset for this purpose.
+# Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
+
+# %%
+# Make sample dataset
+import os
+os.makedirs("./dataset", exist_ok=True)
+os.makedirs("./dataset/1", exist_ok=True)
+os.makedirs("./dataset/2", exist_ok=True)
+
+# %%
+# Download the videos
+from torchvision.datasets.utils import download_url
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/WUzgd7C1pWA.mp4?raw=true",
+    "./dataset/1", "WUzgd7C1pWA.mp4"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/RATRACE_wave_f_nm_np1_fr_goo_37.avi?raw=true",
+    "./dataset/1",
+    "RATRACE_wave_f_nm_np1_fr_goo_37.avi"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/SOX5yA1l24A.mp4?raw=true",
+    "./dataset/2",
+    "SOX5yA1l24A.mp4"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g23_c01.avi?raw=true",
+    "./dataset/2",
+    "v_SoccerJuggling_g23_c01.avi"
+)
+download_url(
+    "https://github.com/pytorch/vision/blob/main/test/assets/videos/v_SoccerJuggling_g24_c01.avi?raw=true",
+    "./dataset/2",
+    "v_SoccerJuggling_g24_c01.avi"
+)
+
+# %%
+# Housekeeping and utilities
+import os
+import random
+
+from torchvision.datasets.folder import make_dataset
+from torchvision import transforms as t
+
+
+def _find_classes(dir):
+    classes = [d.name for d in os.scandir(dir) if d.is_dir()]
+    classes.sort()
+    class_to_idx = {cls_name: i for i, cls_name in enumerate(classes)}
+    return classes, class_to_idx
+
+
+def get_samples(root, extensions=(".mp4", ".avi")):
+    _, class_to_idx = _find_classes(root)
+    return make_dataset(root, class_to_idx, extensions=extensions)
+
+# %%
+# We are going to define the dataset and some basic arguments.
+# We assume the structure of the FolderDataset, and add the following parameters:
+#
+# - ``clip_len``: length of a clip in frames
+# - ``frame_transform``: transform for every frame individually
+# - ``video_transform``: transform on a video sequence
+#
+# .. note::
+#   We actually add epoch size as using :func:`~torch.utils.data.IterableDataset`
+#   class allows us to naturally oversample clips or images from each video if needed.
+
+
+class RandomDataset(torch.utils.data.IterableDataset):
+    def __init__(self, root, epoch_size=None, frame_transform=None, video_transform=None, clip_len=16):
+        super(RandomDataset).__init__()
+
+        self.samples = get_samples(root)
+
+        # Allow for temporal jittering
+        if epoch_size is None:
+            epoch_size = len(self.samples)
+        self.epoch_size = epoch_size
+
+        self.clip_len = clip_len
+        self.frame_transform = frame_transform
+        self.video_transform = video_transform
+
+    def __iter__(self):
+        for i in range(self.epoch_size):
+            # Get random sample
+            path, target = random.choice(self.samples)
+            # Get video object
+            vid = torchvision.io.VideoReader(path, "video")
+            metadata = vid.get_metadata()
+            video_frames = []  # video frame buffer
+
+            # Seek and return frames
+            max_seek = metadata["video"]['duration'][0] - (self.clip_len / metadata["video"]['fps'][0])
+            start = random.uniform(0., max_seek)
+            for frame in itertools.islice(vid.seek(start), self.clip_len):
+                video_frames.append(self.frame_transform(frame['data']))
+                current_pts = frame['pts']
+            # Stack it into a tensor
+            video = torch.stack(video_frames, 0)
+            if self.video_transform:
+                video = self.video_transform(video)
+            output = {
+                'path': path,
+                'video': video,
+                'target': target,
+                'start': start,
+                'end': current_pts}
+            yield output
+
+# %%
+# Given a path of videos in a folder structure, i.e:
+#
+# - dataset
+#     - class 1
+#         - file 0
+#         - file 1
+#         - ...
+#     - class 2
+#         - file 0
+#         - file 1
+#         - ...
+#     - ...
+#
+# We can generate a dataloader and test the dataset.
+
+
+transforms = [t.Resize((112, 112))]
+frame_transform = t.Compose(transforms)
+
+dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
+
+# %%
+from torch.utils.data import DataLoader
+loader = DataLoader(dataset, batch_size=12)
+data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
+for batch in loader:
+    for i in range(len(batch['path'])):
+        data['video'].append(batch['path'][i])
+        data['start'].append(batch['start'][i].item())
+        data['end'].append(batch['end'][i].item())
+        data['tensorsize'].append(batch['video'][i].size())
+print(data)
+
+# %%
+# 4. Data Visualization
+# ----------------------------------
+# Example of visualized video
+
+import matplotlib.pyplot as plt
+
+plt.figure(figsize=(12, 12))
+for i in range(16):
+    plt.subplot(4, 4, i + 1)
+    plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
+    plt.axis("off")
+
+# %%
+# Cleanup the video and dataset:
+import os
+import shutil
+os.remove("./WUzgd7C1pWA.mp4")
+shutil.rmtree("./dataset")
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/plot_visualization_utils.py
@@ -3,10 +3,15 @@
 Visualization utilities
 =======================

+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_visualization_utils.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_visualization_utils.py>` to download the full example code.
+
 This example illustrates some of the utilities that torchvision offers for
-visualizing images, bounding boxes, and segmentation masks.
+visualizing images, bounding boxes, segmentation masks and keypoints.
 """

+# sphinx_gallery_thumbnail_path = "../../gallery/assets/visualization_utils_thumbnail2.png"

 import torch
 import numpy as np
@@ -21,7 +26,7 @@ plt.rcParams["savefig.bbox"] = 'tight'
 def show(imgs):
    if not isinstance(imgs, list):
        imgs = [imgs]
-    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
+    fig, axs = plt.subplots(ncols=len(imgs), squeeze=False)
    for i, img in enumerate(imgs):
        img = img.detach()
        img = F.to_pil_image(img)
@@ -29,7 +34,7 @@ def show(imgs):
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])


-####################################
+# %%
 # Visualizing a grid of images
 # ----------------------------
 # The :func:`~torchvision.utils.make_grid` function can be used to create a
@@ -40,13 +45,14 @@ from torchvision.utils import make_grid
 from torchvision.io import read_image
 from pathlib import Path

-dog1_int = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2_int = read_image(str(Path('assets') / 'dog2.jpg'))
+dog1_int = read_image(str(Path('../assets') / 'dog1.jpg'))
+dog2_int = read_image(str(Path('../assets') / 'dog2.jpg'))
+dog_list = [dog1_int, dog2_int]

-grid = make_grid([dog1_int, dog2_int, dog1_int, dog2_int])
+grid = make_grid(dog_list)
 show(grid)

-####################################
+# %%
 # Visualizing bounding boxes
 # --------------------------
 # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
@@ -62,41 +68,39 @@ result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5)
 show(result)


-#####################################
+# %%
 # Naturally, we can also plot bounding boxes produced by torchvision detection
-# models.  Here is demo with a Faster R-CNN model loaded from
+# models.  Here is a demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
-# model. You can also try using a RetinaNet with
-# :func:`~torchvision.models.detection.retinanet_resnet50_fpn`, an SSDlite with
-# :func:`~torchvision.models.detection.ssdlite320_mobilenet_v3_large` or an SSD with
-# :func:`~torchvision.models.detection.ssd300_vgg16`. For more details
-# on the output of such models, you may refer to :ref:`instance_seg_output`.
+# model. For more details on the output of such models, you may
+# refer to :ref:`instance_seg_output`.
+
+from torchvision.models.detection import fasterrcnn_resnet50_fpn, FasterRCNN_ResNet50_FPN_Weights

-from torchvision.models.detection import fasterrcnn_resnet50_fpn
-from torchvision.transforms.functional import convert_image_dtype

+weights = FasterRCNN_ResNet50_FPN_Weights.DEFAULT
+transforms = weights.transforms()

-batch_int = torch.stack([dog1_int, dog2_int])
-batch = convert_image_dtype(batch_int, dtype=torch.float)
+images = [transforms(d) for d in dog_list]

-model = fasterrcnn_resnet50_fpn(pretrained=True, progress=False)
+model = fasterrcnn_resnet50_fpn(weights=weights, progress=False)
 model = model.eval()

-outputs = model(batch)
+outputs = model(images)
 print(outputs)

-#####################################
+# %%
 # Let's plot the boxes detected by our model. We will only plot the boxes with a
 # score greater than a given threshold.

 score_threshold = .8
 dogs_with_boxes = [
    draw_bounding_boxes(dog_int, boxes=output['boxes'][output['scores'] > score_threshold], width=4)
-    for dog_int, output in zip(batch_int, outputs)
+    for dog_int, output in zip(dog_list, outputs)
 ]
 show(dogs_with_boxes)

-#####################################
+# %%
 # Visualizing segmentation masks
 # ------------------------------
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
@@ -110,26 +114,22 @@ show(dogs_with_boxes)
 # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 #
 # We will see how to use it with torchvision's FCN Resnet-50, loaded with
-# :func:`~torchvision.models.segmentation.fcn_resnet50`.  You can also try using
-# DeepLabv3 (:func:`~torchvision.models.segmentation.deeplabv3_resnet50`) or
-# lraspp mobilenet models
-# (:func:`~torchvision.models.segmentation.lraspp_mobilenet_v3_large`).
-#
-# Let's start by looking at the ouput of the model. Remember that in general,
-# images must be normalized before they're passed to a semantic segmentation
-# model.
+# :func:`~torchvision.models.segmentation.fcn_resnet50`. Let's start by looking
+# at the output of the model.

-from torchvision.models.segmentation import fcn_resnet50
+from torchvision.models.segmentation import fcn_resnet50, FCN_ResNet50_Weights

+weights = FCN_ResNet50_Weights.DEFAULT
+transforms = weights.transforms(resize_size=None)

-model = fcn_resnet50(pretrained=True, progress=False)
+model = fcn_resnet50(weights=weights, progress=False)
 model = model.eval()

-normalized_batch = F.normalize(batch, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
-output = model(normalized_batch)['out']
+batch = torch.stack([transforms(d) for d in dog_list])
+output = model(batch)['out']
 print(output.shape, output.min().item(), output.max().item())

-#####################################
+# %%
 # As we can see above, the output of the segmentation model is a tensor of shape
 # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
 # we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
@@ -139,24 +139,19 @@ print(output.shape, output.min().item(), output.max().item())
 # Let's plot the masks that have been detected for the dog class and for the
 # boat class:

-sem_classes = [
-    '__background__', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
-    'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
-    'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
-]
-sem_class_to_idx = {cls: idx for (idx, cls) in enumerate(sem_classes)}
+sem_class_to_idx = {cls: idx for (idx, cls) in enumerate(weights.meta["categories"])}

 normalized_masks = torch.nn.functional.softmax(output, dim=1)

 dog_and_boat_masks = [
    normalized_masks[img_idx, sem_class_to_idx[cls]]
-    for img_idx in range(batch.shape[0])
+    for img_idx in range(len(dog_list))
    for cls in ('dog', 'boat')
 ]

 show(dog_and_boat_masks)

-#####################################
+# %%
 # As expected, the model is confident about the dog class, but not so much for
 # the boat class.
 #
@@ -171,7 +166,7 @@ print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
 show([m.float() for m in boolean_dog_masks])


-#####################################
+# %%
 # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
 # can read it as the following query: "For which pixels is 'dog' the most likely
 # class?"
@@ -189,15 +184,15 @@ from torchvision.utils import draw_segmentation_masks

 dogs_with_masks = [
    draw_segmentation_masks(img, masks=mask, alpha=0.7)
-    for img, mask in zip(batch_int, boolean_dog_masks)
+    for img, mask in zip(dog_list, boolean_dog_masks)
 ]
 show(dogs_with_masks)

-#####################################
+# %%
 # We can plot more than one mask per image! Remember that the model returned as
 # many masks as there are classes. Let's ask the same query as above, but this
 # time for *all* classes, not just the dog class: "For each pixel and each class
-# C, is class C the most most likely class?"
+# C, is class C the most likely class?"
 #
 # This one is a bit more involved, so we'll first show how to do it with a
 # single image, and then we'll generalize to the batch
@@ -213,7 +208,7 @@ print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_a
 dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6)
 show(dog_with_all_masks)

-#####################################
+# %%
 # We can see in the image above that only 2 masks were drawn: the mask for the
 # background and the mask for the dog. This is because the model thinks that
 # only these 2 classes are the most likely ones across all the pixels. If the
@@ -235,12 +230,12 @@ all_classes_masks = all_classes_masks.swapaxes(0, 1)

 dogs_with_masks = [
    draw_segmentation_masks(img, masks=mask, alpha=.6)
-    for img, mask in zip(batch_int, all_classes_masks)
+    for img, mask in zip(dog_list, all_classes_masks)
 ]
 show(dogs_with_masks)


-#####################################
+# %%
 # .. _instance_seg_output:
 #
 # Instance segmentation models
@@ -261,14 +256,20 @@ show(dogs_with_masks)
 #     of them may not have masks, like
 #     :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`.

-from torchvision.models.detection import maskrcnn_resnet50_fpn
-model = maskrcnn_resnet50_fpn(pretrained=True, progress=False)
+from torchvision.models.detection import maskrcnn_resnet50_fpn, MaskRCNN_ResNet50_FPN_Weights
+
+weights = MaskRCNN_ResNet50_FPN_Weights.DEFAULT
+transforms = weights.transforms()
+
+images = [transforms(d) for d in dog_list]
+
+model = maskrcnn_resnet50_fpn(weights=weights, progress=False)
 model = model.eval()

-output = model(batch)
+output = model(images)
 print(output)

-#####################################
+# %%
 # Let's break this down. For each image in the batch, the model outputs some
 # detections (or instances). The number of detections varies for each input
 # image. Each instance is described by its bounding box, its label, its score
@@ -291,33 +292,16 @@ dog1_masks = dog1_output['masks']
 print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
      f"min = {dog1_masks.min()}, max = {dog1_masks.max()}")

-#####################################
-# Here the masks corresponds to probabilities indicating, for each pixel, how
+# %%
+# Here the masks correspond to probabilities indicating, for each pixel, how
 # likely it is to belong to the predicted label of that instance. Those
 # predicted labels correspond to the 'labels' element in the same output dict.
 # Let's see which labels were predicted for the instances of the first image.

-inst_classes = [
-    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
-    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
-    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
-    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
-    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
-    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
-    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
-    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
-    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
-    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
-    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
-    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
-]
-
-inst_class_to_idx = {cls: idx for (idx, cls) in enumerate(inst_classes)}
-
 print("For the first dog, the following instances were detected:")
-print([inst_classes[label] for label in dog1_output['labels']])
+print([weights.meta["categories"][label] for label in dog1_output['labels']])

-#####################################
+# %%
 # Interestingly, the model detects two persons in the image. Let's go ahead and
 # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
 # expects boolean masks, we need to convert those probabilities into boolean
@@ -335,15 +319,15 @@ dog1_bool_masks = dog1_bool_masks.squeeze(1)

 show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))

-#####################################
+# %%
 # The model seems to have properly detected the dog, but it also confused trees
-# with people. Looking more closely at the scores will help us plotting more
+# with people. Looking more closely at the scores will help us plot more
 # relevant masks:

 print(dog1_output['scores'])

-#####################################
-# Clearly the model is less confident about the dog detection than it is about
+# %%
+# Clearly the model is more confident about the dog detection than it is about
 # the people detections. That's good news. When plotting the masks, we can ask
 # for only those that have a good score. Let's use a score threshold of .75
 # here, and also plot the masks of the second dog.
@@ -357,11 +341,182 @@ boolean_masks = [

 dogs_with_masks = [
    draw_segmentation_masks(img, mask.squeeze(1))
-    for img, mask in zip(batch_int, boolean_masks)
+    for img, mask in zip(dog_list, boolean_masks)
 ]
 show(dogs_with_masks)

-#####################################
+# %%
 # The two 'people' masks in the first image where not selected because they have
-# a lower score than the score threshold. Similarly in the second image, the
+# a lower score than the score threshold. Similarly, in the second image, the
 # instance with class 15 (which corresponds to 'bench') was not selected.
+
+# %%
+# .. _keypoint_output:
+#
+# Visualizing keypoints
+# ------------------------------
+# The :func:`~torchvision.utils.draw_keypoints` function can be used to
+# draw keypoints on images. We will see how to use it with
+# torchvision's KeypointRCNN loaded with :func:`~torchvision.models.detection.keypointrcnn_resnet50_fpn`.
+# We will first have a look at output of the model.
+#
+
+from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights
+from torchvision.io import read_image
+
+person_int = read_image(str(Path("../assets") / "person1.jpg"))
+
+weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
+transforms = weights.transforms()
+
+person_float = transforms(person_int)
+
+model = keypointrcnn_resnet50_fpn(weights=weights, progress=False)
+model = model.eval()
+
+outputs = model([person_float])
+print(outputs)
+
+# %%
+# As we see the output contains a list of dictionaries.
+# The output list is of length batch_size.
+# We currently have just a single image so length of list is 1.
+# Each entry in the list corresponds to an input image,
+# and it is a dict with keys `boxes`, `labels`, `scores`, `keypoints` and `keypoint_scores`.
+# Each value associated to those keys has `num_instances` elements in it.
+# In our case above there are 2 instances detected in the image.
+
+kpts = outputs[0]['keypoints']
+scores = outputs[0]['scores']
+
+print(kpts)
+print(scores)
+
+# %%
+# The KeypointRCNN model detects there are two instances in the image.
+# If you plot the boxes by using :func:`~draw_bounding_boxes`
+# you would recognize they are the person and the surfboard.
+# If we look at the scores, we will realize that the model is much more confident about the person than surfboard.
+# We could now set a threshold confidence and plot instances which we are confident enough.
+# Let us set a threshold of 0.75 and filter out the keypoints corresponding to the person.
+
+detect_threshold = 0.75
+idx = torch.where(scores > detect_threshold)
+keypoints = kpts[idx]
+
+print(keypoints)
+
+# %%
+# Great, now we have the keypoints corresponding to the person.
+# Each keypoint is represented by x, y coordinates and the visibility.
+# We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints.
+# Note that the utility expects uint8 images.
+
+from torchvision.utils import draw_keypoints
+
+res = draw_keypoints(person_int, keypoints, colors="blue", radius=3)
+show(res)
+
+# %%
+# As we see, the keypoints appear as colored circles over the image.
+# The coco keypoints for a person are ordered and represent the following list.\
+
+coco_keypoints = [
+    "nose", "left_eye", "right_eye", "left_ear", "right_ear",
+    "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
+    "left_wrist", "right_wrist", "left_hip", "right_hip",
+    "left_knee", "right_knee", "left_ankle", "right_ankle",
+]
+
+# %%
+# What if we are interested in joining the keypoints?
+# This is especially useful in creating pose detection or action recognition.
+# We can join the keypoints easily using the `connectivity` parameter.
+# A close observation would reveal that we would need to join the points in below
+# order to construct human skeleton.
+#
+# nose -> left_eye -> left_ear.                              (0, 1), (1, 3)
+#
+# nose -> right_eye -> right_ear.                            (0, 2), (2, 4)
+#
+# nose -> left_shoulder -> left_elbow -> left_wrist.         (0, 5), (5, 7), (7, 9)
+#
+# nose -> right_shoulder -> right_elbow -> right_wrist.      (0, 6), (6, 8), (8, 10)
+#
+# left_shoulder -> left_hip -> left_knee -> left_ankle.      (5, 11), (11, 13), (13, 15)
+#
+# right_shoulder -> right_hip -> right_knee -> right_ankle.  (6, 12), (12, 14), (14, 16)
+#
+# We will create a list containing these keypoint ids to be connected.
+
+connect_skeleton = [
+    (0, 1), (0, 2), (1, 3), (2, 4), (0, 5), (0, 6), (5, 7), (6, 8),
+    (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16)
+]
+
+# %%
+# We pass the above list to the connectivity parameter to connect the keypoints.
+#
+
+res = draw_keypoints(person_int, keypoints, connectivity=connect_skeleton, colors="blue", radius=4, width=3)
+show(res)
+
+# %%
+# That looks pretty good.
+#
+# .. _draw_keypoints_with_visibility:
+#
+# Drawing Keypoints with Visibility
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+# Let's have a look at the results, another keypoint prediction module produced, and show the connectivity:
+
+prediction = torch.tensor(
+    [[[208.0176, 214.2409, 1.0000],
+      [000.0000, 000.0000, 0.0000],
+      [197.8246, 210.6392, 1.0000],
+      [000.0000, 000.0000, 0.0000],
+      [178.6378, 217.8425, 1.0000],
+      [221.2086, 253.8591, 1.0000],
+      [160.6502, 269.4662, 1.0000],
+      [243.9929, 304.2822, 1.0000],
+      [138.4654, 328.8935, 1.0000],
+      [277.5698, 340.8990, 1.0000],
+      [153.4551, 374.5145, 1.0000],
+      [000.0000, 000.0000, 0.0000],
+      [226.0053, 370.3125, 1.0000],
+      [221.8081, 455.5516, 1.0000],
+      [273.9723, 448.9486, 1.0000],
+      [193.6275, 546.1933, 1.0000],
+      [273.3727, 545.5930, 1.0000]]]
+)
+
+res = draw_keypoints(person_int, prediction, connectivity=connect_skeleton, colors="blue", radius=4, width=3)
+show(res)
+
+# %%
+# What happened there?
+# The model, which predicted the new keypoints,
+# can't detect the three points that are hidden on the upper left body of the skateboarder.
+# More precisely, the model predicted that `(x, y, vis) = (0, 0, 0)` for the left_eye, left_ear, and left_hip.
+# So we definitely don't want to display those keypoints and connections, and you don't have to.
+# Looking at the parameters of :func:`~torchvision.utils.draw_keypoints`,
+# we can see that we can pass a visibility tensor as an additional argument.
+# Given the models' prediction, we have the visibility as the third keypoint dimension, we just need to extract it.
+# Let's split the ``prediction`` into the keypoint coordinates and their respective visibility,
+# and pass both of them as arguments to :func:`~torchvision.utils.draw_keypoints`.
+
+coordinates, visibility = prediction.split([2, 1], dim=-1)
+visibility = visibility.bool()
+
+res = draw_keypoints(
+    person_int, coordinates, visibility=visibility, connectivity=connect_skeleton, colors="blue", radius=4, width=3
+)
+show(res)
+
+# %%
+# We can see that the undetected keypoints are not draw and the invisible keypoint connections were skipped.
+# This can reduce the noise on images with multiple detections, or in cases like ours,
+# when the keypoint-prediction model missed some detections.
+# Most torch keypoint-prediction models return the visibility for every prediction, ready for you to use it.
+# The :func:`~torchvision.models.detection.keypointrcnn_resnet50_fpn` model,
+# which we used in the first case, does so too.
--- a/gallery/transforms/README.rst
+++ b/gallery/transforms/README.rst
+.. _transforms_gallery:
+
+Transforms
+----------
--- a/gallery/transforms/helpers.py
+++ b/gallery/transforms/helpers.py
+import matplotlib.pyplot as plt
+import torch
+from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+
+def plot(imgs, row_title=None, **imshow_kwargs):
+    if not isinstance(imgs[0], list):
+        # Make a 2d grid even if there's just 1 row
+        imgs = [imgs]
+
+    num_rows = len(imgs)
+    num_cols = len(imgs[0])
+    _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
+    for row_idx, row in enumerate(imgs):
+        for col_idx, img in enumerate(row):
+            boxes = None
+            masks = None
+            if isinstance(img, tuple):
+                img, target = img
+                if isinstance(target, dict):
+                    boxes = target.get("boxes")
+                    masks = target.get("masks")
+                elif isinstance(target, tv_tensors.BoundingBoxes):
+                    boxes = target
+                else:
+                    raise ValueError(f"Unexpected target type: {type(target)}")
+            img = F.to_image(img)
+            if img.dtype.is_floating_point and img.min() < 0:
+                # Poor man's re-normalization for the colors to be OK-ish. This
+                # is useful for images coming out of Normalize()
+                img -= img.min()
+                img /= img.max()
+
+            img = F.to_dtype(img, torch.uint8, scale=True)
+            if boxes is not None:
+                img = draw_bounding_boxes(img, boxes, colors="yellow", width=3)
+            if masks is not None:
+                img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65)
+
+            ax = axs[row_idx, col_idx]
+            ax.imshow(img.permute(1, 2, 0).numpy(), **imshow_kwargs)
+            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+    if row_title is not None:
+        for row_idx in range(num_rows):
+            axs[row_idx, 0].set(ylabel=row_title[row_idx])
+
+    plt.tight_layout()
--- a/gallery/transforms/plot_custom_transforms.py
+++ b/gallery/transforms/plot_custom_transforms.py
+"""
+===================================
+How to write your own v2 transforms
+===================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_transforms.py>` to download the full example code.
+
+This guide explains how to write transforms that are compatible with the
+torchvision transforms V2 API.
+"""
+
+# %%
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+
+# %%
+# Just create a ``nn.Module`` and override the ``forward`` method
+# ===============================================================
+#
+# In most cases, this is all you're going to need, as long as you already know
+# the structure of the input that your transform will expect. For example if
+# you're just doing image classification, your transform will typically accept a
+# single image as input, or a ``(img, label)`` input. So you can just hard-code
+# your ``forward`` method to accept just that, e.g.
+#
+# .. code:: python
+#
+#     class MyCustomTransform(torch.nn.Module):
+#         def forward(self, img, label):
+#             # Do some transformations
+#             return new_img, new_label
+#
+# .. note::
+#
+#     This means that if you have a custom transform that is already compatible
+#     with the V1 transforms (those in ``torchvision.transforms``), it will
+#     still work with the V2 transforms without any change!
+#
+# We will illustrate this more completely below with a typical detection case,
+# where our samples are just images, bounding boxes and labels:
+
+class MyCustomTransform(torch.nn.Module):
+    def forward(self, img, bboxes, label):  # we assume inputs are always structured like this
+        print(
+            f"I'm transforming an image of shape {img.shape} "
+            f"with bboxes = {bboxes}\n{label = }"
+        )
+        # Do some transformations. Here, we're just passing though the input
+        return img, bboxes, label
+
+
+transforms = v2.Compose([
+    MyCustomTransform(),
+    v2.RandomResizedCrop((224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=1),
+    v2.Normalize(mean=[0, 0, 0], std=[1, 1, 1])
+])
+
+H, W = 256, 256
+img = torch.rand(3, H, W)
+bboxes = tv_tensors.BoundingBoxes(
+    torch.tensor([[0, 10, 10, 20], [50, 50, 70, 70]]),
+    format="XYXY",
+    canvas_size=(H, W)
+)
+label = 3
+
+out_img, out_bboxes, out_label = transforms(img, bboxes, label)
+# %%
+print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }")
+# %%
+# .. note::
+#     While working with TVTensor classes in your code, make sure to
+#     familiarize yourself with this section:
+#     :ref:`tv_tensor_unwrapping_behaviour`
+#
+# Supporting arbitrary input structures
+# =====================================
+#
+# In the section above, we have assumed that you already know the structure of
+# your inputs and that you're OK with hard-coding this expected structure in
+# your code. If you want your custom transforms to be as flexible as possible,
+# this can be a bit limiting.
+#
+# A key feature of the builtin Torchvision V2 transforms is that they can accept
+# arbitrary input structure and return the same structure as output (with
+# transformed entries). For example, transforms can accept a single image, or a
+# tuple of ``(img, label)``, or an arbitrary nested dictionary as input:
+
+structured_input = {
+    "img": img,
+    "annotations": (bboxes, label),
+    "something_that_will_be_ignored": (1, "hello")
+}
+structured_output = v2.RandomHorizontalFlip(p=1)(structured_input)
+
+assert isinstance(structured_output, dict)
+assert structured_output["something_that_will_be_ignored"] == (1, "hello")
+print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
+
+# %%
+# If you want to reproduce this behavior in your own transform, we invite you to
+# look at our `code
+# <https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/_transform.py>`_
+# and adapt it to your needs.
+#
+# In brief, the core logic is to unpack the input into a flat list using `pytree
+# <https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py>`_, and
+# then transform only the entries that can be transformed (the decision is made
+# based on the **class** of the entries, as all TVTensors are
+# tensor-subclasses) plus some custom logic that is out of score here - check the
+# code for details. The (potentially transformed) entries are then repacked and
+# returned, in the same structure as the input.
+#
+# We do not provide public dev-facing tools to achieve that at this time, but if
+# this is something that would be valuable to you, please let us know by opening
+# an issue on our `GitHub repo <https://github.com/pytorch/vision/issues>`_.
--- a/gallery/transforms/plot_custom_tv_tensors.py
+++ b/gallery/transforms/plot_custom_tv_tensors.py
+"""
+====================================
+How to write your own TVTensor class
+====================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_tv_tensors.py>` to download the full example code.
+
+This guide is intended for advanced users and downstream library maintainers. We explain how to
+write your own TVTensor class, and how to make it compatible with the built-in
+Torchvision v2 transforms. Before continuing, make sure you have read
+:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
+"""
+
+# %%
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+# %%
+# We will create a very simple class that just inherits from the base
+# :class:`~torchvision.tv_tensors.TVTensor` class. It will be enough to cover
+# what you need to know to implement your more elaborate uses-cases. If you need
+# to create a class that carries meta-data, take a look at how the
+# :class:`~torchvision.tv_tensors.BoundingBoxes` class is `implemented
+# <https://github.com/pytorch/vision/blob/main/torchvision/tv_tensors/_bounding_box.py>`_.
+
+
+class MyTVTensor(tv_tensors.TVTensor):
+    pass
+
+
+my_dp = MyTVTensor([1, 2, 3])
+my_dp
+
+# %%
+# Now that we have defined our custom TVTensor class, we want it to be
+# compatible with the built-in torchvision transforms, and the functional API.
+# For that, we need to implement a kernel which performs the core of the
+# transformation, and then "hook" it to the functional that we want to support
+# via :func:`~torchvision.transforms.v2.functional.register_kernel`.
+#
+# We illustrate this process below: we create a kernel for the "horizontal flip"
+# operation of our MyTVTensor class, and register it to the functional API.
+
+from torchvision.transforms.v2 import functional as F
+
+
+@F.register_kernel(functional="hflip", tv_tensor_cls=MyTVTensor)
+def hflip_my_tv_tensor(my_dp, *args, **kwargs):
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return tv_tensors.wrap(out, like=my_dp)
+
+
+# %%
+# To understand why :func:`~torchvision.tv_tensors.wrap` is used, see
+# :ref:`tv_tensor_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now,
+# we will explain it below in :ref:`param_forwarding`.
+#
+# .. note::
+#
+#     In our call to ``register_kernel`` above we used a string
+#     ``functional="hflip"`` to refer to the functional we want to hook into. We
+#     could also have used the  functional *itself*, i.e.
+#     ``@register_kernel(functional=F.hflip, ...)``.
+#
+# Now that we have registered our kernel, we can call the functional API on a
+# ``MyTVTensor`` instance:
+
+my_dp = MyTVTensor(torch.rand(3, 256, 256))
+_ = F.hflip(my_dp)
+
+# %%
+# And we can also use the
+# :class:`~torchvision.transforms.v2.RandomHorizontalFlip` transform, since it relies on :func:`~torchvision.transforms.v2.functional.hflip` internally:
+t = v2.RandomHorizontalFlip(p=1)
+_ = t(my_dp)
+
+# %%
+# .. note::
+#
+#     We cannot register a kernel for a transform class, we can only register a
+#     kernel for a **functional**. The reason we can't register a transform
+#     class is because one transform may internally rely on more than one
+#     functional, so in general we can't register a single kernel for a given
+#     class.
+#
+# .. _param_forwarding:
+#
+# Parameter forwarding, and ensuring future compatibility of your kernels
+# -----------------------------------------------------------------------
+#
+# The functional API that you're hooking into is public and therefore
+# **backward** compatible: we guarantee that the parameters of these functionals
+# won't be removed or renamed without a proper deprecation cycle. However, we
+# don't guarantee **forward** compatibility, and we may add new parameters in
+# the future.
+#
+# Imagine that in a future version, Torchvision adds a new ``inplace`` parameter
+# to its :func:`~torchvision.transforms.v2.functional.hflip` functional. If you
+# already defined and registered your own kernel as
+
+def hflip_my_tv_tensor(my_dp):  # noqa
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return tv_tensors.wrap(out, like=my_dp)
+
+
+# %%
+# then calling ``F.hflip(my_dp)`` will **fail**, because ``hflip`` will try to
+# pass the new ``inplace`` parameter to your kernel, but your kernel doesn't
+# accept it.
+#
+# For this reason, we recommend to always define your kernels with
+# ``*args, **kwargs`` in their signature, as done above. This way, your kernel
+# will be able to accept any new parameter that we may add in the future.
+# (Technically, adding `**kwargs` only should be enough).
--- a/gallery/transforms/plot_cutmix_mixup.py
+++ b/gallery/transforms/plot_cutmix_mixup.py
+
+"""
+===========================
+How to use CutMix and MixUp
+===========================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_cutmix_mixup.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_cutmix_mixup.py>` to download the full example code.
+
+:class:`~torchvision.transforms.v2.CutMix` and
+:class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies
+that can improve classification accuracy.
+
+These transforms are slightly different from the rest of the Torchvision
+transforms, because they expect
+**batches** of samples as input, not individual images. In this example we'll
+explain how to use them: after the ``DataLoader``, or as part of a collation
+function.
+"""
+
+# %%
+import torch
+from torchvision.datasets import FakeData
+from torchvision.transforms import v2
+
+
+NUM_CLASSES = 100
+
+# %%
+# Pre-processing pipeline
+# -----------------------
+#
+# We'll use a simple but typical image classification pipeline:
+
+preproc = v2.Compose([
+    v2.PILToTensor(),
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),  # to float32 in [0, 1]
+    v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),  # typically from ImageNet
+])
+
+dataset = FakeData(size=1000, num_classes=NUM_CLASSES, transform=preproc)
+
+img, label = dataset[0]
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }, {label = }")
+
+# %%
+#
+# One important thing to note is that neither CutMix nor MixUp are part of this
+# pre-processing pipeline. We'll add them a bit later once we define the
+# DataLoader. Just as a refresher, this is what the DataLoader and training loop
+# would look like if we weren't using CutMix or MixUp:
+
+from torch.utils.data import DataLoader
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    print(labels.dtype)
+    # <rest of the training loop here>
+    break
+# %%
+
+# %%
+# Where to use MixUp and CutMix
+# -----------------------------
+#
+# After the DataLoader
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# Now let's add CutMix and MixUp. The simplest way to do this right after the
+# DataLoader: the Dataloader has already batched the images and labels for us,
+# and this is exactly what these transforms expect as input:
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+cutmix = v2.CutMix(num_classes=NUM_CLASSES)
+mixup = v2.MixUp(num_classes=NUM_CLASSES)
+cutmix_or_mixup = v2.RandomChoice([cutmix, mixup])
+
+for images, labels in dataloader:
+    print(f"Before CutMix/MixUp: {images.shape = }, {labels.shape = }")
+    images, labels = cutmix_or_mixup(images, labels)
+    print(f"After CutMix/MixUp: {images.shape = }, {labels.shape = }")
+
+    # <rest of the training loop here>
+    break
+# %%
+#
+# Note how the labels were also transformed: we went from a batched label of
+# shape (batch_size,) to a tensor of shape (batch_size, num_classes). The
+# transformed labels can still be passed as-is to a loss function like
+# :func:`torch.nn.functional.cross_entropy`.
+#
+# As part of the collation function
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Passing the transforms after the DataLoader is the simplest way to use CutMix
+# and MixUp, but one disadvantage is that it does not take advantage of the
+# DataLoader multi-processing. For that, we can pass those transforms as part of
+# the collation function (refer to the `PyTorch docs
+# <https://pytorch.org/docs/stable/data.html#dataloader-collate-fn>`_ to learn
+# more about collation).
+
+from torch.utils.data import default_collate
+
+
+def collate_fn(batch):
+    return cutmix_or_mixup(*default_collate(batch))
+
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    # No need to call cutmix_or_mixup, it's already been called as part of the DataLoader!
+    # <rest of the training loop here>
+    break
+
+# %%
+# Non-standard input format
+# -------------------------
+#
+# So far we've used a typical sample structure where we pass ``(images,
+# labels)`` as inputs. MixUp and CutMix will magically work by default with most
+# common sample structures: tuples where the second parameter is a tensor label,
+# or dict with a "label[s]" key. Look at the documentation of the
+# ``labels_getter`` parameter for more details.
+#
+# If your samples have a different structure, you can still use CutMix and MixUp
+# by passing a callable to the ``labels_getter`` parameter. For example:
+
+batch = {
+    "imgs": torch.rand(4, 3, 224, 224),
+    "target": {
+        "classes": torch.randint(0, NUM_CLASSES, size=(4,)),
+        "some_other_key": "this is going to be passed-through"
+    }
+}
+
+
+def labels_getter(batch):
+    return batch["target"]["classes"]
+
+
+out = v2.CutMix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
+print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }")
--- a/gallery/transforms/plot_transforms_e2e.py
+++ b/gallery/transforms/plot_transforms_e2e.py
+"""
+===============================================================
+Transforms v2: End-to-end object detection/segmentation example
+===============================================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_e2e.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_e2e.py>` to download the full example code.
+
+Object detection and segmentation tasks are natively supported:
+``torchvision.transforms.v2`` enables jointly transforming images, videos,
+bounding boxes, and masks.
+
+This example showcases an end-to-end instance segmentation training case using
+Torchvision utils from ``torchvision.datasets``, ``torchvision.models`` and
+``torchvision.transforms.v2``. Everything covered here can be applied similarly
+to object detection or semantic segmentation tasks.
+"""
+
+# %%
+import pathlib
+
+import torch
+import torch.utils.data
+
+from torchvision import models, datasets, tv_tensors
+from torchvision.transforms import v2
+
+torch.manual_seed(0)
+
+# This loads fake data for illustration purposes of this example. In practice, you'll have
+# to replace this with the proper data.
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+ROOT = pathlib.Path("../assets") / "coco"
+IMAGES_PATH = str(ROOT / "images")
+ANNOTATIONS_PATH = str(ROOT / "instances.json")
+from helpers import plot
+
+
+# %%
+# Dataset preparation
+# -------------------
+#
+# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
+# returns.
+
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH)
+
+sample = dataset[0]
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{type(target[0]) = }\n{target[0].keys() = }")
+
+
+# %%
+# Torchvision datasets preserve the data structure and types as it was intended
+# by the datasets authors. So by default, the output structure may not always be
+# compatible with the models or the transforms.
+#
+# To overcome that, we can use the
+# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For
+# :class:`~torchvision.datasets.CocoDetection`, this changes the target
+# structure to a single dictionary of lists:
+
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=("boxes", "labels", "masks"))
+
+sample = dataset[0]
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{target.keys() = }")
+print(f"{type(target['boxes']) = }\n{type(target['labels']) = }\n{type(target['masks']) = }")
+
+# %%
+# We used the ``target_keys`` parameter to specify the kind of output we're
+# interested in. Our dataset now returns a target which is dict where the values
+# are :ref:`TVTensors <what_are_tv_tensors>` (all are :class:`torch.Tensor`
+# subclasses). We're dropped all unncessary keys from the previous output, but
+# if you need any of the original keys e.g. "image_id", you can still ask for
+# it.
+#
+# .. note::
+#
+#     If you just want to do detection, you don't need and shouldn't pass
+#     "masks" in ``target_keys``: if masks are present in the sample, they will
+#     be transformed, slowing down your transformations unnecessarily.
+#
+# As baseline, let's have a look at a sample without transformations:
+
+plot([dataset[0], dataset[1]])
+
+
+# %%
+# Transforms
+# ----------
+#
+# Let's now define our pre-processing transforms. All the transforms know how
+# to handle images, bounding boxes and masks when relevant.
+#
+# Transforms are typically passed as the ``transforms`` parameter of the
+# dataset so that they can leverage multi-processing from the
+# :class:`torch.utils.data.DataLoader`.
+
+transforms = v2.Compose(
+    [
+        v2.ToImage(),
+        v2.RandomPhotometricDistort(p=1),
+        v2.RandomZoomOut(fill={tv_tensors.Image: (123, 117, 104), "others": 0}),
+        v2.RandomIoUCrop(),
+        v2.RandomHorizontalFlip(p=1),
+        v2.SanitizeBoundingBoxes(),
+        v2.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH, transforms=transforms)
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=["boxes", "labels", "masks"])
+
+# %%
+# A few things are worth noting here:
+#
+# - We're converting the PIL image into a
+#   :class:`~torchvision.transforms.v2.Image` object. This isn't strictly
+#   necessary, but relying on Tensors (here: a Tensor subclass) will
+#   :ref:`generally be faster <transforms_perf>`.
+# - We are calling :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` to
+#   make sure we remove degenerate bounding boxes, as well as their
+#   corresponding labels and masks.
+#   :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` should be placed
+#   at least once at the end of a detection pipeline; it is particularly
+#   critical if :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
+#
+# Let's look how the sample looks like with our augmentation pipeline in place:
+
+# sphinx_gallery_thumbnail_number = 2
+plot([dataset[0], dataset[1]])
+
+
+# %%
+# We can see that the color of the images were distorted, zoomed in or out, and flipped.
+# The bounding boxes and the masks were transformed accordingly. And without any further ado, we can start training.
+#
+# Data loading and training loop
+# ------------------------------
+#
+# Below we're using Mask-RCNN which is an instance segmentation model, but
+# everything we've covered in this tutorial also applies to object detection and
+# semantic segmentation tasks.
+
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    # We need a custom collation function here, since the object detection
+    # models expect a sequence of images and target dictionaries. The default
+    # collation function tries to torch.stack() the individual elements,
+    # which fails in general for object detection, because the number of bounding
+    # boxes varies between the images of the same batch.
+    collate_fn=lambda batch: tuple(zip(*batch)),
+)
+
+model = models.get_model("maskrcnn_resnet50_fpn_v2", weights=None, weights_backbone=None).train()
+
+for imgs, targets in data_loader:
+    loss_dict = model(imgs, targets)
+    # Put your training logic here
+
+    print(f"{[img.shape for img in imgs] = }")
+    print(f"{[type(target) for target in targets] = }")
+    for name, loss_val in loss_dict.items():
+        print(f"{name:<20}{loss_val:.3f}")
+
+# %%
+# Training References
+# -------------------
+#
+# From there, you can check out the `torchvision references
+# <https://github.com/pytorch/vision/tree/main/references>`_ where you'll find
+# the actual training scripts we use to train our models.
+#
+# **Disclaimer** The code in our references is more complex than what you'll
+# need for your own use-cases: this is because we're supporting different
+# backends (PIL, tensors, TVTensors) and different transforms namespaces (v1 and
+# v2). So don't be afraid to simplify and only keep what you need.
--- a/gallery/transforms/plot_transforms_getting_started.py
+++ b/gallery/transforms/plot_transforms_getting_started.py
+"""
+==================================
+Getting started with transforms v2
+==================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_getting_started.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_getting_started.py>` to download the full example code.
+
+This example illustrates all of what you need to know to get started with the
+new :mod:`torchvision.transforms.v2` API. We'll cover simple tasks like
+image classification, and more advanced ones like object detection /
+segmentation.
+"""
+
+# %%
+# First, a bit of setup
+from pathlib import Path
+import torch
+import matplotlib.pyplot as plt
+plt.rcParams["savefig.bbox"] = 'tight'
+
+from torchvision.transforms import v2
+from torchvision.io import read_image
+
+torch.manual_seed(1)
+
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+img = read_image(str(Path('../assets') / 'astronaut.jpg'))
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }")
+
+# %%
+# The basics
+# ----------
+#
+# The Torchvision transforms behave like a regular :class:`torch.nn.Module` (in
+# fact, most of them are): instantiate a transform, pass an input, get a
+# transformed output:
+
+transform = v2.RandomCrop(size=(224, 224))
+out = transform(img)
+
+plot([img, out])
+
+# %%
+# I just want to do image classification
+# --------------------------------------
+#
+# If you just care about image classification, things are very simple. A basic
+# classification pipeline may look like this:
+
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),
+    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+out = transforms(img)
+
+plot([img, out])
+
+# %%
+# Such transformation pipeline is typically passed as the ``transform`` argument
+# to the :ref:`Datasets <datasets>`, e.g. ``ImageNet(...,
+# transform=transforms)``.
+#
+# That's pretty much all there is. From there, read through our :ref:`main docs
+# <transforms>` to learn more about recommended practices and conventions, or
+# explore more :ref:`examples <transforms_gallery>` e.g. how to use augmentation
+# transforms like :ref:`CutMix and MixUp
+# <sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py>`.
+#
+# .. note::
+#
+#     If you're already relying on the ``torchvision.transforms`` v1 API,
+#     we recommend to :ref:`switch to the new v2 transforms<v1_or_v2>`. It's
+#     very easy: the v2 transforms are fully compatible with the v1 API, so you
+#     only need to change the import!
+#
+# Detection, Segmentation, Videos
+# -------------------------------
+#
+# The new Torchvision transforms in the ``torchvision.transforms.v2`` namespace
+# support tasks beyond image classification: they can also transform bounding
+# boxes, segmentation / detection masks, or videos.
+#
+# Let's briefly look at a detection example with bounding boxes.
+
+from torchvision import tv_tensors  # we'll describe this a bit later, bare with us
+
+boxes = tv_tensors.BoundingBoxes(
+    [
+        [15, 10, 370, 510],
+        [275, 340, 510, 510],
+        [130, 345, 210, 425]
+    ],
+    format="XYXY", canvas_size=img.shape[-2:])
+
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomPhotometricDistort(p=1),
+    v2.RandomHorizontalFlip(p=1),
+])
+out_img, out_boxes = transforms(img, boxes)
+print(type(boxes), type(out_boxes))
+
+plot([(img, boxes), (out_img, out_boxes)])
+
+# %%
+#
+# The example above focuses on object detection. But if we had masks
+# (:class:`torchvision.tv_tensors.Mask`) for object segmentation or semantic
+# segmentation, or videos (:class:`torchvision.tv_tensors.Video`), we could have
+# passed them to the transforms in exactly the same way.
+#
+# By now you likely have a few questions: what are these TVTensors, how do we
+# use them, and what is the expected input/output of those transforms? We'll
+# answer these in the next sections.
+
+# %%
+#
+# .. _what_are_tv_tensors:
+#
+# What are TVTensors?
+# --------------------
+#
+# TVTensors are :class:`torch.Tensor` subclasses. The available TVTensors are
+# :class:`~torchvision.tv_tensors.Image`,
+# :class:`~torchvision.tv_tensors.BoundingBoxes`,
+# :class:`~torchvision.tv_tensors.Mask`, and
+# :class:`~torchvision.tv_tensors.Video`.
+#
+# TVTensors look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()``
+# or any ``torch.*`` operator will also work on a TVTensor:
+
+img_dp = tv_tensors.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8))
+
+print(f"{isinstance(img_dp, torch.Tensor) = }")
+print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
+
+# %%
+# These TVTensor classes are at the core of the transforms: in order to
+# transform a given input, the transforms first look at the **class** of the
+# object, and dispatch to the appropriate implementation accordingly.
+#
+# You don't need to know much more about TVTensors at this point, but advanced
+# users who want to learn more can refer to
+# :ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
+#
+# What do I pass as input?
+# ------------------------
+#
+# Above, we've seen two examples: one where we passed a single image as input
+# i.e. ``out = transforms(img)``, and one where we passed both an image and
+# bounding boxes, i.e. ``out_img, out_boxes = transforms(img, boxes)``.
+#
+# In fact, transforms support **arbitrary input structures**. The input can be a
+# single image, a tuple, an arbitrarily nested dictionary... pretty much
+# anything. The same structure will be returned as output. Below, we use the
+# same detection transforms, but pass a tuple (image, target_dict) as input and
+# we're getting the same structure as output:
+
+target = {
+    "boxes": boxes,
+    "labels": torch.arange(boxes.shape[0]),
+    "this_is_ignored": ("arbitrary", {"structure": "!"})
+}
+
+# Re-using the transforms and definitions from above.
+out_img, out_target = transforms(img, target)
+
+# sphinx_gallery_thumbnail_number = 4
+plot([(img, target["boxes"]), (out_img, out_target["boxes"])])
+print(f"{out_target['this_is_ignored']}")
+
+# %%
+# We passed a tuple so we get a tuple back, and the second element is the
+# tranformed target dict. Transforms don't really care about the structure of
+# the input; as mentioned above, they only care about the **type** of the
+# objects and transforms them accordingly.
+#
+# *Foreign* objects like strings or ints are simply passed-through. This can be
+# useful e.g. if you want to associate a path with every single sample when
+# debugging!
+#
+# .. _passthrough_heuristic:
+#
+# .. note::
+#
+#     **Disclaimer** This note is slightly advanced and can be safely skipped on
+#     a first read.
+#
+#     Pure :class:`torch.Tensor` objects are, in general, treated as images (or
+#     as videos for video-specific transforms). Indeed, you may have noticed
+#     that in the code above we haven't used the
+#     :class:`~torchvision.tv_tensors.Image` class at all, and yet our images
+#     got transformed properly. Transforms follow the following logic to
+#     determine whether a pure Tensor should be treated as an image (or video),
+#     or just ignored:
+#
+#     * If there is an :class:`~torchvision.tv_tensors.Image`,
+#       :class:`~torchvision.tv_tensors.Video`,
+#       or :class:`PIL.Image.Image` instance in the input, all other pure
+#       tensors are passed-through.
+#     * If there is no :class:`~torchvision.tv_tensors.Image` or
+#       :class:`~torchvision.tv_tensors.Video` instance, only the first pure
+#       :class:`torch.Tensor` will be transformed as image or video, while all
+#       others will be passed-through. Here "first" means "first in a depth-wise
+#       traversal".
+#
+#     This is what happened in the detection example above: the first pure
+#     tensor was the image so it got transformed properly, and all other pure
+#     tensor instances like the ``labels`` were passed-through (although labels
+#     can still be transformed by some transforms like
+#     :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`!).
+#
+# .. _transforms_datasets_intercompatibility:
+#
+# Transforms and Datasets intercompatibility
+# ------------------------------------------
+#
+# Roughly speaking, the output of the datasets must correspond to the input of
+# the transforms. How to do that depends on whether you're using the torchvision
+# :ref:`built-in datatsets <datasets>`, or your own custom datasets.
+#
+# Using built-in datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you're just doing image classification, you don't need to do anything. Just
+# use ``transform`` argument of the dataset e.g. ``ImageNet(...,
+# transform=transforms)`` and you're good to go.
+#
+# Torchvision also supports datasets for object detection or segmentation like
+# :class:`torchvision.datasets.CocoDetection`. Those datasets predate
+# the existence of the :mod:`torchvision.transforms.v2` module and of the
+# TVTensors, so they don't return TVTensors out of the box.
+#
+# An easy way to force those datasets to return TVTensors and to make them
+# compatible with v2 transforms is to use the
+# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function:
+#
+# .. code-block:: python
+#
+#    from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
+#
+#    dataset = CocoDetection(..., transforms=my_transforms)
+#    dataset = wrap_dataset_for_transforms_v2(dataset)
+#    # Now the dataset returns TVTensors!
+#
+# Using your own datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you have a custom dataset, then you'll need to convert your objects into
+# the appropriate TVTensor classes. Creating TVTensor instances is very easy,
+# refer to :ref:`tv_tensor_creation` for more details.
+#
+# There are two main places where you can implement that conversion logic:
+#
+# - At the end of the datasets's ``__getitem__`` method, before returning the
+#   sample (or by sub-classing the dataset).
+# - As the very first step of your transforms pipeline
+#
+# Either way, the logic will depend on your specific dataset.
--- a/gallery/plot_transforms.py
+++ b/gallery/plot_transforms.py
@@ -3,280 +3,329 @@
 Illustration of transforms
 ==========================

-This example illustrates the various transforms available in :ref:`the
-torchvision.transforms module <transforms>`.
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_illustrations.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_illustrations.py>` to download the full example code.
+
+This example illustrates some of the various transforms available in :ref:`the
+torchvision.transforms.v2 module <transforms>`.
 """
+# %%
+
+# sphinx_gallery_thumbnail_path = "../../gallery/assets/transforms_thumbnail.png"

 from PIL import Image
 from pathlib import Path
 import matplotlib.pyplot as plt
-import numpy as np

 import torch
-import torchvision.transforms as T
-
+from torchvision.transforms import v2

 plt.rcParams["savefig.bbox"] = 'tight'
-orig_img = Image.open(Path('assets') / 'astronaut.jpg')
+
 # if you change the seed, make sure that the randomly-applied transforms
 # properly show that the image can be both transformed and *not* transformed!
 torch.manual_seed(0)

-
-def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
-    if not isinstance(imgs[0], list):
-        # Make a 2d grid even if there's just 1 row
-        imgs = [imgs]
-
-    num_rows = len(imgs)
-    num_cols = len(imgs[0]) + with_orig
-    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
-    for row_idx, row in enumerate(imgs):
-        row = [orig_img] + row if with_orig else row
-        for col_idx, img in enumerate(row):
-            ax = axs[row_idx, col_idx]
-            ax.imshow(np.asarray(img), **imshow_kwargs)
-            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-
-    if with_orig:
-        axs[0, 0].set(title='Original image')
-        axs[0, 0].title.set_size(8)
-    if row_title is not None:
-        for row_idx in range(num_rows):
-            axs[row_idx, 0].set(ylabel=row_title[row_idx])
-
-    plt.tight_layout()
-
-
-####################################
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+orig_img = Image.open(Path('../assets') / 'astronaut.jpg')
+
+# %%
+# Geometric Transforms
+# --------------------
+# Geometric image transformation refers to the process of altering the geometric properties of an image,
+# such as its shape, size, orientation, or position.
+# It involves applying mathematical operations to the image pixels or coordinates to achieve the desired transformation.
+#
 # Pad
-# ---
+# ~~~
 # The :class:`~torchvision.transforms.Pad` transform
 # (see also :func:`~torchvision.transforms.functional.pad`)
-# fills image borders with some pixel values.
-padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
-plot(padded_imgs)
+# pads all image borders with some pixel values.
+padded_imgs = [v2.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
+plot([orig_img] + padded_imgs)

-####################################
+# %%
 # Resize
-# ------
+# ~~~~~~
 # The :class:`~torchvision.transforms.Resize` transform
 # (see also :func:`~torchvision.transforms.functional.resize`)
 # resizes an image.
-resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(resized_imgs)
+resized_imgs = [v2.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + resized_imgs)

-####################################
+# %%
 # CenterCrop
-# ----------
+# ~~~~~~~~~~
 # The :class:`~torchvision.transforms.CenterCrop` transform
 # (see also :func:`~torchvision.transforms.functional.center_crop`)
 # crops the given image at the center.
-center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(center_crops)
+center_crops = [v2.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + center_crops)

-####################################
+# %%
 # FiveCrop
-# --------
+# ~~~~~~~~
 # The :class:`~torchvision.transforms.FiveCrop` transform
 # (see also :func:`~torchvision.transforms.functional.five_crop`)
 # crops the given image into four corners and the central crop.
-(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
-plot([top_left, top_right, bottom_left, bottom_right, center])
-
-####################################
-# Grayscale
-# ---------
-# The :class:`~torchvision.transforms.Grayscale` transform
-# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
-# converts an image to grayscale
-gray_img = T.Grayscale()(orig_img)
-plot([gray_img], cmap='gray')
-
-####################################
-# Random transforms
-# -----------------
-# The following transforms are random, which means that the same transfomer
-# instance will produce different result each time it transforms a given image.
-#
-# ColorJitter
-# ~~~~~~~~~~~
-# The :class:`~torchvision.transforms.ColorJitter` transform
-# randomly changes the brightness, saturation, and other properties of an image.
-jitter = T.ColorJitter(brightness=.5, hue=.3)
-jitted_imgs = [jitter(orig_img) for _ in range(4)]
-plot(jitted_imgs)
-
-####################################
-# GaussianBlur
-# ~~~~~~~~~~~~
-# The :class:`~torchvision.transforms.GaussianBlur` transform
-# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
-# performs gaussian blur transform on an image.
-blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
-blurred_imgs = [blurrer(orig_img) for _ in range(4)]
-plot(blurred_imgs)
+(top_left, top_right, bottom_left, bottom_right, center) = v2.FiveCrop(size=(100, 100))(orig_img)
+plot([orig_img] + [top_left, top_right, bottom_left, bottom_right, center])

-####################################
+# %%
 # RandomPerspective
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPerspective` transform
 # (see also :func:`~torchvision.transforms.functional.perspective`)
 # performs random perspective transform on an image.
-perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
+perspective_transformer = v2.RandomPerspective(distortion_scale=0.6, p=1.0)
 perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)]
-plot(perspective_imgs)
+plot([orig_img] + perspective_imgs)

-####################################
+# %%
 # RandomRotation
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomRotation` transform
 # (see also :func:`~torchvision.transforms.functional.rotate`)
 # rotates an image with random angle.
-rotater = T.RandomRotation(degrees=(0, 180))
+rotater = v2.RandomRotation(degrees=(0, 180))
 rotated_imgs = [rotater(orig_img) for _ in range(4)]
-plot(rotated_imgs)
+plot([orig_img] + rotated_imgs)

-####################################
+# %%
 # RandomAffine
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAffine` transform
 # (see also :func:`~torchvision.transforms.functional.affine`)
 # performs random affine transform on an image.
-affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
+affine_transfomer = v2.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
 affine_imgs = [affine_transfomer(orig_img) for _ in range(4)]
-plot(affine_imgs)
-
-####################################
+plot([orig_img] + affine_imgs)
+
+# %%
+# ElasticTransform
+# ~~~~~~~~~~~~~~~~
+# The :class:`~torchvision.transforms.ElasticTransform` transform
+# (see also :func:`~torchvision.transforms.functional.elastic_transform`)
+# Randomly transforms the morphology of objects in images and produces a
+# see-through-water-like effect.
+elastic_transformer = v2.ElasticTransform(alpha=250.0)
+transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)]
+plot([orig_img] + transformed_imgs)
+
+# %%
 # RandomCrop
 # ~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomCrop` transform
 # (see also :func:`~torchvision.transforms.functional.crop`)
 # crops an image at a random location.
-cropper = T.RandomCrop(size=(128, 128))
+cropper = v2.RandomCrop(size=(128, 128))
 crops = [cropper(orig_img) for _ in range(4)]
-plot(crops)
+plot([orig_img] + crops)

-####################################
+# %%
 # RandomResizedCrop
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomResizedCrop` transform
 # (see also :func:`~torchvision.transforms.functional.resized_crop`)
 # crops an image at a random location, and then resizes the crop to a given
 # size.
-resize_cropper = T.RandomResizedCrop(size=(32, 32))
+resize_cropper = v2.RandomResizedCrop(size=(32, 32))
 resized_crops = [resize_cropper(orig_img) for _ in range(4)]
-plot(resized_crops)
+plot([orig_img] + resized_crops)
+
+# %%
+# Photometric Transforms
+# ----------------------
+# Photometric image transformation refers to the process of modifying the photometric properties of an image,
+# such as its brightness, contrast, color, or tone.
+# These transformations are applied to change the visual appearance of an image
+# while preserving its geometric structure.
+#
+# Except :class:`~torchvision.transforms.Grayscale`, the following transforms are random,
+# which means that the same transform
+# instance will produce different result each time it transforms a given image.
+#
+# Grayscale
+# ~~~~~~~~~
+# The :class:`~torchvision.transforms.Grayscale` transform
+# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
+# converts an image to grayscale
+gray_img = v2.Grayscale()(orig_img)
+plot([orig_img, gray_img], cmap='gray')

-####################################
+# %%
+# ColorJitter
+# ~~~~~~~~~~~
+# The :class:`~torchvision.transforms.ColorJitter` transform
+# randomly changes the brightness, contrast, saturation, hue, and other properties of an image.
+jitter = v2.ColorJitter(brightness=.5, hue=.3)
+jittered_imgs = [jitter(orig_img) for _ in range(4)]
+plot([orig_img] + jittered_imgs)
+
+# %%
+# GaussianBlur
+# ~~~~~~~~~~~~
+# The :class:`~torchvision.transforms.GaussianBlur` transform
+# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
+# performs gaussian blur transform on an image.
+blurrer = v2.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5.))
+blurred_imgs = [blurrer(orig_img) for _ in range(4)]
+plot([orig_img] + blurred_imgs)
+
+# %%
 # RandomInvert
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomInvert` transform
 # (see also :func:`~torchvision.transforms.functional.invert`)
 # randomly inverts the colors of the given image.
-inverter = T.RandomInvert()
+inverter = v2.RandomInvert()
 invertered_imgs = [inverter(orig_img) for _ in range(4)]
-plot(invertered_imgs)
+plot([orig_img] + invertered_imgs)

-####################################
+# %%
 # RandomPosterize
 # ~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPosterize` transform
 # (see also :func:`~torchvision.transforms.functional.posterize`)
 # randomly posterizes the image by reducing the number of bits
 # of each color channel.
-posterizer = T.RandomPosterize(bits=2)
+posterizer = v2.RandomPosterize(bits=2)
 posterized_imgs = [posterizer(orig_img) for _ in range(4)]
-plot(posterized_imgs)
+plot([orig_img] + posterized_imgs)

-####################################
+# %%
 # RandomSolarize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomSolarize` transform
 # (see also :func:`~torchvision.transforms.functional.solarize`)
 # randomly solarizes the image by inverting all pixel values above
 # the threshold.
-solarizer = T.RandomSolarize(threshold=192.0)
+solarizer = v2.RandomSolarize(threshold=192.0)
 solarized_imgs = [solarizer(orig_img) for _ in range(4)]
-plot(solarized_imgs)
+plot([orig_img] + solarized_imgs)

-####################################
+# %%
 # RandomAdjustSharpness
 # ~~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
 # (see also :func:`~torchvision.transforms.functional.adjust_sharpness`)
 # randomly adjusts the sharpness of the given image.
-sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
+sharpness_adjuster = v2.RandomAdjustSharpness(sharpness_factor=2)
 sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)]
-plot(sharpened_imgs)
+plot([orig_img] + sharpened_imgs)

-####################################
+# %%
 # RandomAutocontrast
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAutocontrast` transform
 # (see also :func:`~torchvision.transforms.functional.autocontrast`)
 # randomly applies autocontrast to the given image.
-autocontraster = T.RandomAutocontrast()
+autocontraster = v2.RandomAutocontrast()
 autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)]
-plot(autocontrasted_imgs)
+plot([orig_img] + autocontrasted_imgs)

-####################################
+# %%
 # RandomEqualize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomEqualize` transform
 # (see also :func:`~torchvision.transforms.functional.equalize`)
 # randomly equalizes the histogram of the given image.
-equalizer = T.RandomEqualize()
+equalizer = v2.RandomEqualize()
 equalized_imgs = [equalizer(orig_img) for _ in range(4)]
-plot(equalized_imgs)
+plot([orig_img] + equalized_imgs)

-####################################
+# %%
+# JPEG
+# ~~~~~~~~~~~~~~
+# The :class:`~torchvision.transforms.v2.JPEG` transform
+# (see also :func:`~torchvision.transforms.v2.functional.jpeg`)
+# applies JPEG compression to the given image with random
+# degree of compression.
+jpeg = v2.JPEG((5, 50))
+jpeg_imgs = [jpeg(orig_img) for _ in range(4)]
+plot([orig_img] + jpeg_imgs)
+
+# %%
+# Augmentation Transforms
+# -----------------------
+# The following transforms are combinations of multiple transforms,
+# either geometric or photometric, or both.
+#
 # AutoAugment
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.AutoAugment` transform
 # automatically augments data based on a given auto-augmentation policy.
 # See :class:`~torchvision.transforms.AutoAugmentPolicy` for the available policies.
-policies = [T.AutoAugmentPolicy.CIFAR10, T.AutoAugmentPolicy.IMAGENET, T.AutoAugmentPolicy.SVHN]
-augmenters = [T.AutoAugment(policy) for policy in policies]
+policies = [v2.AutoAugmentPolicy.CIFAR10, v2.AutoAugmentPolicy.IMAGENET, v2.AutoAugmentPolicy.SVHN]
+augmenters = [v2.AutoAugment(policy) for policy in policies]
 imgs = [
    [augmenter(orig_img) for _ in range(4)]
    for augmenter in augmenters
 ]
 row_title = [str(policy).split('.')[-1] for policy in policies]
-plot(imgs, row_title=row_title)
+plot([[orig_img] + row for row in imgs], row_title=row_title)

-####################################
-# Randomly-applied transforms
+# %%
+# RandAugment
+# ~~~~~~~~~~~
+# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
+augmenter = v2.RandAugment()
+imgs = [augmenter(orig_img) for _ in range(4)]
+plot([orig_img] + imgs)
+
+# %%
+# TrivialAugmentWide
+# ~~~~~~~~~~~~~~~~~~
+# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
+# However, instead of transforming an image multiple times, it transforms an image only once
+# using a random transform from a given list with a random strength number.
+augmenter = v2.TrivialAugmentWide()
+imgs = [augmenter(orig_img) for _ in range(4)]
+plot([orig_img] + imgs)
+
+# %%
+# AugMix
+# ~~~~~~
+# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
+augmenter = v2.AugMix()
+imgs = [augmenter(orig_img) for _ in range(4)]
+plot([orig_img] + imgs)
+
+# %%
+# Randomly-applied Transforms
 # ---------------------------
 #
-# Some transforms are randomly-applied given a probability ``p``.  That is, the
-# transformed image may actually be the same as the original one, even when
-# called with the same transformer instance!
+# The following transforms are randomly-applied given a probability ``p``.  That is, given ``p = 0.5``,
+# there is a 50% chance to return the original image, and a 50% chance to return the transformed image,
+# even when called with the same transform instance!
 #
 # RandomHorizontalFlip
 # ~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomHorizontalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.hflip`)
 # performs horizontal flip of an image, with a given probability.
-hflipper = T.RandomHorizontalFlip(p=0.5)
+hflipper = v2.RandomHorizontalFlip(p=0.5)
 transformed_imgs = [hflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)

-####################################
+# %%
 # RandomVerticalFlip
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomVerticalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.vflip`)
 # performs vertical flip of an image, with a given probability.
-vflipper = T.RandomVerticalFlip(p=0.5)
+vflipper = v2.RandomVerticalFlip(p=0.5)
 transformed_imgs = [vflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)

-####################################
+# %%
 # RandomApply
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomApply` transform
 # randomly applies a list of transforms, with a given probability.
-applier = T.RandomApply(transforms=[T.RandomCrop(size=(64, 64))], p=0.5)
+applier = v2.RandomApply(transforms=[v2.RandomCrop(size=(64, 64))], p=0.5)
 transformed_imgs = [applier(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
--- a/gallery/transforms/plot_tv_tensors.py
+++ b/gallery/transforms/plot_tv_tensors.py
+"""
+=============
+TVTensors FAQ
+=============
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_tv_tensors.py>` to download the full example code.
+
+
+TVTensors are Tensor subclasses introduced together with
+``torchvision.transforms.v2``. This example showcases what these TVTensors are
+and how they behave.
+
+.. warning::
+
+    **Intended Audience** Unless you're writing your own transforms or your own TVTensors, you
+    probably do not need to read this guide. This is a fairly low-level topic
+    that most users will not need to worry about: you do not need to understand
+    the internals of TVTensors to efficiently rely on
+    ``torchvision.transforms.v2``. It may however be useful for advanced users
+    trying to implement their own datasets, transforms, or work directly with
+    the TVTensors.
+"""
+
+# %%
+import PIL.Image
+
+import torch
+from torchvision import tv_tensors
+
+
+# %%
+# What are TVTensors?
+# -------------------
+#
+# TVTensors are zero-copy tensor subclasses:
+
+tensor = torch.rand(3, 256, 256)
+image = tv_tensors.Image(tensor)
+
+assert isinstance(image, torch.Tensor)
+assert image.data_ptr() == tensor.data_ptr()
+
+# %%
+# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
+# for the input data.
+#
+# :mod:`torchvision.tv_tensors` supports four types of TVTensors:
+#
+# * :class:`~torchvision.tv_tensors.Image`
+# * :class:`~torchvision.tv_tensors.Video`
+# * :class:`~torchvision.tv_tensors.BoundingBoxes`
+# * :class:`~torchvision.tv_tensors.Mask`
+#
+# What can I do with a TVTensor?
+# ------------------------------
+#
+# TVTensors look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or
+# any ``torch.*`` operator will also work on TVTensors. See
+# :ref:`tv_tensor_unwrapping_behaviour` for a few gotchas.
+
+# %%
+# .. _tv_tensor_creation:
+#
+# How do I construct a TVTensor?
+# ------------------------------
+#
+# Using the constructor
+# ^^^^^^^^^^^^^^^^^^^^^
+#
+# Each TVTensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
+
+image = tv_tensors.Image([[[[0, 1], [1, 0]]]])
+print(image)
+
+
+# %%
+# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
+# parameters.
+
+float_image = tv_tensors.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
+print(float_image)
+
+
+# %%
+# In addition, :class:`~torchvision.tv_tensors.Image` and :class:`~torchvision.tv_tensors.Mask` can also take a
+# :class:`PIL.Image.Image` directly:
+
+image = tv_tensors.Image(PIL.Image.open("../assets/astronaut.jpg"))
+print(image.shape, image.dtype)
+
+# %%
+# Some TVTensors require additional metadata to be passed in ordered to be constructed. For example,
+# :class:`~torchvision.tv_tensors.BoundingBoxes` requires the coordinate format as well as the size of the
+# corresponding image (``canvas_size``) alongside the actual values. These
+# metadata are required to properly transform the bounding boxes.
+
+bboxes = tv_tensors.BoundingBoxes(
+    [[17, 16, 344, 495], [0, 10, 0, 10]],
+    format=tv_tensors.BoundingBoxFormat.XYXY,
+    canvas_size=image.shape[-2:]
+)
+print(bboxes)
+
+# %%
+# Using ``tv_tensors.wrap()``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can also use the :func:`~torchvision.tv_tensors.wrap` function to wrap a tensor object
+# into a TVTensor. This is useful when you already have an object of the
+# desired type, which typically happens when writing transforms: you just want
+# to wrap the output like the input.
+
+new_bboxes = torch.tensor([0, 20, 30, 40])
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+assert new_bboxes.canvas_size == bboxes.canvas_size
+
+# %%
+# The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
+# it as a parameter to override it.
+#
+# .. _tv_tensor_unwrapping_behaviour:
+#
+# I had a TVTensor but now I have a Tensor. Help!
+# -----------------------------------------------
+#
+# By default, operations on :class:`~torchvision.tv_tensors.TVTensor` objects
+# will return a pure Tensor:
+
+
+assert isinstance(bboxes, tv_tensors.BoundingBoxes)
+
+# Shift bboxes by 3 pixels in both H and W
+new_bboxes = bboxes + 3
+
+assert isinstance(new_bboxes, torch.Tensor)
+assert not isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# .. note::
+#
+#    This behavior only affects native ``torch`` operations. If you are using
+#    the built-in ``torchvision`` transforms or functionals, you will always get
+#    as output the same type that you passed as input (pure ``Tensor`` or
+#    ``TVTensor``).
+
+# %%
+# But I want a TVTensor back!
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can re-wrap a pure tensor into a TVTensor by just calling the TVTensor
+# constructor, or by using the :func:`~torchvision.tv_tensors.wrap` function
+# (see more details above in :ref:`tv_tensor_creation`):
+
+new_bboxes = bboxes + 3
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# Alternatively, you can use the :func:`~torchvision.tv_tensors.set_return_type`
+# as a global config setting for the whole program, or as a context manager
+# (read its docs to learn more about caveats):
+
+with tv_tensors.set_return_type("TVTensor"):
+    new_bboxes = bboxes + 3
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# Why is this happening?
+# ^^^^^^^^^^^^^^^^^^^^^^
+#
+# **For performance reasons**. :class:`~torchvision.tv_tensors.TVTensor`
+# classes are Tensor subclasses, so any operation involving a
+# :class:`~torchvision.tv_tensors.TVTensor` object will go through the
+# `__torch_function__
+# <https://pytorch.org/docs/stable/notes/extending.html#extending-torch>`_
+# protocol. This induces a small overhead, which we want to avoid when possible.
+# This doesn't matter for built-in ``torchvision`` transforms because we can
+# avoid the overhead there, but it could be a problem in your model's
+# ``forward``.
+#
+# **The alternative isn't much better anyway.** For every operation where
+# preserving the :class:`~torchvision.tv_tensors.TVTensor` type makes
+# sense, there are just as many operations where returning a pure Tensor is
+# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.tv_tensors.Image`?
+# If we were to preserve :class:`~torchvision.tv_tensors.TVTensor` types all
+# the way, even model's logits or the output of the loss function would end up
+# being of type :class:`~torchvision.tv_tensors.Image`, and surely that's not
+# desirable.
+#
+# .. note::
+#
+#    This behaviour is something we're actively seeking feedback on. If you find this surprising or if you
+#    have any suggestions on how to better support your use-cases, please reach out to us via this issue:
+#    https://github.com/pytorch/vision/issues/7319
+#
+# Exceptions
+# ^^^^^^^^^^
+#
+# There are a few exceptions to this "unwrapping" rule:
+# :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
+# :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain
+# the TVTensor type.
+#
+# Inplace operations on TVTensors like ``obj.add_()`` will preserve the type of
+# ``obj``. However, the **returned** value of inplace operations will be a pure
+# tensor:
+
+image = tv_tensors.Image([[[0, 1], [1, 0]]])
+
+new_image = image.add_(1).mul_(2)
+
+# image got transformed in-place and is still a TVTensor Image, but new_image
+# is a Tensor. They share the same underlying data and they're equal, just
+# different classes.
+assert isinstance(image, tv_tensors.Image)
+print(image)
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, tv_tensors.Image)
+assert (new_image == image).all()
+assert new_image.data_ptr() == image.data_ptr()