New schema for metrics in weights meta-data (#6047)

* Classif models * Detection * Segmentation * quantization * Video * optical flow * tests * Fix docs * Fix Video dataset * Consistency for RAFT dataset names * use ImageNet-1K * Use COCO-val2017-VOC-labels for segmentation * formatting

New schema for metrics in weights meta-data (#6047)
* Classif models * Detection * Segmentation * quantization * Video * optical flow * tests * Fix docs * Fix Video dataset * Consistency for RAFT dataset names * use ImageNet-1K * Use COCO-val2017-VOC-labels for segmentation * formatting
2ec0e847 · Nicolas Hug · GitHub · 2a35dde3 · 2ec0e847 · 2ec0e847
Unverified Commit 2ec0e847 authored May 18, 2022 by Nicolas Hug Committed by GitHub May 18, 2022
20 changed files
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -334,25 +334,22 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
        lines.append("")

        for field in obj:
-            lines += [f"**{str(field)}**:", ""]
-
-            table = []
-
-            # the `meta` dict contains another embedded `metrics` dict. To
-            # simplify the table generation below, we create the
-            # `meta_with_metrics` dict, where the metrics dict has been "flattened"
            meta = copy(field.meta)
-            metrics = meta.pop("metrics", {})
-            meta_with_metrics = dict(meta, **metrics)

-            lines += [meta_with_metrics.pop("_docs")]
+            lines += [f"**{str(field)}**:", ""]
+            lines += [meta.pop("_docs")]

            if field == obj.DEFAULT:
                lines += [f"Also available as ``{obj.__name__}.DEFAULT``."]
-
            lines += [""]

-            for k, v in meta_with_metrics.items():
+            table = []
+            metrics = meta.pop("_metrics")
+            for dataset, dataset_metrics in metrics.items():
+                for metric_name, metric_value in dataset_metrics.items():
+                    table.append((f"{metric_name} (on {dataset})", str(metric_value)))
+
+            for k, v in meta.items():
                if k in {"recipe", "license"}:
                    v = f"`link <{v}>`__"
                elif k == "min_size":
@@ -374,7 +371,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
            lines.append("")


-def generate_weights_table(module, table_name, metrics, include_patterns=None, exclude_patterns=None):
+def generate_weights_table(module, table_name, metrics, dataset, include_patterns=None, exclude_patterns=None):
    weights_endswith = "_QuantizedWeights" if module.__name__.split(".")[-1] == "quantization" else "_Weights"
    weight_enums = [getattr(module, name) for name in dir(module) if name.endswith(weights_endswith)]
    weights = [w for weight_enum in weight_enums for w in weight_enum]
@@ -391,7 +388,7 @@ def generate_weights_table(module, table_name, metrics, include_patterns=None, e
    content = [
        (
            f":class:`{w} <{type(w).__name__}>`",
-            *(w.meta["metrics"][metric] for metric in metrics_keys),
+            *(w.meta["_metrics"][dataset][metric] for metric in metrics_keys),
            f"{w.meta['num_params']/1e6:.1f}M",
            f"`link <{w.meta['recipe']}>`__",
        )
@@ -408,29 +405,45 @@ def generate_weights_table(module, table_name, metrics, include_patterns=None, e
        table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n")


-generate_weights_table(module=M, table_name="classification", metrics=[("acc@1", "Acc@1"), ("acc@5", "Acc@5")])
 generate_weights_table(
-    module=M.quantization, table_name="classification_quant", metrics=[("acc@1", "Acc@1"), ("acc@5", "Acc@5")]
+    module=M, table_name="classification", metrics=[("acc@1", "Acc@1"), ("acc@5", "Acc@5")], dataset="ImageNet-1K"
+)
+generate_weights_table(
+    module=M.quantization,
+    table_name="classification_quant",
+    metrics=[("acc@1", "Acc@1"), ("acc@5", "Acc@5")],
+    dataset="ImageNet-1K",
 )
 generate_weights_table(
-    module=M.detection, table_name="detection", metrics=[("box_map", "Box MAP")], exclude_patterns=["Mask", "Keypoint"]
+    module=M.detection,
+    table_name="detection",
+    metrics=[("box_map", "Box MAP")],
+    exclude_patterns=["Mask", "Keypoint"],
+    dataset="COCO-val2017",
 )
 generate_weights_table(
    module=M.detection,
    table_name="instance_segmentation",
    metrics=[("box_map", "Box MAP"), ("mask_map", "Mask MAP")],
+    dataset="COCO-val2017",
    include_patterns=["Mask"],
 )
 generate_weights_table(
    module=M.detection,
    table_name="detection_keypoint",
    metrics=[("box_map", "Box MAP"), ("kp_map", "Keypoint MAP")],
+    dataset="COCO-val2017",
    include_patterns=["Keypoint"],
 )
 generate_weights_table(
-    module=M.segmentation, table_name="segmentation", metrics=[("miou", "Mean IoU"), ("pixel_acc", "pixelwise Acc")]
+    module=M.segmentation,
+    table_name="segmentation",
+    metrics=[("miou", "Mean IoU"), ("pixel_acc", "pixelwise Acc")],
+    dataset="COCO-val2017-VOC-labels",
+)
+generate_weights_table(
+    module=M.video, table_name="video", metrics=[("acc@1", "Acc@1"), ("acc@5", "Acc@5")], dataset="Kinetics-400"
 )
-generate_weights_table(module=M.video, table_name="video", metrics=[("acc@1", "Acc@1"), ("acc@5", "Acc@5")])


 def setup(app):

--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -85,7 +85,7 @@ def test_schema_meta_validation(model_fn):
        "categories",
        "keypoint_names",
        "license",
-        "metrics",
+        "_metrics",
        "min_size",
        "num_params",
        "recipe",
@@ -93,19 +93,23 @@ def test_schema_meta_validation(model_fn):
        "_docs",
    }
    # mandatory fields for each computer vision task
-    classification_fields = {"categories", ("metrics", "acc@1"), ("metrics", "acc@5")}
+    classification_fields = {"categories", ("_metrics", "ImageNet-1K", "acc@1"), ("_metrics", "ImageNet-1K", "acc@5")}
    defaults = {
-        "all": {"metrics", "min_size", "num_params", "recipe", "_docs"},
+        "all": {"_metrics", "min_size", "num_params", "recipe", "_docs"},
        "models": classification_fields,
-        "detection": {"categories", ("metrics", "box_map")},
+        "detection": {"categories", ("_metrics", "COCO-val2017", "box_map")},
        "quantization": classification_fields | {"backend", "unquantized"},
-        "segmentation": {"categories", ("metrics", "miou"), ("metrics", "pixel_acc")},
-        "video": classification_fields,
+        "segmentation": {
+            "categories",
+            ("_metrics", "COCO-val2017-VOC-labels", "miou"),
+            ("_metrics", "COCO-val2017-VOC-labels", "pixel_acc"),
+        },
+        "video": {"categories", ("_metrics", "Kinetics-400", "acc@1"), ("_metrics", "Kinetics-400", "acc@5")},
        "optical_flow": set(),
    }
    model_name = model_fn.__name__
    module_name = model_fn.__module__.split(".")[-2]
-    fields = defaults["all"] | defaults[module_name]
+    expected_fields = defaults["all"] | defaults[module_name]

    weights_enum = _get_model_weights(model_fn)
    if len(weights_enum) == 0:
@@ -115,7 +119,13 @@ def test_schema_meta_validation(model_fn):
    incorrect_params = []
    bad_names = []
    for w in weights_enum:
-        missing_fields = fields - (set(w.meta.keys()) | set(("metrics", x) for x in w.meta.get("metrics", {}).keys()))
+        actual_fields = set(w.meta.keys())
+        actual_fields |= set(
+            ("_metrics", dataset, metric_key)
+            for dataset in w.meta.get("_metrics", {}).keys()
+            for metric_key in w.meta.get("_metrics", {}).get(dataset, {}).keys()
+        )
+        missing_fields = expected_fields - actual_fields
        unsupported_fields = set(w.meta.keys()) - permitted_fields
        if missing_fields or unsupported_fields:
            problematic_weights[w] = {"missing": missing_fields, "unsupported": unsupported_fields}

--- a/torchvision/models/alexnet.py
+++ b/torchvision/models/alexnet.py
@@ -61,9 +61,11 @@ class AlexNet_Weights(WeightsEnum):
            "min_size": (63, 63),
            "categories": _IMAGENET_CATEGORIES,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#alexnet-and-vgg",
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 56.522,
                    "acc@5": 79.066,
+                }
            },
            "_docs": """
                These weights reproduce closely the results of the paper using a simplified training recipe.

--- a/torchvision/models/convnext.py
+++ b/torchvision/models/convnext.py
@@ -222,9 +222,11 @@ class ConvNeXt_Tiny_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 28589128,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 82.520,
                    "acc@5": 96.146,
+                }
            },
        },
    )
@@ -238,9 +240,11 @@ class ConvNeXt_Small_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 50223688,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 83.616,
                    "acc@5": 96.650,
+                }
            },
        },
    )
@@ -254,9 +258,11 @@ class ConvNeXt_Base_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 88591464,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 84.062,
                    "acc@5": 96.870,
+                }
            },
        },
    )
@@ -270,9 +276,11 @@ class ConvNeXt_Large_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 197767336,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 84.414,
                    "acc@5": 96.976,
+                }
            },
        },
    )

--- a/torchvision/models/densenet.py
+++ b/torchvision/models/densenet.py
@@ -272,9 +272,11 @@ class DenseNet121_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 7978856,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 74.434,
                    "acc@5": 91.972,
+                }
            },
        },
    )
@@ -288,9 +290,11 @@ class DenseNet161_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 28681000,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 77.138,
                    "acc@5": 93.560,
+                }
            },
        },
    )
@@ -304,9 +308,11 @@ class DenseNet169_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 14149480,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 75.600,
                    "acc@5": 92.806,
+                }
            },
        },
    )
@@ -320,9 +326,11 @@ class DenseNet201_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 20013928,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 76.896,
                    "acc@5": 93.370,
+                }
            },
        },
    )

--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -383,8 +383,10 @@ class FasterRCNN_ResNet50_FPN_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 41755286,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-resnet-50-fpn",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 37.0,
+                }
            },
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },
@@ -400,8 +402,10 @@ class FasterRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 43712278,
            "recipe": "https://github.com/pytorch/vision/pull/5763",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 46.7,
+                }
            },
            "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
        },
@@ -417,8 +421,10 @@ class FasterRCNN_MobileNet_V3_Large_FPN_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 19386354,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-mobilenetv3-large-fpn",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 32.8,
+                }
            },
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },
@@ -434,8 +440,10 @@ class FasterRCNN_MobileNet_V3_Large_320_FPN_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 19386354,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#faster-r-cnn-mobilenetv3-large-320-fpn",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 22.8,
+                }
            },
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },

--- a/torchvision/models/detection/fcos.py
+++ b/torchvision/models/detection/fcos.py
@@ -658,8 +658,10 @@ class FCOS_ResNet50_FPN_Weights(WeightsEnum):
            "categories": _COCO_CATEGORIES,
            "min_size": (1, 1),
            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#fcos-resnet-50-fpn",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 39.2,
+                }
            },
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },

--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -322,9 +322,11 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 59137258,
            "recipe": "https://github.com/pytorch/vision/issues/1606",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 50.6,
                    "kp_map": 61.1,
+                }
            },
            "_docs": """
                These weights were produced by following a similar training recipe as on the paper but use a checkpoint
@@ -339,9 +341,11 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 59137258,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#keypoint-r-cnn",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 54.6,
                    "kp_map": 65.0,
+                }
            },
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },

--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -364,9 +364,11 @@ class MaskRCNN_ResNet50_FPN_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 44401393,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#mask-r-cnn",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 37.9,
                    "mask_map": 34.6,
+                }
            },
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },
@@ -382,9 +384,11 @@ class MaskRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 46359409,
            "recipe": "https://github.com/pytorch/vision/pull/5773",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 47.4,
                    "mask_map": 41.8,
+                }
            },
            "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
        },

--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -687,8 +687,10 @@ class RetinaNet_ResNet50_FPN_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 34014999,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#retinanet",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 36.4,
+                }
            },
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },
@@ -704,8 +706,10 @@ class RetinaNet_ResNet50_FPN_V2_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 38198935,
            "recipe": "https://github.com/pytorch/vision/pull/5756",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 41.5,
+                }
            },
            "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
        },

--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -34,8 +34,10 @@ class SSD300_VGG16_Weights(WeightsEnum):
            "categories": _COCO_CATEGORIES,
            "min_size": (1, 1),
            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssd300-vgg16",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 25.1,
+                }
            },
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },

--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -193,8 +193,10 @@ class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum):
            "categories": _COCO_CATEGORIES,
            "min_size": (1, 1),
            "recipe": "https://github.com/pytorch/vision/tree/main/references/detection#ssdlite320-mobilenetv3-large",
-            "metrics": {
+            "_metrics": {
+                "COCO-val2017": {
                    "box_map": 21.3,
+                }
            },
            "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
        },

--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -458,9 +458,11 @@ class EfficientNet_B0_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V1,
            "num_params": 5288548,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 77.692,
                    "acc@5": 93.532,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },
@@ -478,9 +480,11 @@ class EfficientNet_B1_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V1,
            "num_params": 7794184,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 78.642,
                    "acc@5": 94.186,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },
@@ -494,9 +498,11 @@ class EfficientNet_B1_Weights(WeightsEnum):
            **_COMMON_META_V1,
            "num_params": 7794184,
            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-lr-wd-crop-tuning",
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 79.838,
                    "acc@5": 94.934,
+                }
            },
            "_docs": """
                These weights improve upon the results of the original paper by using a modified version of TorchVision's
@@ -518,9 +524,11 @@ class EfficientNet_B2_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V1,
            "num_params": 9109994,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 80.608,
                    "acc@5": 95.310,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },
@@ -538,9 +546,11 @@ class EfficientNet_B3_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V1,
            "num_params": 12233232,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 82.008,
                    "acc@5": 96.054,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },
@@ -558,9 +568,11 @@ class EfficientNet_B4_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V1,
            "num_params": 19341616,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 83.384,
                    "acc@5": 96.594,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },
@@ -578,9 +590,11 @@ class EfficientNet_B5_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V1,
            "num_params": 30389784,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 83.444,
                    "acc@5": 96.628,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },
@@ -598,9 +612,11 @@ class EfficientNet_B6_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V1,
            "num_params": 43040704,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 84.008,
                    "acc@5": 96.916,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },
@@ -618,9 +634,11 @@ class EfficientNet_B7_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V1,
            "num_params": 66347960,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 84.122,
                    "acc@5": 96.908,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },
@@ -640,9 +658,11 @@ class EfficientNet_V2_S_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V2,
            "num_params": 21458488,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 84.228,
                    "acc@5": 96.878,
+                }
            },
            "_docs": """
                These weights improve upon the results of the original paper by using a modified version of TorchVision's
@@ -666,9 +686,11 @@ class EfficientNet_V2_M_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V2,
            "num_params": 54139356,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 85.112,
                    "acc@5": 97.156,
+                }
            },
            "_docs": """
                These weights improve upon the results of the original paper by using a modified version of TorchVision's
@@ -695,9 +717,11 @@ class EfficientNet_V2_L_Weights(WeightsEnum):
        meta={
            **_COMMON_META_V2,
            "num_params": 118515272,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 85.808,
                    "acc@5": 97.788,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },

--- a/torchvision/models/googlenet.py
+++ b/torchvision/models/googlenet.py
@@ -284,9 +284,11 @@ class GoogLeNet_Weights(WeightsEnum):
            "min_size": (15, 15),
            "categories": _IMAGENET_CATEGORIES,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#googlenet",
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 69.778,
                    "acc@5": 89.530,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },

--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -416,9 +416,11 @@ class Inception_V3_Weights(WeightsEnum):
            "min_size": (75, 75),
            "categories": _IMAGENET_CATEGORIES,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#inception-v3",
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 77.294,
                    "acc@5": 93.450,
+                }
            },
            "_docs": """These weights are ported from the original paper.""",
        },

--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -225,9 +225,11 @@ class MNASNet0_5_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 2218512,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 67.734,
                    "acc@5": 87.490,
+                }
            },
            "_docs": """These weights reproduce closely the results of the paper.""",
        },
@@ -243,9 +245,11 @@ class MNASNet0_75_Weights(WeightsEnum):
            **_COMMON_META,
            "recipe": "https://github.com/pytorch/vision/pull/6019",
            "num_params": 3170208,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 71.180,
                    "acc@5": 90.496,
+                }
            },
            "_docs": """
                These weights were trained from scratch by using TorchVision's `new training recipe
@@ -263,9 +267,11 @@ class MNASNet1_0_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "num_params": 4383312,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 73.456,
                    "acc@5": 91.510,
+                }
            },
            "_docs": """These weights reproduce closely the results of the paper.""",
        },
@@ -281,9 +287,11 @@ class MNASNet1_3_Weights(WeightsEnum):
            **_COMMON_META,
            "recipe": "https://github.com/pytorch/vision/pull/6019",
            "num_params": 6282256,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 76.506,
                    "acc@5": 93.522,
+                }
            },
            "_docs": """
                These weights were trained from scratch by using TorchVision's `new training recipe

--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -208,9 +208,11 @@ class MobileNet_V2_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv2",
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 71.878,
                    "acc@5": 90.286,
+                }
            },
            "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
        },
@@ -221,9 +223,11 @@ class MobileNet_V2_Weights(WeightsEnum):
        meta={
            **_COMMON_META,
            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-reg-tuning",
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 72.154,
                    "acc@5": 90.822,
+                }
            },
            "_docs": """
                These weights improve upon the results of the original paper by using a modified version of TorchVision's

--- a/torchvision/models/mobilenetv3.py
+++ b/torchvision/models/mobilenetv3.py
@@ -317,9 +317,11 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 5483032,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv3-large--small",
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 74.042,
                    "acc@5": 91.340,
+                }
            },
            "_docs": """These weights were trained from scratch by using a simple training recipe.""",
        },
@@ -331,9 +333,11 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 5483032,
            "recipe": "https://github.com/pytorch/vision/issues/3995#new-recipe-with-reg-tuning",
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 75.274,
                    "acc@5": 92.566,
+                }
            },
            "_docs": """
                These weights improve marginally upon the results of the original paper by using a modified version of
@@ -353,9 +357,11 @@ class MobileNet_V3_Small_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 2542856,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#mobilenetv3-large--small",
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 67.668,
                    "acc@5": 87.402,
+                }
            },
            "_docs": """
                These weights improve upon the results of the original paper by using a simple training recipe.

--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -525,11 +525,10 @@ class Raft_Large_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 5257536,
            "recipe": "https://github.com/princeton-vl/RAFT",
-            "metrics": {
-                "sintel_train_cleanpass_epe": 1.4411,
-                "sintel_train_finalpass_epe": 2.7894,
-                "kitti_train_per_image_epe": 5.0172,
-                "kitti_train_fl_all": 17.4506,
+            "_metrics": {
+                "Sintel-Train-Cleanpass": {"epe": 1.4411},
+                "Sintel-Train-Finalpass": {"epe": 2.7894},
+                "Kitti-Train": {"per_image_epe": 5.0172, "fl_all": 17.4506},
            },
            "_docs": """These weights were ported from the original paper. They are trained on Chairs + Things.""",
        },
@@ -542,11 +541,10 @@ class Raft_Large_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 5257536,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/optical_flow",
-            "metrics": {
-                "sintel_train_cleanpass_epe": 1.3822,
-                "sintel_train_finalpass_epe": 2.7161,
-                "kitti_train_per_image_epe": 4.5118,
-                "kitti_train_fl_all": 16.0679,
+            "_metrics": {
+                "Sintel-Train-Cleanpass": {"epe": 1.3822},
+                "Sintel-Train-Finalpass": {"epe": 2.7161},
+                "Kitti-Train": {"per_image_epe": 4.5118, "fl_all": 16.0679},
            },
            "_docs": """These weights were trained from scratch on Chairs + Things.""",
        },
@@ -560,9 +558,9 @@ class Raft_Large_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 5257536,
            "recipe": "https://github.com/princeton-vl/RAFT",
-            "metrics": {
-                "sintel_test_cleanpass_epe": 1.94,
-                "sintel_test_finalpass_epe": 3.18,
+            "_metrics": {
+                "Sintel-Test-Cleanpass": {"epe": 1.94},
+                "Sintel-Test-Finalpass": {"epe": 3.18},
            },
            "_docs": """
                These weights were ported from the original paper. They are trained on Chairs + Things and fine-tuned on
@@ -578,9 +576,9 @@ class Raft_Large_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 5257536,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/optical_flow",
-            "metrics": {
-                "sintel_test_cleanpass_epe": 1.819,
-                "sintel_test_finalpass_epe": 3.067,
+            "_metrics": {
+                "Sintel-Test-Cleanpass": {"epe": 1.819},
+                "Sintel-Test-Finalpass": {"epe": 3.067},
            },
            "_docs": """
                These weights were trained from scratch on Chairs + Things and fine-tuned on Sintel (C+T+S+K+H).
@@ -596,8 +594,8 @@ class Raft_Large_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 5257536,
            "recipe": "https://github.com/princeton-vl/RAFT",
-            "metrics": {
-                "kitti_test_fl_all": 5.10,
+            "_metrics": {
+                "Kitti-Test": {"fl_all": 5.10},
            },
            "_docs": """
                These weights were ported from the original paper. They are trained on Chairs + Things, fine-tuned on
@@ -613,8 +611,8 @@ class Raft_Large_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 5257536,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/optical_flow",
-            "metrics": {
-                "kitti_test_fl_all": 5.19,
+            "_metrics": {
+                "Kitti-Test": {"fl_all": 5.19},
            },
            "_docs": """
                These weights were trained from scratch on Chairs + Things, fine-tuned on Sintel and then on Kitti.
@@ -634,11 +632,10 @@ class Raft_Small_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 990162,
            "recipe": "https://github.com/princeton-vl/RAFT",
-            "metrics": {
-                "sintel_train_cleanpass_epe": 2.1231,
-                "sintel_train_finalpass_epe": 3.2790,
-                "kitti_train_per_image_epe": 7.6557,
-                "kitti_train_fl_all": 25.2801,
+            "_metrics": {
+                "Sintel-Train-Cleanpass": {"epe": 2.1231},
+                "Sintel-Train-Finalpass": {"epe": 3.2790},
+                "Kitti-Train": {"per_image_epe": 7.6557, "fl_all": 25.2801},
            },
            "_docs": """These weights were ported from the original paper. They are trained on Chairs + Things.""",
        },
@@ -650,11 +647,10 @@ class Raft_Small_Weights(WeightsEnum):
            **_COMMON_META,
            "num_params": 990162,
            "recipe": "https://github.com/pytorch/vision/tree/main/references/optical_flow",
-            "metrics": {
-                "sintel_train_cleanpass_epe": 1.9901,
-                "sintel_train_finalpass_epe": 3.2831,
-                "kitti_train_per_image_epe": 7.5978,
-                "kitti_train_fl_all": 25.2369,
+            "_metrics": {
+                "Sintel-Train-Cleanpass": {"epe": 1.9901},
+                "Sintel-Train-Finalpass": {"epe": 3.2831},
+                "Kitti-Train": {"per_image_epe": 7.5978, "fl_all": 25.2369},
            },
            "_docs": """These weights were trained from scratch on Chairs + Things.""",
        },

--- a/torchvision/models/quantization/googlenet.py
+++ b/torchvision/models/quantization/googlenet.py
@@ -117,9 +117,11 @@ class GoogLeNet_QuantizedWeights(WeightsEnum):
            "backend": "fbgemm",
            "recipe": "https://github.com/pytorch/vision/tree/main/references/classification#post-training-quantized-models",
            "unquantized": GoogLeNet_Weights.IMAGENET1K_V1,
-            "metrics": {
+            "_metrics": {
+                "ImageNet-1K": {
                    "acc@1": 69.826,
                    "acc@5": 89.404,
+                }
            },
            "_docs": """
                These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized