Revert vit_h_14 as it breaks our CI (#5259)

c6722307 · Vasilis Vryniotis · GitHub · 4bf6c6e4 · c6722307 · c6722307
Unverified Commit c6722307 authored Jan 23, 2022 by Vasilis Vryniotis Committed by GitHub Jan 23, 2022
5 changed files
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -89,7 +89,6 @@ You can construct a model with random weights by calling its constructor:
    vit_b_32 = models.vit_b_32()
    vit_l_16 = models.vit_l_16()
    vit_l_32 = models.vit_l_32()
-    vit_h_14 = models.vit_h_14()
 We provide pre-trained models, using the PyTorch :mod:`torch.utils.model_zoo`.
 These can be constructed by passing ``pretrained=True``:
@@ -464,7 +463,6 @@ VisionTransformer
    vit_b_32
    vit_l_16
    vit_l_32
-    vit_h_14
 Quantized Models
 ----------------

--- a/hubconf.py
+++ b/hubconf.py
@@ -63,5 +63,4 @@ from torchvision.models.vision_transformer import (
    vit_b_32,
    vit_l_16,
    vit_l_32,
-    vit_h_14,
 )
--- a/test/expect/ModelTester.test_vit_h_14_expect.pkl
+++ b/test/expect/ModelTester.test_vit_h_14_expect.pkl
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -15,7 +15,6 @@ __all__ = [
    "vit_b_32",
    "vit_l_16",
    "vit_l_32",
-    "vit_h_14",
 ]
 model_urls = {
@@ -357,26 +356,6 @@ def vit_l_32(pretrained: bool = False, progress: bool = True, **kwargs: Any) ->
    )
-def vit_h_14(pretrained: bool = False, progress: bool = True, **kwargs: Any) -> VisionTransformer:
-    """
-    Constructs a vit_h_14 architecture from
-    `"An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale" <https://arxiv.org/abs/2010.11929>`_.
-    NOTE: Pretrained weights are not available for this model.
-    """
-    return _vision_transformer(
-        arch="vit_h_14",
-        patch_size=14,
-        num_layers=32,
-        num_heads=16,
-        hidden_dim=1280,
-        mlp_dim=5120,
-        pretrained=pretrained,
-        progress=progress,
-        **kwargs,
-    )
 def interpolate_embeddings(
    image_size: int,
    patch_size: int,

--- a/torchvision/prototype/models/vision_transformer.py
+++ b/torchvision/prototype/models/vision_transformer.py
@@ -19,12 +19,10 @@ __all__ = [
    "ViT_B_32_Weights",
    "ViT_L_16_Weights",
    "ViT_L_32_Weights",
-    "ViT_H_14_Weights",
    "vit_b_16",
    "vit_b_32",
    "vit_l_16",
    "vit_l_32",
-    "vit_h_14",
 ]
@@ -105,11 +103,6 @@ class ViT_L_32_Weights(WeightsEnum):
    default = ImageNet1K_V1
-class ViT_H_14_Weights(WeightsEnum):
-    # Weights are not available yet.
-    pass
 def _vision_transformer(
    patch_size: int,
    num_layers: int,
@@ -203,19 +196,3 @@ def vit_l_32(*, weights: Optional[ViT_L_32_Weights] = None, progress: bool = Tru
        progress=progress,
        **kwargs,
    )
-@handle_legacy_interface(weights=("pretrained", None))
-def vit_h_14(*, weights: Optional[ViT_H_14_Weights] = None, progress: bool = True, **kwargs: Any) -> VisionTransformer:
-    weights = ViT_H_14_Weights.verify(weights)
-    return _vision_transformer(
-        patch_size=14,
-        num_layers=32,
-        num_heads=16,
-        hidden_dim=1280,
-        mlp_dim=5120,
-        weights=weights,
-        progress=progress,
-        **kwargs,
-    )