Download model weights in parallel for prototype CI (#4772)

* enable caching of model weights for prototype CI * syntax * syntax * make cache dir dynamic * increase verbosity * fix * use larget CI machine * revert debug output * [DEBUG] test env var usage in save_cache * retry * use checksum for caching * remove env vars because expansion is not working * syntax * cleanup * base caching on model-urls * relax regex * cleanup skips * cleanup * fix skipping logic * improve step name * benchmark without caching * benchmark with external download * debug * fix manual download location * debug again * download weights in the background * try parallel download * add missing import * use correct decoractor * up resource_class * fix wording * enable stdout passthrough to see download during test * remove linebreak * move checkout up * cleanup * debug failing test * temp fix * fix * cleanup * fix regex * remove explicit install of numpy

Download model weights in parallel for prototype CI (#4772)
* enable caching of model weights for prototype CI * syntax * syntax * make cache dir dynamic * increase verbosity * fix * use larget CI machine * revert debug output * [DEBUG] test env var usage in save_cache * retry * use checksum for caching * remove env vars because expansion is not working * syntax * cleanup * base caching on model-urls * relax regex * cleanup skips * cleanup * fix skipping logic * improve step name * benchmark without caching * benchmark with external download * debug * fix manual download location * debug again * download weights in the background * try parallel download * add missing import * use correct decoractor * up resource_class * fix wording * enable stdout passthrough to see download during test * remove linebreak * move checkout up * cleanup * debug failing test * temp fix * fix * cleanup * fix regex * remove explicit install of numpy
29f38f17 · Philip Meier · GitHub · cca16993 · 29f38f17 · 29f38f17
Unverified Commit 29f38f17 authored Nov 26, 2021 by Philip Meier Committed by GitHub Nov 26, 2021
5 changed files
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -263,14 +263,23 @@ jobs:
  prototype_test:
    docker:
      - image: circleci/python:3.7
+    resource_class: xlarge
    steps:
      - run:
          name: Install torch
-          command: pip install --user --progress-bar=off --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+          command: |
+            pip install --user --progress-bar=off --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
      - run:
          name: Install prototype dependencies
          command: pip install --user --progress-bar=off git+https://github.com/pytorch/data.git
      - checkout
+      - run:
+          name: Download model weights
+          background: true
+          command: |
+            sudo apt update -qy && sudo apt install -qy parallel wget
+            python scripts/collect_model_urls.py torchvision/prototype/models \
+                | parallel -j0 wget --no-verbose -P ~/.cache/torch/hub/checkpoints {}
      - run:
          name: Install torchvision
          command: pip install --user --progress-bar off --no-build-isolation .
@@ -279,6 +288,8 @@ jobs:
          command: pip install --user --progress-bar=off pytest pytest-mock scipy iopath
      - run:
          name: Run tests
+          environment:
+            PYTORCH_TEST_WITH_PROTOTYPE: 1
          command: pytest --junitxml=test-results/junit.xml -v --durations 20 test/test_prototype_*.py
      - store_test_results:
          path: test-results

--- a/.circleci/config.yml.in
+++ b/.circleci/config.yml.in
@@ -263,14 +263,23 @@ jobs:
  prototype_test:
    docker:
      - image: circleci/python:3.7
+    resource_class: xlarge
    steps:
      - run:
          name: Install torch
-          command: pip install --user --progress-bar=off --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
+          command: |
+            pip install --user --progress-bar=off --pre torch -f https://download.pytorch.org/whl/nightly/cpu/torch_nightly.html
      - run:
          name: Install prototype dependencies
          command: pip install --user --progress-bar=off git+https://github.com/pytorch/data.git
      - checkout
+      - run:
+          name: Download model weights
+          background: true
+          command: |
+            sudo apt update -qy && sudo apt install -qy parallel wget
+            python scripts/collect_model_urls.py torchvision/prototype/models \
+                | parallel -j0 wget --no-verbose -P ~/.cache/torch/hub/checkpoints {}
      - run:
          name: Install torchvision
          command: pip install --user --progress-bar off --no-build-isolation .
@@ -279,6 +288,8 @@ jobs:
          command: pip install --user --progress-bar=off pytest pytest-mock scipy iopath
      - run:
          name: Run tests
+          environment:
+            PYTORCH_TEST_WITH_PROTOTYPE: 1
          command: pytest --junitxml=test-results/junit.xml -v --durations 20 test/test_prototype_*.py
      - store_test_results:
          path: test-results

--- a/scripts/collect_model_urls.py
+++ b/scripts/collect_model_urls.py
+import pathlib
+import re
+import sys
+
+MODEL_URL_PATTERN = re.compile(r"https://download[.]pytorch[.]org/models/.*?[.]pth")
+
+
+def main(root):
+    model_urls = set()
+    for path in pathlib.Path(root).glob("**/*"):
+        if path.name.startswith("_") or not path.suffix == ".py":
+            continue
+
+        with open(path, "r") as file:
+            for line in file:
+                model_urls.update(MODEL_URL_PATTERN.findall(line))
+
+    print("\n".join(sorted(model_urls)))
+
+
+if __name__ == "__main__":
+    main(sys.argv[1])
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -4,8 +4,10 @@ import os
 import random
 import shutil
 import tempfile
+from distutils.util import strtobool

 import numpy as np
+import pytest
 import torch
 from PIL import Image
 from torchvision import io
@@ -13,9 +15,18 @@ from torchvision import io
 import __main__  # noqa: 401


-IN_CIRCLE_CI = os.getenv("CIRCLECI", False) == "true"
-IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None
-IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1"
+def get_bool_env_var(name, *, exist_ok=False, default=False):
+    value = os.getenv(name)
+    if value is None:
+        return default
+    if exist_ok:
+        return True
+    return bool(strtobool(value))
+
+
+IN_CIRCLE_CI = get_bool_env_var("CIRCLECI")
+IN_RE_WORKER = get_bool_env_var("INSIDE_RE_WORKER", exist_ok=True)
+IN_FBCODE = get_bool_env_var("IN_FBCODE_TORCHVISION")
 CUDA_NOT_AVAILABLE_MSG = "CUDA device not available"
 CIRCLECI_GPU_NO_CUDA_MSG = "We're in a CircleCI GPU machine, and this test doesn't need cuda."

@@ -202,3 +213,7 @@ def _test_fn_on_batch(batch_tensors, fn, scripted_fn_atol=1e-8, **fn_kwargs):
        # scriptable function test
        s_transformed_batch = scripted_fn(batch_tensors, **fn_kwargs)
        torch.testing.assert_close(transformed_batch, s_transformed_batch, rtol=1e-5, atol=scripted_fn_atol)
+
+
+def run_on_env_var(name, *, skip_reason=None, exist_ok=False, default=False):
+    return pytest.mark.skipif(not get_bool_env_var(name, exist_ok=exist_ok, default=default), reason=skip_reason)
--- a/test/test_prototype_models.py
+++ b/test/test_prototype_models.py
 import importlib
-import os

 import pytest
 import test_models as TM
 import torch
-from common_utils import cpu_and_gpu
+from common_utils import cpu_and_gpu, run_on_env_var
 from torchvision.prototype import models

+run_if_test_with_prototype = run_on_env_var(
+    "PYTORCH_TEST_WITH_PROTOTYPE",
+    skip_reason="Prototype tests are disabled by default. Set PYTORCH_TEST_WITH_PROTOTYPE=1 to run them.",
+)
+

 def _get_original_model(model_fn):
    original_module_name = model_fn.__module__.replace(".prototype", "")
@@ -48,34 +52,34 @@ def test_get_weight(model_fn, name, weight):

 @pytest.mark.parametrize("model_fn", TM.get_models_from_module(models))
 @pytest.mark.parametrize("dev", cpu_and_gpu())
-@pytest.mark.skipif(os.getenv("PYTORCH_TEST_WITH_PROTOTYPE", "0") == "0", reason="Prototype code tests are disabled")
+@run_if_test_with_prototype
 def test_classification_model(model_fn, dev):
    TM.test_classification_model(model_fn, dev)


 @pytest.mark.parametrize("model_fn", TM.get_models_from_module(models.detection))
 @pytest.mark.parametrize("dev", cpu_and_gpu())
-@pytest.mark.skipif(os.getenv("PYTORCH_TEST_WITH_PROTOTYPE", "0") == "0", reason="Prototype code tests are disabled")
+@run_if_test_with_prototype
 def test_detection_model(model_fn, dev):
    TM.test_detection_model(model_fn, dev)


 @pytest.mark.parametrize("model_fn", TM.get_models_from_module(models.quantization))
-@pytest.mark.skipif(os.getenv("PYTORCH_TEST_WITH_PROTOTYPE", "0") == "0", reason="Prototype code tests are disabled")
+@run_if_test_with_prototype
 def test_quantized_classification_model(model_fn):
    TM.test_quantized_classification_model(model_fn)


 @pytest.mark.parametrize("model_fn", TM.get_models_from_module(models.segmentation))
 @pytest.mark.parametrize("dev", cpu_and_gpu())
-@pytest.mark.skipif(os.getenv("PYTORCH_TEST_WITH_PROTOTYPE", "0") == "0", reason="Prototype code tests are disabled")
+@run_if_test_with_prototype
 def test_segmentation_model(model_fn, dev):
    TM.test_segmentation_model(model_fn, dev)


 @pytest.mark.parametrize("model_fn", TM.get_models_from_module(models.video))
 @pytest.mark.parametrize("dev", cpu_and_gpu())
-@pytest.mark.skipif(os.getenv("PYTORCH_TEST_WITH_PROTOTYPE", "0") == "0", reason="Prototype code tests are disabled")
+@run_if_test_with_prototype
 def test_video_model(model_fn, dev):
    TM.test_video_model(model_fn, dev)

@@ -89,7 +93,7 @@ def test_video_model(model_fn, dev):
    + get_models_with_module_names(models.video),
 )
 @pytest.mark.parametrize("dev", cpu_and_gpu())
-@pytest.mark.skipif(os.getenv("PYTORCH_TEST_WITH_PROTOTYPE", "0") == "0", reason="Prototype code tests are disabled")
+@run_if_test_with_prototype
 def test_old_vs_new_factory(model_fn, module_name, dev):
    defaults = {
        "models": {