diff --git a/.circleci/.gitignore b/.circleci/.gitignore deleted file mode 100644 index 485dee64bcfb48793379b200a1afd14e85a8aaf4..0000000000000000000000000000000000000000 --- a/.circleci/.gitignore +++ /dev/null @@ -1 +0,0 @@ -.idea diff --git a/.circleci/build_docs/commit_docs.sh b/.circleci/build_docs/commit_docs.sh deleted file mode 100755 index 04e3538fefc8ca179c7d678f0c88efba433525de..0000000000000000000000000000000000000000 --- a/.circleci/build_docs/commit_docs.sh +++ /dev/null @@ -1,35 +0,0 @@ -#!/usr/bin/env bash - -set -ex - - -if [ "$2" == "" ]; then - echo call as "$0" "" "" - echo where src is the root of the built documentation git checkout and - echo branch should be "main" or "1.7" or so - exit 1 -fi - -src=$1 -target=$2 - -echo "committing docs from ${src} to ${target}" - -pushd "${src}" -git checkout gh-pages -mkdir -p ./"${target}" -rm -rf ./"${target}"/* -cp -r "${src}/docs/build/html/"* ./"$target" -if [ "${target}" == "main" ]; then - mkdir -p ./_static - rm -rf ./_static/* - cp -r "${src}/docs/build/html/_static/"* ./_static - git add --all ./_static || true -fi -git add --all ./"${target}" || true -git config user.email "soumith+bot@pytorch.org" -git config user.name "pytorchbot" -# If there aren't changes, don't make a commit; push is no-op -git commit -m "auto-generating sphinx docs" || true -git remote add https https://github.com/pytorch/vision.git -git push -u https gh-pages diff --git a/.circleci/config.yml b/.circleci/config.yml deleted file mode 100644 index 7e8c399bcf99398fcee028123214338f60569fa9..0000000000000000000000000000000000000000 --- a/.circleci/config.yml +++ /dev/null @@ -1,3078 +0,0 @@ -version: 2.1 - -# How to test the Linux jobs: -# - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/ -# - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7 -# - Replace binary_linux_wheel_py3.7 with the name of the job you want to test. -# Job names are 'name:' key. - -executors: - windows-cpu: - machine: - resource_class: windows.xlarge - image: windows-server-2019-vs2019:stable - shell: bash.exe - - windows-gpu: - machine: - resource_class: windows.gpu.nvidia.medium - image: windows-server-2019-nvidia:stable - shell: bash.exe - -commands: - checkout_merge: - description: "checkout merge branch" - steps: - - checkout -# - run: -# name: Checkout merge branch -# command: | -# set -ex -# BRANCH=$(git rev-parse --abbrev-ref HEAD) -# if [[ "$BRANCH" != "main" ]]; then -# git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH} -# git checkout "merged/$CIRCLE_BRANCH" -# fi - designate_upload_channel: - description: "inserts the correct upload channel into ${BASH_ENV}" - steps: - - run: - name: adding UPLOAD_CHANNEL to BASH_ENV - command: | - our_upload_channel=test - echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV} - - brew_update: - description: "Update Homebrew and install base formulae" - steps: - - run: - name: Update Homebrew - no_output_timeout: "10m" - command: | - set -ex - - # Update repositories manually. - # Running `brew update` produces a comparison between the - # current checkout and the updated checkout, which takes a - # very long time because the existing checkout is 2y old. - for path in $(find /usr/local/Homebrew -type d -name .git) - do - cd $path/.. - git fetch --depth=1 origin - git reset --hard origin/master - done - - export HOMEBREW_NO_AUTO_UPDATE=1 - - # Install expect and moreutils so that we can call `unbuffer` and `ts`. - # moreutils installs a `parallel` executable by default, which conflicts - # with the executable from the GNU `parallel`, so we must unlink GNU - # `parallel` first, and relink it afterwards. - brew install coreutils - brew unlink parallel - brew install moreutils - brew link parallel --overwrite - brew install expect - - brew_install: - description: "Install Homebrew formulae" - parameters: - formulae: - type: string - default: "" - steps: - - run: - name: Install << parameters.formulae >> - no_output_timeout: "10m" - command: | - set -ex - export HOMEBREW_NO_AUTO_UPDATE=1 - brew install << parameters.formulae >> - - run_brew_for_ios_build: - steps: - - brew_update - - brew_install: - formulae: libtool - - apt_install: - parameters: - args: - type: string - descr: - type: string - default: "" - update: - type: boolean - default: true - steps: - - run: - name: > - <<^ parameters.descr >> apt install << parameters.args >> <> - <<# parameters.descr >> << parameters.descr >> <> - command: | - <<# parameters.update >> sudo apt update -qy <> - sudo apt install << parameters.args >> - - pip_install: - parameters: - args: - type: string - descr: - type: string - default: "" - user: - type: boolean - default: true - steps: - - run: - name: > - <<^ parameters.descr >> pip install << parameters.args >> <> - <<# parameters.descr >> << parameters.descr >> <> - command: > - pip install - <<# parameters.user >> --user <> - --progress-bar=off - << parameters.args >> - - install_torchvision: - parameters: - editable: - type: boolean - default: true - steps: - - pip_install: - args: --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html - descr: Install PyTorch from nightly releases - - pip_install: - args: --no-build-isolation <<# parameters.editable >> --editable <> . - descr: Install torchvision <<# parameters.editable >> in editable mode <> - - # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup. - # This command can be used if only a selection of tests need to be run, for ad-hoc files. - run_tests_selective: - parameters: - file_or_dir: - type: string - steps: - - run: - name: Install test utilities - command: pip install --progress-bar=off pytest pytest-mock - - run: - name: Run tests - command: pytest --junitxml=test-results/junit.xml -v --durations 20 <> - - store_test_results: - path: test-results - - download_model_weights: - parameters: - extract_roots: - type: string - default: "torchvision/models" - background: - type: boolean - default: true - steps: - - apt_install: - args: parallel wget - descr: Install download utilitites - - run: - name: Download model weights - background: << parameters.background >> - command: | - mkdir -p ~/.cache/torch/hub/checkpoints - python scripts/collect_model_urls.py << parameters.extract_roots >> \ - | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci' - -binary_common: &binary_common - parameters: - # Edit these defaults to do a release - build_version: - description: "version number of release binary; by default, build a nightly" - type: string - default: "0.14.1" - pytorch_version: - description: "PyTorch version to build against; by default, use a nightly" - type: string - default: "1.13.1" - # Don't edit these - python_version: - description: "Python version to build against (e.g., 3.7)" - type: string - cu_version: - description: "CUDA version to build against, in CU format (e.g., cpu or cu100)" - type: string - default: "cpu" - unicode_abi: - description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)" - type: string - default: "" - wheel_docker_image: - description: "Wheel only: what docker image to use" - type: string - default: "" - conda_docker_image: - description: "Conda only: what docker image to use" - type: string - default: "pytorch/conda-builder:cpu" - environment: - PYTHON_VERSION: << parameters.python_version >> - PYTORCH_VERSION: << parameters.pytorch_version >> - UNICODE_ABI: << parameters.unicode_abi >> - CU_VERSION: << parameters.cu_version >> - MACOSX_DEPLOYMENT_TARGET: 10.9 - -torchvision_ios_params: &torchvision_ios_params - parameters: - build_environment: - type: string - default: "" - ios_arch: - type: string - default: "" - ios_platform: - type: string - default: "" - environment: - BUILD_ENVIRONMENT: << parameters.build_environment >> - IOS_ARCH: << parameters.ios_arch >> - IOS_PLATFORM: << parameters.ios_platform >> - -torchvision_android_params: &torchvision_android_params - parameters: - build_environment: - type: string - default: "" - environment: - BUILD_ENVIRONMENT: << parameters.build_environment >> - -smoke_test_common: &smoke_test_common - <<: *binary_common - docker: - - image: torchvision/smoke_test:latest - -jobs: - circleci_consistency: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - pip_install: - args: jinja2 pyyaml - - run: - name: Check CircleCI config consistency - command: | - python .circleci/regenerate.py - git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1) - - lint_python_and_config: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - pip_install: - args: pre-commit - descr: Install lint utilities - - run: - name: Install pre-commit hooks - command: pre-commit install-hooks - - run: - name: Lint Python code and config files - command: pre-commit run --all-files - - run: - name: Required lint modifications - when: on_fail - command: git --no-pager diff - - lint_c: - docker: - - image: cimg/python:3.7 - steps: - - apt_install: - args: libtinfo5 - descr: Install additional system libraries - - checkout - - run: - name: Install lint utilities - command: | - curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format - chmod +x clang-format - sudo mv clang-format /opt/clang-format - - run: - name: Lint C code - command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format - - run: - name: Required lint modifications - when: on_fail - command: git --no-pager diff - - type_check_python: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision: - editable: true - - pip_install: - args: mypy - descr: Install Python type check utilities - - run: - name: Check Python types statically - command: mypy --install-types --non-interactive --config-file mypy.ini - - unittest_torchhub: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision - - run_tests_selective: - file_or_dir: test/test_hub.py - - unittest_onnx: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision - - pip_install: - args: onnx onnxruntime - descr: Install ONNX - - run_tests_selective: - file_or_dir: test/test_onnx.py - - unittest_extended: - docker: - - image: cimg/python:3.7 - resource_class: xlarge - steps: - - checkout - - download_model_weights - - install_torchvision - - run: - name: Enable extended tests - command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV - - run_tests_selective: - file_or_dir: test/test_extended_*.py - - binary_linux_wheel: - <<: *binary_common - docker: - - image: << parameters.wheel_docker_image >> - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build conda packages - no_output_timeout: 30m - command: | - set -ex - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - binary_linux_conda: - <<: *binary_common - docker: - - image: "<< parameters.conda_docker_image >>" - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build conda packages - no_output_timeout: 30m - command: | - set -ex - packaging/build_conda.sh - - store_artifacts: - path: /opt/conda/conda-bld/linux-64 - - persist_to_workspace: - root: /opt/conda/conda-bld/linux-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_win_conda: - <<: *binary_common - executor: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build conda packages - no_output_timeout: 30m - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate base - conda install -yq conda-build "conda-package-handling!=1.5.0" - packaging/build_conda.sh - rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2 - - store_artifacts: - path: C:/tools/miniconda3/conda-bld/win-64 - - persist_to_workspace: - root: C:/tools/miniconda3/conda-bld/win-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_win_wheel: - <<: *binary_common - executor: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build wheel packages - no_output_timeout: 30m - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_macos_wheel: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - # Cannot easily deduplicate this as source'ing activate - # will set environment variables which we need to propagate - # to build_wheel.sh - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - binary_ios_build: - <<: *torchvision_ios_params - macos: - xcode: "14.0" - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run_brew_for_ios_build - - run: - name: Build - no_output_timeout: "1h" - command: | - script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh" - cat "$script" - source "$script" - - persist_to_workspace: - root: /Users/distiller/workspace/ - paths: ios - - binary_ios_upload: - <<: *torchvision_ios_params - macos: - xcode: "14.0" - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run_brew_for_ios_build - - run: - name: Upload - no_output_timeout: "1h" - command: | - script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh" - cat "$script" - source "$script" - - binary_android_build: - <<: *torchvision_android_params - docker: - - image: cimg/android:2021.08-ndk - resource_class: xlarge - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run: - name: Build - no_output_timeout: "1h" - command: | - script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh" - cat "$script" - source "$script" - - store_artifacts: - path: ~/workspace/artifacts - - binary_android_upload: - <<: *torchvision_android_params - docker: - - image: cimg/android:2021.08-ndk - resource_class: xlarge - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run: - name: Upload - no_output_timeout: "1h" - command: | - script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh" - cat "$script" - source "$script" - - binary_macos_conda: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - conda install -yq conda-build - packaging/build_conda.sh - - store_artifacts: - path: /Users/distiller/miniconda3/conda-bld/osx-64 - - persist_to_workspace: - root: /Users/distiller/miniconda3/conda-bld/osx-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - # Requires org-member context - binary_conda_upload: - docker: - - image: continuumio/miniconda - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - command: | - # Prevent credential from leaking - conda install -yq anaconda-client - set -x - anaconda -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force - - # Requires org-member context - binary_wheel_upload: - parameters: - subfolder: - description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)" - type: string - docker: - - image: cimg/python:3.7 - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - checkout - - pip_install: - args: awscli - - run: - command: | - export PATH="$HOME/.local/bin:$PATH" - # Prevent credential from leaking - set +x - export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}" - export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}" - set -x - for pkg in ~/workspace/*.whl; do - aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read - done - - smoke_test_linux_conda: - <<: *smoke_test_common - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - conda install -v -y -c pytorch-nightly pytorch - conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2) - - run: - name: smoke test - command: | - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_linux_pip: - <<: *smoke_test_common - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - - pip_install: - args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/test/torch_test.html - - run: - name: smoke test - command: | - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_docker_image_build: - machine: - image: ubuntu-2004:202104-01 - resource_class: large - environment: - image_name: torchvision/smoke_test - steps: - - checkout - - designate_upload_channel - - run: - name: Build and push Docker image - no_output_timeout: "1h" - command: | - set +x - echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin - set -x - cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID} - docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest - docker push ${image_name}:${CIRCLE_WORKFLOW_ID} - docker push ${image_name}:latest - - smoke_test_win_conda: - <<: *binary_common - executor: - name: windows-cpu - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda env remove -n python${PYTHON_VERSION} || true - conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION} - conda activate python${PYTHON_VERSION} - conda install -v -y -c pytorch-nightly pytorch - conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2) - - run: - name: smoke test - command: | - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_win_pip: - <<: *binary_common - executor: - name: windows-cpu - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION} - conda activate python${PYTHON_VERSION} - - pip_install: - args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/test/torch_test.html - - run: - name: smoke test - command: | - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - unittest_linux_cpu: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cpu" - resource_class: 2xlarge+ - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - - key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_linux_gpu: - <<: *binary_common - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - resource_class: gpu.nvidia.medium - environment: - image_name: "pytorch/manylinux-cuda116" - CU_VERSION: << parameters.cu_version >> - PYTHON_VERSION: << parameters.python_version >> - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - - key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - # Here we create an envlist file that contains some env variables that we want the docker container to be aware of. - # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables. - # They're avaiable in all the other workflows (OSX and Windows). - # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible. - # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run". - name: export CIRCLECI env var - command: echo "CIRCLECI=true" >> ./env.list - - run: - name: Install torchvision - command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post Process - command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_windows_cpu: - <<: *binary_common - executor: - name: windows-cpu - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: .circleci/unittest/windows/scripts/setup_env.sh - - save_cache: - - key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/windows/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/windows/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/windows/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_windows_gpu: - <<: *binary_common - executor: - name: windows-gpu - environment: - CUDA_VERSION: "11.6" - PYTHON_VERSION: << parameters.python_version >> - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: .circleci/unittest/windows/scripts/setup_env.sh - - save_cache: - - key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - name: Install CUDA - command: packaging/windows/internal/cuda_install.bat - - run: - name: Update CUDA driver - command: packaging/windows/internal/driver_update.bat - - run: - name: Install torchvision - command: .circleci/unittest/windows/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/windows/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/windows/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_macos_cpu: - <<: *binary_common - macos: - xcode: "14.0" - resource_class: large - steps: - - checkout - - designate_upload_channel - - run: - name: Install wget - command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget - # Disable brew auto update which is very slow - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - - keys: - - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - - key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - cmake_linux_cpu: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cpu" - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Setup conda - command: .circleci/unittest/linux/scripts/setup_env.sh - - run: packaging/build_cmake.sh - - cmake_linux_gpu: - <<: *binary_common - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - resource_class: gpu.nvidia.small - environment: - PYTHON_VERSION: << parameters.python_version >> - PYTORCH_VERSION: << parameters.pytorch_version >> - UNICODE_ABI: << parameters.unicode_abi >> - CU_VERSION: << parameters.cu_version >> - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Setup conda - command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh - - run: - name: Build torchvision C++ distribution and test - no_output_timeout: 30m - command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh - - cmake_macos_cpu: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - conda install -yq conda-build cmake - packaging/build_cmake.sh - - cmake_windows_cpu: - <<: *binary_common - executor: - name: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/build_cmake.sh - - cmake_windows_gpu: - <<: *binary_common - executor: - name: windows-gpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Update CUDA driver - command: packaging/windows/internal/driver_update.bat - - run: - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - packaging/build_cmake.sh - - build_docs: - <<: *binary_common - docker: - - image: cimg/python:3.7 - resource_class: 2xlarge+ - steps: - - attach_workspace: - at: ~/workspace - - checkout - - download_model_weights - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - designate_upload_channel - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Build docs - command: | - set -ex - # turn v1.12.0rc3 into 1.12.0 - tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/') - VERSION=${tag:-main} - eval "$(./conda/bin/conda shell.bash hook)" - conda activate ./env - pushd docs - pip install --progress-bar=off -r requirements.txt - make html - popd - - persist_to_workspace: - root: ./ - paths: - - "*" - - store_artifacts: - path: ./docs/build/html - destination: docs - - upload_docs: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cuda100" - resource_class: 2xlarge+ - steps: - - attach_workspace: - at: ~/workspace - - run: - name: Generate netrc - command: | - # set credentials for https pushing - # requires the org-member context - cat > ~/.netrc \< gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7 -# - Replace binary_linux_wheel_py3.7 with the name of the job you want to test. -# Job names are 'name:' key. - -executors: - windows-cpu: - machine: - resource_class: windows.xlarge - image: windows-server-2019-vs2019:stable - shell: bash.exe - - windows-gpu: - machine: - resource_class: windows.gpu.nvidia.medium - image: windows-server-2019-nvidia:stable - shell: bash.exe - -commands: - checkout_merge: - description: "checkout merge branch" - steps: - - checkout -# - run: -# name: Checkout merge branch -# command: | -# set -ex -# BRANCH=$(git rev-parse --abbrev-ref HEAD) -# if [[ "$BRANCH" != "main" ]]; then -# git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH} -# git checkout "merged/$CIRCLE_BRANCH" -# fi - designate_upload_channel: - description: "inserts the correct upload channel into ${BASH_ENV}" - steps: - - run: - name: adding UPLOAD_CHANNEL to BASH_ENV - command: | - our_upload_channel=test - echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV} - - brew_update: - description: "Update Homebrew and install base formulae" - steps: - - run: - name: Update Homebrew - no_output_timeout: "10m" - command: | - set -ex - - # Update repositories manually. - # Running `brew update` produces a comparison between the - # current checkout and the updated checkout, which takes a - # very long time because the existing checkout is 2y old. - for path in $(find /usr/local/Homebrew -type d -name .git) - do - cd $path/.. - git fetch --depth=1 origin - git reset --hard origin/master - done - - export HOMEBREW_NO_AUTO_UPDATE=1 - - # Install expect and moreutils so that we can call `unbuffer` and `ts`. - # moreutils installs a `parallel` executable by default, which conflicts - # with the executable from the GNU `parallel`, so we must unlink GNU - # `parallel` first, and relink it afterwards. - brew install coreutils - brew unlink parallel - brew install moreutils - brew link parallel --overwrite - brew install expect - - brew_install: - description: "Install Homebrew formulae" - parameters: - formulae: - type: string - default: "" - steps: - - run: - name: Install << parameters.formulae >> - no_output_timeout: "10m" - command: | - set -ex - export HOMEBREW_NO_AUTO_UPDATE=1 - brew install << parameters.formulae >> - - run_brew_for_ios_build: - steps: - - brew_update - - brew_install: - formulae: libtool - - apt_install: - parameters: - args: - type: string - descr: - type: string - default: "" - update: - type: boolean - default: true - steps: - - run: - name: > - <<^ parameters.descr >> apt install << parameters.args >> <> - <<# parameters.descr >> << parameters.descr >> <> - command: | - <<# parameters.update >> sudo apt update -qy <> - sudo apt install << parameters.args >> - - pip_install: - parameters: - args: - type: string - descr: - type: string - default: "" - user: - type: boolean - default: true - steps: - - run: - name: > - <<^ parameters.descr >> pip install << parameters.args >> <> - <<# parameters.descr >> << parameters.descr >> <> - command: > - pip install - <<# parameters.user >> --user <> - --progress-bar=off - << parameters.args >> - - install_torchvision: - parameters: - editable: - type: boolean - default: true - steps: - - pip_install: - args: --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html - descr: Install PyTorch from nightly releases - - pip_install: - args: --no-build-isolation <<# parameters.editable >> --editable <> . - descr: Install torchvision <<# parameters.editable >> in editable mode <> - - # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup. - # This command can be used if only a selection of tests need to be run, for ad-hoc files. - run_tests_selective: - parameters: - file_or_dir: - type: string - steps: - - run: - name: Install test utilities - command: pip install --progress-bar=off pytest pytest-mock - - run: - name: Run tests - command: pytest --junitxml=test-results/junit.xml -v --durations 20 <> - - store_test_results: - path: test-results - - download_model_weights: - parameters: - extract_roots: - type: string - default: "torchvision/models" - background: - type: boolean - default: true - steps: - - apt_install: - args: parallel wget - descr: Install download utilitites - - run: - name: Download model weights - background: << parameters.background >> - command: | - mkdir -p ~/.cache/torch/hub/checkpoints - python scripts/collect_model_urls.py << parameters.extract_roots >> \ - | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci' - -binary_common: &binary_common - parameters: - # Edit these defaults to do a release - build_version: - description: "version number of release binary; by default, build a nightly" - type: string - default: "0.14.1" - pytorch_version: - description: "PyTorch version to build against; by default, use a nightly" - type: string - default: "1.13.1" - # Don't edit these - python_version: - description: "Python version to build against (e.g., 3.7)" - type: string - cu_version: - description: "CUDA version to build against, in CU format (e.g., cpu or cu100)" - type: string - default: "cpu" - unicode_abi: - description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)" - type: string - default: "" - wheel_docker_image: - description: "Wheel only: what docker image to use" - type: string - default: "" - conda_docker_image: - description: "Conda only: what docker image to use" - type: string - default: "pytorch/conda-builder:cpu" - environment: - PYTHON_VERSION: << parameters.python_version >> - PYTORCH_VERSION: << parameters.pytorch_version >> - UNICODE_ABI: << parameters.unicode_abi >> - CU_VERSION: << parameters.cu_version >> - MACOSX_DEPLOYMENT_TARGET: 10.9 - -torchvision_ios_params: &torchvision_ios_params - parameters: - build_environment: - type: string - default: "" - ios_arch: - type: string - default: "" - ios_platform: - type: string - default: "" - environment: - BUILD_ENVIRONMENT: << parameters.build_environment >> - IOS_ARCH: << parameters.ios_arch >> - IOS_PLATFORM: << parameters.ios_platform >> - -torchvision_android_params: &torchvision_android_params - parameters: - build_environment: - type: string - default: "" - environment: - BUILD_ENVIRONMENT: << parameters.build_environment >> - -smoke_test_common: &smoke_test_common - <<: *binary_common - docker: - - image: torchvision/smoke_test:latest - -jobs: - circleci_consistency: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - pip_install: - args: jinja2 pyyaml - - run: - name: Check CircleCI config consistency - command: | - python .circleci/regenerate.py - git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1) - - lint_python_and_config: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - pip_install: - args: pre-commit - descr: Install lint utilities - - run: - name: Install pre-commit hooks - command: pre-commit install-hooks - - run: - name: Lint Python code and config files - command: pre-commit run --all-files - - run: - name: Required lint modifications - when: on_fail - command: git --no-pager diff - - lint_c: - docker: - - image: cimg/python:3.7 - steps: - - apt_install: - args: libtinfo5 - descr: Install additional system libraries - - checkout - - run: - name: Install lint utilities - command: | - curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format - chmod +x clang-format - sudo mv clang-format /opt/clang-format - - run: - name: Lint C code - command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format - - run: - name: Required lint modifications - when: on_fail - command: git --no-pager diff - - type_check_python: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision: - editable: true - - pip_install: - args: mypy - descr: Install Python type check utilities - - run: - name: Check Python types statically - command: mypy --install-types --non-interactive --config-file mypy.ini - - unittest_torchhub: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision - - run_tests_selective: - file_or_dir: test/test_hub.py - - unittest_onnx: - docker: - - image: cimg/python:3.7 - steps: - - checkout - - install_torchvision - - pip_install: - args: onnx onnxruntime - descr: Install ONNX - - run_tests_selective: - file_or_dir: test/test_onnx.py - - unittest_extended: - docker: - - image: cimg/python:3.7 - resource_class: xlarge - steps: - - checkout - - download_model_weights - - install_torchvision - - run: - name: Enable extended tests - command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV - - run_tests_selective: - file_or_dir: test/test_extended_*.py - - binary_linux_wheel: - <<: *binary_common - docker: - - image: << parameters.wheel_docker_image >> - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build conda packages - no_output_timeout: 30m - command: | - set -ex - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - binary_linux_conda: - <<: *binary_common - docker: - - image: "<< parameters.conda_docker_image >>" - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build conda packages - no_output_timeout: 30m - command: | - set -ex - packaging/build_conda.sh - - store_artifacts: - path: /opt/conda/conda-bld/linux-64 - - persist_to_workspace: - root: /opt/conda/conda-bld/linux-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_win_conda: - <<: *binary_common - executor: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build conda packages - no_output_timeout: 30m - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate base - conda install -yq conda-build "conda-package-handling!=1.5.0" - packaging/build_conda.sh - rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2 - - store_artifacts: - path: C:/tools/miniconda3/conda-bld/win-64 - - persist_to_workspace: - root: C:/tools/miniconda3/conda-bld/win-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_win_wheel: - <<: *binary_common - executor: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Build wheel packages - no_output_timeout: 30m - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - store_test_results: - path: build_results/ - - binary_macos_wheel: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - # Cannot easily deduplicate this as source'ing activate - # will set environment variables which we need to propagate - # to build_wheel.sh - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - packaging/build_wheel.sh - - store_artifacts: - path: dist - - persist_to_workspace: - root: dist - paths: - - "*" - - binary_ios_build: - <<: *torchvision_ios_params - macos: - xcode: "14.0" - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run_brew_for_ios_build - - run: - name: Build - no_output_timeout: "1h" - command: | - script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh" - cat "$script" - source "$script" - - persist_to_workspace: - root: /Users/distiller/workspace/ - paths: ios - - binary_ios_upload: - <<: *torchvision_ios_params - macos: - xcode: "14.0" - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run_brew_for_ios_build - - run: - name: Upload - no_output_timeout: "1h" - command: | - script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh" - cat "$script" - source "$script" - - binary_android_build: - <<: *torchvision_android_params - docker: - - image: cimg/android:2021.08-ndk - resource_class: xlarge - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run: - name: Build - no_output_timeout: "1h" - command: | - script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh" - cat "$script" - source "$script" - - store_artifacts: - path: ~/workspace/artifacts - - binary_android_upload: - <<: *torchvision_android_params - docker: - - image: cimg/android:2021.08-ndk - resource_class: xlarge - steps: - - attach_workspace: - at: ~/workspace - - checkout - - run: - name: Upload - no_output_timeout: "1h" - command: | - script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh" - cat "$script" - source "$script" - - binary_macos_conda: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - conda install -yq conda-build - packaging/build_conda.sh - - store_artifacts: - path: /Users/distiller/miniconda3/conda-bld/osx-64 - - persist_to_workspace: - root: /Users/distiller/miniconda3/conda-bld/osx-64 - paths: - - "*" - - store_test_results: - path: build_results/ - - # Requires org-member context - binary_conda_upload: - docker: - - image: continuumio/miniconda - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - command: | - # Prevent credential from leaking - conda install -yq anaconda-client - set -x - anaconda -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force - - # Requires org-member context - binary_wheel_upload: - parameters: - subfolder: - description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)" - type: string - docker: - - image: cimg/python:3.7 - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - checkout - - pip_install: - args: awscli - - run: - command: | - export PATH="$HOME/.local/bin:$PATH" - # Prevent credential from leaking - set +x - export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}" - export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}" - set -x - for pkg in ~/workspace/*.whl; do - aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read - done - - smoke_test_linux_conda: - <<: *smoke_test_common - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - conda install -v -y -c pytorch-nightly pytorch - conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2) - - run: - name: smoke test - command: | - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_linux_pip: - <<: *smoke_test_common - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - - pip_install: - args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/test/torch_test.html - - run: - name: smoke test - command: | - source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_docker_image_build: - machine: - image: ubuntu-2004:202104-01 - resource_class: large - environment: - image_name: torchvision/smoke_test - steps: - - checkout - - designate_upload_channel - - run: - name: Build and push Docker image - no_output_timeout: "1h" - command: | - set +x - echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin - set -x - cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID} - docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest - docker push ${image_name}:${CIRCLE_WORKFLOW_ID} - docker push ${image_name}:latest - - smoke_test_win_conda: - <<: *binary_common - executor: - name: windows-cpu - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda env remove -n python${PYTHON_VERSION} || true - conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION} - conda activate python${PYTHON_VERSION} - conda install -v -y -c pytorch-nightly pytorch - conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2) - - run: - name: smoke test - command: | - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - smoke_test_win_pip: - <<: *binary_common - executor: - name: windows-cpu - steps: - - attach_workspace: - at: ~/workspace - - designate_upload_channel - - run: - name: install binaries - command: | - set -x - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION} - conda activate python${PYTHON_VERSION} - - pip_install: - args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/test/torch_test.html - - run: - name: smoke test - command: | - eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')" - conda activate python${PYTHON_VERSION} - python -c "import torchvision" - - unittest_linux_cpu: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cpu" - resource_class: 2xlarge+ - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_linux_gpu: - <<: *binary_common - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - resource_class: gpu.nvidia.medium - environment: - image_name: "pytorch/manylinux-cuda116" - CU_VERSION: << parameters.cu_version >> - PYTHON_VERSION: << parameters.python_version >> - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - # Here we create an envlist file that contains some env variables that we want the docker container to be aware of. - # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables. - # They're avaiable in all the other workflows (OSX and Windows). - # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible. - # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run". - name: export CIRCLECI env var - command: echo "CIRCLECI=true" >> ./env.list - - run: - name: Install torchvision - command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post Process - command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_windows_cpu: - <<: *binary_common - executor: - name: windows-cpu - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: .circleci/unittest/windows/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/windows/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/windows/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/windows/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_windows_gpu: - <<: *binary_common - executor: - name: windows-gpu - environment: - CUDA_VERSION: "11.6" - PYTHON_VERSION: << parameters.python_version >> - steps: - - checkout - - designate_upload_channel - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: .circleci/unittest/windows/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - name: Install CUDA - command: packaging/windows/internal/cuda_install.bat - - run: - name: Update CUDA driver - command: packaging/windows/internal/driver_update.bat - - run: - name: Install torchvision - command: .circleci/unittest/windows/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/windows/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/windows/scripts/post_process.sh - - store_test_results: - path: test-results - - unittest_macos_cpu: - <<: *binary_common - macos: - xcode: "14.0" - resource_class: large - steps: - - checkout - - designate_upload_channel - - run: - name: Install wget - command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget - # Disable brew auto update which is very slow - - run: - name: Generate cache key - # This will refresh cache on Sundays, nightly build should generate new cache. - command: echo "$(date +"%Y-%U")" > .circleci-weekly - - restore_cache: - {% raw %} - keys: - - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - save_cache: - {% raw %} - key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }} - {% endraw %} - paths: - - conda - - env - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Run tests - command: .circleci/unittest/linux/scripts/run_test.sh - - run: - name: Post process - command: .circleci/unittest/linux/scripts/post_process.sh - - store_test_results: - path: test-results - - cmake_linux_cpu: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cpu" - resource_class: 2xlarge+ - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Setup conda - command: .circleci/unittest/linux/scripts/setup_env.sh - - run: packaging/build_cmake.sh - - cmake_linux_gpu: - <<: *binary_common - machine: - image: ubuntu-2004-cuda-11.4:202110-01 - resource_class: gpu.nvidia.small - environment: - PYTHON_VERSION: << parameters.python_version >> - PYTORCH_VERSION: << parameters.pytorch_version >> - UNICODE_ABI: << parameters.unicode_abi >> - CU_VERSION: << parameters.cu_version >> - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Setup conda - command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh - - run: - name: Build torchvision C++ distribution and test - no_output_timeout: 30m - command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh - - cmake_macos_cpu: - <<: *binary_common - macos: - xcode: "14.0" - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh - sh conda.sh -b - source $HOME/miniconda3/bin/activate - conda install -yq conda-build cmake - packaging/build_cmake.sh - - cmake_windows_cpu: - <<: *binary_common - executor: - name: windows-cpu - steps: - - checkout_merge - - designate_upload_channel - - run: - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/build_cmake.sh - - cmake_windows_gpu: - <<: *binary_common - executor: - name: windows-gpu - steps: - - checkout_merge - - designate_upload_channel - - run: - name: Update CUDA driver - command: packaging/windows/internal/driver_update.bat - - run: - command: | - set -ex - source packaging/windows/internal/vc_install_helper.sh - packaging/windows/internal/cuda_install.bat - packaging/build_cmake.sh - - build_docs: - <<: *binary_common - docker: - - image: cimg/python:3.7 - resource_class: 2xlarge+ - steps: - - attach_workspace: - at: ~/workspace - - checkout - - download_model_weights - - run: - name: Setup - command: .circleci/unittest/linux/scripts/setup_env.sh - - designate_upload_channel - - run: - name: Install torchvision - command: .circleci/unittest/linux/scripts/install.sh - - run: - name: Build docs - command: | - set -ex - # turn v1.12.0rc3 into 1.12.0 - tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/') - VERSION=${tag:-main} - eval "$(./conda/bin/conda shell.bash hook)" - conda activate ./env - pushd docs - pip install --progress-bar=off -r requirements.txt - make html - popd - - persist_to_workspace: - root: ./ - paths: - - "*" - - store_artifacts: - path: ./docs/build/html - destination: docs - - upload_docs: - <<: *binary_common - docker: - - image: "pytorch/manylinux-cuda100" - resource_class: 2xlarge+ - steps: - - attach_workspace: - at: ~/workspace - - run: - name: Generate netrc - command: | - # set credentials for https pushing - # requires the org-member context - cat > ~/.netrc \<> ~/.bashrc -CMD [ "/bin/bash"] diff --git a/.circleci/unittest/android/scripts/binary_android_build.sh b/.circleci/unittest/android/scripts/binary_android_build.sh deleted file mode 100644 index 0d8c0d47d8a624bcf4cf4c43492f2a92d97b771f..0000000000000000000000000000000000000000 --- a/.circleci/unittest/android/scripts/binary_android_build.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash -set -ex -o pipefail - -echo "DIR: $(pwd)" -echo "ANDROID_HOME=${ANDROID_HOME}" -echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}" -echo "JAVA_HOME=${JAVA_HOME}" - -WORKSPACE=/home/circleci/workspace -VISION_ANDROID=/home/circleci/project/android - -. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh - -GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties -rm -f $GRADLE_LOCAL_PROPERTIES - -echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES -echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES - -echo "GRADLE_PATH $GRADLE_PATH" -echo "GRADLE_HOME $GRADLE_HOME" - -${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} assemble || true - -mkdir -p ~/workspace/artifacts -find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz -find . -type f -name *apk -print | xargs tar cfvz ~/workspace/artifacts/artifacts-apks.tgz diff --git a/.circleci/unittest/android/scripts/binary_android_upload.sh b/.circleci/unittest/android/scripts/binary_android_upload.sh deleted file mode 100644 index 1472a877d9001c6f24d1a26da26284dcc73bc27c..0000000000000000000000000000000000000000 --- a/.circleci/unittest/android/scripts/binary_android_upload.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -set -ex -o pipefail - -echo "DIR: $(pwd)" -echo "ANDROID_HOME=${ANDROID_HOME}" -echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}" -echo "JAVA_HOME=${JAVA_HOME}" - -WORKSPACE=/home/circleci/workspace -VISION_ANDROID=/home/circleci/project/android - -. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh - -GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties -rm -f $GRADLE_LOCAL_PROPERTIES -GRADLE_PROPERTIES=/home/circleci/project/android/gradle.properties - -echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES -echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES - -echo "SONATYPE_NEXUS_USERNAME=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES -echo "mavenCentralRepositoryUsername=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES -echo "SONATYPE_NEXUS_PASSWORD=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES -echo "mavenCentralRepositoryPassword=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES - -echo "signing.keyId=${ANDROID_SIGN_KEY}" >> $GRADLE_PROPERTIES -echo "signing.password=${ANDROID_SIGN_PASS}" >> $GRADLE_PROPERTIES - -cat /home/circleci/project/android/gradle.properties | grep VERSION - -${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} ops:uploadArchives - -mkdir -p ~/workspace/artifacts -find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz diff --git a/.circleci/unittest/android/scripts/install_gradle.sh b/.circleci/unittest/android/scripts/install_gradle.sh deleted file mode 100755 index 5f803abfa949d95ec3d742f678ad4471b77c9854..0000000000000000000000000000000000000000 --- a/.circleci/unittest/android/scripts/install_gradle.sh +++ /dev/null @@ -1,19 +0,0 @@ -#!/bin/bash -set -ex - -_https_amazon_aws=https://ossci-android.s3.amazonaws.com -GRADLE_VERSION=6.8.3 - -_gradle_home=/opt/gradle -sudo rm -rf $gradle_home -sudo mkdir -p $_gradle_home - -curl --silent --output /tmp/gradle.zip --retry 3 $_https_amazon_aws/gradle-${GRADLE_VERSION}-bin.zip - -sudo unzip -q /tmp/gradle.zip -d $_gradle_home -rm /tmp/gradle.zip - -sudo chmod -R 777 $_gradle_home - -export GRADLE_HOME=$_gradle_home/gradle-$GRADLE_VERSION -export GRADLE_PATH=${GRADLE_HOME}/bin/gradle diff --git a/.circleci/unittest/ios/scripts/binary_ios_build.sh b/.circleci/unittest/ios/scripts/binary_ios_build.sh deleted file mode 100755 index e2ad7b0c55faa836d9cadfceca964490833d5391..0000000000000000000000000000000000000000 --- a/.circleci/unittest/ios/scripts/binary_ios_build.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/bin/bash -set -ex -o pipefail - -echo "" -echo "DIR: $(pwd)" -WORKSPACE=/Users/distiller/workspace -PROJ_ROOT_IOS=/Users/distiller/project/ios -PYTORCH_IOS_NIGHTLY_NAME=libtorch_ios_nightly_build.zip -export TCLLIBPATH="/usr/local/lib" - -# install conda -curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -chmod +x ~/conda.sh -/bin/bash ~/conda.sh -b -p ~/anaconda -export PATH="~/anaconda/bin:${PATH}" -source ~/anaconda/bin/activate - -# install dependencies -conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi requests typing_extensions wget --yes -conda install -c conda-forge valgrind --yes -export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"} - -# sync submodules -cd ${PROJ_ROOT_IOS} -git submodule sync -git submodule update --init --recursive - -# download pytorch-iOS nightly build and unzip it -mkdir -p ${PROJ_ROOT_IOS}/lib -mkdir -p ${PROJ_ROOT_IOS}/build -mkdir -p ${PROJ_ROOT_IOS}/pytorch -TORCH_ROOT="${PROJ_ROOT_IOS}/pytorch" - -cd ${TORCH_ROOT} -wget https://ossci-ios-build.s3.amazonaws.com/${PYTORCH_IOS_NIGHTLY_NAME} -mkdir -p ./build_ios -unzip -d ./build_ios ./${PYTORCH_IOS_NIGHTLY_NAME} - -LIBTORCH_HEADER_ROOT="${TORCH_ROOT}/build_ios/install/include" -cd ${PROJ_ROOT_IOS} -IOS_ARCH=${IOS_ARCH} LIBTORCH_HEADER_ROOT=${LIBTORCH_HEADER_ROOT} ./build_ios.sh -rm -rf ${TORCH_ROOT} - -# store the binary -DEST_DIR=${WORKSPACE}/ios/${IOS_ARCH} -mkdir -p ${DEST_DIR} -cp ${PROJ_ROOT_IOS}/lib/*.a ${DEST_DIR} diff --git a/.circleci/unittest/ios/scripts/binary_ios_upload.sh b/.circleci/unittest/ios/scripts/binary_ios_upload.sh deleted file mode 100644 index ce56388e5da417a4b240b5c0389fef8439cb2510..0000000000000000000000000000000000000000 --- a/.circleci/unittest/ios/scripts/binary_ios_upload.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -set -ex -o pipefail - -echo "" -echo "DIR: $(pwd)" - -WORKSPACE=/Users/distiller/workspace -PROJ_ROOT=/Users/distiller/project -ARTIFACTS_DIR=${WORKSPACE}/ios -ls ${ARTIFACTS_DIR} -ZIP_DIR=${WORKSPACE}/zip -mkdir -p ${ZIP_DIR}/install/lib - -# build a FAT bianry -cd ${ZIP_DIR}/install/lib -libs=("${ARTIFACTS_DIR}/x86_64/libtorchvision_ops.a" "${ARTIFACTS_DIR}/arm64/libtorchvision_ops.a") -lipo -create "${libs[@]}" -o ${ZIP_DIR}/install/lib/libtorchvision_ops.a -lipo -i ${ZIP_DIR}/install/lib/*.a - -# copy the license -cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/ -# zip the library -ZIPFILE=libtorchvision_ops_ios_nightly_build.zip -cd ${ZIP_DIR} -#for testing -touch version.txt -echo $(date +%s) > version.txt -zip -r ${ZIPFILE} install version.txt LICENSE - -# upload to aws -# Install conda then 'conda install' awscli -curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -chmod +x ~/conda.sh -/bin/bash ~/conda.sh -b -p ~/anaconda -export PATH="~/anaconda/bin:${PATH}" -source ~/anaconda/bin/activate -conda install -c conda-forge awscli --yes -set +x -export AWS_ACCESS_KEY_ID=${AWS_S3_ACCESS_KEY_FOR_PYTORCH_BINARY_UPLOAD} -export AWS_SECRET_ACCESS_KEY=${AWS_S3_ACCESS_SECRET_FOR_PYTORCH_BINARY_UPLOAD} -set -x -aws s3 cp ${ZIPFILE} s3://ossci-ios-build/ --acl public-read diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml deleted file mode 100644 index 77ee99295195657edf884acbf6049f36b1f1f709..0000000000000000000000000000000000000000 --- a/.circleci/unittest/linux/scripts/environment.yml +++ /dev/null @@ -1,16 +0,0 @@ -channels: - - pytorch - - defaults -dependencies: - - pytest - - pytest-cov - - pytest-mock - - pip - - libpng - - jpeg - - ca-certificates - - h5py - - pip: - - future - - scipy - - av diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh deleted file mode 100755 index 54722842a746a4691710c85298e269f654fd505c..0000000000000000000000000000000000000000 --- a/.circleci/unittest/linux/scripts/install.sh +++ /dev/null @@ -1,40 +0,0 @@ -#!/usr/bin/env bash - -unset PYTORCH_VERSION -# For unittest, nightly PyTorch is used as the following section, -# so no need to set PYTORCH_VERSION. -# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config. - -set -e - -eval "$(./conda/bin/conda shell.bash hook)" -conda activate ./env - -if [ "${CU_VERSION:-}" == cpu ] ; then - cudatoolkit="cpuonly" - version="cpu" -else - if [[ ${#CU_VERSION} -eq 4 ]]; then - CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}" - elif [[ ${#CU_VERSION} -eq 5 ]]; then - CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}" - fi - echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION: ${CU_VERSION} " - version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")" - cudatoolkit="pytorch-cuda=${version}" -fi - -case "$(uname -s)" in - Darwin*) os=MacOSX;; - *) os=Linux -esac - -printf "Installing PyTorch with %s\n" "${cudatoolkit}" -if [ "${os}" == "MacOSX" ]; then - conda install -y -c "pytorch-${UPLOAD_CHANNEL}" "pytorch-${UPLOAD_CHANNEL}"::pytorch "${cudatoolkit}" -else - conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}" -fi - -printf "* Installing torchvision\n" -python setup.py develop diff --git a/.circleci/unittest/linux/scripts/post_process.sh b/.circleci/unittest/linux/scripts/post_process.sh deleted file mode 100755 index e97bf2a7b1b19fe99eaf0889a157f46c38cc0060..0000000000000000000000000000000000000000 --- a/.circleci/unittest/linux/scripts/post_process.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -set -e - -eval "$(./conda/bin/conda shell.bash hook)" -conda activate ./env diff --git a/.circleci/unittest/linux/scripts/run_test.sh b/.circleci/unittest/linux/scripts/run_test.sh deleted file mode 100755 index 8f6b8cb84850822c5476ebb87c3dc7bec0d57b9b..0000000000000000000000000000000000000000 --- a/.circleci/unittest/linux/scripts/run_test.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env bash - -set -e - -eval "$(./conda/bin/conda shell.bash hook)" -conda activate ./env - -python -m torch.utils.collect_env -pytest --junitxml=test-results/junit.xml -v --durations 20 diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh deleted file mode 100755 index 0574cdff1cf6fd2ec91cf01e4a34d37eb95a4717..0000000000000000000000000000000000000000 --- a/.circleci/unittest/linux/scripts/setup_env.sh +++ /dev/null @@ -1,47 +0,0 @@ -#!/usr/bin/env bash - -# This script is for setting up environment in which unit test is ran. -# To speed up the CI time, the resulting environment is cached. -# -# Do not install PyTorch and torchvision here, otherwise they also get cached. - -set -e - -this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -# Avoid error: "fatal: unsafe repository" -git config --global --add safe.directory '*' -root_dir="$(git rev-parse --show-toplevel)" -conda_dir="${root_dir}/conda" -env_dir="${root_dir}/env" - -cd "${root_dir}" - -case "$(uname -s)" in - Darwin*) os=MacOSX;; - *) os=Linux -esac - -# 1. Install conda at ./conda -if [ ! -d "${conda_dir}" ]; then - printf "* Installing conda\n" - wget -O miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh" - bash ./miniconda.sh -b -f -p "${conda_dir}" -fi -eval "$(${conda_dir}/bin/conda shell.bash hook)" - -# 2. Create test environment at ./env -if [ ! -d "${env_dir}" ]; then - printf "* Creating a test environment\n" - conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION" -fi -conda activate "${env_dir}" - -# 3. Install Conda dependencies -printf "* Installing dependencies (except PyTorch)\n" -FFMPEG_PIN="=4.2" -if [[ "${PYTHON_VERSION}" = "3.9" ]]; then - FFMPEG_PIN=">=4.2" -fi - -conda install -y -c pytorch "ffmpeg${FFMPEG_PIN}" -conda env update --file "${this_dir}/environment.yml" --prune diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml deleted file mode 100644 index 0e07ae80d0d42a639d887425db8b042e030c2cd2..0000000000000000000000000000000000000000 --- a/.circleci/unittest/windows/scripts/environment.yml +++ /dev/null @@ -1,19 +0,0 @@ -channels: - - pytorch - - defaults -dependencies: - - pytest - - pytest-cov - - pytest-mock - - pip - - libpng - - jpeg - - ca-certificates - - hdf5 - - setuptools - - pip: - - future - - scipy - - av != 9.1.1 - - dataclasses - - h5py diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh deleted file mode 100644 index 85920abb8da88d882b6603d28ddabffec62cd300..0000000000000000000000000000000000000000 --- a/.circleci/unittest/windows/scripts/install.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env bash - -unset PYTORCH_VERSION -# For unittest, nightly PyTorch is used as the following section, -# so no need to set PYTORCH_VERSION. -# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config. - -set -ex - -this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - -eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')" -conda activate ./env - -# TODO, refactor the below logic to make it easy to understand how to get correct cuda_version. -if [ "${CU_VERSION:-}" == cpu ] ; then - cudatoolkit="cpuonly" - version="cpu" -else - if [[ ${#CU_VERSION} -eq 4 ]]; then - CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}" - elif [[ ${#CU_VERSION} -eq 5 ]]; then - CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}" - fi - - cuda_toolkit_pckg="cudatoolkit" - if [[ $CUDA_VERSION == 11.6 || $CUDA_VERSION == 11.7 ]]; then - cuda_toolkit_pckg="pytorch-cuda" - fi - - echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION" - version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")" - cudatoolkit="${cuda_toolkit_pckg}=${version}" -fi - -printf "Installing PyTorch with %s\n" "${cudatoolkit}" -conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}" - -torch_cuda=$(python -c "import torch; print(torch.cuda.is_available())") -echo torch.cuda.is_available is $torch_cuda - -if [ ! -z "${CUDA_VERSION:-}" ] ; then - if [ "$torch_cuda" == "False" ]; then - echo "torch with cuda installed but torch.cuda.is_available() is False" - exit 1 - fi -fi - -source "$this_dir/set_cuda_envs.sh" - -printf "* Installing torchvision\n" -"$this_dir/vc_env_helper.bat" python setup.py develop diff --git a/.circleci/unittest/windows/scripts/install_conda.bat b/.circleci/unittest/windows/scripts/install_conda.bat deleted file mode 100644 index 6052ad08b106accec140ef3f0e27cb4fe893377a..0000000000000000000000000000000000000000 --- a/.circleci/unittest/windows/scripts/install_conda.bat +++ /dev/null @@ -1 +0,0 @@ -start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda% diff --git a/.circleci/unittest/windows/scripts/post_process.sh b/.circleci/unittest/windows/scripts/post_process.sh deleted file mode 100644 index 5c5cbb758a9ef2b235e6e5af308bef77fc26a253..0000000000000000000000000000000000000000 --- a/.circleci/unittest/windows/scripts/post_process.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/usr/bin/env bash - -set -e - -eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')" -conda activate ./env diff --git a/.circleci/unittest/windows/scripts/run_test.sh b/.circleci/unittest/windows/scripts/run_test.sh deleted file mode 100644 index 802ad37f511adc7ab38adc992738d570a40432c4..0000000000000000000000000000000000000000 --- a/.circleci/unittest/windows/scripts/run_test.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/usr/bin/env bash - -set -e - -eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')" -conda activate ./env - -this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -source "$this_dir/set_cuda_envs.sh" - -python -m torch.utils.collect_env -pytest --junitxml=test-results/junit.xml -v --durations 20 diff --git a/.circleci/unittest/windows/scripts/set_cuda_envs.sh b/.circleci/unittest/windows/scripts/set_cuda_envs.sh deleted file mode 100644 index 7db3137b5944034cb556a341658fa0db95c75761..0000000000000000000000000000000000000000 --- a/.circleci/unittest/windows/scripts/set_cuda_envs.sh +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env bash -set -ex - -echo CU_VERSION is "${CU_VERSION}" -echo CUDA_VERSION is "${CUDA_VERSION}" - -# Currenly, CU_VERSION and CUDA_VERSION are not consistent. -# to understand this code, see https://github.com/pytorch/vision/issues/4443 -version="cpu" -if [[ ! -z "${CUDA_VERSION}" ]] ; then - version="$CUDA_VERSION" -else - if [[ ${#CU_VERSION} -eq 5 ]]; then - version="${CU_VERSION:2:2}.${CU_VERSION:4:1}" - fi -fi - -# Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi. -# It would exit the shell. One result is cpu tests would not run if the shell exit. -# Unless there's an error, Don't exit. -if [[ "$version" != "cpu" ]]; then - # set cuda envs - export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH" - export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}" - export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}" - - if [ ! -d "$CUDA_PATH" ]; then - echo "$CUDA_PATH" does not exist - exit 1 - fi - - if [ ! -f "${CUDA_PATH}\include\nvjpeg.h" ]; then - echo "nvjpeg does not exist" - exit 1 - fi - - # check cuda driver version - for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do - if [[ -x "$path" ]]; then - "$path" || echo "true"; - break - fi - done - - which nvcc - nvcc --version - env | grep CUDA -fi diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh deleted file mode 100644 index 5eeb2e17b48976243d6736c7fe5c4b3edd1e582e..0000000000000000000000000000000000000000 --- a/.circleci/unittest/windows/scripts/setup_env.sh +++ /dev/null @@ -1,45 +0,0 @@ -#!/usr/bin/env bash - -# This script is for setting up environment in which unit test is ran. -# To speed up the CI time, the resulting environment is cached. -# -# Do not install PyTorch and torchvision here, otherwise they also get cached. - -set -e - -this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -root_dir="$(git rev-parse --show-toplevel)" -conda_dir="${root_dir}/conda" -env_dir="${root_dir}/env" - -cd "${root_dir}" - -# 1. Install conda at ./conda -if [ ! -d "${conda_dir}" ]; then - printf "* Installing conda\n" - export tmp_conda="$(echo $conda_dir | tr '/' '\\')" - export miniconda_exe="$(echo $root_dir | tr '/' '\\')\\miniconda.exe" - curl --output miniconda.exe https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O - "$this_dir/install_conda.bat" - unset tmp_conda - unset miniconda_exe -fi - -eval "$(${conda_dir}/Scripts/conda.exe 'shell.bash' 'hook')" - -# 2. Create test environment at ./env -if [ ! -d "${env_dir}" ]; then - printf "* Creating a test environment\n" - conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION" -fi -conda activate "${env_dir}" - -# 3. Install Conda dependencies -printf "* Installing dependencies (except PyTorch)\n" -conda env update --file "${this_dir}/environment.yml" --prune - -# 4. Downgrade setuptools on Python 3.7. -# See https://github.com/pytorch/vision/pull/5868 -if [[ "${PYTHON_VERSION}" == '3.7' ]]; then - pip install --upgrade setuptools==58.0.4 -fi diff --git a/.circleci/unittest/windows/scripts/vc_env_helper.bat b/.circleci/unittest/windows/scripts/vc_env_helper.bat deleted file mode 100644 index 9410135677a4fdc1113d96c5a422583992c688c3..0000000000000000000000000000000000000000 --- a/.circleci/unittest/windows/scripts/vc_env_helper.bat +++ /dev/null @@ -1,39 +0,0 @@ -@echo on - -set VC_VERSION_LOWER=16 -set VC_VERSION_UPPER=17 - -for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do ( - if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( - set "VS15INSTALLDIR=%%i" - set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat" - goto vswhere - ) -) - -:vswhere -if "%VSDEVCMD_ARGS%" == "" ( - call "%VS15VCVARSALL%" x64 || exit /b 1 -) else ( - call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1 -) - -@echo on - -set DISTUTILS_USE_SDK=1 - -set args=%1 -shift -:start -if [%1] == [] goto done -set args=%args% %1 -shift -goto start - -:done -if "%args%" == "" ( - echo Usage: vc_env_helper.bat [command] [args] - echo e.g. vc_env_helper.bat cl /c test.cpp -) - -%args% || exit /b 1 diff --git a/.clang-format b/.clang-format index 6d0ab740db4bd2ce6debe0008785a7d7c7468461..95d60445f4a51826e8a26e4b47c8233222261dda 100644 --- a/.clang-format +++ b/.clang-format @@ -60,9 +60,6 @@ MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None -ObjCBlockIndentWidth: 2 -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: false PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 @@ -85,4 +82,11 @@ SpacesInSquareBrackets: false Standard: Cpp11 TabWidth: 8 UseTab: Never +--- +Language: ObjC +ColumnLimit: 120 +AlignAfterOpenBracket: Align +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false ... diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index eec9385478805eb123b2f88e30e428342632a1ae..5e88f5b9bb7b678fe14530eaec363d982f94686c 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -7,3 +7,7 @@ d367a01a18a3ae6bee13d8be3b63fd6a581ea46f # Upgrade usort to 1.0.2 and black to 22.3.0 (#5106) 6ca9c76adb6daf2695d603ad623a9cf1c4f4806f +# Fix unnecessary exploded black formatting (#7709) +a335d916db0694770e8152f41e19195de3134523 +# Renaming: `BoundingBox` -> `BoundingBoxes` (#7778) +332bff937c6711666191880fab57fa2f23ae772e diff --git a/.gitattributes b/.gitattributes index f9d672d7fb5b2db73cfff9cea21f7afb344f663c..22d0452f8d7e02ba33fa717d8a1792a76b050182 100644 --- a/.gitattributes +++ b/.gitattributes @@ -6,6 +6,3 @@ # To ignore it use below *.ipynb linguist-documentation - -# To exclude autogenerated files from code reviews -.circleci/config.yml linguist-generated=true diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index a073146ebedf6ae14ba5bf62fadb65ade8e0318d..ba811554c439216ec72175977938a6f2196bc0d8 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -48,7 +48,7 @@ body: description: | Please run the following and paste the output below. ```sh - wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py + wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py # For security purposes, please check the contents of collect_env.py before running it. python collect_env.py ``` diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 27d0f2a1f0b239bd5108a9ce77a81f69bb11edfe..1a3402466f4e03fa36c69260c1cf17cca893646d 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1 +1,10 @@ tracking_issue: 2447 + +# List of workflows that will be re-run in case of failures +# https://github.com/pytorch/test-infra/blob/main/torchci/lib/bot/retryBot.ts +retryable_workflows: +- Build Linux +- Build Macos +- Build M1 +- Build Windows +- Tests diff --git a/.github/scripts/cmake.sh b/.github/scripts/cmake.sh new file mode 100755 index 0000000000000000000000000000000000000000..ef3e5f61dad934d6060b41d49979e1478a0cc38c --- /dev/null +++ b/.github/scripts/cmake.sh @@ -0,0 +1,124 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +./.github/scripts/setup-env.sh + +# Activate conda environment +set +x && eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci && set -x + +# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below. +case $(uname) in + Linux) + OS_TYPE=linux + ;; + Darwin) + OS_TYPE=macos + ;; + MSYS*) + OS_TYPE=windows + ;; + *) + echo "Unknown OS type:" $(uname) + exit 1 + ;; +esac + +if [[ $OS_TYPE == macos ]]; then + JOBS=$(sysctl -n hw.logicalcpu) +else + JOBS=$(nproc) +fi + +TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))") +if [[ $OS_TYPE == windows ]]; then + PACKAGING_DIR="${PWD}/packaging" + export PATH="${TORCH_PATH}/lib:${PATH}" +fi + +Torch_DIR="${TORCH_PATH}/share/cmake/Torch" +if [[ "${GPU_ARCH_TYPE}" == "cuda" ]]; then + WITH_CUDA=1 +else + WITH_CUDA=0 +fi + +echo '::group::Prepare CMake builds' +mkdir -p cpp_build + +pushd test/tracing/frcnn +python trace_model.py +mkdir -p build +mv fasterrcnn_resnet50_fpn.pt build +popd + +pushd examples/cpp/hello_world +python trace_model.py +mkdir -p build +mv resnet18.pt build +popd + +# This was only needed for the tracing above +pip uninstall -y torchvision +echo '::endgroup::' + +echo '::group::Build and install libtorchvision' +pushd cpp_build + +# On macOS, CMake is looking for the library (*.dylib) and the header (*.h) separately. By default, it prefers to load +# the header from other packages that install the library. This easily leads to a mismatch if the library installed +# from conda doesn't have the exact same version. Thus, we need to explicitly set CMAKE_FIND_FRAMEWORK=NEVER to force +# it to not load anything from other installed frameworks. Resources: +# https://stackoverflow.com/questions/36523911/osx-homebrew-cmake-libpng-version-mismatch-issue +# https://cmake.org/cmake/help/latest/variable/CMAKE_FIND_FRAMEWORK.html +cmake .. -DTorch_DIR="${Torch_DIR}" -DWITH_CUDA="${WITH_CUDA}" \ + -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \ + -DCMAKE_FIND_FRAMEWORK=NEVER \ + -DCMAKE_INSTALL_PREFIX="${CONDA_PREFIX}" +if [[ $OS_TYPE == windows ]]; then + "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cmake.bat" $JOBS +else + make -j$JOBS + make install +fi + +popd +echo '::endgroup::' + +echo '::group::Build and run project that uses Faster-RCNN' +pushd test/tracing/frcnn/build + +cmake .. -DTorch_DIR="${Torch_DIR}" -DWITH_CUDA="${WITH_CUDA}" \ + -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \ + -DCMAKE_FIND_FRAMEWORK=NEVER +if [[ $OS_TYPE == windows ]]; then + "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_frcnn.bat" $JOBS + cd Release + cp ../fasterrcnn_resnet50_fpn.pt . +else + make -j$JOBS +fi + +./test_frcnn_tracing + +popd +echo '::endgroup::' + +echo '::group::Build and run C++ example' +pushd examples/cpp/hello_world/build + +cmake .. -DTorch_DIR="${Torch_DIR}" \ + -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \ + -DCMAKE_FIND_FRAMEWORK=NEVER +if [[ $OS_TYPE == windows ]]; then + "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cpp_example.bat" $JOBS + cd Release + cp ../resnet18.pt . +else + make -j$JOBS +fi + +./hello-world + +popd +echo '::endgroup::' diff --git a/.circleci/unittest/linux/scripts/run-clang-format.py b/.github/scripts/run-clang-format.py similarity index 99% rename from .circleci/unittest/linux/scripts/run-clang-format.py rename to .github/scripts/run-clang-format.py index 5c61b2519e04617b3f7aedc8600c350579e27d39..670fd97833a7c7395c71771ca3c2060b9930cc9e 100755 --- a/.circleci/unittest/linux/scripts/run-clang-format.py +++ b/.github/scripts/run-clang-format.py @@ -48,7 +48,7 @@ except ImportError: DEVNULL = open(os.devnull, "wb") -DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu" +DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu,mm" class ExitStatus: diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh new file mode 100755 index 0000000000000000000000000000000000000000..a4f113c367fa801bd0e95bded875eac4e9f8f15c --- /dev/null +++ b/.github/scripts/setup-env.sh @@ -0,0 +1,118 @@ +#!/usr/bin/env bash + +set -euxo pipefail + +# Prepare conda +set +x && eval "$($(which conda) shell.bash hook)" && set -x + +# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below. +case $(uname) in + Linux) + OS_TYPE=linux + ;; + Darwin) + OS_TYPE=macos + ;; + MSYS*) + OS_TYPE=windows + ;; + *) + echo "Unknown OS type:" $(uname) + exit 1 + ;; +esac + +if [[ "${OS_TYPE}" == "macos" && $(uname -m) == x86_64 ]]; then + echo '::group::Uninstall system JPEG libraries on macOS' + # The x86 macOS runners, e.g. the GitHub Actions native "macos-12" runner, has some JPEG and PNG libraries + # installed by default that interfere with our build. We uninstall them here and use the one from conda below. + IMAGE_LIBS=$(brew list | grep -E "jpeg|png") + for lib in $IMAGE_LIBS; do + brew uninstall --ignore-dependencies --force "${lib}" + done + echo '::endgroup::' +fi + +echo '::group::Create build environment' +# See https://github.com/pytorch/vision/issues/7296 for ffmpeg +conda create \ + --name ci \ + --quiet --yes \ + python="${PYTHON_VERSION}" pip \ + ninja cmake \ + libpng \ + 'ffmpeg<4.3' +conda activate ci +conda install --quiet --yes libjpeg-turbo -c pytorch +pip install --progress-bar=off --upgrade setuptools + +# See https://github.com/pytorch/vision/issues/6790 +if [[ "${PYTHON_VERSION}" != "3.11" ]]; then + pip install --progress-bar=off av!=10.0.0 +fi + +echo '::endgroup::' + +if [[ "${OS_TYPE}" == windows && "${GPU_ARCH_TYPE}" == cuda ]]; then + echo '::group::Install VisualStudio CUDA extensions on Windows' + if [[ "${VC_YEAR:-}" == "2022" ]]; then + TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/MSBuild/Microsoft/VC/v170/BuildCustomizations" + else + TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations" + fi + mkdir -p "${TARGET_DIR}" + cp -r "${CUDA_HOME}/MSBuildExtensions/"* "${TARGET_DIR}" + echo '::endgroup::' +fi + +echo '::group::Install PyTorch' +# TODO: Can we maybe have this as environment variable in the job template? For example, `IS_RELEASE`. +if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then + CHANNEL=test +else + CHANNEL=nightly +fi + +case $GPU_ARCH_TYPE in + cpu) + GPU_ARCH_ID="cpu" + ;; + cuda) + VERSION_WITHOUT_DOT=$(echo "${GPU_ARCH_VERSION}" | sed 's/\.//') + GPU_ARCH_ID="cu${VERSION_WITHOUT_DOT}" + ;; + *) + echo "Unknown GPU_ARCH_TYPE=${GPU_ARCH_TYPE}" + exit 1 + ;; +esac +PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}" +pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}" + +if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then + python -c "import torch; exit(not torch.cuda.is_available())" +fi +echo '::endgroup::' + +echo '::group::Install third party dependencies prior to TorchVision install' +# Installing with `easy_install`, e.g. `python setup.py install` or `python setup.py develop`, has some quirks when +# when pulling in third-party dependencies. For example: +# - On Windows, we often hit an SSL error although `pip` can install just fine. +# - It happily pulls in pre-releases, which can lead to more problems down the line. +# `pip` does not unless explicitly told to do so. +# Thus, we use `easy_install` to extract the third-party dependencies here and install them upfront with `pip`. +python setup.py egg_info +# The requires.txt cannot be used with `pip install -r` directly. The requirements are listed at the top and the +# optional dependencies come in non-standard syntax after a blank line. Thus, we just extract the header. +sed -e '/^$/,$d' *.egg-info/requires.txt | tee requirements.txt +pip install --progress-bar=off -r requirements.txt +echo '::endgroup::' + +echo '::group::Install TorchVision' +python setup.py develop +echo '::endgroup::' + +echo '::group::Collect environment information' +conda list +python -m torch.utils.collect_env +echo '::endgroup::' diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh new file mode 100755 index 0000000000000000000000000000000000000000..bb2ad73715abf228a365922b11c20a6549d5479c --- /dev/null +++ b/.github/scripts/unittest.sh @@ -0,0 +1,15 @@ +#!/usr/bin/env bash + +set -euo pipefail + +./.github/scripts/setup-env.sh + +# Activate conda environment +eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci + +echo '::group::Install testing utilities' +pip install --progress-bar=off pytest pytest-mock pytest-cov +echo '::endgroup::' + +python test/smoke_test.py +pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml new file mode 100644 index 0000000000000000000000000000000000000000..23f2b4b06ec832f85c5f9689ccf4b5624839570b --- /dev/null +++ b/.github/workflows/build-cmake.yml @@ -0,0 +1,83 @@ +name: CMake + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + workflow_dispatch: + +jobs: + linux: + strategy: + matrix: + include: + - runner: linux.12xlarge + gpu-arch-type: cpu + - runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "11.8" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1 + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + script: | + set -euo pipefail + + export PYTHON_VERSION=3.8 + export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} + export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} + + ./.github/scripts/cmake.sh + + macos: + strategy: + matrix: + include: + - runner: macos-12 + - runner: macos-m1-12 + fail-fast: false + uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.1 + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + script: | + set -euo pipefail + + export PYTHON_VERSION=3.8 + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + + ./.github/scripts/cmake.sh + + windows: + strategy: + matrix: + include: + - runner: windows.4xlarge + gpu-arch-type: cpu + - runner: windows.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "11.8" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/windows_job.yml@release/2.1 + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + script: | + set -euo pipefail + + export PYTHON_VERSION=3.8 + export VC_YEAR=2022 + export VSDEVCMD_ARGS="" + export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} + export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} + + ./.github/scripts/cmake.sh diff --git a/.github/workflows/build-conda-linux.yml b/.github/workflows/build-conda-linux.yml new file mode 100644 index 0000000000000000000000000000000000000000..8da9d488f7e9332caa2c314a9e76123429f7fc56 --- /dev/null +++ b/.github/workflows/build-conda-linux.yml @@ -0,0 +1,52 @@ +name: Build Linux Conda + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1 + with: + package-type: conda + os: linux + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: "" + post-script: "" + conda-package-directory: packaging/torchvision + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_conda_linux.yml@release/2.1 + with: + conda-package-directory: ${{ matrix.conda-package-directory }} + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} + secrets: + CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} diff --git a/.github/workflows/build-conda-m1.yml b/.github/workflows/build-conda-m1.yml new file mode 100644 index 0000000000000000000000000000000000000000..4a347e1baf5a074717b5fab92371a8408fcaff94 --- /dev/null +++ b/.github/workflows/build-conda-m1.yml @@ -0,0 +1,53 @@ +name: Build M1 Conda + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1 + with: + package-type: conda + os: macos-arm64 + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: "" + post-script: "" + conda-package-directory: packaging/torchvision + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@release/2.1 + with: + conda-package-directory: ${{ matrix.conda-package-directory }} + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + runner-type: macos-m1-12 + trigger-event: ${{ github.event_name }} + secrets: + CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} diff --git a/.github/workflows/build-conda-macos.yml b/.github/workflows/build-conda-macos.yml new file mode 100644 index 0000000000000000000000000000000000000000..aca1b12754cdfeb8a3b2817a0876875a01d35828 --- /dev/null +++ b/.github/workflows/build-conda-macos.yml @@ -0,0 +1,53 @@ +name: Build Macos Conda + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1 + with: + package-type: conda + os: macos + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: "" + post-script: "" + conda-package-directory: packaging/torchvision + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@release/2.1 + with: + conda-package-directory: ${{ matrix.conda-package-directory }} + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + runner-type: macos-12 + trigger-event: ${{ github.event_name }} + secrets: + CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml new file mode 100644 index 0000000000000000000000000000000000000000..f03e4c57fc1c54e5627ec271da4614bdacad778e --- /dev/null +++ b/.github/workflows/build-conda-windows.yml @@ -0,0 +1,52 @@ +name: Build Windows Conda + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1 + with: + package-type: conda + os: windows + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + env-script: packaging/windows/internal/vc_env_helper.bat + post-script: "" + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_conda_windows.yml@release/2.1 + with: + conda-package-directory: ${{ matrix.conda-package-directory }} + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} + secrets: + CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} diff --git a/.github/workflows/build-m1-binaries.yml b/.github/workflows/build-m1-binaries.yml deleted file mode 100644 index 81b15172a7e279bd4dadecf04ffb94e89e006e98..0000000000000000000000000000000000000000 --- a/.github/workflows/build-m1-binaries.yml +++ /dev/null @@ -1,160 +0,0 @@ -name: Build on M1 -on: - pull_request: - paths: - - .github/workflows/build-m1-binaries.yml - push: - branches: - - nightly - - main - - release/* - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - workflow_dispatch: -env: - CHANNEL: "nightly" -jobs: - build_wheels: - name: "Build TorchVision M1 wheels" - runs-on: macos-m1-12 - strategy: - matrix: - py_vers: [ "3.8", "3.9", "3.10" ] - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Set CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Set Release CHANNEL (for release) - if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }} - run: | - echo "CHANNEL=test" >> "$GITHUB_ENV" - - name: Setup miniconda - uses: pytorch/test-infra/.github/actions/setup-miniconda@main - - name: Build TorchVision M1 wheel - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')` - set -ex - . packaging/pkg_helpers.bash - # if we are uploading to test channell, our version consist only of the base: 0.x.x - no date string or suffix added - if [[ $CHANNEL == "test" ]]; then - setup_base_build_version - else - setup_build_version - fi - - conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg wheel pkg-config - conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL} - conda run -p ${ENV_NAME} python3 -mpip install delocate - conda run -p ${ENV_NAME} python3 setup.py bdist_wheel - export PYTORCH_VERSION="$(conda run -p ${ENV_NAME} python3 -mpip show torch | grep ^Version: | sed 's/Version: *//')" - conda run -p ${ENV_NAME} DYLD_FALLBACK_LIBRARY_PATH="${ENV_NAME}/lib" delocate-wheel -v --ignore-missing-dependencies dist/*.whl - conda env remove -p ${ENV_NAME} - - name: Test wheel - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-test-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - set -ex - conda create -yp ${ENV_NAME} python=${PY_VERS} numpy - conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL} - conda run -p ${ENV_NAME} python3 -mpip install dist/*.whl - # Test torch is importable, by changing cwd and running import commands - conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print('torchvision version is ', torchvision.__version__)" - conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torch;import torchvision;print('Is torchvision useable?', all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]))" - conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print(torchvision.io.read_image('${PWD}/gallery/assets/dog1.jpg').shape)" - conda env remove -p ${ENV_NAME} - - name: Upload wheel to GitHub - uses: actions/upload-artifact@v3 - with: - name: torchvision-py${{ matrix.py_vers }}-macos11-m1 - path: dist/ - - name: Upload wheel to S3 - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }} - shell: arch -arch arm64 bash {0} - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - run: | - for pkg in dist/*; do - aws s3 cp "$pkg" "s3://pytorch/whl/${CHANNEL}/cpu/" --acl public-read - done - build_conda: - name: "Build TorchVision M1 conda packages" - runs-on: macos-m1-12 - strategy: - matrix: - py_vers: [ "3.8", "3.9", "3.10" ] - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Set CHANNEL (only for tagged pushes) - if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }} - run: | - # reference ends with an RC suffix - if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then - echo "CHANNEL=test" >> "$GITHUB_ENV" - fi - - name: Set CHANNEL Release (for release) - if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }} - run: | - echo "CHANNEL=test" >> "$GITHUB_ENV" - - name: Setup miniconda - uses: pytorch/test-infra/.github/actions/setup-miniconda@main - - name: Install conda-build and purge previous artifacts - shell: arch -arch arm64 bash {0} - run: | - conda install -yq conda-build - conda build purge-all - - name: Build TorchVision M1 conda package - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PYTHON_VERSION: ${{ matrix.py_vers }} - CU_VERSION: cpu - run: | - set -ex - . packaging/pkg_helpers.bash - - if [[ $CHANNEL == "test" ]]; then - setup_base_build_version - else - setup_build_version - fi - - setup_conda_pytorch_constraint - export SOURCE_ROOT_DIR=$(pwd) - conda build \ - -c defaults \ - $CONDA_CHANNEL_FLAGS \ - --no-anaconda-upload \ - --python "$PYTHON_VERSION" \ - --output-folder=dist/ \ - packaging/torchvision - - name: Upload package to GitHub - uses: actions/upload-artifact@v3 - with: - name: torchvision-py${{ matrix.py_vers }}-macos11-m1-conda - path: dist/ - - name: Upload package to conda - if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }} - shell: arch -arch arm64 bash {0} - env: - CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - run: | - conda install -yq anaconda-client - set -x - export ANACONDA_PATH=$(conda info --base)/bin - $ANACONDA_PATH/anaconda -t "${CONDA_PYTORCHBOT_TOKEN}" upload dist/osx-arm64/*.tar.bz2 -u "pytorch-${CHANNEL}" --label main --no-progress --force diff --git a/.github/workflows/build-wheels-aarch64-linux.yml b/.github/workflows/build-wheels-aarch64-linux.yml new file mode 100644 index 0000000000000000000000000000000000000000..30bcd3955012607624f1cb5e838104c992325acf --- /dev/null +++ b/.github/workflows/build-wheels-aarch64-linux.yml @@ -0,0 +1,53 @@ +name: Build Aarch64 Linux Wheels + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1 + with: + package-type: wheel + os: linux-aarch64 + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + with-cuda: disable + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + post-script: packaging/post_build_script.sh + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@release/2.1 + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} + architecture: aarch64 + setup-miniconda: false + secrets: + AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml new file mode 100644 index 0000000000000000000000000000000000000000..e04c7383eaa1ca7a226905bb147a41bcc61c01aa --- /dev/null +++ b/.github/workflows/build-wheels-linux.yml @@ -0,0 +1,50 @@ +name: Build Linux Wheels + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1 + with: + package-type: wheel + os: linux + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + post-script: packaging/post_build_script.sh + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@release/2.1 + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} + secrets: + AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml new file mode 100644 index 0000000000000000000000000000000000000000..b4c4becc7106e9cf66e6774ab2e046541343277b --- /dev/null +++ b/.github/workflows/build-wheels-m1.yml @@ -0,0 +1,51 @@ +name: Build M1 Wheels + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1 + with: + package-type: wheel + os: macos-arm64 + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + post-script: packaging/post_build_script.sh + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@release/2.1 + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + runner-type: macos-m1-12 + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} + secrets: + AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/build-wheels-macos.yml b/.github/workflows/build-wheels-macos.yml new file mode 100644 index 0000000000000000000000000000000000000000..f79ace74583cbbe514c8640e6f2f6f514bfc57f2 --- /dev/null +++ b/.github/workflows/build-wheels-macos.yml @@ -0,0 +1,51 @@ +name: Build Macos Wheels + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1 + with: + package-type: wheel + os: macos + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + post-script: packaging/post_build_script.sh + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@release/2.1 + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + runner-type: macos-12 + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} + secrets: + AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/build-wheels-windows.yml b/.github/workflows/build-wheels-windows.yml new file mode 100644 index 0000000000000000000000000000000000000000..c9b3ebd72e75d516922ab517dca8b3d7e216c492 --- /dev/null +++ b/.github/workflows/build-wheels-windows.yml @@ -0,0 +1,52 @@ +name: Build Windows Wheels + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + # NOTE: Binary build pipelines should only get triggered on release candidate builds + # Release candidate tags look like: v1.11.0-rc1 + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + generate-matrix: + uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1 + with: + package-type: wheel + os: windows + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build: + needs: generate-matrix + strategy: + fail-fast: false + matrix: + include: + - repository: pytorch/vision + pre-script: packaging/pre_build_script.sh + env-script: packaging/windows/internal/vc_env_helper.bat + post-script: "python packaging/wheel/relocate.py" + smoke-test-script: test/smoke_test.py + package-name: torchvision + name: ${{ matrix.repository }} + uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@release/2.1 + with: + repository: ${{ matrix.repository }} + ref: "" + test-infra-repository: pytorch/test-infra + test-infra-ref: release/2.1 + build-matrix: ${{ needs.generate-matrix.outputs.matrix }} + pre-script: ${{ matrix.pre-script }} + env-script: ${{ matrix.env-script }} + post-script: ${{ matrix.post-script }} + package-name: ${{ matrix.package-name }} + smoke-test-script: ${{ matrix.smoke-test-script }} + trigger-event: ${{ github.event_name }} + secrets: + AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} + AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml new file mode 100644 index 0000000000000000000000000000000000000000..724ee09a472a7e7011be412f0b395804986ccade --- /dev/null +++ b/.github/workflows/docs.yml @@ -0,0 +1,127 @@ +name: Docs + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + tags: + - v[0-9]+.[0-9]+.[0-9] + - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + workflow_dispatch: + +jobs: + build: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1 + with: + repository: pytorch/vision + upload-artifact: docs + script: | + set -euo pipefail + + export PYTHON_VERSION=3.8 + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + ./.github/scripts/setup-env.sh + + # Prepare conda + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda activate ci + # FIXME: not sure why we need this. `ldd torchvision/video_reader.so` shows that it + # already links against the one pulled from conda. However, at runtime it pulls from + # /lib64 + # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't + # have to pay attention in all other workflows? + export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" + + cd docs + + echo '::group::Install doc requirements' + pip install --progress-bar=off -r requirements.txt + echo '::endgroup::' + + if [[ ${{ github.event_name }} == push && (${{ github.ref_type }} == tag || (${{ github.ref_type }} == branch && ${{ github.ref_name }} == release/*)) ]]; then + echo '::group::Enable version string sanitization' + # This environment variable just has to exist and must not be empty. The actual value is arbitrary. + # See docs/source/conf.py for details + export TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS=1 + echo '::endgroup::' + fi + + # The runner does not have sufficient memory to run with as many processes as there are + # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here. + sed -i -e 's/-j auto/-j 1/' Makefile + make html + + # Below is an imperfect way for us to add "try on collab" links to all of our gallery examples. + # sphinx-gallery will convert all gallery examples to .ipynb notebooks and stores them in + # build/html/_downloads//.ipynb + # We copy all those ipynb files in a more convenient folder so that we can more easily link to them. + mkdir build/html/_generated_ipynb_notebooks + for file in `find build/html/_downloads`; do + if [[ $file == *.ipynb ]]; then + cp $file build/html/_generated_ipynb_notebooks/ + fi + done + + cp -r build/html "${RUNNER_ARTIFACT_DIR}" + + # On PRs we also want to upload the docs into our S3 bucket for preview. + if [[ ${{ github.event_name == 'pull_request' }} ]]; then + cp -r build/html/* "${RUNNER_DOCS_DIR}" + fi + + upload: + needs: build + if: github.repository == 'pytorch/vision' && github.event_name == 'push' && + ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag') + permissions: + contents: write + uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1 + with: + repository: pytorch/vision + download-artifact: docs + ref: gh-pages + script: | + set -euo pipefail + + REF_TYPE=${{ github.ref_type }} + REF_NAME=${{ github.ref_name }} + + if [[ "${REF_TYPE}" == branch ]]; then + TARGET_FOLDER="${REF_NAME}" + elif [[ "${REF_TYPE}" == tag ]]; then + case "${REF_NAME}" in + *-rc*) + echo "Aborting upload since this is an RC tag: ${REF_NAME}" + exit 0 + ;; + *) + # Strip the leading "v" as well as the trailing patch version. For example: + # 'v0.15.2' -> '0.15' + TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/v\([0-9]\+\)\.\([0-9]\+\)\.[0-9]\+/\1.\2/') + ;; + esac + fi + echo "Target Folder: ${TARGET_FOLDER}" + + mkdir -p "${TARGET_FOLDER}" + rm -rf "${TARGET_FOLDER}"/* + mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}" + git add "${TARGET_FOLDER}" || true + + if [[ "${TARGET_FOLDER}" == main ]]; then + mkdir -p _static + rm -rf _static/* + cp -r "${TARGET_FOLDER}"/_static/* _static + git add _static || true + fi + + git config user.name 'pytorchbot' + git config user.email 'soumith+bot@pytorch.org' + git config http.postBuffer 524288000 + git commit -m "auto-generating sphinx docs" || true + git push diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml new file mode 100644 index 0000000000000000000000000000000000000000..917bc54c8327136a91ae6312e8f938d6d5e918fe --- /dev/null +++ b/.github/workflows/lint.yml @@ -0,0 +1,106 @@ +name: Lint + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + workflow_dispatch: + +jobs: + python-source-and-configs: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1 + with: + repository: pytorch/vision + script: | + set -euo pipefail + + echo '::group::Setup environment' + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda create --name ci --quiet --yes python=3.8 pip + conda activate ci + echo '::endgroup::' + + echo '::group::Install lint tools' + pip install --progress-bar=off pre-commit + echo '::endgroup::' + + set +e + pre-commit run --all-files + + if [ $? -ne 0 ]; then + git --no-pager diff + exit 1 + fi + + c-source: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1 + with: + repository: pytorch/vision + script: | + set -euo pipefail + + echo '::group::Setup environment' + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + # clang-format needs some shared libraries that conflict with the system ones. Thus, we install them from conda + # and prepend the libraries to linker path to prioritize them. `ncurses=5` is only available on the conda-forge + # channel. Since we are not building or testing here, this is fine. + conda create --name ci --quiet --yes -c conda-forge python=3.8 ncurses=5 libgcc + conda activate ci + export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}" + echo '::endgroup::' + + echo '::group::Install lint tools' + curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o ./clang-format + chmod +x ./clang-format + echo '::endgroup::' + + echo '::group::Lint C source' + set +e + ./.github/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format + + if [ $? -ne 0 ]; then + git --no-pager diff + exit 1 + fi + echo '::endgroup::' + + python-types: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1 + with: + repository: pytorch/vision + script: | + set -euo pipefail + + export PYTHON_VERSION=3.8 + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + + ./.github/scripts/setup-env.sh + + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda activate ci + + echo '::group::Install lint tools' + pip install --progress-bar=off mypy + echo '::endgroup::' + + echo '::group::Lint Python types' + mypy --install-types --non-interactive --config-file mypy.ini + echo '::endgroup::' + + bc: + if: github.event.pull_request + runs-on: ubuntu-latest + steps: + - name: Run BC Lint Action + uses: pytorch/test-infra/.github/actions/bc-lint@release/2.1 + with: + repo: ${{ github.event.pull_request.head.repo.full_name }} + base_sha: ${{ github.event.pull_request.base.sha }} + head_sha: ${{ github.event.pull_request.head.sha }} diff --git a/.github/workflows/test-m1.yml b/.github/workflows/test-m1.yml deleted file mode 100644 index 1e5f79f82fd764eb8fe725b927144fe51a961a0d..0000000000000000000000000000000000000000 --- a/.github/workflows/test-m1.yml +++ /dev/null @@ -1,50 +0,0 @@ -name: Unit-tests on M1 -on: - pull_request: - push: - branches: - - nightly - - main - - release/* - workflow_dispatch: -env: - CHANNEL: "nightly" -jobs: - tests: - name: "Unit-tests on M1" - runs-on: macos-m1-12 - strategy: - matrix: - py_vers: [ "3.8"] - - steps: - - name: Checkout repository - uses: actions/checkout@v2 - - name: Set Release CHANNEL (for release) - if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }} - run: | - echo "CHANNEL=test" >> "$GITHUB_ENV" - - name: Install TorchVision - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - . ~/miniconda3/etc/profile.d/conda.sh - # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')` - export PATH=~/miniconda3/bin:$PATH - set -ex - conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy - conda run -p ${ENV_NAME} python3 -mpip install --pre torch --extra-index-url=https://download.pytorch.org/whl/${CHANNEL} - conda run -p ${ENV_NAME} python3 setup.py develop - conda run -p ${ENV_NAME} python3 -mpip install pytest pytest-mock av - - name: Run tests - shell: arch -arch arm64 bash {0} - env: - ENV_NAME: conda-env-${{ github.run_id }} - PY_VERS: ${{ matrix.py_vers }} - run: | - . ~/miniconda3/etc/profile.d/conda.sh - set -ex - conda run -p ${ENV_NAME} --no-capture-output python3 -u -mpytest -v --tb=long --durations 20 - conda env remove -p ${ENV_NAME} diff --git a/.github/workflows/tests-schedule.yml b/.github/workflows/tests-schedule.yml index ecc283cac27e968ff70c0c2f405775acf15d2fd4..5426fdc997a58ad5d5afb6d48969595ca879bf4d 100644 --- a/.github/workflows/tests-schedule.yml +++ b/.github/workflows/tests-schedule.yml @@ -18,11 +18,14 @@ jobs: - name: Set up python uses: actions/setup-python@v2 with: - python-version: 3.7 + python-version: 3.8 - name: Upgrade system packages run: python -m pip install --upgrade pip setuptools wheel + - name: SSL + run: python -c 'import ssl; print(ssl.OPENSSL_VERSION)' + - name: Checkout repository uses: actions/checkout@v2 diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..7c1a334e1085c003a25e178083d1fc377f4d3741 --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,168 @@ +name: Tests + +on: + pull_request: + push: + branches: + - nightly + - main + - release/* + workflow_dispatch: + +jobs: + unittests-linux: + strategy: + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + runner: ["linux.12xlarge"] + gpu-arch-type: ["cpu"] + include: + - python-version: 3.8 + runner: linux.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "11.8" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1 + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + timeout: 120 + script: | + set -euo pipefail + + export PYTHON_VERSION=${{ matrix.python-version }} + export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} + export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} + + ./.github/scripts/unittest.sh + + unittests-macos: + strategy: + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + runner: ["macos-12"] + include: + - python-version: "3.8" + runner: macos-m1-12 + fail-fast: false + uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.1 + with: + repository: pytorch/vision + # We need an increased timeout here, since the macos-12 runner is the free one from GH + # and needs roughly 2 hours to just run the test suite + timeout: 240 + runner: ${{ matrix.runner }} + script: | + set -euo pipefail + + export PYTHON_VERSION=${{ matrix.python-version }} + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + + ./.github/scripts/unittest.sh + + unittests-windows: + strategy: + matrix: + python-version: + - "3.8" + - "3.9" + - "3.10" + - "3.11" + runner: ["windows.4xlarge"] + gpu-arch-type: ["cpu"] + include: + - python-version: "3.8" + runner: windows.g5.4xlarge.nvidia.gpu + gpu-arch-type: cuda + gpu-arch-version: "11.8" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/windows_job.yml@release/2.1 + with: + repository: pytorch/vision + runner: ${{ matrix.runner }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + timeout: 120 + script: | + set -euxo pipefail + + export PYTHON_VERSION=${{ matrix.python-version }} + export VC_YEAR=2019 + export VSDEVCMD_ARGS="" + export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }} + export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }} + + ./.github/scripts/unittest.sh + + onnx: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1 + with: + repository: pytorch/vision + script: | + set -euo pipefail + + export PYTHON_VERSION=3.8 + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + + ./.github/scripts/setup-env.sh + + # Prepare conda + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda activate ci + + echo '::group::Install ONNX' + pip install --progress-bar=off onnx onnxruntime + echo '::endgroup::' + + echo '::group::Install testing utilities' + pip install --progress-bar=off pytest + echo '::endgroup::' + + echo '::group::Run ONNX tests' + pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_onnx.py + echo '::endgroup::' + + unittests-extended: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1 + with: + repository: pytorch/vision + script: | + set -euo pipefail + + export PYTHON_VERSION=3.8 + export GPU_ARCH_TYPE=cpu + export GPU_ARCH_VERSION='' + + ./.github/scripts/setup-env.sh + + # Prepare conda + CONDA_PATH=$(which conda) + eval "$(${CONDA_PATH} shell.bash hook)" + conda activate ci + + echo '::group::Pre-download model weights' + pip install --progress-bar=off aiohttp aiofiles tqdm + python scripts/download_model_urls.py + echo '::endgroup::' + + echo '::group::Install testing utilities' + pip install --progress-bar=off pytest + echo '::endgroup::' + + echo '::group::Run extended unittests' + export PYTORCH_TEST_WITH_EXTENDED=1 + pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_extended_*.py + echo '::endgroup::' diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml new file mode 100644 index 0000000000000000000000000000000000000000..ac81f3ff155c35d152b64f8ec55e0af5cb23370e --- /dev/null +++ b/.github/workflows/update-viablestrict.yml @@ -0,0 +1,23 @@ +name: Update viable/strict + +on: + pull_request: + paths: + - .github/workflows/update-viablestrict.yml + schedule: + - cron: 10,40 * * * * + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }} + cancel-in-progress: false + +jobs: + do_update_viablestrict: + uses: pytorch/test-infra/.github/workflows/update-viablestrict.yml@release/2.1 + with: + repository: pytorch/vision + required_checks: "Build Linux,Build M1,Build Macos,Build Windows,Tests,CMake,Lint,Docs" + secrets: + ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }} + GITHUB_DEPLOY_KEY : ${{ secrets.VISION_GITHUB_DEPLOY_KEY }} diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 4c51a531267f44279aa2ce2226569b213732642d..762ebf6fce0b59e20d113a8b77ae684edddfbaff 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,19 +6,21 @@ repos: - id: check-toml - id: check-yaml exclude: packaging/.* + args: + - --allow-multiple-documents - id: mixed-line-ending args: [--fix=lf] - id: end-of-file-fixer - repo: https://github.com/omnilib/ufmt - rev: v1.3.2 + rev: v1.3.3 hooks: - id: ufmt additional_dependencies: - black == 22.3.0 - usort == 1.0.2 - - repo: https://gitlab.com/pycqa/flake8 + - repo: https://github.com/PyCQA/flake8 rev: 5.0.4 hooks: - id: flake8 diff --git a/CITATION.cff b/CITATION.cff new file mode 100644 index 0000000000000000000000000000000000000000..37db28b2badfdc4fd42ceaeb8aa301780d3b16f9 --- /dev/null +++ b/CITATION.cff @@ -0,0 +1,14 @@ +cff-version: 1.2.0 +title: "TorchVision: PyTorch's Computer Vision library" +message: >- + If you find TorchVision useful in your work, please + consider citing the following BibTeX entry. +type: software +authors: + - given-names: TorchVision maintainers and contributors +url: "https://github.com/pytorch/vision" +license: "BSD-3-Clause" +date-released: "2016-11-06" +journal: "GitHub repository" +publisher: "GitHub" +key: "torchvision2016" diff --git a/CMakeLists.txt b/CMakeLists.txt index 85b878307cf4e89425d258ce6c69675b66cdefcb..0cd485d7a24860e058e6f6024b6e47531759279b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,10 @@ -cmake_minimum_required(VERSION 3.12) +cmake_minimum_required(VERSION 3.18) project(torchvision) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) file(STRINGS version.txt TORCHVISION_VERSION) option(WITH_CUDA "Enable CUDA support" OFF) +option(WITH_MPS "Enable MPS support" OFF) option(WITH_PNG "Enable features requiring LibPNG." ON) option(WITH_JPEG "Enable features requiring LibJPEG." ON) option(USE_PYTHON "Link to Python when building" OFF) @@ -13,11 +14,11 @@ if(WITH_CUDA) add_definitions(-D__CUDA_NO_HALF_OPERATORS__) add_definitions(-DWITH_CUDA) set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") - # CUDA-11.x can not be compiled using C++14 standard on Windows - string(REGEX MATCH "^[0-9]+" CUDA_MAJOR ${CMAKE_CUDA_COMPILER_VERSION}) - if(${CUDA_MAJOR} GREATER 10 AND MSVC) - set(CMAKE_CXX_STANDARD 17) - endif() +endif() + +if(WITH_MPS) + enable_language(OBJC OBJCXX) + add_definitions(-DWITH_MPS) endif() find_package(Torch REQUIRED) @@ -84,6 +85,9 @@ list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCP if(WITH_CUDA) list(APPEND ALLOW_LISTED ${TVCPP}/ops/cuda ${TVCPP}/ops/autocast) endif() +if(WITH_MPS) + list(APPEND ALLOW_LISTED ${TVCPP}/ops/mps) +endif() FOREACH(DIR ${ALLOW_LISTED}) file(GLOB ALL_SOURCES ${ALL_SOURCES} ${DIR}/*.*) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 3eedb6261a481109f06d567dc86ccc03b7dc133d..b41c0fe8939a81a19c50b5514073d85557753a23 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -4,22 +4,22 @@ We want to make contributing to this project as easy and transparent as possible ## TL;DR -We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out. +We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out. Your contributions may fall into the following categories: -- It helps the project if you could +- It helps the project if you could - Report issues you're facing - - Give a :+1: on issues that others reported and that are relevant to you + - Give a :+1: on issues that others reported and that are relevant to you - Answering queries on the issue tracker, investigating bugs are very valuable contributions to the project. -- You would like to improve the documentation. This is no less important than improving the library itself! +- You would like to improve the documentation. This is no less important than improving the library itself! If you find a typo in the documentation, do not hesitate to submit a GitHub pull request. - If you would like to fix a bug - please pick one from the [list of open issues labelled as "help wanted"](https://github.com/pytorch/vision/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22) - comment on the issue that you want to work on this issue - - send a PR with your fix, see below. + - send a PR with your fix, see below. - If you plan to contribute new features, utility functions or extensions, please first open an issue and discuss the feature with us. @@ -30,30 +30,49 @@ clear and has sufficient instructions to be able to reproduce the issue. ## Development installation -### Install PyTorch Nightly + +### Dependencies + +Start by installing the **nightly** build of PyTorch following the [official +instructions](https://pytorch.org/get-started/locally/). + +**Optionally**, install `libpng` and `libjpeg-turbo` if you want to enable +support for +native encoding / decoding of PNG and JPEG formats in +[torchvision.io](https://pytorch.org/vision/stable/io.html#image): ```bash -conda install pytorch -c pytorch-nightly -# or with pip (see https://pytorch.org/get-started/locally/) -# pip install numpy -# pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html +conda install libpng libjpeg-turbo -c pytorch ``` -### Install Torchvision +Note: you can use the `TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY` +environment variables to tell the build system where to find those libraries if +they are in specific locations. Take a look at +[setup.py](https://github.com/pytorch/vision/blob/main/setup.py) for more +details. + +### Clone and install torchvision ```bash git clone https://github.com/pytorch/vision.git cd vision -python setup.py develop +python setup.py develop # use install instead of develop if you don't care about development. # or, for OSX # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py develop -# for C++ debugging, please use DEBUG=1 +# for C++ debugging, use DEBUG=1 # DEBUG=1 python setup.py develop -pip install flake8 typing mypy pytest pytest-mock scipy ``` -You may also have to install `libpng-dev` and `libjpeg-turbo8-dev` libraries: -```bash -conda install libpng jpeg + +By default, GPU support is built if CUDA is found and `torch.cuda.is_available()` is true. It's possible to force +building GPU support by setting `FORCE_CUDA=1` environment variable, which is useful when building a docker image. + +We don't officially support building from source using `pip`, but _if_ you do, you'll need to use the +`--no-build-isolation` flag. + +Other development dependencies include: + +``` +pip install flake8 typing mypy pytest pytest-mock scipy ``` ## Development Process @@ -66,12 +85,12 @@ If you plan to modify the code or documentation, please follow the steps below: 4. Ensure the test suite passes. 5. Make sure your code passes the formatting checks (see below). -For more details about pull requests, -please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). +For more details about pull requests, +please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). -If you would like to contribute a new model, please see [here](#New-model). +If you would like to contribute a new model, please see [here](#New-architecture-or-improved-model-weights). -If you would like to contribute a new dataset, please see [here](#New-dataset). +If you would like to contribute a new dataset, please see [here](#New-dataset). ### Code formatting and typing @@ -83,7 +102,7 @@ Instead of relying directly on `black` however, we rely on [ufmt](https://github.com/omnilib/ufmt), for compatibility reasons with Facebook internal infrastructure. -To format your code, install `ufmt` with `pip install ufmt==1.3.2 black==22.3.0 usort==1.0.2` and use e.g.: +To format your code, install `ufmt` with `pip install ufmt==1.3.3 black==22.3.0 usort==1.0.2` and use e.g.: ```bash ufmt format torchvision @@ -126,8 +145,8 @@ mypy --config-file mypy.ini ### Unit tests -If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific -test: +If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific +test: ```bash pytest test/ -vvv -k # e.g. pytest test/test_transforms.py -vvv -k test_center_crop @@ -136,7 +155,7 @@ pytest test/ -vvv -k If you would like to run all tests: ```bash pytest test -vvv -``` +``` Tests that require internet access should be in `test/test_internet.py`. @@ -189,18 +208,18 @@ with "transforms" in their name. ### New architecture or improved model weights Please refer to the guidelines in [Contributing to Torchvision - Models](https://github.com/pytorch/vision/blob/main/CONTRIBUTING_MODELS.md). - + ### New dataset -More details on how to add a new dataset will be provided later. Please, do not send any PR with a new dataset without discussing +Please, do not send any PR with a new dataset without discussing it in an issue as, most likely, it will not be accepted. ### Pull Request -If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on -different operation systems, python versions and hardwares. +If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on +different operating systems, python versions and hardware. -For more details about pull requests workflow, +For more details about pull requests workflow, please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). ## License diff --git a/CONTRIBUTING_MODELS.md b/CONTRIBUTING_MODELS.md index 82845e6579aaddc4aca10f36bf95a3862811be64..390a25a0f8985767e8a9e39c43f6ad372befd1ca 100644 --- a/CONTRIBUTING_MODELS.md +++ b/CONTRIBUTING_MODELS.md @@ -20,13 +20,13 @@ So, before starting any work and submitting a PR there are a few critical things ### 1. Preparation work -- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether or not this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it. +- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it. - If the decision is to include the new model, then please create a new ticket which will be used for all design and implementation discussions prior to the PR. One of the TorchVision maintainers will reach out at this stage and this will be your POC from this point onwards in order to provide support, guidance and regular feedback. ### 2. Implement the model -Please take a look at existing models in TorchVision to get familiar with the idioms. Also please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1. Example of things to take into account: +Please take a look at existing models in TorchVision to get familiar with the idioms. Also, please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1. Example of things to take into account: - The implementation should be as close as possible to the canonical implementation/paper - The PR must include the code implementation, documentation and tests @@ -34,7 +34,7 @@ Please take a look at existing models in TorchVision to get familiar with the id - The weights need to reproduce closely the results of the paper in terms of accuracy, even though the final weights to be deployed will be those trained by the TorchVision maintainers - The PR description should include commands/configuration used to train the model, so that the TorchVision maintainers can easily run them to verify the implementation and generate the final model to be released - Make sure we re-use existing components as much as possible (inheritance) -- New primitives (transforms, losses, etc) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer +- New primitives (transforms, losses, etc.) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer - Please take a look at the detailed [implementation and documentation guidelines](https://github.com/pytorch/vision/issues/5319) for a fine grain list of things not to be missed ### 3. Train the model with reference scripts diff --git a/MANIFEST.in b/MANIFEST.in index 75f238c0a2c97812ebe5fdf3e2b43667c7c7f6af..9e45188df355dac6e7e8e3657cd48959f8a2d968 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,4 +1,4 @@ -include README.rst +include README.md include LICENSE recursive-exclude * __pycache__ diff --git a/README.md b/README.md index 981a96322b901cf9ab213a120d9e456924af18a2..b7df756a3578435295f6f3a039da7ba2403d090d 100644 --- a/README.md +++ b/README.md @@ -47,7 +47,7 @@ pip3 install torchvision* ##### 编译安装 ```bash -git clone -b dtk-23.10-v0.14.1 http://developer.hpccube.com/codes/aicomponent/vision.git +git clone -b dtk-23.10-v0.16.0 http://developer.hpccube.com/codes/aicomponent/vision.git cd vision PYTORCH_ROCM_ARCH="gfx906;gfx926" python3 setup.py bdist_wheel pip3 install dist/vision* diff --git a/README_ORIGIN.md b/README_ORIGIN.md new file mode 100644 index 0000000000000000000000000000000000000000..373b6b79548f524171985fcdd0cdea906dc1f78a --- /dev/null +++ b/README_ORIGIN.md @@ -0,0 +1,150 @@ +# torchvision + +[![total torchvision downloads](https://pepy.tech/badge/torchvision)](https://pepy.tech/project/torchvision) +[![documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/vision/stable/index.html) + +The torchvision package consists of popular datasets, model architectures, and common image transformations for computer +vision. + +## Installation + +Please refer to the [official +instructions](https://pytorch.org/get-started/locally/) to install the stable +versions of `torch` and `torchvision` on your system. + +To build source, refer to our [contributing +page](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation). + +The following is the corresponding `torchvision` versions and supported Python +versions. + +| `torch` | `torchvision` | Python | +| ------------------ | ------------------ | ------------------- | +| `main` / `nightly` | `main` / `nightly` | `>=3.8`, `<=3.11` | +| `2.1` | `0.16` | `>=3.8`, `<=3.11` | +| `2.0` | `0.15` | `>=3.8`, `<=3.11` | +| `1.13` | `0.14` | `>=3.7.2`, `<=3.10` | + +
+ older versions + +| `torch` | `torchvision` | Python | +|---------|-------------------|---------------------------| +| `1.12` | `0.13` | `>=3.7`, `<=3.10` | +| `1.11` | `0.12` | `>=3.7`, `<=3.10` | +| `1.10` | `0.11` | `>=3.6`, `<=3.9` | +| `1.9` | `0.10` | `>=3.6`, `<=3.9` | +| `1.8` | `0.9` | `>=3.6`, `<=3.9` | +| `1.7` | `0.8` | `>=3.6`, `<=3.9` | +| `1.6` | `0.7` | `>=3.6`, `<=3.8` | +| `1.5` | `0.6` | `>=3.5`, `<=3.8` | +| `1.4` | `0.5` | `==2.7`, `>=3.5`, `<=3.8` | +| `1.3` | `0.4.2` / `0.4.3` | `==2.7`, `>=3.5`, `<=3.7` | +| `1.2` | `0.4.1` | `==2.7`, `>=3.5`, `<=3.7` | +| `1.1` | `0.3` | `==2.7`, `>=3.5`, `<=3.7` | +| `<=1.0` | `0.2` | `==2.7`, `>=3.5`, `<=3.7` | + +
+ +## Image Backends + +Torchvision currently supports the following image backends: + +- torch tensors +- PIL images: + - [Pillow](https://python-pillow.org/) + - [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) - a **much faster** drop-in replacement for Pillow with SIMD. + +Read more in in our [docs](https://pytorch.org/vision/stable/transforms.html). + +## [UNSTABLE] Video Backend + +Torchvision currently supports the following video backends: + +- [pyav](https://github.com/PyAV-Org/PyAV) (default) - Pythonic binding for ffmpeg libraries. +- video_reader - This needs ffmpeg to be installed and torchvision to be built from source. There shouldn't be any + conflicting version of ffmpeg installed. Currently, this is only supported on Linux. + +``` +conda install -c conda-forge ffmpeg +python setup.py install +``` + +# Using the models on C++ + +TorchVision provides an example project for how to use the models on C++ using JIT Script. + +Installation From source: + +``` +mkdir build +cd build +# Add -DWITH_CUDA=on support for the CUDA if needed +cmake .. +make +make install +``` + +Once installed, the library can be accessed in cmake (after properly configuring `CMAKE_PREFIX_PATH`) via the +`TorchVision::TorchVision` target: + +``` +find_package(TorchVision REQUIRED) +target_link_libraries(my-target PUBLIC TorchVision::TorchVision) +``` + +The `TorchVision` package will also automatically look for the `Torch` package and add it as a dependency to +`my-target`, so make sure that it is also available to cmake via the `CMAKE_PREFIX_PATH`. + +For an example setup, take a look at `examples/cpp/hello_world`. + +Python linking is disabled by default when compiling TorchVision with CMake, this allows you to run models without any +Python dependency. In some special cases where TorchVision's operators are used from Python code, you may need to link +to Python. This can be done by passing `-DUSE_PYTHON=on` to CMake. + +### TorchVision Operators + +In order to get the torchvision operators registered with torch (eg. for the JIT), all you need to do is to ensure that +you `#include ` in your project. + +## Documentation + +You can find the API documentation on the pytorch website: + +## Contributing + +See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out. + +## Disclaimer on Datasets + +This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, +vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to +determine whether you have permission to use the dataset under the dataset's license. + +If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset +to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML +community! + +## Pre-trained Model License + +The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the +dataset used for training. It is your responsibility to determine whether you have permission to use the models for your +use case. + +More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See +[SWAG LICENSE](https://github.com/facebookresearch/SWAG/blob/main/LICENSE) for additional details. + +## Citing TorchVision + +If you find TorchVision useful in your work, please consider citing the following BibTeX entry: + +```bibtex +@software{torchvision2016, + title = {TorchVision: PyTorch's Computer Vision library}, + author = {TorchVision maintainers and contributors}, + year = 2016, + journal = {GitHub repository}, + publisher = {GitHub}, + howpublished = {\url{https://github.com/pytorch/vision}} +} +``` diff --git a/README_ORIGIN.rst b/README_ORIGIN.rst deleted file mode 100644 index c3605cc3c9507373466ed73081061e2276320dd0..0000000000000000000000000000000000000000 --- a/README_ORIGIN.rst +++ /dev/null @@ -1,198 +0,0 @@ -torchvision -=========== - -.. image:: https://pepy.tech/badge/torchvision - :target: https://pepy.tech/project/torchvision - -.. image:: https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v - :target: https://pytorch.org/vision/stable/index.html - - -The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision. - - -Installation -============ - -We recommend Anaconda as Python package management system. Please refer to `pytorch.org `_ -for the detail of PyTorch (``torch``) installation. The following is the corresponding ``torchvision`` versions and -supported Python versions. - -+--------------------------+--------------------------+---------------------------------+ -| ``torch`` | ``torchvision`` | ``python`` | -+==========================+==========================+=================================+ -| ``main`` / ``nightly`` | ``main`` / ``nightly`` | ``>=3.7``, ``<=3.10`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.12.0`` | ``0.13.0`` | ``>=3.7``, ``<=3.10`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.11.0`` | ``0.12.0`` | ``>=3.7``, ``<=3.10`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.10.2`` | ``0.11.3`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.10.1`` | ``0.11.2`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.10.0`` | ``0.11.1`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.9.1`` | ``0.10.1`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.9.0`` | ``0.10.0`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.8.2`` | ``0.9.2`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.8.1`` | ``0.9.1`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.8.0`` | ``0.9.0`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.7.1`` | ``0.8.2`` | ``>=3.6``, ``<=3.9`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.7.0`` | ``0.8.1`` | ``>=3.6``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.7.0`` | ``0.8.0`` | ``>=3.6``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.6.0`` | ``0.7.0`` | ``>=3.6``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.5.1`` | ``0.6.1`` | ``>=3.5``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.5.0`` | ``0.6.0`` | ``>=3.5``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.4.0`` | ``0.5.0`` | ``==2.7``, ``>=3.5``, ``<=3.8`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.3.1`` | ``0.4.2`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.3.0`` | ``0.4.1`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.2.0`` | ``0.4.0`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ -| ``1.1.0`` | ``0.3.0`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ -| ``<=1.0.1`` | ``0.2.2`` | ``==2.7``, ``>=3.5``, ``<=3.7`` | -+--------------------------+--------------------------+---------------------------------+ - -Anaconda: - -.. code:: bash - - conda install torchvision -c pytorch - -pip: - -.. code:: bash - - pip install torchvision - -From source: - -.. code:: bash - - python setup.py install - # or, for OSX - # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install - - -We don't officially support building from source using ``pip``, but *if* you do, -you'll need to use the ``--no-build-isolation`` flag. -In case building TorchVision from source fails, install the nightly version of PyTorch following -the linked guide on the `contributing page `_ and retry the install. - -By default, GPU support is built if CUDA is found and ``torch.cuda.is_available()`` is true. -It's possible to force building GPU support by setting ``FORCE_CUDA=1`` environment variable, -which is useful when building a docker image. - -Image Backend -============= -Torchvision currently supports the following image backends: - -* `Pillow`_ (default) - -* `Pillow-SIMD`_ - a **much faster** drop-in replacement for Pillow with SIMD. If installed will be used as the default. - -* `accimage`_ - if installed can be activated by calling :code:`torchvision.set_image_backend('accimage')` - -* `libpng`_ - can be installed via conda :code:`conda install libpng` or any of the package managers for debian-based and RHEL-based Linux distributions. - -* `libjpeg`_ - can be installed via conda :code:`conda install jpeg` or any of the package managers for debian-based and RHEL-based Linux distributions. `libjpeg-turbo`_ can be used as well. - -**Notes:** ``libpng`` and ``libjpeg`` must be available at compilation time in order to be available. Make sure that it is available on the standard library locations, -otherwise, add the include and library paths in the environment variables ``TORCHVISION_INCLUDE`` and ``TORCHVISION_LIBRARY``, respectively. - -.. _libpng : http://www.libpng.org/pub/png/libpng.html -.. _Pillow : https://python-pillow.org/ -.. _Pillow-SIMD : https://github.com/uploadcare/pillow-simd -.. _accimage: https://github.com/pytorch/accimage -.. _libjpeg: http://ijg.org/ -.. _libjpeg-turbo: https://libjpeg-turbo.org/ - -Video Backend -============= -Torchvision currently supports the following video backends: - -* `pyav`_ (default) - Pythonic binding for ffmpeg libraries. - -.. _pyav : https://github.com/PyAV-Org/PyAV - -* video_reader - This needs ffmpeg to be installed and torchvision to be built from source. There shouldn't be any conflicting version of ffmpeg installed. Currently, this is only supported on Linux. - -.. code:: bash - - conda install -c conda-forge ffmpeg - python setup.py install - - -Using the models on C++ -======================= -TorchVision provides an example project for how to use the models on C++ using JIT Script. - -Installation From source: - -.. code:: bash - - mkdir build - cd build - # Add -DWITH_CUDA=on support for the CUDA if needed - cmake .. - make - make install - -Once installed, the library can be accessed in cmake (after properly configuring ``CMAKE_PREFIX_PATH``) via the :code:`TorchVision::TorchVision` target: - -.. code:: rest - - find_package(TorchVision REQUIRED) - target_link_libraries(my-target PUBLIC TorchVision::TorchVision) - -The ``TorchVision`` package will also automatically look for the ``Torch`` package and add it as a dependency to ``my-target``, -so make sure that it is also available to cmake via the ``CMAKE_PREFIX_PATH``. - -For an example setup, take a look at ``examples/cpp/hello_world``. - -Python linking is disabled by default when compiling TorchVision with CMake, this allows you to run models without any Python -dependency. In some special cases where TorchVision's operators are used from Python code, you may need to link to Python. This -can be done by passing ``-DUSE_PYTHON=on`` to CMake. - -TorchVision Operators ---------------------- -In order to get the torchvision operators registered with torch (eg. for the JIT), all you need to do is to ensure that you -:code:`#include ` in your project. - -Documentation -============= -You can find the API documentation on the pytorch website: https://pytorch.org/vision/stable/index.html - -Contributing -============ - -See the `CONTRIBUTING `_ file for how to help out. - -Disclaimer on Datasets -====================== - -This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license. - -If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community! - -Pre-trained Model License -========================= - -The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case. - -More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See `SWAG LICENSE `_ for additional details. diff --git a/android/README.md b/android/README.md new file mode 100644 index 0000000000000000000000000000000000000000..788c83f26de72593717e97af749ccadb77daab5f --- /dev/null +++ b/android/README.md @@ -0,0 +1,3 @@ +## Status + +The Android demo of TorchVision is currently unmaintained, untested and likely out-of-date. diff --git a/android/build.gradle b/android/build.gradle index f28ba9112ff5709472fdbe511a0c53ecd1cd6c2d..f7995a07f5b619fd777ee5c52ce757115f2bf069 100644 --- a/android/build.gradle +++ b/android/build.gradle @@ -14,7 +14,7 @@ allprojects { androidSupportAppCompatV7Version = "28.0.0" fbjniJavaOnlyVersion = "0.0.3" - soLoaderNativeLoaderVersion = "0.10.4" + soLoaderNativeLoaderVersion = "0.10.5" pytorchAndroidVersion = "1.12" } diff --git a/android/gradle.properties b/android/gradle.properties index 1b6b275f63f64e360ea4dd1c688340e35e90345c..8204b73b05197d56e927a3bbdd7051e70db10fda 100644 --- a/android/gradle.properties +++ b/android/gradle.properties @@ -1,6 +1,6 @@ ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64 -VERSION_NAME=0.14.0-SNAPSHOT +VERSION_NAME=0.15.0-SNAPSHOT GROUP=org.pytorch MAVEN_GROUP=org.pytorch SONATYPE_STAGING_PROFILE=orgpytorch diff --git a/android/ops/CMakeLists.txt b/android/ops/CMakeLists.txt index ad42adbfa71e63fc73d2de0e6dcae4d333cbca37..fb8d4348e8ea77948a8e8acc54ac5ede0ba53760 100644 --- a/android/ops/CMakeLists.txt +++ b/android/ops/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.4.1) set(TARGET torchvision_ops) project(${TARGET} CXX) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) string(APPEND CMAKE_CXX_FLAGS " -DMOBILE") diff --git a/cmake/iOS.cmake b/cmake/iOS.cmake index d42ea4c9232c171312fdff20d42733d9ef379de1..935c57f11b9268504f2769d56eeffdba02a44b5f 100644 --- a/cmake/iOS.cmake +++ b/cmake/iOS.cmake @@ -10,11 +10,11 @@ # SIMULATOR - used to build for the Simulator platforms, which have an x86 arch. # # CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder -# By default this location is automatcially chosen based on the IOS_PLATFORM value above. +# By default this location is automatically chosen based on the IOS_PLATFORM value above. # If set manually, it will override the default location and force the user of a particular Developer Platform # # CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder -# By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value. +# By default this location is automatically chosen based on the CMAKE_IOS_DEVELOPER_ROOT value. # In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path. # If set manually, this will force the use of a specific SDK version @@ -100,7 +100,7 @@ if(IOS_DEPLOYMENT_TARGET) set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}") endif() -# Hidden visibilty is required for cxx on iOS +# Hidden visibility is required for cxx on iOS set(CMAKE_C_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS}") set(CMAKE_CXX_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -fvisibility-inlines-hidden") diff --git a/docs/Makefile b/docs/Makefile index 389a07a604e29769030bbd2e3df0d9252686487f..f462ff223032e8b44ff3f6a1429f164777596dd5 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -33,6 +33,7 @@ clean: rm -rf $(SOURCEDIR)/auto_examples/ # sphinx-gallery rm -rf $(SOURCEDIR)/gen_modules/ # sphinx-gallery rm -rf $(SOURCEDIR)/generated/ # autosummary + rm -rf $(SOURCEDIR)/models/generated # autosummary .PHONY: help Makefile docset diff --git a/docs/requirements.txt b/docs/requirements.txt index 09a11359ae758854b303403738188b8e3e934336..2a50d9b8f45c672a59ebd81a430d8674682eb498 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -5,3 +5,4 @@ sphinx-gallery>=0.11.1 sphinx==5.0.0 tabulate -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme +pycocotools diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py index 925894df5c5787ef453e86e2eb7ce326735a3b8e..cc79ca8972f3019f615b47560a5c02d25ea0e160 100644 --- a/docs/source/beta_status.py +++ b/docs/source/beta_status.py @@ -4,15 +4,26 @@ from docutils.parsers.rst import Directive class BetaStatus(Directive): has_content = True + text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed." + node = nodes.warning def run(self): - api_name = " ".join(self.content) - text = f"The {api_name} is in Beta stage, and backward compatibility is not guaranteed." - return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))] + text = self.text.format(api_name=" ".join(self.content)) + return [self.node("", nodes.paragraph("", "", nodes.Text(text)))] + + +class V2BetaStatus(BetaStatus): + text = ( + "The {api_name} is in Beta stage, and while we do not expect disruptive breaking changes, " + "some APIs may slightly change according to user feedback. Please submit any feedback you may have " + "in this issue: https://github.com/pytorch/vision/issues/6753." + ) + node = nodes.note def setup(app): app.add_directive("betastatus", BetaStatus) + app.add_directive("v2betastatus", V2BetaStatus) return { "version": "0.1", "parallel_read_safe": True, diff --git a/docs/source/conf.py b/docs/source/conf.py index 231d3cad416dcd8189121ee94fc931cb9f135816..cd3a28658cbd721d3135aa3971dd4654fd278891 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -29,6 +29,7 @@ from pathlib import Path import pytorch_sphinx_theme import torchvision import torchvision.models as M +from sphinx_gallery.sorting import ExplicitOrder from tabulate import tabulate sys.path.append(os.path.abspath(".")) @@ -55,11 +56,65 @@ extensions = [ "beta_status", ] +# We override sphinx-gallery's example header to prevent sphinx-gallery from +# creating a note at the top of the renderred notebook. +# https://github.com/sphinx-gallery/sphinx-gallery/blob/451ccba1007cc523f39cbcc960ebc21ca39f7b75/sphinx_gallery/gen_rst.py#L1267-L1271 +# This is because we also want to add a link to google collab, so we write our own note in each example. +from sphinx_gallery import gen_rst + +gen_rst.EXAMPLE_HEADER = """ +.. DO NOT EDIT. +.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY. +.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE: +.. "{0}" +.. LINE NUMBERS ARE GIVEN BELOW. + +.. rst-class:: sphx-glr-example-title + +.. _sphx_glr_{1}: + +""" + + +class CustomGalleryExampleSortKey: + # See https://sphinx-gallery.github.io/stable/configuration.html#sorting-gallery-examples + # and https://github.com/sphinx-gallery/sphinx-gallery/blob/master/sphinx_gallery/sorting.py + def __init__(self, src_dir): + self.src_dir = src_dir + + transforms_subsection_order = [ + "plot_transforms_getting_started.py", + "plot_transforms_illustrations.py", + "plot_transforms_e2e.py", + "plot_cutmix_mixup.py", + "plot_custom_transforms.py", + "plot_tv_tensors.py", + "plot_custom_tv_tensors.py", + ] + + def __call__(self, filename): + if "gallery/transforms" in self.src_dir: + try: + return self.transforms_subsection_order.index(filename) + except ValueError as e: + raise ValueError( + "Looks like you added an example in gallery/transforms? " + "You need to specify its order in docs/source/conf.py. Look for CustomGalleryExampleSortKey." + ) from e + else: + # For other subsections we just sort alphabetically by filename + return filename + + sphinx_gallery_conf = { "examples_dirs": "../../gallery/", # path to your example scripts "gallery_dirs": "auto_examples", # path to where to save gallery generated output + "subsection_order": ExplicitOrder(["../../gallery/transforms", "../../gallery/others"]), "backreferences_dir": "gen_modules/backreferences", "doc_module": ("torchvision",), + "remove_config_comments": True, + "ignore_pattern": "helpers.py", + "within_subsection_order": CustomGalleryExampleSortKey, } napoleon_use_ivar = True @@ -88,17 +143,15 @@ author = "Torch Contributors" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. -# -# The short X.Y version. -version = "main (" + torchvision.__version__ + " )" -# The full version, including alpha/beta/rc tags. -release = "main" -VERSION = os.environ.get("VERSION", None) -if VERSION: +# version: The short X.Y version. +# release: The full version, including alpha/beta/rc tags. +if os.environ.get("TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS", None): # Turn 1.11.0aHASH into 1.11 (major.minor only) - version = ".".join(version.split(".")[:2]) + version = release = ".".join(torchvision.__version__.split(".")[:2]) html_title = " ".join((project, version, "documentation")) - release = version +else: + version = f"main ({torchvision.__version__})" + release = "main" # The language for content autogenerated by Sphinx. Refer to documentation @@ -138,7 +191,7 @@ html_theme_options = { "logo_only": True, "pytorch_project": "docs", "navigation_with_keys": True, - "analytics_id": "UA-117752657-2", + "analytics_id": "GTM-T8XT4PS", } html_logo = "_static/img/pytorch-logo-dark.svg" @@ -318,7 +371,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines): used within the autoclass directive. """ - if obj.__name__.endswith(("_Weights", "_QuantizedWeights")): + if getattr(obj, ".__name__", "").endswith(("_Weights", "_QuantizedWeights")): if len(obj) == 0: lines[:] = ["There are no available pre-trained weights."] @@ -331,7 +384,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines): ] if obj.__doc__ != "An enumeration.": - # We only show the custom enum doc if it was overriden. The default one from Python is "An enumeration" + # We only show the custom enum doc if it was overridden. The default one from Python is "An enumeration" lines.append("") lines.append(obj.__doc__) @@ -362,6 +415,13 @@ def inject_weight_metadata(app, what, name, obj, options, lines): max_visible = 3 v_sample = ", ".join(v[:max_visible]) v = f"{v_sample}, ... ({len(v)-max_visible} omitted)" if len(v) > max_visible else v_sample + elif k == "_ops": + v = f"{v:.2f}" + k = "GIPS" if obj.__name__.endswith("_QuantizedWeights") else "GFLOPS" + elif k == "_file_size": + k = "File size" + v = f"{v:.1f} MB" + table.append((str(k), str(v))) table = tabulate(table, tablefmt="rst") lines += [".. rst-class:: table-weights"] # Custom CSS class, see custom_torchvision.css @@ -385,19 +445,27 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern if exclude_patterns is not None: weights = [w for w in weights if all(p not in str(w) for p in exclude_patterns)] + ops_name = "GIPS" if "QuantizedWeights" in weights_endswith else "GFLOPS" + metrics_keys, metrics_names = zip(*metrics) - column_names = ["Weight"] + list(metrics_names) + ["Params", "Recipe"] + column_names = ["Weight"] + list(metrics_names) + ["Params"] + [ops_name, "Recipe"] # Final column order column_names = [f"**{name}**" for name in column_names] # Add bold - content = [ - ( + content = [] + for w in weights: + row = [ f":class:`{w} <{type(w).__name__}>`", *(w.meta["_metrics"][dataset][metric] for metric in metrics_keys), f"{w.meta['num_params']/1e6:.1f}M", + f"{w.meta['_ops']:.2f}", f"`link <{w.meta['recipe']}>`__", - ) - for w in weights - ] + ] + + content.append(row) + + column_widths = ["110"] + ["18"] * len(metrics_names) + ["18"] * 2 + ["10"] + widths_table = " ".join(column_widths) + table = tabulate(content, headers=column_names, tablefmt="rst") generated_dir = Path("generated") @@ -405,7 +473,7 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern with open(generated_dir / f"{table_name}_table.rst", "w+") as table_file: table_file.write(".. rst-class:: table-weights\n") # Custom CSS class, see custom_torchvision.css table_file.write(".. table::\n") - table_file.write(f" :widths: 100 {'20 ' * len(metrics_names)} 20 10\n\n") + table_file.write(f" :widths: {widths_table} \n\n") table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n") diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst index f3f0b466d622be175cb9efc1688ca621bfc01d1b..588c1f781edbe8c5ef63d75bd20178e27266b9c3 100644 --- a/docs/source/datasets.rst +++ b/docs/source/datasets.rst @@ -1,3 +1,5 @@ +.. _datasets: + Datasets ======== @@ -80,7 +82,6 @@ Image detection or segmentation CocoDetection CelebA Cityscapes - GTSRB Kitti OxfordIIITPet SBDataset @@ -149,6 +150,14 @@ Video classification Kinetics UCF101 +Video prediction +~~~~~~~~~~~~~~~~~~~~ + +.. autosummary:: + :toctree: generated/ + :template: class_dataset.rst + + MovingMNIST .. _base_classes_datasets: @@ -162,3 +171,12 @@ Base classes for custom datasets DatasetFolder ImageFolder VisionDataset + +Transforms v2 +------------- + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + wrap_dataset_for_transforms_v2 diff --git a/docs/source/index.rst b/docs/source/index.rst index 79dbebdd047d626eb8ec10347b6a226cbf564a08..dc5fdefaefb032ea7db7eb0e478d23bb7a7e37d8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -32,6 +32,7 @@ architectures, and common image transformations for computer vision. :caption: Package Reference transforms + tv_tensors models datasets utils diff --git a/docs/source/io.rst b/docs/source/io.rst index 258a1ee16dcaccd582543bd1fe2ae70aeed55fbf..1da9bb6882a9fbc4b91b9ce787de42164c825b7d 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -1,11 +1,37 @@ -Reading/Writing images and videos -================================= +Decoding / Encoding images and videos +===================================== .. currentmodule:: torchvision.io The :mod:`torchvision.io` package provides functions for performing IO -operations. They are currently specific to reading and writing video and -images. +operations. They are currently specific to reading and writing images and +videos. + +Images +------ + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + read_image + decode_image + encode_jpeg + decode_jpeg + write_jpeg + encode_png + decode_png + write_png + read_file + write_file + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + ImageReadMode + + Video ----- @@ -20,7 +46,7 @@ Video Fine-grained video API ----------------------- +^^^^^^^^^^^^^^^^^^^^^^ In addition to the :mod:`read_video` function, we provide a high-performance lower-level API for more fine-grained control compared to the :mod:`read_video` function. @@ -61,28 +87,3 @@ Example of inspecting a video: # the constructor we select a default video stream, but # in practice, we can set whichever stream we would like video.set_current_stream("video:0") - - -Image ------ - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - ImageReadMode - -.. autosummary:: - :toctree: generated/ - :template: function.rst - - read_image - decode_image - encode_jpeg - decode_jpeg - write_jpeg - encode_png - decode_png - write_png - read_file - write_file diff --git a/docs/source/models.rst b/docs/source/models.rst index 10618434f9bf190255a89a3902a55c80272b9f00..155407786025401414e87c1f58e096e470c6807d 100644 --- a/docs/source/models.rst +++ b/docs/source/models.rst @@ -120,13 +120,12 @@ behavior, such as batch normalization. To switch between these modes, use # Set model to eval mode model.eval() -Model Registration Mechanism ----------------------------- - -.. betastatus:: registration mechanism +Listing and retrieving available models +--------------------------------------- -As of v0.14, TorchVision offers a new model registration mechanism which allows retreaving models -and weights by their names. Here are a few examples on how to use them: +As of v0.14, TorchVision offers a new mechanism which allows listing and +retrieving models and weights by their names. Here are a few examples on how to +use them: .. code:: python @@ -148,7 +147,7 @@ and weights by their names. Here are a few examples on how to use them: weights_enum2 = get_model_weights(torchvision.models.quantization.mobilenet_v3_large) assert weights_enum == weights_enum2 -Here are the available public methods of the model registration mechanism: +Here are the available public functions to retrieve models and their corresponding weights: .. currentmodule:: torchvision.models .. autosummary:: @@ -518,6 +517,7 @@ pre-trained weights: models/video_mvit models/video_resnet models/video_s3d + models/video_swin_transformer | diff --git a/docs/source/models/alexnet.rst b/docs/source/models/alexnet.rst index 080c241983bedf78f6b64fc19a8f5b7cecaf7e06..8e94b4eeed905983648cdefe50b29b95b4a4c41b 100644 --- a/docs/source/models/alexnet.rst +++ b/docs/source/models/alexnet.rst @@ -14,7 +14,7 @@ and is based on `One weird trick for parallelizing convolutional neural networks Model builders -------------- -The following model builders can be used to instanciate an AlexNet model, with or +The following model builders can be used to instantiate an AlexNet model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.alexnet.AlexNet`` base class. Please refer to the `source code diff --git a/docs/source/models/efficientnet.rst b/docs/source/models/efficientnet.rst index 4df547b3cbd3bac54e61a0270f5e60b010d227f5..cbc9718959af40e414a1a00a3cb5454305a3e16d 100644 --- a/docs/source/models/efficientnet.rst +++ b/docs/source/models/efficientnet.rst @@ -10,7 +10,7 @@ paper. Model builders -------------- -The following model builders can be used to instanciate an EfficientNet model, with or +The following model builders can be used to instantiate an EfficientNet model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source code diff --git a/docs/source/models/efficientnetv2.rst b/docs/source/models/efficientnetv2.rst index 05c953b13277ac3bc9a82bd98310c2b042eeff63..3066c28ebd482a128f12656c966c246bfb8f0de9 100644 --- a/docs/source/models/efficientnetv2.rst +++ b/docs/source/models/efficientnetv2.rst @@ -10,7 +10,7 @@ paper. Model builders -------------- -The following model builders can be used to instanciate an EfficientNetV2 model, with or +The following model builders can be used to instantiate an EfficientNetV2 model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source code diff --git a/docs/source/models/fcos.rst b/docs/source/models/fcos.rst index 1bcc42676784fd72c775ac1485f117602bd213c4..085f26549b8dd40899fe2d08d55064406f676c13 100644 --- a/docs/source/models/fcos.rst +++ b/docs/source/models/fcos.rst @@ -3,7 +3,7 @@ FCOS .. currentmodule:: torchvision.models.detection -The RetinaNet model is based on the `FCOS: Fully Convolutional One-Stage Object Detection +The FCOS model is based on the `FCOS: Fully Convolutional One-Stage Object Detection `__ paper. .. betastatus:: detection module @@ -12,7 +12,7 @@ Model builders -------------- The following model builders can be used to instantiate a FCOS model, with or -without pre-trained weights. All the model buidlers internally rely on the +without pre-trained weights. All the model builders internally rely on the ``torchvision.models.detection.fcos.FCOS`` base class. Please refer to the `source code `_ for more details about this class. diff --git a/docs/source/models/googlenet.rst b/docs/source/models/googlenet.rst index ed4f1345e232ff36f4e960838c8c7ed7baa765d6..91ea03ddf3d48e1342f6a4be77e3344d8f635f0c 100644 --- a/docs/source/models/googlenet.rst +++ b/docs/source/models/googlenet.rst @@ -10,7 +10,7 @@ paper. Model builders -------------- -The following model builders can be used to instanciate a GoogLeNet model, with or +The following model builders can be used to instantiate a GoogLeNet model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.googlenet.GoogLeNet`` base class. Please refer to the `source code diff --git a/docs/source/models/googlenet_quant.rst b/docs/source/models/googlenet_quant.rst index acb2737b52b8d3752e692fc768899f4ece7a5c38..4358389b3e50c2c7b025a3c097fecd80af5f6306 100644 --- a/docs/source/models/googlenet_quant.rst +++ b/docs/source/models/googlenet_quant.rst @@ -10,7 +10,7 @@ paper. Model builders -------------- -The following model builders can be used to instanciate a quantized GoogLeNet +The following model builders can be used to instantiate a quantized GoogLeNet model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.quantization.googlenet.QuantizableGoogLeNet`` base class. Please refer to the `source code diff --git a/docs/source/models/inception.rst b/docs/source/models/inception.rst index 72aa9724d4199686bca029329a7aab86cb91b7ad..e162eef5d30531bb357717186ca84a8b3cf8402b 100644 --- a/docs/source/models/inception.rst +++ b/docs/source/models/inception.rst @@ -10,7 +10,7 @@ Computer Vision `__ paper. Model builders -------------- -The following model builders can be used to instanciate an InceptionV3 model, with or +The following model builders can be used to instantiate an InceptionV3 model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.inception.Inception3`` base class. Please refer to the `source code `_ for diff --git a/docs/source/models/inception_quant.rst b/docs/source/models/inception_quant.rst index 397fd10df3c173c6a33bc0232433316fe8d46d6a..d26f1ab09da533b9c3496a5f835a540cf16f29df 100644 --- a/docs/source/models/inception_quant.rst +++ b/docs/source/models/inception_quant.rst @@ -10,7 +10,7 @@ Computer Vision `__ paper. Model builders -------------- -The following model builders can be used to instanciate a quantized Inception +The following model builders can be used to instantiate a quantized Inception model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.quantization.inception.QuantizableInception3`` base class. Please refer to the `source code diff --git a/docs/source/models/mnasnet.rst b/docs/source/models/mnasnet.rst index e31b4aca1b695073b8762e8d5a276cc3f70b7427..fd9ea5115857b0c85a2b7b949a24a99015f9374a 100644 --- a/docs/source/models/mnasnet.rst +++ b/docs/source/models/mnasnet.rst @@ -11,7 +11,7 @@ Search for Mobile `__ paper. Model builders -------------- -The following model builders can be used to instanciate an MNASNet model. +The following model builders can be used to instantiate an MNASNet model. All the model builders internally rely on the ``torchvision.models.mnasnet.MNASNet`` base class. Please refer to the `source code diff --git a/docs/source/models/retinanet.rst b/docs/source/models/retinanet.rst index 8613ae9aaab61fd931ce910bc95f7cedf7797887..910692ef3a5a91df23a4389af527c110f703bc88 100644 --- a/docs/source/models/retinanet.rst +++ b/docs/source/models/retinanet.rst @@ -12,7 +12,7 @@ Model builders -------------- The following model builders can be used to instantiate a RetinaNet model, with or -without pre-trained weights. All the model buidlers internally rely on the +without pre-trained weights. All the model builders internally rely on the ``torchvision.models.detection.retinanet.RetinaNet`` base class. Please refer to the `source code `_ for more details about this class. diff --git a/docs/source/models/ssd.rst b/docs/source/models/ssd.rst index 7d73b234a283088f74df46ceffb540f4c5b8a169..68b0bb224df3a22466bd1cd42bbcd06183769950 100644 --- a/docs/source/models/ssd.rst +++ b/docs/source/models/ssd.rst @@ -12,7 +12,7 @@ The SSD model is based on the `SSD: Single Shot MultiBox Detector Model builders -------------- -The following model builders can be used to instanciate a SSD model, with or +The following model builders can be used to instantiate a SSD model, with or without pre-trained weights. All the model builders internally rely on the ``torchvision.models.detection.SSD`` base class. Please refer to the `source code diff --git a/docs/source/models/swin_transformer.rst b/docs/source/models/swin_transformer.rst index 35b529879541ede53bb16776e0de957038b109a6..b302f5bd79d390e658d7614de8d15471f8d1bb6e 100644 --- a/docs/source/models/swin_transformer.rst +++ b/docs/source/models/swin_transformer.rst @@ -15,7 +15,7 @@ Model builders -------------- The following model builders can be used to instantiate an SwinTransformer model (original and V2) with and without pre-trained weights. -All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer`` +All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer`` base class. Please refer to the `source code `_ for more details about this class. diff --git a/docs/source/models/vgg.rst b/docs/source/models/vgg.rst index a9fa9aabfb10bc131ba680c639dcab89b6420524..77b5686927c99c39075fab6d0a9f9c24de491134 100644 --- a/docs/source/models/vgg.rst +++ b/docs/source/models/vgg.rst @@ -11,7 +11,7 @@ Model builders -------------- The following model builders can be used to instantiate a VGG model, with or -without pre-trained weights. All the model buidlers internally rely on the +without pre-trained weights. All the model builders internally rely on the ``torchvision.models.vgg.VGG`` base class. Please refer to the `source code `_ for more details about this class. diff --git a/docs/source/models/video_swin_transformer.rst b/docs/source/models/video_swin_transformer.rst new file mode 100644 index 0000000000000000000000000000000000000000..e31e69759b45681e5619ab1befe9a5bc4bc2cdf6 --- /dev/null +++ b/docs/source/models/video_swin_transformer.rst @@ -0,0 +1,27 @@ +Video SwinTransformer +===================== + +.. currentmodule:: torchvision.models.video + +The Video SwinTransformer model is based on the `Video Swin Transformer `__ paper. + +.. betastatus:: video module + + +Model builders +-------------- + +The following model builders can be used to instantiate a VideoResNet model, with or +without pre-trained weights. All the model builders internally rely on the +``torchvision.models.video.swin_transformer.SwinTransformer3d`` base class. Please refer to the `source +code +`_ for +more details about this class. + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + swin3d_t + swin3d_s + swin3d_b diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 5909b68966ba8c7fec477c4d84c3ced4f9c8d08a..2aa1fc5ba1ebdc8e8c199098686c1d3661d95669 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -5,123 +5,549 @@ Transforming and augmenting images .. currentmodule:: torchvision.transforms -Transforms are common image transformations available in the -``torchvision.transforms`` module. They can be chained together using -:class:`Compose`. -Most transform classes have a function equivalent: :ref:`functional -transforms ` give fine-grained control over the -transformations. -This is useful if you have to build a more complex transformation pipeline -(e.g. in the case of segmentation tasks). - -Most transformations accept both `PIL `_ -images and tensor images, although some transformations are :ref:`PIL-only -` and some are :ref:`tensor-only -`. The :ref:`conversion_transforms` may be used to -convert to and from PIL images. - -The transformations that accept tensor images also accept batches of tensor -images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a -number of channels, ``H`` and ``W`` are image height and width. A batch of -Tensor Images is a tensor of ``(B, C, H, W)`` shape, where ``B`` is a number -of images in the batch. +Torchvision supports common computer vision transformations in the +``torchvision.transforms`` and ``torchvision.transforms.v2`` modules. Transforms +can be used to transform or augment data for training or inference of different +tasks (image classification, detection, segmentation, video classification). + +.. code:: python + + # Image Classification + import torch + from torchvision.transforms import v2 + + H, W = 32, 32 + img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8) + + transforms = v2.Compose([ + v2.RandomResizedCrop(size=(224, 224), antialias=True), + v2.RandomHorizontalFlip(p=0.5), + v2.ToDtype(torch.float32, scale=True), + v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + img = transforms(img) + +.. code:: python + + # Detection (re-using imports and transforms from above) + from torchvision import tv_tensors + + img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8) + boxes = torch.randint(0, H // 2, size=(3, 4)) + boxes[:, 2:] += boxes[:, :2] + boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W)) + + # The same transforms can be used! + img, boxes = transforms(img, boxes) + # And you can pass arbitrary input structures + output_dict = transforms({"image": img, "boxes": boxes}) + +Transforms are typically passed as the ``transform`` or ``transforms`` argument +to the :ref:`Datasets `. + +Start here +---------- + +Whether you're new to Torchvision transforms, or you're already experienced with +them, we encourage you to start with +:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` in +order to learn more about what can be done with the new v2 transforms. + +Then, browse the sections in below this page for general information and +performance tips. The available transforms and functionals are listed in the +:ref:`API reference `. + +More information and tutorials can also be found in our :ref:`example gallery +`, e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py` +or :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py`. + +.. _conventions: + +Supported input types and conventions +------------------------------------- + +Most transformations accept both `PIL `_ images +and tensor inputs. Both CPU and CUDA tensors are supported. +The result of both backends (PIL or Tensors) should be very +close. In general, we recommend relying on the tensor backend :ref:`for +performance `. The :ref:`conversion transforms +` may be used to convert to and from PIL images, or for +converting dtypes and ranges. + +Tensor image are expected to be of shape ``(C, H, W)``, where ``C`` is the +number of channels, and ``H`` and ``W`` refer to height and width. Most +transforms support batched tensor input. A batch of Tensor images is a tensor of +shape ``(N, C, H, W)``, where ``N`` is a number of images in the batch. The +:ref:`v2 ` transforms generally accept an arbitrary number of leading +dimensions ``(..., C, H, W)`` and can handle batched images or batched videos. + +.. _range_and_dtype: + +Dtype and expected value range +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The expected range of the values of a tensor image is implicitly defined by the tensor dtype. Tensor images with a float dtype are expected to have -values in ``[0, 1)``. Tensor images with an integer dtype are expected to +values in ``[0, 1]``. Tensor images with an integer dtype are expected to have values in ``[0, MAX_DTYPE]`` where ``MAX_DTYPE`` is the largest value -that can be represented in that dtype. +that can be represented in that dtype. Typically, images of dtype +``torch.uint8`` are expected to have values in ``[0, 255]``. -Randomized transformations will apply the same transformation to all the -images of a given batch, but they will produce different transformations -across calls. For reproducible transformations across calls, you may use -:ref:`functional transforms `. +Use :class:`~torchvision.transforms.v2.ToDtype` to convert both the dtype and +range of the inputs. -The following examples illustrate the use of the available transforms: +.. _v1_or_v2: - * :ref:`sphx_glr_auto_examples_plot_transforms.py` +V1 or V2? Which one should I use? +--------------------------------- - .. figure:: ../source/auto_examples/images/sphx_glr_plot_transforms_001.png - :align: center - :scale: 65% +**TL;DR** We recommending using the ``torchvision.transforms.v2`` transforms +instead of those in ``torchvision.transforms``. They're faster and they can do +more things. Just change the import and you should be good to go. + +In Torchvision 0.15 (March 2023), we released a new set of transforms available +in the ``torchvision.transforms.v2`` namespace. These transforms have a lot of +advantages compared to the v1 ones (in ``torchvision.transforms``): + +- They can transform images **but also** bounding boxes, masks, or videos. This + provides support for tasks beyond image classification: detection, segmentation, + video classification, etc. See + :ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` + and :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`. +- They support more transforms like :class:`~torchvision.transforms.v2.CutMix` + and :class:`~torchvision.transforms.v2.MixUp`. See + :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py`. +- They're :ref:`faster `. +- They support arbitrary input structures (dicts, lists, tuples, etc.). +- Future improvements and features will be added to the v2 transforms only. + +These transforms are **fully backward compatible** with the v1 ones, so if +you're already using tranforms from ``torchvision.transforms``, all you need to +do to is to update the import to ``torchvision.transforms.v2``. In terms of +output, there might be negligible differences due to implementation differences. + +.. note:: + + The v2 transforms are still BETA, but at this point we do not expect + disruptive changes to be made to their public APIs. We're planning to make + them fully stable in version 0.17. Please submit any feedback you may have + `here `_. + +.. _transforms_perf: + +Performance considerations +-------------------------- - * :ref:`sphx_glr_auto_examples_plot_scripted_tensor_transforms.py` +We recommend the following guidelines to get the best performance out of the +transforms: - .. figure:: ../source/auto_examples/images/sphx_glr_plot_scripted_tensor_transforms_001.png - :align: center - :scale: 30% +- Rely on the v2 transforms from ``torchvision.transforms.v2`` +- Use tensors instead of PIL images +- Use ``torch.uint8`` dtype, especially for resizing +- Resize with bilinear or bicubic mode -.. warning:: +This is what a typical transform pipeline could look like: - Since v0.8.0 all random transformations are using torch default random generator to sample random parameters. - It is a backward compatibility breaking change and user should set the random state as following: +.. code:: python + + from torchvision.transforms import v2 + transforms = v2.Compose([ + v2.ToImage(), # Convert to tensor, only needed if you had a PIL image + v2.ToDtype(torch.uint8, scale=True), # optional, most input are already uint8 at this point + # ... + v2.RandomResizedCrop(size=(224, 224), antialias=True), # Or Resize(antialias=True) + # ... + v2.ToDtype(torch.float32, scale=True), # Normalize expects float input + v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), + ]) + +The above should give you the best performance in a typical training environment +that relies on the :class:`torch.utils.data.DataLoader` with ``num_workers > +0``. + +Transforms tend to be sensitive to the input strides / memory format. Some +transforms will be faster with channels-first images while others prefer +channels-last. Like ``torch`` operators, most transforms will preserve the +memory format of the input, but this may not always be respected due to +implementation details. You may want to experiment a bit if you're chasing the +very best performance. Using :func:`torch.compile` on individual transforms may +also help factoring out the memory format variable (e.g. on +:class:`~torchvision.transforms.v2.Normalize`). Note that we're talking about +**memory format**, not :ref:`tensor shape `. + +Note that resize transforms like :class:`~torchvision.transforms.v2.Resize` +and :class:`~torchvision.transforms.v2.RandomResizedCrop` typically prefer +channels-last input and tend **not** to benefit from :func:`torch.compile` at +this time. - .. code:: python +.. _functional_transforms: - # Previous versions - # import random - # random.seed(12) +Transform classes, functionals, and kernels +------------------------------------------- - # Now - import torch - torch.manual_seed(17) +Transforms are available as classes like +:class:`~torchvision.transforms.v2.Resize`, but also as functionals like +:func:`~torchvision.transforms.v2.functional.resize` in the +``torchvision.transforms.v2.functional`` namespace. +This is very much like the :mod:`torch.nn` package which defines both classes +and functional equivalents in :mod:`torch.nn.functional`. - Please, keep in mind that the same seed for torch random generator and Python random generator will not - produce the same results. +The functionals support PIL images, pure tensors, or :ref:`TVTensors +`, e.g. both ``resize(image_tensor)`` and ``resize(boxes)`` are +valid. +.. note:: -Scriptable transforms ---------------------- + Random transforms like :class:`~torchvision.transforms.v2.RandomCrop` will + randomly sample some parameter each time they're called. Their functional + counterpart (:func:`~torchvision.transforms.v2.functional.crop`) does not do + any kind of random sampling and thus have a slighlty different + parametrization. The ``get_params()`` class method of the transforms class + can be used to perform parameter sampling when using the functional APIs. -In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`. + +The ``torchvision.transforms.v2.functional`` namespace also contains what we +call the "kernels". These are the low-level functions that implement the +core functionalities for specific types, e.g. ``resize_bounding_boxes`` or +```resized_crop_mask``. They are public, although not documented. Check the +`code +`_ +to see which ones are available (note that those starting with a leading +underscore are **not** public!). Kernels are only really useful if you want +:ref:`torchscript support ` for types like bounding +boxes or masks. + +.. _transforms_torchscript: + +Torchscript support +------------------- + +Most transform classes and functionals support torchscript. For composing +transforms, use :class:`torch.nn.Sequential` instead of +:class:`~torchvision.transforms.v2.Compose`: .. code:: python transforms = torch.nn.Sequential( - transforms.CenterCrop(10), - transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + CenterCrop(10), + Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ) scripted_transforms = torch.jit.script(transforms) -Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor`` and does not require -`lambda` functions or ``PIL.Image``. +.. warning:: -For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``. + v2 transforms support torchscript, but if you call ``torch.jit.script()`` on + a v2 **class** transform, you'll actually end up with its (scripted) v1 + equivalent. This may lead to slightly different results between the + scripted and eager executions due to implementation differences between v1 + and v2. + If you really need torchscript support for the v2 transforms, we recommend + scripting the **functionals** from the + ``torchvision.transforms.v2.functional`` namespace to avoid surprises. -Compositions of transforms --------------------------- + +Also note that the functionals only support torchscript for pure tensors, which +are always treated as images. If you need torchscript support for other types +like bounding boxes or masks, you can rely on the :ref:`low-level kernels +`. + +For any custom transformations to be used with ``torch.jit.script``, they should +be derived from ``torch.nn.Module``. + +See also: :ref:`sphx_glr_auto_examples_others_plot_scripted_tensor_transforms.py`. + +.. _v2_api_ref: + +V2 API reference - Recommended +------------------------------ + +Geometry +^^^^^^^^ + +Resizing +"""""""" .. autosummary:: :toctree: generated/ :template: class.rst - Compose + v2.Resize + v2.ScaleJitter + v2.RandomShortestSize + v2.RandomResize + +Functionals + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.resize + +Cropping +"""""""" + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.RandomCrop + v2.RandomResizedCrop + v2.RandomIoUCrop + v2.CenterCrop + v2.FiveCrop + v2.TenCrop + +Functionals + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.crop + v2.functional.resized_crop + v2.functional.ten_crop + v2.functional.center_crop + v2.functional.five_crop + +Others +"""""" + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.RandomHorizontalFlip + v2.RandomVerticalFlip + v2.Pad + v2.RandomZoomOut + v2.RandomRotation + v2.RandomAffine + v2.RandomPerspective + v2.ElasticTransform + +Functionals + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.horizontal_flip + v2.functional.vertical_flip + v2.functional.pad + v2.functional.rotate + v2.functional.affine + v2.functional.perspective + v2.functional.elastic + +Color +^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.ColorJitter + v2.RandomChannelPermutation + v2.RandomPhotometricDistort + v2.Grayscale + v2.RandomGrayscale + v2.GaussianBlur + v2.RandomInvert + v2.RandomPosterize + v2.RandomSolarize + v2.RandomAdjustSharpness + v2.RandomAutocontrast + v2.RandomEqualize + +Functionals + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.permute_channels + v2.functional.rgb_to_grayscale + v2.functional.to_grayscale + v2.functional.gaussian_blur + v2.functional.invert + v2.functional.posterize + v2.functional.solarize + v2.functional.adjust_sharpness + v2.functional.autocontrast + v2.functional.adjust_contrast + v2.functional.equalize + v2.functional.adjust_brightness + v2.functional.adjust_saturation + v2.functional.adjust_hue + v2.functional.adjust_gamma + + +Composition +^^^^^^^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.Compose + v2.RandomApply + v2.RandomChoice + v2.RandomOrder + +Miscellaneous +^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.LinearTransformation + v2.Normalize + v2.RandomErasing + v2.Lambda + v2.SanitizeBoundingBoxes + v2.ClampBoundingBoxes + v2.UniformTemporalSubsample + +Functionals + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.normalize + v2.functional.erase + v2.functional.clamp_bounding_boxes + v2.functional.uniform_temporal_subsample + +.. _conversion_transforms: + +Conversion +^^^^^^^^^^ + +.. note:: + Beware, some of these conversion transforms below will scale the values + while performing the conversion, while some may not do any scaling. By + scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0, + 255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`. + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.ToImage + v2.ToPureTensor + v2.PILToTensor + v2.ToPILImage + v2.ToDtype + v2.ConvertBoundingBoxFormat + +functionals + +.. autosummary:: + :toctree: generated/ + :template: functional.rst + + v2.functional.to_image + v2.functional.pil_to_tensor + v2.functional.to_pil_image + v2.functional.to_dtype + v2.functional.convert_bounding_box_format + + +Deprecated + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.ToTensor + v2.functional.to_tensor + v2.ConvertImageDtype + v2.functional.convert_image_dtype + +Auto-Augmentation +^^^^^^^^^^^^^^^^^ + +`AutoAugment `_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models. +Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that +ImageNet policies provide significant improvements when applied to other datasets. +In TorchVision we implemented 3 policies learned on the following datasets: ImageNet, CIFAR10 and SVHN. +The new transform can be used standalone or mixed-and-matched with existing transforms: + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + v2.AutoAugment + v2.RandAugment + v2.TrivialAugmentWide + v2.AugMix + +CutMix - MixUp +^^^^^^^^^^^^^^ -Transforms on PIL Image and torch.\*Tensor ------------------------------------------- +CutMix and MixUp are special transforms that +are meant to be used on batches rather than on individual images, because they +are combining pairs of images together. These can be used after the dataloader +(once the samples are batched), or part of a collation function. See +:ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage examples. .. autosummary:: :toctree: generated/ :template: class.rst + v2.CutMix + v2.MixUp + +Developer tools +^^^^^^^^^^^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: function.rst + + v2.functional.register_kernel + + +V1 API Reference +---------------- + +Geometry +^^^^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + Resize + RandomCrop + RandomResizedCrop CenterCrop - ColorJitter FiveCrop - Grayscale + TenCrop Pad + RandomRotation RandomAffine - RandomApply - RandomCrop - RandomGrayscale - RandomHorizontalFlip RandomPerspective - RandomResizedCrop - RandomRotation + ElasticTransform + RandomHorizontalFlip RandomVerticalFlip - Resize - TenCrop + + +Color +^^^^^ + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + ColorJitter + Grayscale + RandomGrayscale GaussianBlur RandomInvert RandomPosterize @@ -130,23 +556,20 @@ Transforms on PIL Image and torch.\*Tensor RandomAutocontrast RandomEqualize - -.. _transforms_pil_only: - -Transforms on PIL Image only ----------------------------- +Composition +^^^^^^^^^^^ .. autosummary:: :toctree: generated/ :template: class.rst + Compose + RandomApply RandomChoice RandomOrder -.. _transforms_tensor_only: - -Transforms on torch.\*Tensor only ---------------------------------- +Miscellaneous +^^^^^^^^^^^^^ .. autosummary:: :toctree: generated/ @@ -155,13 +578,17 @@ Transforms on torch.\*Tensor only LinearTransformation Normalize RandomErasing - ConvertImageDtype - -.. _conversion_transforms: + Lambda -Conversion Transforms ---------------------- +Conversion +^^^^^^^^^^ +.. note:: + Beware, some of these conversion transforms below will scale the values + while performing the conversion, while some may not do any scaling. By + scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0, + 255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`. + .. autosummary:: :toctree: generated/ :template: class.rst @@ -169,20 +596,10 @@ Conversion Transforms ToPILImage ToTensor PILToTensor + ConvertImageDtype - -Generic Transforms ------------------- - -.. autosummary:: - :toctree: generated/ - :template: class.rst - - Lambda - - -Automatic Augmentation Transforms ---------------------------------- +Auto-Augmentation +^^^^^^^^^^^^^^^^^ `AutoAugment `_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models. Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that @@ -200,57 +617,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran TrivialAugmentWide AugMix -.. _functional_transforms: + Functional Transforms ---------------------- +^^^^^^^^^^^^^^^^^^^^^ .. currentmodule:: torchvision.transforms.functional -Functional transforms give you fine-grained control of the transformation pipeline. -As opposed to the transformations above, functional transforms don't contain a random number -generator for their parameters. -That means you have to specify/generate all parameters, but the functional transform will give you -reproducible results across calls. - -Example: -you can apply a functional transform with the same parameters to multiple images like this: - -.. code:: python - - import torchvision.transforms.functional as TF - import random - - def my_segmentation_transforms(image, segmentation): - if random.random() > 0.5: - angle = random.randint(-30, 30) - image = TF.rotate(image, angle) - segmentation = TF.rotate(segmentation, angle) - # more transforms ... - return image, segmentation - - -Example: -you can use a functional transform to build transform classes with custom behavior: - -.. code:: python - - import torchvision.transforms.functional as TF - import random - - class MyRotationTransform: - """Rotate by one of the given angles.""" - - def __init__(self, angles): - self.angles = angles - - def __call__(self, x): - angle = random.choice(self.angles) - return TF.rotate(x, angle) - - rotation_transform = MyRotationTransform(angles=[-30, -15, 0, 15, 30]) - - .. autosummary:: :toctree: generated/ :template: function.rst diff --git a/docs/source/tv_tensors.rst b/docs/source/tv_tensors.rst new file mode 100644 index 0000000000000000000000000000000000000000..cb8a3c45fa9ca2c53754a110570a9bd0dab4d7ca --- /dev/null +++ b/docs/source/tv_tensors.rst @@ -0,0 +1,29 @@ +.. _tv_tensors: + +TVTensors +========== + +.. currentmodule:: torchvision.tv_tensors + +TVTensors are :class:`torch.Tensor` subclasses which the v2 :ref:`transforms +` use under the hood to dispatch their inputs to the appropriate +lower-level kernels. Most users do not need to manipulate TVTensors directly. + +Refer to +:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` for +an introduction to TVTensors, or +:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py` for more advanced +info. + +.. autosummary:: + :toctree: generated/ + :template: class.rst + + Image + Video + BoundingBoxFormat + BoundingBoxes + Mask + TVTensor + set_return_type + wrap diff --git a/docs/source/utils.rst b/docs/source/utils.rst index 276f730c2940fa778d2064c666d24f38e01e0699..cda04de900ad8f43a9ac855631d3c67f0a149c69 100644 --- a/docs/source/utils.rst +++ b/docs/source/utils.rst @@ -4,7 +4,7 @@ Utils ===== The ``torchvision.utils`` module contains various utilities, mostly :ref:`for -vizualization `. +visualization `. .. currentmodule:: torchvision.utils diff --git a/examples/cpp/hello_world/CMakeLists.txt b/examples/cpp/hello_world/CMakeLists.txt index 3ca59e4c199e4e557ae3c3dfc414686507ded01d..7d49178b8b339ed02739adec1fbb3c8ec64882b2 100644 --- a/examples/cpp/hello_world/CMakeLists.txt +++ b/examples/cpp/hello_world/CMakeLists.txt @@ -17,4 +17,4 @@ add_executable(hello-world main.cpp) # which also adds all the necessary torch dependencies. target_compile_features(hello-world PUBLIC cxx_range_for) target_link_libraries(hello-world TorchVision::TorchVision) -set_property(TARGET hello-world PROPERTY CXX_STANDARD 14) +set_property(TARGET hello-world PROPERTY CXX_STANDARD 17) diff --git a/gallery/README.rst b/gallery/README.rst index 868afe743518523fe73b029e6250291518304376..8dfea35527640aea39b4659d3bd2b3873d1ad708 100644 --- a/gallery/README.rst +++ b/gallery/README.rst @@ -1,4 +1,4 @@ -Example gallery -=============== +.. _gallery: -Below is a gallery of examples +Examples and tutorials +====================== diff --git a/gallery/assets/coco/images/000000000001.jpg b/gallery/assets/coco/images/000000000001.jpg new file mode 120000 index 0000000000000000000000000000000000000000..9be80c7c27300ce0b8fe589a9e41b13fef33c2b8 --- /dev/null +++ b/gallery/assets/coco/images/000000000001.jpg @@ -0,0 +1 @@ +../../astronaut.jpg \ No newline at end of file diff --git a/gallery/assets/coco/images/000000000002.jpg b/gallery/assets/coco/images/000000000002.jpg new file mode 120000 index 0000000000000000000000000000000000000000..9f8efef9928aec7e07a66a3581f1d09c2184393e --- /dev/null +++ b/gallery/assets/coco/images/000000000002.jpg @@ -0,0 +1 @@ +../../dog2.jpg \ No newline at end of file diff --git a/gallery/assets/coco/instances.json b/gallery/assets/coco/instances.json new file mode 100644 index 0000000000000000000000000000000000000000..fe0e09270bfba4390db27fd796fbc943c2c76362 --- /dev/null +++ b/gallery/assets/coco/instances.json @@ -0,0 +1 @@ +{"images": [{"file_name": "000000000001.jpg", "height": 512, "width": 512, "id": 1}, {"file_name": "000000000002.jpg", "height": 500, "width": 500, "id": 2}], "annotations": [{"segmentation": [[40.0, 511.0, 26.0, 487.0, 28.0, 438.0, 17.0, 397.0, 24.0, 346.0, 38.0, 306.0, 61.0, 250.0, 111.0, 206.0, 111.0, 187.0, 120.0, 183.0, 136.0, 159.0, 159.0, 150.0, 181.0, 148.0, 182.0, 132.0, 175.0, 132.0, 168.0, 120.0, 154.0, 102.0, 153.0, 62.0, 188.0, 35.0, 191.0, 29.0, 208.0, 20.0, 210.0, 22.0, 227.0, 16.0, 240.0, 16.0, 276.0, 31.0, 285.0, 39.0, 301.0, 88.0, 297.0, 108.0, 281.0, 128.0, 273.0, 138.0, 266.0, 138.0, 264.0, 153.0, 257.0, 162.0, 256.0, 174.0, 284.0, 197.0, 300.0, 221.0, 303.0, 236.0, 337.0, 258.0, 357.0, 306.0, 361.0, 351.0, 358.0, 511.0]], "iscrowd": 0, "image_id": 1, "bbox": [17.0, 16.0, 344.0, 495.0], "category_id": 1, "id": 1}, {"segmentation": [[0.0, 411.0, 43.0, 401.0, 99.0, 395.0, 105.0, 351.0, 124.0, 326.0, 181.0, 294.0, 227.0, 280.0, 245.0, 262.0, 259.0, 234.0, 262.0, 207.0, 271.0, 140.0, 283.0, 139.0, 301.0, 162.0, 309.0, 181.0, 341.0, 175.0, 362.0, 139.0, 369.0, 139.0, 377.0, 163.0, 378.0, 203.0, 381.0, 212.0, 380.0, 220.0, 382.0, 242.0, 404.0, 264.0, 392.0, 293.0, 384.0, 295.0, 385.0, 316.0, 399.0, 343.0, 391.0, 448.0, 452.0, 475.0, 457.0, 494.0, 436.0, 498.0, 402.0, 491.0, 369.0, 488.0, 366.0, 496.0, 319.0, 496.0, 302.0, 485.0, 226.0, 469.0, 128.0, 456.0, 74.0, 458.0, 29.0, 439.0, 0.0, 445.0]], "iscrowd": 0, "image_id": 2, "bbox": [0.0, 139.0, 457.0, 359.0], "category_id": 18, "id": 2}]} diff --git a/gallery/others/README.rst b/gallery/others/README.rst new file mode 100644 index 0000000000000000000000000000000000000000..fafb007d98522d5888a26868d0ecc420df434ca7 --- /dev/null +++ b/gallery/others/README.rst @@ -0,0 +1,2 @@ +Others +------ diff --git a/gallery/plot_optical_flow.py b/gallery/others/plot_optical_flow.py similarity index 90% rename from gallery/plot_optical_flow.py rename to gallery/others/plot_optical_flow.py index b0a93209877e9afda6a52914a29c727b3f5ed4c3..3ab1449341729cced1c571c9eefefcadc586c08d 100644 --- a/gallery/plot_optical_flow.py +++ b/gallery/others/plot_optical_flow.py @@ -3,6 +3,10 @@ Optical Flow: Predicting movement with the RAFT model ===================================================== +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + Optical flow is the task of predicting movement between two images, usually two consecutive frames of a video. Optical flow models take two images as input, and predict a flow: the flow indicates the displacement of every single pixel in the @@ -42,7 +46,7 @@ def plot(imgs, **imshow_kwargs): plt.tight_layout() -################################### +# %% # Reading Videos Using Torchvision # -------------------------------- # We will first read a video using :func:`~torchvision.io.read_video`. @@ -62,7 +66,7 @@ video_url = "https://download.pytorch.org/tutorial/pexelscom_pavel_danilyuk_bask video_path = Path(tempfile.mkdtemp()) / "basketball.mp4" _ = urlretrieve(video_url, video_path) -######################### +# %% # :func:`~torchvision.io.read_video` returns the video frames, audio frames and # the metadata associated with the video. In our case, we only need the video # frames. @@ -79,11 +83,12 @@ img2_batch = torch.stack([frames[101], frames[151]]) plot(img1_batch) -######################### +# %% # The RAFT model accepts RGB images. We first get the frames from -# :func:`~torchvision.io.read_video` and resize them to ensure their -# dimensions are divisible by 8. Then we use the transforms bundled into the -# weights in order to preprocess the input and rescale its values to the +# :func:`~torchvision.io.read_video` and resize them to ensure their dimensions +# are divisible by 8. Note that we explicitly use ``antialias=False``, because +# this is how those models were trained. Then we use the transforms bundled into +# the weights in order to preprocess the input and rescale its values to the # required ``[-1, 1]`` interval. from torchvision.models.optical_flow import Raft_Large_Weights @@ -93,8 +98,8 @@ transforms = weights.transforms() def preprocess(img1_batch, img2_batch): - img1_batch = F.resize(img1_batch, size=[520, 960]) - img2_batch = F.resize(img2_batch, size=[520, 960]) + img1_batch = F.resize(img1_batch, size=[520, 960], antialias=False) + img2_batch = F.resize(img2_batch, size=[520, 960], antialias=False) return transforms(img1_batch, img2_batch) @@ -103,7 +108,7 @@ img1_batch, img2_batch = preprocess(img1_batch, img2_batch) print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}") -#################################### +# %% # Estimating Optical flow using RAFT # ---------------------------------- # We will use our RAFT implementation from @@ -124,12 +129,12 @@ list_of_flows = model(img1_batch.to(device), img2_batch.to(device)) print(f"type = {type(list_of_flows)}") print(f"length = {len(list_of_flows)} = number of iterations of the model") -#################################### +# %% # The RAFT model outputs lists of predicted flows where each entry is a # (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration" # in the model. For more details on the iterative nature of the model, please # refer to the `original paper `_. Here, we -# are only interested in the final predicted flows (they are the most acccurate +# are only interested in the final predicted flows (they are the most accurate # ones), so we will just retrieve the last item in the list. # # As described above, a flow is a tensor with dimensions (2, H, W) (or (N, 2, H, @@ -143,10 +148,10 @@ print(f"shape = {predicted_flows.shape} = (N, 2, H, W)") print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}") -#################################### +# %% # Visualizing predicted flows # --------------------------- -# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to +# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utility to # convert a flow into an RGB image. It also supports batches of flows. # each "direction" in the flow will be mapped to a given RGB color. In the # images below, pixels with similar colors are assumed by the model to be moving @@ -165,7 +170,7 @@ img1_batch = [(img1 + 1) / 2 for img1 in img1_batch] grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)] plot(grid) -#################################### +# %% # Bonus: Creating GIFs of predicted flows # --------------------------------------- # In the example above we have only shown the predicted flows of 2 pairs of @@ -186,7 +191,7 @@ plot(grid) # output_folder = "/tmp/" # Update this to the folder of your choice # write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg") -#################################### +# %% # Once the .jpg flow images are saved, you can convert them into a video or a # GIF using ffmpeg with e.g.: # diff --git a/gallery/plot_repurposing_annotations.py b/gallery/others/plot_repurposing_annotations.py similarity index 94% rename from gallery/plot_repurposing_annotations.py rename to gallery/others/plot_repurposing_annotations.py index 7bb68617a17b0b99160d371b0f77204ea14035eb..b1617cacd99170497ebda357b4bc975b580354ff 100644 --- a/gallery/plot_repurposing_annotations.py +++ b/gallery/others/plot_repurposing_annotations.py @@ -3,6 +3,10 @@ Repurposing masks into bounding boxes ===================================== +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + The following example illustrates the operations available the :ref:`torchvision.ops ` module for repurposing segmentation masks into object localization annotations for different tasks @@ -20,7 +24,7 @@ import matplotlib.pyplot as plt import torchvision.transforms.functional as F -ASSETS_DIRECTORY = "assets" +ASSETS_DIRECTORY = "../assets" plt.rcParams["savefig.bbox"] = "tight" @@ -36,7 +40,7 @@ def show(imgs): axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) -#################################### +# %% # Masks # ----- # In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package, @@ -53,7 +57,7 @@ def show(imgs): # A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object # localization tasks. -#################################### +# %% # Converting Masks to Bounding Boxes # ----------------------------------------------- # For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to @@ -70,7 +74,7 @@ img = read_image(img_path) mask = read_image(mask_path) -######################### +# %% # Here the masks are represented as a PNG Image, with floating point values. # Each pixel is encoded as different colors, with 0 being background. # Notice that the spatial dimensions of image and mask match. @@ -79,7 +83,7 @@ print(mask.size()) print(img.size()) print(mask) -############################ +# %% # We get the unique colors, as these would be the object ids. obj_ids = torch.unique(mask) @@ -91,7 +95,7 @@ obj_ids = obj_ids[1:] # Note that this snippet would work as well if the masks were float values instead of ints. masks = mask == obj_ids[:, None, None] -######################## +# %% # Now the masks are a boolean tensor. # The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image. # The other two dimensions are height and width, which are equal to the dimensions of the image. @@ -101,7 +105,7 @@ masks = mask == obj_ids[:, None, None] print(masks.size()) print(masks) -#################################### +# %% # Let us visualize an image and plot its corresponding segmentation masks. # We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks. @@ -113,7 +117,7 @@ for mask in masks: show(drawn_masks) -#################################### +# %% # To convert the boolean masks into bounding boxes. # We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module # It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format. @@ -124,7 +128,7 @@ boxes = masks_to_boxes(masks) print(boxes.size()) print(boxes) -#################################### +# %% # As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format. # These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility # provided in :ref:`torchvision.utils `. @@ -134,7 +138,7 @@ from torchvision.utils import draw_bounding_boxes drawn_boxes = draw_bounding_boxes(img, boxes, colors="red") show(drawn_boxes) -################################### +# %% # These boxes can now directly be used by detection models in torchvision. # Here is demo with a Faster R-CNN model loaded from # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` @@ -153,7 +157,7 @@ target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64) detection_outputs = model(img.unsqueeze(0), [target]) -#################################### +# %% # Converting Segmentation Dataset to Detection Dataset # ---------------------------------------------------- # diff --git a/gallery/others/plot_scripted_tensor_transforms.py b/gallery/others/plot_scripted_tensor_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..128ce7778f3c19811fd124566ade6e99049159db --- /dev/null +++ b/gallery/others/plot_scripted_tensor_transforms.py @@ -0,0 +1,136 @@ +""" +=================== +Torchscript support +=================== + +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + +This example illustrates `torchscript +`_ support of the torchvision +:ref:`transforms ` on Tensor images. +""" + +# %% +from pathlib import Path + +import matplotlib.pyplot as plt + +import torch +import torch.nn as nn + +import torchvision.transforms as v1 +from torchvision.io import read_image + +plt.rcParams["savefig.bbox"] = 'tight' +torch.manual_seed(1) + +# If you're trying to run that on collab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +import sys +sys.path += ["../transforms"] +from helpers import plot +ASSETS_PATH = Path('../assets') + + +# %% +# Most transforms support torchscript. For composing transforms, we use +# :class:`torch.nn.Sequential` instead of +# :class:`~torchvision.transforms.v2.Compose`: + +dog1 = read_image(str(ASSETS_PATH / 'dog1.jpg')) +dog2 = read_image(str(ASSETS_PATH / 'dog2.jpg')) + +transforms = torch.nn.Sequential( + v1.RandomCrop(224), + v1.RandomHorizontalFlip(p=0.3), +) + +scripted_transforms = torch.jit.script(transforms) + +plot([dog1, scripted_transforms(dog1), dog2, scripted_transforms(dog2)]) + + +# %% +# .. warning:: +# +# Above we have used transforms from the ``torchvision.transforms`` +# namespace, i.e. the "v1" transforms. The v2 transforms from the +# ``torchvision.transforms.v2`` namespace are the :ref:`recommended +# ` way to use transforms in your code. +# +# The v2 transforms also support torchscript, but if you call +# ``torch.jit.script()`` on a v2 **class** transform, you'll actually end up +# with its (scripted) v1 equivalent. This may lead to slightly different +# results between the scripted and eager executions due to implementation +# differences between v1 and v2. +# +# If you really need torchscript support for the v2 transforms, **we +# recommend scripting the functionals** from the +# ``torchvision.transforms.v2.functional`` namespace to avoid surprises. +# +# Below we now show how to combine image transformations and a model forward +# pass, while using ``torch.jit.script`` to obtain a single scripted module. +# +# Let's define a ``Predictor`` module that transforms the input tensor and then +# applies an ImageNet model on it. + +from torchvision.models import resnet18, ResNet18_Weights + + +class Predictor(nn.Module): + + def __init__(self): + super().__init__() + weights = ResNet18_Weights.DEFAULT + self.resnet18 = resnet18(weights=weights, progress=False).eval() + self.transforms = weights.transforms(antialias=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + with torch.no_grad(): + x = self.transforms(x) + y_pred = self.resnet18(x) + return y_pred.argmax(dim=1) + + +# %% +# Now, let's define scripted and non-scripted instances of ``Predictor`` and +# apply it on multiple tensor images of the same size + +device = "cuda" if torch.cuda.is_available() else "cpu" + +predictor = Predictor().to(device) +scripted_predictor = torch.jit.script(predictor).to(device) + +batch = torch.stack([dog1, dog2]).to(device) + +res = predictor(batch) +res_scripted = scripted_predictor(batch) + +# %% +# We can verify that the prediction of the scripted and non-scripted models are +# the same: + +import json + +with open(Path('../assets') / 'imagenet_class_index.json') as labels_file: + labels = json.load(labels_file) + +for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)): + assert pred == pred_scripted + print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}") + +# %% +# Since the model is scripted, it can be easily dumped on disk and re-used + +import tempfile + +with tempfile.NamedTemporaryFile() as f: + scripted_predictor.save(f.name) + + dumped_scripted_predictor = torch.jit.load(f.name) + res_scripted_dumped = dumped_scripted_predictor(batch) +assert (res_scripted_dumped == res_scripted).all() + +# %% diff --git a/gallery/plot_video_api.py b/gallery/others/plot_video_api.py similarity index 94% rename from gallery/plot_video_api.py rename to gallery/others/plot_video_api.py index d83a508eabe09e892d6f9a3408cb6954e86d8356..ac9eb0ba27d8948719e6beeac9545af6f725f362 100644 --- a/gallery/plot_video_api.py +++ b/gallery/others/plot_video_api.py @@ -1,20 +1,24 @@ """ -======================= +========= Video API -======================= +========= + +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. This example illustrates some of the APIs that torchvision offers for videos, together with the examples on how to build datasets and more. """ -#################################### +# %% # 1. Introduction: building a new video object and examining the properties # ------------------------------------------------------------------------- # First we select a video to test the object out. For the sake of argument # we're using one from kinetics400 dataset. # To create it, we need to define the path and the stream we want to use. -###################################### +# %% # Chosen video statistics: # # - WUzgd7C1pWA.mp4 @@ -32,6 +36,7 @@ videos, together with the examples on how to build datasets and more. import torch import torchvision from torchvision.datasets.utils import download_url +torchvision.set_video_backend("video_reader") # Download the sample video download_url( @@ -41,7 +46,7 @@ download_url( ) video_path = "./WUzgd7C1pWA.mp4" -###################################### +# %% # Streams are defined in a similar fashion as torch devices. We encode them as strings in a form # of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int. # The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered. @@ -51,7 +56,7 @@ stream = "video" video = torchvision.io.VideoReader(video_path, stream) video.get_metadata() -###################################### +# %% # Here we can see that video has two streams - a video and an audio stream. # Currently available stream types include ['video', 'audio']. # Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id @@ -60,7 +65,7 @@ video.get_metadata() # users can access the one they want. # If only stream type is passed, the decoder auto-detects first stream of that type and returns it. -###################################### +# %% # Let's read all the frames from the video stream. By default, the return value of # ``next(video_reader)`` is a dict containing the following fields. # @@ -84,7 +89,7 @@ approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0] print("Approx total number of datapoints we can expect: ", approx_nf) print("Read data size: ", frames[0].size(0) * len(frames)) -###################################### +# %% # But what if we only want to read certain time segment of the video? # That can be done easily using the combination of our ``seek`` function, and the fact that each call # to next returns the presentation timestamp of the returned frame in seconds. @@ -106,7 +111,7 @@ for frame, pts in itertools.islice(video.seek(2), 10): print("Total number of frames: ", len(frames)) -###################################### +# %% # Or if we wanted to read from 2nd to 5th second, # We seek into a second second of the video, # then we utilize the itertools takewhile to get the @@ -124,7 +129,7 @@ approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0] print("We can expect approx: ", approx_nf) print("Tensor size: ", frames[0].size()) -#################################### +# %% # 2. Building a sample read_video function # ---------------------------------------------------------------------------------------- # We can utilize the methods above to build the read video function that follows @@ -169,21 +174,21 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au vf, af, info, meta = example_read_video(video) print(vf.size(), af.size()) -#################################### +# %% # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400) # ------------------------------------------------------------------------------------------------------- # Cool, so now we can use the same principle to make the sample dataset. # We suggest trying out iterable dataset for this purpose. # Here, we are going to build an example dataset that reads randomly selected 10 frames of video. -#################################### +# %% # Make sample dataset import os os.makedirs("./dataset", exist_ok=True) os.makedirs("./dataset/1", exist_ok=True) os.makedirs("./dataset/2", exist_ok=True) -#################################### +# %% # Download the videos from torchvision.datasets.utils import download_url download_url( @@ -211,7 +216,7 @@ download_url( "v_SoccerJuggling_g24_c01.avi" ) -#################################### +# %% # Housekeeping and utilities import os import random @@ -231,7 +236,7 @@ def get_samples(root, extensions=(".mp4", ".avi")): _, class_to_idx = _find_classes(root) return make_dataset(root, class_to_idx, extensions=extensions) -#################################### +# %% # We are going to define the dataset and some basic arguments. # We assume the structure of the FolderDataset, and add the following parameters: # @@ -286,7 +291,7 @@ class RandomDataset(torch.utils.data.IterableDataset): 'end': current_pts} yield output -#################################### +# %% # Given a path of videos in a folder structure, i.e: # # - dataset @@ -308,7 +313,7 @@ frame_transform = t.Compose(transforms) dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform) -#################################### +# %% from torch.utils.data import DataLoader loader = DataLoader(dataset, batch_size=12) data = {"video": [], 'start': [], 'end': [], 'tensorsize': []} @@ -320,7 +325,7 @@ for batch in loader: data['tensorsize'].append(batch['video'][i].size()) print(data) -#################################### +# %% # 4. Data Visualization # ---------------------------------- # Example of visualized video @@ -333,7 +338,7 @@ for i in range(16): plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0)) plt.axis("off") -#################################### +# %% # Cleanup the video and dataset: import os import shutil diff --git a/gallery/plot_visualization_utils.py b/gallery/others/plot_visualization_utils.py similarity index 93% rename from gallery/plot_visualization_utils.py rename to gallery/others/plot_visualization_utils.py index b04e0b6cffa3e5756694c0375fddea9cd58e14c7..98089c54dbb4bb6ed8b46eb54669f1b762d8f131 100644 --- a/gallery/plot_visualization_utils.py +++ b/gallery/others/plot_visualization_utils.py @@ -3,6 +3,10 @@ Visualization utilities ======================= +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + This example illustrates some of the utilities that torchvision offers for visualizing images, bounding boxes, segmentation masks and keypoints. """ @@ -30,7 +34,7 @@ def show(imgs): axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) -#################################### +# %% # Visualizing a grid of images # ---------------------------- # The :func:`~torchvision.utils.make_grid` function can be used to create a @@ -41,14 +45,14 @@ from torchvision.utils import make_grid from torchvision.io import read_image from pathlib import Path -dog1_int = read_image(str(Path('assets') / 'dog1.jpg')) -dog2_int = read_image(str(Path('assets') / 'dog2.jpg')) +dog1_int = read_image(str(Path('../assets') / 'dog1.jpg')) +dog2_int = read_image(str(Path('../assets') / 'dog2.jpg')) dog_list = [dog1_int, dog2_int] grid = make_grid(dog_list) show(grid) -#################################### +# %% # Visualizing bounding boxes # -------------------------- # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an @@ -64,7 +68,7 @@ result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5) show(result) -##################################### +# %% # Naturally, we can also plot bounding boxes produced by torchvision detection # models. Here is a demo with a Faster R-CNN model loaded from # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` @@ -85,7 +89,7 @@ model = model.eval() outputs = model(images) print(outputs) -##################################### +# %% # Let's plot the boxes detected by our model. We will only plot the boxes with a # score greater than a given threshold. @@ -96,7 +100,7 @@ dogs_with_boxes = [ ] show(dogs_with_boxes) -##################################### +# %% # Visualizing segmentation masks # ------------------------------ # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to @@ -125,7 +129,7 @@ batch = torch.stack([transforms(d) for d in dog_list]) output = model(batch)['out'] print(output.shape, output.min().item(), output.max().item()) -##################################### +# %% # As we can see above, the output of the segmentation model is a tensor of shape # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and # we can normalize them into ``[0, 1]`` by using a softmax. After the softmax, @@ -147,7 +151,7 @@ dog_and_boat_masks = [ show(dog_and_boat_masks) -##################################### +# %% # As expected, the model is confident about the dog class, but not so much for # the boat class. # @@ -162,7 +166,7 @@ print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}") show([m.float() for m in boolean_dog_masks]) -##################################### +# %% # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you # can read it as the following query: "For which pixels is 'dog' the most likely # class?" @@ -184,11 +188,11 @@ dogs_with_masks = [ ] show(dogs_with_masks) -##################################### +# %% # We can plot more than one mask per image! Remember that the model returned as # many masks as there are classes. Let's ask the same query as above, but this # time for *all* classes, not just the dog class: "For each pixel and each class -# C, is class C the most most likely class?" +# C, is class C the most likely class?" # # This one is a bit more involved, so we'll first show how to do it with a # single image, and then we'll generalize to the batch @@ -204,7 +208,7 @@ print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_a dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6) show(dog_with_all_masks) -##################################### +# %% # We can see in the image above that only 2 masks were drawn: the mask for the # background and the mask for the dog. This is because the model thinks that # only these 2 classes are the most likely ones across all the pixels. If the @@ -231,7 +235,7 @@ dogs_with_masks = [ show(dogs_with_masks) -##################################### +# %% # .. _instance_seg_output: # # Instance segmentation models @@ -265,7 +269,7 @@ model = model.eval() output = model(images) print(output) -##################################### +# %% # Let's break this down. For each image in the batch, the model outputs some # detections (or instances). The number of detections varies for each input # image. Each instance is described by its bounding box, its label, its score @@ -288,7 +292,7 @@ dog1_masks = dog1_output['masks'] print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, " f"min = {dog1_masks.min()}, max = {dog1_masks.max()}") -##################################### +# %% # Here the masks correspond to probabilities indicating, for each pixel, how # likely it is to belong to the predicted label of that instance. Those # predicted labels correspond to the 'labels' element in the same output dict. @@ -297,7 +301,7 @@ print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, " print("For the first dog, the following instances were detected:") print([weights.meta["categories"][label] for label in dog1_output['labels']]) -##################################### +# %% # Interestingly, the model detects two persons in the image. Let's go ahead and # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks` # expects boolean masks, we need to convert those probabilities into boolean @@ -315,14 +319,14 @@ dog1_bool_masks = dog1_bool_masks.squeeze(1) show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9)) -##################################### +# %% # The model seems to have properly detected the dog, but it also confused trees -# with people. Looking more closely at the scores will help us plotting more +# with people. Looking more closely at the scores will help us plot more # relevant masks: print(dog1_output['scores']) -##################################### +# %% # Clearly the model is more confident about the dog detection than it is about # the people detections. That's good news. When plotting the masks, we can ask # for only those that have a good score. Let's use a score threshold of .75 @@ -341,12 +345,12 @@ dogs_with_masks = [ ] show(dogs_with_masks) -##################################### +# %% # The two 'people' masks in the first image where not selected because they have -# a lower score than the score threshold. Similarly in the second image, the +# a lower score than the score threshold. Similarly, in the second image, the # instance with class 15 (which corresponds to 'bench') was not selected. -##################################### +# %% # .. _keypoint_output: # # Visualizing keypoints @@ -360,7 +364,7 @@ show(dogs_with_masks) from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights from torchvision.io import read_image -person_int = read_image(str(Path("assets") / "person1.jpg")) +person_int = read_image(str(Path("../assets") / "person1.jpg")) weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT transforms = weights.transforms() @@ -373,7 +377,7 @@ model = model.eval() outputs = model([person_float]) print(outputs) -##################################### +# %% # As we see the output contains a list of dictionaries. # The output list is of length batch_size. # We currently have just a single image so length of list is 1. @@ -388,7 +392,7 @@ scores = outputs[0]['scores'] print(kpts) print(scores) -##################################### +# %% # The KeypointRCNN model detects there are two instances in the image. # If you plot the boxes by using :func:`~draw_bounding_boxes` # you would recognize they are the person and the surfboard. @@ -402,7 +406,7 @@ keypoints = kpts[idx] print(keypoints) -##################################### +# %% # Great, now we have the keypoints corresponding to the person. # Each keypoint is represented by x, y coordinates and the visibility. # We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints. @@ -413,7 +417,7 @@ from torchvision.utils import draw_keypoints res = draw_keypoints(person_int, keypoints, colors="blue", radius=3) show(res) -##################################### +# %% # As we see the keypoints appear as colored circles over the image. # The coco keypoints for a person are ordered and represent the following list.\ @@ -424,7 +428,7 @@ coco_keypoints = [ "left_knee", "right_knee", "left_ankle", "right_ankle", ] -##################################### +# %% # What if we are interested in joining the keypoints? # This is especially useful in creating pose detection or action recognition. # We can join the keypoints easily using the `connectivity` parameter. @@ -450,7 +454,7 @@ connect_skeleton = [ (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16) ] -##################################### +# %% # We pass the above list to the connectivity parameter to connect the keypoints. # diff --git a/gallery/plot_scripted_tensor_transforms.py b/gallery/plot_scripted_tensor_transforms.py deleted file mode 100644 index 995383d460321af7928d69f249d23a92df981ea8..0000000000000000000000000000000000000000 --- a/gallery/plot_scripted_tensor_transforms.py +++ /dev/null @@ -1,141 +0,0 @@ -""" -========================= -Tensor transforms and JIT -========================= - -This example illustrates various features that are now supported by the -:ref:`image transformations ` on Tensor images. In particular, we -show how image transforms can be performed on GPU, and how one can also script -them using JIT compilation. - -Prior to v0.8.0, transforms in torchvision have traditionally been PIL-centric -and presented multiple limitations due to that. Now, since v0.8.0, transforms -implementations are Tensor and PIL compatible and we can achieve the following -new features: - -- transform multi-band torch tensor images (with more than 3-4 channels) -- torchscript transforms together with your model for deployment -- support for GPU acceleration -- batched transformation such as for videos -- read and decode data directly as torch tensor with torchscript support (for PNG and JPEG image formats) - -.. note:: - These features are only possible with **Tensor** images. -""" - -from pathlib import Path - -import matplotlib.pyplot as plt -import numpy as np - -import torch -import torchvision.transforms as T -from torchvision.io import read_image - - -plt.rcParams["savefig.bbox"] = 'tight' -torch.manual_seed(1) - - -def show(imgs): - fix, axs = plt.subplots(ncols=len(imgs), squeeze=False) - for i, img in enumerate(imgs): - img = T.ToPILImage()(img.to('cpu')) - axs[0, i].imshow(np.asarray(img)) - axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) - - -#################################### -# The :func:`~torchvision.io.read_image` function allows to read an image and -# directly load it as a tensor - -dog1 = read_image(str(Path('assets') / 'dog1.jpg')) -dog2 = read_image(str(Path('assets') / 'dog2.jpg')) -show([dog1, dog2]) - -#################################### -# Transforming images on GPU -# -------------------------- -# Most transforms natively support tensors on top of PIL images (to visualize -# the effect of the transforms, you may refer to see -# :ref:`sphx_glr_auto_examples_plot_transforms.py`). -# Using tensor images, we can run the transforms on GPUs if cuda is available! - -import torch.nn as nn - -transforms = torch.nn.Sequential( - T.RandomCrop(224), - T.RandomHorizontalFlip(p=0.3), -) - -device = 'cuda' if torch.cuda.is_available() else 'cpu' -dog1 = dog1.to(device) -dog2 = dog2.to(device) - -transformed_dog1 = transforms(dog1) -transformed_dog2 = transforms(dog2) -show([transformed_dog1, transformed_dog2]) - -#################################### -# Scriptable transforms for easier deployment via torchscript -# ----------------------------------------------------------- -# We now show how to combine image transformations and a model forward pass, -# while using ``torch.jit.script`` to obtain a single scripted module. -# -# Let's define a ``Predictor`` module that transforms the input tensor and then -# applies an ImageNet model on it. - -from torchvision.models import resnet18, ResNet18_Weights - - -class Predictor(nn.Module): - - def __init__(self): - super().__init__() - weights = ResNet18_Weights.DEFAULT - self.resnet18 = resnet18(weights=weights, progress=False).eval() - self.transforms = weights.transforms() - - def forward(self, x: torch.Tensor) -> torch.Tensor: - with torch.no_grad(): - x = self.transforms(x) - y_pred = self.resnet18(x) - return y_pred.argmax(dim=1) - - -#################################### -# Now, let's define scripted and non-scripted instances of ``Predictor`` and -# apply it on multiple tensor images of the same size - -predictor = Predictor().to(device) -scripted_predictor = torch.jit.script(predictor).to(device) - -batch = torch.stack([dog1, dog2]).to(device) - -res = predictor(batch) -res_scripted = scripted_predictor(batch) - -#################################### -# We can verify that the prediction of the scripted and non-scripted models are -# the same: - -import json - -with open(Path('assets') / 'imagenet_class_index.json') as labels_file: - labels = json.load(labels_file) - -for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)): - assert pred == pred_scripted - print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}") - -#################################### -# Since the model is scripted, it can be easily dumped on disk and re-used - -import tempfile - -with tempfile.NamedTemporaryFile() as f: - scripted_predictor.save(f.name) - - dumped_scripted_predictor = torch.jit.load(f.name) - res_scripted_dumped = dumped_scripted_predictor(batch) -assert (res_scripted_dumped == res_scripted).all() diff --git a/gallery/transforms/README.rst b/gallery/transforms/README.rst new file mode 100644 index 0000000000000000000000000000000000000000..1b8b1b08155ae339948c20d13f2f55d5a580a6bc --- /dev/null +++ b/gallery/transforms/README.rst @@ -0,0 +1,4 @@ +.. _transforms_gallery: + +Transforms +---------- diff --git a/gallery/transforms/helpers.py b/gallery/transforms/helpers.py new file mode 100644 index 0000000000000000000000000000000000000000..e94d717eb7df9a7585cf1704262368208bf0e786 --- /dev/null +++ b/gallery/transforms/helpers.py @@ -0,0 +1,50 @@ +import matplotlib.pyplot as plt +import torch +from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks +from torchvision import tv_tensors +from torchvision.transforms.v2 import functional as F + + +def plot(imgs, row_title=None, **imshow_kwargs): + if not isinstance(imgs[0], list): + # Make a 2d grid even if there's just 1 row + imgs = [imgs] + + num_rows = len(imgs) + num_cols = len(imgs[0]) + _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False) + for row_idx, row in enumerate(imgs): + for col_idx, img in enumerate(row): + boxes = None + masks = None + if isinstance(img, tuple): + img, target = img + if isinstance(target, dict): + boxes = target.get("boxes") + masks = target.get("masks") + elif isinstance(target, tv_tensors.BoundingBoxes): + boxes = target + else: + raise ValueError(f"Unexpected target type: {type(target)}") + img = F.to_image(img) + if img.dtype.is_floating_point and img.min() < 0: + # Poor man's re-normalization for the colors to be OK-ish. This + # is useful for images coming out of Normalize() + img -= img.min() + img /= img.max() + + img = F.to_dtype(img, torch.uint8, scale=True) + if boxes is not None: + img = draw_bounding_boxes(img, boxes, colors="yellow", width=3) + if masks is not None: + img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65) + + ax = axs[row_idx, col_idx] + ax.imshow(img.permute(1, 2, 0).numpy(), **imshow_kwargs) + ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) + + if row_title is not None: + for row_idx in range(num_rows): + axs[row_idx, 0].set(ylabel=row_title[row_idx]) + + plt.tight_layout() diff --git a/gallery/transforms/plot_custom_transforms.py b/gallery/transforms/plot_custom_transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..898c2cd0beaedbe21c5b112aed556a689518bb49 --- /dev/null +++ b/gallery/transforms/plot_custom_transforms.py @@ -0,0 +1,121 @@ +""" +=================================== +How to write your own v2 transforms +=================================== + +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + +This guide explains how to write transforms that are compatible with the +torchvision transforms V2 API. +""" + +# %% +import torch +from torchvision import tv_tensors +from torchvision.transforms import v2 + + +# %% +# Just create a ``nn.Module`` and override the ``forward`` method +# =============================================================== +# +# In most cases, this is all you're going to need, as long as you already know +# the structure of the input that your transform will expect. For example if +# you're just doing image classification, your transform will typically accept a +# single image as input, or a ``(img, label)`` input. So you can just hard-code +# your ``forward`` method to accept just that, e.g. +# +# .. code:: python +# +# class MyCustomTransform(torch.nn.Module): +# def forward(self, img, label): +# # Do some transformations +# return new_img, new_label +# +# .. note:: +# +# This means that if you have a custom transform that is already compatible +# with the V1 transforms (those in ``torchvision.transforms``), it will +# still work with the V2 transforms without any change! +# +# We will illustrate this more completely below with a typical detection case, +# where our samples are just images, bounding boxes and labels: + +class MyCustomTransform(torch.nn.Module): + def forward(self, img, bboxes, label): # we assume inputs are always structured like this + print( + f"I'm transforming an image of shape {img.shape} " + f"with bboxes = {bboxes}\n{label = }" + ) + # Do some transformations. Here, we're just passing though the input + return img, bboxes, label + + +transforms = v2.Compose([ + MyCustomTransform(), + v2.RandomResizedCrop((224, 224), antialias=True), + v2.RandomHorizontalFlip(p=1), + v2.Normalize(mean=[0, 0, 0], std=[1, 1, 1]) +]) + +H, W = 256, 256 +img = torch.rand(3, H, W) +bboxes = tv_tensors.BoundingBoxes( + torch.tensor([[0, 10, 10, 20], [50, 50, 70, 70]]), + format="XYXY", + canvas_size=(H, W) +) +label = 3 + +out_img, out_bboxes, out_label = transforms(img, bboxes, label) +# %% +print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }") +# %% +# .. note:: +# While working with TVTensor classes in your code, make sure to +# familiarize yourself with this section: +# :ref:`tv_tensor_unwrapping_behaviour` +# +# Supporting arbitrary input structures +# ===================================== +# +# In the section above, we have assumed that you already know the structure of +# your inputs and that you're OK with hard-coding this expected structure in +# your code. If you want your custom transforms to be as flexible as possible, +# this can be a bit limiting. +# +# A key feature of the builtin Torchvision V2 transforms is that they can accept +# arbitrary input structure and return the same structure as output (with +# transformed entries). For example, transforms can accept a single image, or a +# tuple of ``(img, label)``, or an arbitrary nested dictionary as input: + +structured_input = { + "img": img, + "annotations": (bboxes, label), + "something_that_will_be_ignored": (1, "hello") +} +structured_output = v2.RandomHorizontalFlip(p=1)(structured_input) + +assert isinstance(structured_output, dict) +assert structured_output["something_that_will_be_ignored"] == (1, "hello") +print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}") + +# %% +# If you want to reproduce this behavior in your own transform, we invite you to +# look at our `code +# `_ +# and adapt it to your needs. +# +# In brief, the core logic is to unpack the input into a flat list using `pytree +# `_, and +# then transform only the entries that can be transformed (the decision is made +# based on the **class** of the entries, as all TVTensors are +# tensor-subclasses) plus some custom logic that is out of score here - check the +# code for details. The (potentially transformed) entries are then repacked and +# returned, in the same structure as the input. +# +# We do not provide public dev-facing tools to achieve that at this time, but if +# this is something that would be valuable to you, please let us know by opening +# an issue on our `GitHub repo `_. diff --git a/gallery/transforms/plot_custom_tv_tensors.py b/gallery/transforms/plot_custom_tv_tensors.py new file mode 100644 index 0000000000000000000000000000000000000000..bf5ee198837ca34c40ec279293184b2db19efb9d --- /dev/null +++ b/gallery/transforms/plot_custom_tv_tensors.py @@ -0,0 +1,119 @@ +""" +==================================== +How to write your own TVTensor class +==================================== + +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + +This guide is intended for advanced users and downstream library maintainers. We explain how to +write your own TVTensor class, and how to make it compatible with the built-in +Torchvision v2 transforms. Before continuing, make sure you have read +:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`. +""" + +# %% +import torch +from torchvision import tv_tensors +from torchvision.transforms import v2 + +# %% +# We will create a very simple class that just inherits from the base +# :class:`~torchvision.tv_tensors.TVTensor` class. It will be enough to cover +# what you need to know to implement your more elaborate uses-cases. If you need +# to create a class that carries meta-data, take a look at how the +# :class:`~torchvision.tv_tensors.BoundingBoxes` class is `implemented +# `_. + + +class MyTVTensor(tv_tensors.TVTensor): + pass + + +my_dp = MyTVTensor([1, 2, 3]) +my_dp + +# %% +# Now that we have defined our custom TVTensor class, we want it to be +# compatible with the built-in torchvision transforms, and the functional API. +# For that, we need to implement a kernel which performs the core of the +# transformation, and then "hook" it to the functional that we want to support +# via :func:`~torchvision.transforms.v2.functional.register_kernel`. +# +# We illustrate this process below: we create a kernel for the "horizontal flip" +# operation of our MyTVTensor class, and register it to the functional API. + +from torchvision.transforms.v2 import functional as F + + +@F.register_kernel(functional="hflip", tv_tensor_cls=MyTVTensor) +def hflip_my_tv_tensor(my_dp, *args, **kwargs): + print("Flipping!") + out = my_dp.flip(-1) + return tv_tensors.wrap(out, like=my_dp) + + +# %% +# To understand why :func:`~torchvision.tv_tensors.wrap` is used, see +# :ref:`tv_tensor_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now, +# we will explain it below in :ref:`param_forwarding`. +# +# .. note:: +# +# In our call to ``register_kernel`` above we used a string +# ``functional="hflip"`` to refer to the functional we want to hook into. We +# could also have used the functional *itself*, i.e. +# ``@register_kernel(functional=F.hflip, ...)``. +# +# Now that we have registered our kernel, we can call the functional API on a +# ``MyTVTensor`` instance: + +my_dp = MyTVTensor(torch.rand(3, 256, 256)) +_ = F.hflip(my_dp) + +# %% +# And we can also use the +# :class:`~torchvision.transforms.v2.RandomHorizontalFlip` transform, since it relies on :func:`~torchvision.transforms.v2.functional.hflip` internally: +t = v2.RandomHorizontalFlip(p=1) +_ = t(my_dp) + +# %% +# .. note:: +# +# We cannot register a kernel for a transform class, we can only register a +# kernel for a **functional**. The reason we can't register a transform +# class is because one transform may internally rely on more than one +# functional, so in general we can't register a single kernel for a given +# class. +# +# .. _param_forwarding: +# +# Parameter forwarding, and ensuring future compatibility of your kernels +# ----------------------------------------------------------------------- +# +# The functional API that you're hooking into is public and therefore +# **backward** compatible: we guarantee that the parameters of these functionals +# won't be removed or renamed without a proper deprecation cycle. However, we +# don't guarantee **forward** compatibility, and we may add new parameters in +# the future. +# +# Imagine that in a future version, Torchvision adds a new ``inplace`` parameter +# to its :func:`~torchvision.transforms.v2.functional.hflip` functional. If you +# already defined and registered your own kernel as + +def hflip_my_tv_tensor(my_dp): # noqa + print("Flipping!") + out = my_dp.flip(-1) + return tv_tensors.wrap(out, like=my_dp) + + +# %% +# then calling ``F.hflip(my_dp)`` will **fail**, because ``hflip`` will try to +# pass the new ``inplace`` parameter to your kernel, but your kernel doesn't +# accept it. +# +# For this reason, we recommend to always define your kernels with +# ``*args, **kwargs`` in their signature, as done above. This way, your kernel +# will be able to accept any new parameter that we may add in the future. +# (Technically, adding `**kwargs` only should be enough). diff --git a/gallery/transforms/plot_cutmix_mixup.py b/gallery/transforms/plot_cutmix_mixup.py new file mode 100644 index 0000000000000000000000000000000000000000..d26b027b121ad849b6638f3387460aa8d6ec9ed5 --- /dev/null +++ b/gallery/transforms/plot_cutmix_mixup.py @@ -0,0 +1,150 @@ + +""" +=========================== +How to use CutMix and MixUp +=========================== + +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + +:class:`~torchvision.transforms.v2.CutMix` and +:class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies +that can improve classification accuracy. + +These transforms are slightly different from the rest of the Torchvision +transforms, because they expect +**batches** of samples as input, not individual images. In this example we'll +explain how to use them: after the ``DataLoader``, or as part of a collation +function. +""" + +# %% +import torch +from torchvision.datasets import FakeData +from torchvision.transforms import v2 + + +NUM_CLASSES = 100 + +# %% +# Pre-processing pipeline +# ----------------------- +# +# We'll use a simple but typical image classification pipeline: + +preproc = v2.Compose([ + v2.PILToTensor(), + v2.RandomResizedCrop(size=(224, 224), antialias=True), + v2.RandomHorizontalFlip(p=0.5), + v2.ToDtype(torch.float32, scale=True), # to float32 in [0, 1] + v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), # typically from ImageNet +]) + +dataset = FakeData(size=1000, num_classes=NUM_CLASSES, transform=preproc) + +img, label = dataset[0] +print(f"{type(img) = }, {img.dtype = }, {img.shape = }, {label = }") + +# %% +# +# One important thing to note is that neither CutMix nor MixUp are part of this +# pre-processing pipeline. We'll add them a bit later once we define the +# DataLoader. Just as a refresher, this is what the DataLoader and training loop +# would look like if we weren't using CutMix or MixUp: + +from torch.utils.data import DataLoader + +dataloader = DataLoader(dataset, batch_size=4, shuffle=True) + +for images, labels in dataloader: + print(f"{images.shape = }, {labels.shape = }") + print(labels.dtype) + # + break +# %% + +# %% +# Where to use MixUp and CutMix +# ----------------------------- +# +# After the DataLoader +# ^^^^^^^^^^^^^^^^^^^^ +# +# Now let's add CutMix and MixUp. The simplest way to do this right after the +# DataLoader: the Dataloader has already batched the images and labels for us, +# and this is exactly what these transforms expect as input: + +dataloader = DataLoader(dataset, batch_size=4, shuffle=True) + +cutmix = v2.CutMix(num_classes=NUM_CLASSES) +mixup = v2.MixUp(num_classes=NUM_CLASSES) +cutmix_or_mixup = v2.RandomChoice([cutmix, mixup]) + +for images, labels in dataloader: + print(f"Before CutMix/MixUp: {images.shape = }, {labels.shape = }") + images, labels = cutmix_or_mixup(images, labels) + print(f"After CutMix/MixUp: {images.shape = }, {labels.shape = }") + + # + break +# %% +# +# Note how the labels were also transformed: we went from a batched label of +# shape (batch_size,) to a tensor of shape (batch_size, num_classes). The +# transformed labels can still be passed as-is to a loss function like +# :func:`torch.nn.functional.cross_entropy`. +# +# As part of the collation function +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# Passing the transforms after the DataLoader is the simplest way to use CutMix +# and MixUp, but one disadvantage is that it does not take advantage of the +# DataLoader multi-processing. For that, we can pass those transforms as part of +# the collation function (refer to the `PyTorch docs +# `_ to learn +# more about collation). + +from torch.utils.data import default_collate + + +def collate_fn(batch): + return cutmix_or_mixup(*default_collate(batch)) + + +dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn) + +for images, labels in dataloader: + print(f"{images.shape = }, {labels.shape = }") + # No need to call cutmix_or_mixup, it's already been called as part of the DataLoader! + # + break + +# %% +# Non-standard input format +# ------------------------- +# +# So far we've used a typical sample structure where we pass ``(images, +# labels)`` as inputs. MixUp and CutMix will magically work by default with most +# common sample structures: tuples where the second parameter is a tensor label, +# or dict with a "label[s]" key. Look at the documentation of the +# ``labels_getter`` parameter for more details. +# +# If your samples have a different structure, you can still use CutMix and MixUp +# by passing a callable to the ``labels_getter`` parameter. For example: + +batch = { + "imgs": torch.rand(4, 3, 224, 224), + "target": { + "classes": torch.randint(0, NUM_CLASSES, size=(4,)), + "some_other_key": "this is going to be passed-through" + } +} + + +def labels_getter(batch): + return batch["target"]["classes"] + + +out = v2.CutMix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch) +print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }") diff --git a/gallery/transforms/plot_transforms_e2e.py b/gallery/transforms/plot_transforms_e2e.py new file mode 100644 index 0000000000000000000000000000000000000000..6c58b4a5a9ad6ca4161a984dec675a5dcfa2ce55 --- /dev/null +++ b/gallery/transforms/plot_transforms_e2e.py @@ -0,0 +1,181 @@ +""" +=============================================================== +Transforms v2: End-to-end object detection/segmentation example +=============================================================== + +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + +Object detection and segmentation tasks are natively supported: +``torchvision.transforms.v2`` enables jointly transforming images, videos, +bounding boxes, and masks. + +This example showcases an end-to-end instance segmentation training case using +Torchvision utils from ``torchvision.datasets``, ``torchvision.models`` and +``torchvision.transforms.v2``. Everything covered here can be applied similarly +to object detection or semantic segmentation tasks. +""" + +# %% +import pathlib + +import torch +import torch.utils.data + +from torchvision import models, datasets, tv_tensors +from torchvision.transforms import v2 + +torch.manual_seed(0) + +# This loads fake data for illustration purposes of this example. In practice, you'll have +# to replace this with the proper data. +# If you're trying to run that on collab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +ROOT = pathlib.Path("../assets") / "coco" +IMAGES_PATH = str(ROOT / "images") +ANNOTATIONS_PATH = str(ROOT / "instances.json") +from helpers import plot + + +# %% +# Dataset preparation +# ------------------- +# +# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently +# returns. + +dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH) + +sample = dataset[0] +img, target = sample +print(f"{type(img) = }\n{type(target) = }\n{type(target[0]) = }\n{target[0].keys() = }") + + +# %% +# Torchvision datasets preserve the data structure and types as it was intended +# by the datasets authors. So by default, the output structure may not always be +# compatible with the models or the transforms. +# +# To overcome that, we can use the +# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For +# :class:`~torchvision.datasets.CocoDetection`, this changes the target +# structure to a single dictionary of lists: + +dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=("boxes", "labels", "masks")) + +sample = dataset[0] +img, target = sample +print(f"{type(img) = }\n{type(target) = }\n{target.keys() = }") +print(f"{type(target['boxes']) = }\n{type(target['labels']) = }\n{type(target['masks']) = }") + +# %% +# We used the ``target_keys`` parameter to specify the kind of output we're +# interested in. Our dataset now returns a target which is dict where the values +# are :ref:`TVTensors ` (all are :class:`torch.Tensor` +# subclasses). We're dropped all unncessary keys from the previous output, but +# if you need any of the original keys e.g. "image_id", you can still ask for +# it. +# +# .. note:: +# +# If you just want to do detection, you don't need and shouldn't pass +# "masks" in ``target_keys``: if masks are present in the sample, they will +# be transformed, slowing down your transformations unnecessarily. +# +# As baseline, let's have a look at a sample without transformations: + +plot([dataset[0], dataset[1]]) + + +# %% +# Transforms +# ---------- +# +# Let's now define our pre-processing transforms. All the transforms know how +# to handle images, bouding boxes and masks when relevant. +# +# Transforms are typically passed as the ``transforms`` parameter of the +# dataset so that they can leverage multi-processing from the +# :class:`torch.utils.data.DataLoader`. + +transforms = v2.Compose( + [ + v2.ToImage(), + v2.RandomPhotometricDistort(p=1), + v2.RandomZoomOut(fill={tv_tensors.Image: (123, 117, 104), "others": 0}), + v2.RandomIoUCrop(), + v2.RandomHorizontalFlip(p=1), + v2.SanitizeBoundingBoxes(), + v2.ToDtype(torch.float32, scale=True), + ] +) + +dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH, transforms=transforms) +dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=["boxes", "labels", "masks"]) + +# %% +# A few things are worth noting here: +# +# - We're converting the PIL image into a +# :class:`~torchvision.transforms.v2.Image` object. This isn't strictly +# necessary, but relying on Tensors (here: a Tensor subclass) will +# :ref:`generally be faster `. +# - We are calling :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` to +# make sure we remove degenerate bounding boxes, as well as their +# corresponding labels and masks. +# :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` should be placed +# at least once at the end of a detection pipeline; it is particularly +# critical if :class:`~torchvision.transforms.v2.RandomIoUCrop` was used. +# +# Let's look how the sample looks like with our augmentation pipeline in place: + +# sphinx_gallery_thumbnail_number = 2 +plot([dataset[0], dataset[1]]) + + +# %% +# We can see that the color of the images were distorted, zoomed in or out, and flipped. +# The bounding boxes and the masks were transformed accordingly. And without any further ado, we can start training. +# +# Data loading and training loop +# ------------------------------ +# +# Below we're using Mask-RCNN which is an instance segmentation model, but +# everything we've covered in this tutorial also applies to object detection and +# semantic segmentation tasks. + +data_loader = torch.utils.data.DataLoader( + dataset, + batch_size=2, + # We need a custom collation function here, since the object detection + # models expect a sequence of images and target dictionaries. The default + # collation function tries to torch.stack() the individual elements, + # which fails in general for object detection, because the number of bouding + # boxes varies between the images of a same batch. + collate_fn=lambda batch: tuple(zip(*batch)), +) + +model = models.get_model("maskrcnn_resnet50_fpn_v2", weights=None, weights_backbone=None).train() + +for imgs, targets in data_loader: + loss_dict = model(imgs, targets) + # Put your training logic here + + print(f"{[img.shape for img in imgs] = }") + print(f"{[type(target) for target in targets] = }") + for name, loss_val in loss_dict.items(): + print(f"{name:<20}{loss_val:.3f}") + +# %% +# Training References +# ------------------- +# +# From there, you can check out the `torchvision references +# `_ where you'll find +# the actual training scripts we use to train our models. +# +# **Disclaimer** The code in our references is more complex than what you'll +# need for your own use-cases: this is because we're supporting different +# backends (PIL, tensors, TVTensors) and different transforms namespaces (v1 and +# v2). So don't be afraid to simplify and only keep what you need. diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py new file mode 100644 index 0000000000000000000000000000000000000000..c61d1cc1be0681a708d6bc08a8ac3369c4c54ddc --- /dev/null +++ b/gallery/transforms/plot_transforms_getting_started.py @@ -0,0 +1,266 @@ +""" +================================== +Getting started with transforms v2 +================================== + +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + +This example illustrates all of what you need to know to get started with the +new :mod:`torchvision.transforms.v2` API. We'll cover simple tasks like +image classification, and more advanced ones like object detection / +segmentation. +""" + +# %% +# First, a bit of setup +from pathlib import Path +import torch +import matplotlib.pyplot as plt +plt.rcParams["savefig.bbox"] = 'tight' + +from torchvision.transforms import v2 +from torchvision.io import read_image + +torch.manual_seed(1) + +# If you're trying to run that on collab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +from helpers import plot +img = read_image(str(Path('../assets') / 'astronaut.jpg')) +print(f"{type(img) = }, {img.dtype = }, {img.shape = }") + +# %% +# The basics +# ---------- +# +# The Torchvision transforms behave like a regular :class:`torch.nn.Module` (in +# fact, most of them are): instantiate a transform, pass an input, get a +# transformed output: + +transform = v2.RandomCrop(size=(224, 224)) +out = transform(img) + +plot([img, out]) + +# %% +# I just want to do image classification +# -------------------------------------- +# +# If you just care about image classification, things are very simple. A basic +# classification pipeline may look like this: + +transforms = v2.Compose([ + v2.RandomResizedCrop(size=(224, 224), antialias=True), + v2.RandomHorizontalFlip(p=0.5), + v2.ToDtype(torch.float32, scale=True), + v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), +]) +out = transforms(img) + +plot([img, out]) + +# %% +# Such transformation pipeline is typically passed as the ``transform`` argument +# to the :ref:`Datasets `, e.g. ``ImageNet(..., +# transform=transforms)``. +# +# That's pretty much all there is. From there, read through our :ref:`main docs +# ` to learn more about recommended practices and conventions, or +# explore more :ref:`examples ` e.g. how to use augmentation +# transforms like :ref:`CutMix and MixUp +# `. +# +# .. note:: +# +# If you're already relying on the ``torchvision.transforms`` v1 API, +# we recommend to :ref:`switch to the new v2 transforms`. It's +# very easy: the v2 transforms are fully compatible with the v1 API, so you +# only need to change the import! +# +# Detection, Segmentation, Videos +# ------------------------------- +# +# The new Torchvision transforms in the ``torchvision.transforms.v2`` namespace +# support tasks beyond image classification: they can also transform bounding +# boxes, segmentation / detection masks, or videos. +# +# Let's briefly look at a detection example with bounding boxes. + +from torchvision import tv_tensors # we'll describe this a bit later, bare with us + +boxes = tv_tensors.BoundingBoxes( + [ + [15, 10, 370, 510], + [275, 340, 510, 510], + [130, 345, 210, 425] + ], + format="XYXY", canvas_size=img.shape[-2:]) + +transforms = v2.Compose([ + v2.RandomResizedCrop(size=(224, 224), antialias=True), + v2.RandomPhotometricDistort(p=1), + v2.RandomHorizontalFlip(p=1), +]) +out_img, out_boxes = transforms(img, boxes) +print(type(boxes), type(out_boxes)) + +plot([(img, boxes), (out_img, out_boxes)]) + +# %% +# +# The example above focuses on object detection. But if we had masks +# (:class:`torchvision.tv_tensors.Mask`) for object segmentation or semantic +# segmentation, or videos (:class:`torchvision.tv_tensors.Video`), we could have +# passed them to the transforms in exactly the same way. +# +# By now you likely have a few questions: what are these TVTensors, how do we +# use them, and what is the expected input/output of those transforms? We'll +# answer these in the next sections. + +# %% +# +# .. _what_are_tv_tensors: +# +# What are TVTensors? +# -------------------- +# +# TVTensors are :class:`torch.Tensor` subclasses. The available TVTensors are +# :class:`~torchvision.tv_tensors.Image`, +# :class:`~torchvision.tv_tensors.BoundingBoxes`, +# :class:`~torchvision.tv_tensors.Mask`, and +# :class:`~torchvision.tv_tensors.Video`. +# +# TVTensors look and feel just like regular tensors - they **are** tensors. +# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` +# or any ``torch.*`` operator will also work on a TVTensor: + +img_dp = tv_tensors.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8)) + +print(f"{isinstance(img_dp, torch.Tensor) = }") +print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }") + +# %% +# These TVTensor classes are at the core of the transforms: in order to +# transform a given input, the transforms first look at the **class** of the +# object, and dispatch to the appropriate implementation accordingly. +# +# You don't need to know much more about TVTensors at this point, but advanced +# users who want to learn more can refer to +# :ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`. +# +# What do I pass as input? +# ------------------------ +# +# Above, we've seen two examples: one where we passed a single image as input +# i.e. ``out = transforms(img)``, and one where we passed both an image and +# bounding boxes, i.e. ``out_img, out_boxes = transforms(img, boxes)``. +# +# In fact, transforms support **arbitrary input structures**. The input can be a +# single image, a tuple, an arbitrarily nested dictionary... pretty much +# anything. The same structure will be returned as output. Below, we use the +# same detection transforms, but pass a tuple (image, target_dict) as input and +# we're getting the same structure as output: + +target = { + "boxes": boxes, + "labels": torch.arange(boxes.shape[0]), + "this_is_ignored": ("arbitrary", {"structure": "!"}) +} + +# Re-using the transforms and definitions from above. +out_img, out_target = transforms(img, target) + +# sphinx_gallery_thumbnail_number = 4 +plot([(img, target["boxes"]), (out_img, out_target["boxes"])]) +print(f"{out_target['this_is_ignored']}") + +# %% +# We passed a tuple so we get a tuple back, and the second element is the +# tranformed target dict. Transforms don't really care about the structure of +# the input; as mentioned above, they only care about the **type** of the +# objects and transforms them accordingly. +# +# *Foreign* objects like strings or ints are simply passed-through. This can be +# useful e.g. if you want to associate a path with every single sample when +# debugging! +# +# .. _passthrough_heuristic: +# +# .. note:: +# +# **Disclaimer** This note is slightly advanced and can be safely skipped on +# a first read. +# +# Pure :class:`torch.Tensor` objects are, in general, treated as images (or +# as videos for video-specific transforms). Indeed, you may have noticed +# that in the code above we haven't used the +# :class:`~torchvision.tv_tensors.Image` class at all, and yet our images +# got transformed properly. Transforms follow the following logic to +# determine whether a pure Tensor should be treated as an image (or video), +# or just ignored: +# +# * If there is an :class:`~torchvision.tv_tensors.Image`, +# :class:`~torchvision.tv_tensors.Video`, +# or :class:`PIL.Image.Image` instance in the input, all other pure +# tensors are passed-through. +# * If there is no :class:`~torchvision.tv_tensors.Image` or +# :class:`~torchvision.tv_tensors.Video` instance, only the first pure +# :class:`torch.Tensor` will be transformed as image or video, while all +# others will be passed-through. Here "first" means "first in a depth-wise +# traversal". +# +# This is what happened in the detection example above: the first pure +# tensor was the image so it got transformed properly, and all other pure +# tensor instances like the ``labels`` were passed-through (although labels +# can still be transformed by some transforms like +# :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`!). +# +# .. _transforms_datasets_intercompatibility: +# +# Transforms and Datasets intercompatibility +# ------------------------------------------ +# +# Roughly speaking, the output of the datasets must correspond to the input of +# the transforms. How to do that depends on whether you're using the torchvision +# :ref:`built-in datatsets `, or your own custom datasets. +# +# Using built-in datasets +# ^^^^^^^^^^^^^^^^^^^^^^^ +# +# If you're just doing image classification, you don't need to do anything. Just +# use ``transform`` argument of the dataset e.g. ``ImageNet(..., +# transform=transforms)`` and you're good to go. +# +# Torchvision also supports datasets for object detection or segmentation like +# :class:`torchvision.datasets.CocoDetection`. Those datasets predate +# the existence of the :mod:`torchvision.transforms.v2` module and of the +# TVTensors, so they don't return TVTensors out of the box. +# +# An easy way to force those datasets to return TVTensors and to make them +# compatible with v2 transforms is to use the +# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function: +# +# .. code-block:: python +# +# from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2 +# +# dataset = CocoDetection(..., transforms=my_transforms) +# dataset = wrap_dataset_for_transforms_v2(dataset) +# # Now the dataset returns TVTensors! +# +# Using your own datasets +# ^^^^^^^^^^^^^^^^^^^^^^^ +# +# If you have a custom dataset, then you'll need to convert your objects into +# the appropriate TVTensor classes. Creating TVTensor instances is very easy, +# refer to :ref:`tv_tensor_creation` for more details. +# +# There are two main places where you can implement that conversion logic: +# +# - At the end of the datasets's ``__getitem__`` method, before returning the +# sample (or by sub-classing the dataset). +# - As the very first step of your transforms pipeline +# +# Either way, the logic will depend on your specific dataset. diff --git a/gallery/plot_transforms.py b/gallery/transforms/plot_transforms_illustrations.py similarity index 56% rename from gallery/plot_transforms.py rename to gallery/transforms/plot_transforms_illustrations.py index c6e44a14e229915903551b995753f6032c704e40..95ab455d0fd80d1752f9964b18c866172ae866bc 100644 --- a/gallery/plot_transforms.py +++ b/gallery/transforms/plot_transforms_illustrations.py @@ -3,317 +3,318 @@ Illustration of transforms ========================== -This example illustrates the various transforms available in :ref:`the -torchvision.transforms module `. +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + +This example illustrates some of the various transforms available in :ref:`the +torchvision.transforms.v2 module `. """ +# %% # sphinx_gallery_thumbnail_path = "../../gallery/assets/transforms_thumbnail.png" from PIL import Image from pathlib import Path import matplotlib.pyplot as plt -import numpy as np import torch -import torchvision.transforms as T - +from torchvision.transforms import v2 plt.rcParams["savefig.bbox"] = 'tight' -orig_img = Image.open(Path('assets') / 'astronaut.jpg') + # if you change the seed, make sure that the randomly-applied transforms # properly show that the image can be both transformed and *not* transformed! torch.manual_seed(0) - -def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs): - if not isinstance(imgs[0], list): - # Make a 2d grid even if there's just 1 row - imgs = [imgs] - - num_rows = len(imgs) - num_cols = len(imgs[0]) + with_orig - fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False) - for row_idx, row in enumerate(imgs): - row = [orig_img] + row if with_orig else row - for col_idx, img in enumerate(row): - ax = axs[row_idx, col_idx] - ax.imshow(np.asarray(img), **imshow_kwargs) - ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[]) - - if with_orig: - axs[0, 0].set(title='Original image') - axs[0, 0].title.set_size(8) - if row_title is not None: - for row_idx in range(num_rows): - axs[row_idx, 0].set(ylabel=row_title[row_idx]) - - plt.tight_layout() - - -#################################### +# If you're trying to run that on collab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +from helpers import plot +orig_img = Image.open(Path('../assets') / 'astronaut.jpg') + +# %% +# Geometric Transforms +# -------------------- +# Geometric image transformation refers to the process of altering the geometric properties of an image, +# such as its shape, size, orientation, or position. +# It involves applying mathematical operations to the image pixels or coordinates to achieve the desired transformation. +# # Pad -# --- +# ~~~ # The :class:`~torchvision.transforms.Pad` transform # (see also :func:`~torchvision.transforms.functional.pad`) -# fills image borders with some pixel values. -padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)] -plot(padded_imgs) +# pads all image borders with some pixel values. +padded_imgs = [v2.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)] +plot([orig_img] + padded_imgs) -#################################### +# %% # Resize -# ------ +# ~~~~~~ # The :class:`~torchvision.transforms.Resize` transform # (see also :func:`~torchvision.transforms.functional.resize`) # resizes an image. -resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] -plot(resized_imgs) +resized_imgs = [v2.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] +plot([orig_img] + resized_imgs) -#################################### +# %% # CenterCrop -# ---------- +# ~~~~~~~~~~ # The :class:`~torchvision.transforms.CenterCrop` transform # (see also :func:`~torchvision.transforms.functional.center_crop`) # crops the given image at the center. -center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] -plot(center_crops) +center_crops = [v2.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)] +plot([orig_img] + center_crops) -#################################### +# %% # FiveCrop -# -------- +# ~~~~~~~~ # The :class:`~torchvision.transforms.FiveCrop` transform # (see also :func:`~torchvision.transforms.functional.five_crop`) # crops the given image into four corners and the central crop. -(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img) -plot([top_left, top_right, bottom_left, bottom_right, center]) +(top_left, top_right, bottom_left, bottom_right, center) = v2.FiveCrop(size=(100, 100))(orig_img) +plot([orig_img] + [top_left, top_right, bottom_left, bottom_right, center]) -#################################### -# Grayscale -# --------- -# The :class:`~torchvision.transforms.Grayscale` transform -# (see also :func:`~torchvision.transforms.functional.to_grayscale`) -# converts an image to grayscale -gray_img = T.Grayscale()(orig_img) -plot([gray_img], cmap='gray') - -#################################### -# Random transforms -# ----------------- -# The following transforms are random, which means that the same transfomer -# instance will produce different result each time it transforms a given image. -# -# ColorJitter -# ~~~~~~~~~~~ -# The :class:`~torchvision.transforms.ColorJitter` transform -# randomly changes the brightness, saturation, and other properties of an image. -jitter = T.ColorJitter(brightness=.5, hue=.3) -jitted_imgs = [jitter(orig_img) for _ in range(4)] -plot(jitted_imgs) - -#################################### -# GaussianBlur -# ~~~~~~~~~~~~ -# The :class:`~torchvision.transforms.GaussianBlur` transform -# (see also :func:`~torchvision.transforms.functional.gaussian_blur`) -# performs gaussian blur transform on an image. -blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5)) -blurred_imgs = [blurrer(orig_img) for _ in range(4)] -plot(blurred_imgs) - -#################################### +# %% # RandomPerspective # ~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomPerspective` transform # (see also :func:`~torchvision.transforms.functional.perspective`) # performs random perspective transform on an image. -perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0) +perspective_transformer = v2.RandomPerspective(distortion_scale=0.6, p=1.0) perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)] -plot(perspective_imgs) +plot([orig_img] + perspective_imgs) -#################################### +# %% # RandomRotation # ~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomRotation` transform # (see also :func:`~torchvision.transforms.functional.rotate`) # rotates an image with random angle. -rotater = T.RandomRotation(degrees=(0, 180)) +rotater = v2.RandomRotation(degrees=(0, 180)) rotated_imgs = [rotater(orig_img) for _ in range(4)] -plot(rotated_imgs) +plot([orig_img] + rotated_imgs) -#################################### +# %% # RandomAffine # ~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomAffine` transform # (see also :func:`~torchvision.transforms.functional.affine`) # performs random affine transform on an image. -affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75)) +affine_transfomer = v2.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75)) affine_imgs = [affine_transfomer(orig_img) for _ in range(4)] -plot(affine_imgs) +plot([orig_img] + affine_imgs) -#################################### +# %% # ElasticTransform # ~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.ElasticTransform` transform # (see also :func:`~torchvision.transforms.functional.elastic_transform`) # Randomly transforms the morphology of objects in images and produces a # see-through-water-like effect. -elastic_transformer = T.ElasticTransform(alpha=250.0) +elastic_transformer = v2.ElasticTransform(alpha=250.0) transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)] -plot(transformed_imgs) +plot([orig_img] + transformed_imgs) -#################################### +# %% # RandomCrop # ~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomCrop` transform # (see also :func:`~torchvision.transforms.functional.crop`) # crops an image at a random location. -cropper = T.RandomCrop(size=(128, 128)) +cropper = v2.RandomCrop(size=(128, 128)) crops = [cropper(orig_img) for _ in range(4)] -plot(crops) +plot([orig_img] + crops) -#################################### +# %% # RandomResizedCrop # ~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomResizedCrop` transform # (see also :func:`~torchvision.transforms.functional.resized_crop`) # crops an image at a random location, and then resizes the crop to a given # size. -resize_cropper = T.RandomResizedCrop(size=(32, 32)) +resize_cropper = v2.RandomResizedCrop(size=(32, 32)) resized_crops = [resize_cropper(orig_img) for _ in range(4)] -plot(resized_crops) +plot([orig_img] + resized_crops) + +# %% +# Photometric Transforms +# ---------------------- +# Photometric image transformation refers to the process of modifying the photometric properties of an image, +# such as its brightness, contrast, color, or tone. +# These transformations are applied to change the visual appearance of an image +# while preserving its geometric structure. +# +# Except :class:`~torchvision.transforms.Grayscale`, the following transforms are random, +# which means that the same transform +# instance will produce different result each time it transforms a given image. +# +# Grayscale +# ~~~~~~~~~ +# The :class:`~torchvision.transforms.Grayscale` transform +# (see also :func:`~torchvision.transforms.functional.to_grayscale`) +# converts an image to grayscale +gray_img = v2.Grayscale()(orig_img) +plot([orig_img, gray_img], cmap='gray') -#################################### +# %% +# ColorJitter +# ~~~~~~~~~~~ +# The :class:`~torchvision.transforms.ColorJitter` transform +# randomly changes the brightness, contrast, saturation, hue, and other properties of an image. +jitter = v2.ColorJitter(brightness=.5, hue=.3) +jittered_imgs = [jitter(orig_img) for _ in range(4)] +plot([orig_img] + jittered_imgs) + +# %% +# GaussianBlur +# ~~~~~~~~~~~~ +# The :class:`~torchvision.transforms.GaussianBlur` transform +# (see also :func:`~torchvision.transforms.functional.gaussian_blur`) +# performs gaussian blur transform on an image. +blurrer = v2.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5.)) +blurred_imgs = [blurrer(orig_img) for _ in range(4)] +plot([orig_img] + blurred_imgs) + +# %% # RandomInvert # ~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomInvert` transform # (see also :func:`~torchvision.transforms.functional.invert`) # randomly inverts the colors of the given image. -inverter = T.RandomInvert() +inverter = v2.RandomInvert() invertered_imgs = [inverter(orig_img) for _ in range(4)] -plot(invertered_imgs) +plot([orig_img] + invertered_imgs) -#################################### +# %% # RandomPosterize # ~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomPosterize` transform # (see also :func:`~torchvision.transforms.functional.posterize`) # randomly posterizes the image by reducing the number of bits # of each color channel. -posterizer = T.RandomPosterize(bits=2) +posterizer = v2.RandomPosterize(bits=2) posterized_imgs = [posterizer(orig_img) for _ in range(4)] -plot(posterized_imgs) +plot([orig_img] + posterized_imgs) -#################################### +# %% # RandomSolarize # ~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomSolarize` transform # (see also :func:`~torchvision.transforms.functional.solarize`) # randomly solarizes the image by inverting all pixel values above # the threshold. -solarizer = T.RandomSolarize(threshold=192.0) +solarizer = v2.RandomSolarize(threshold=192.0) solarized_imgs = [solarizer(orig_img) for _ in range(4)] -plot(solarized_imgs) +plot([orig_img] + solarized_imgs) -#################################### +# %% # RandomAdjustSharpness # ~~~~~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform # (see also :func:`~torchvision.transforms.functional.adjust_sharpness`) # randomly adjusts the sharpness of the given image. -sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2) +sharpness_adjuster = v2.RandomAdjustSharpness(sharpness_factor=2) sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)] -plot(sharpened_imgs) +plot([orig_img] + sharpened_imgs) -#################################### +# %% # RandomAutocontrast # ~~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomAutocontrast` transform # (see also :func:`~torchvision.transforms.functional.autocontrast`) # randomly applies autocontrast to the given image. -autocontraster = T.RandomAutocontrast() +autocontraster = v2.RandomAutocontrast() autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)] -plot(autocontrasted_imgs) +plot([orig_img] + autocontrasted_imgs) -#################################### +# %% # RandomEqualize # ~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomEqualize` transform # (see also :func:`~torchvision.transforms.functional.equalize`) # randomly equalizes the histogram of the given image. -equalizer = T.RandomEqualize() +equalizer = v2.RandomEqualize() equalized_imgs = [equalizer(orig_img) for _ in range(4)] -plot(equalized_imgs) +plot([orig_img] + equalized_imgs) -#################################### +# %% +# Augmentation Transforms +# ----------------------- +# The following transforms are combinations of multiple transforms, +# either geometric or photometric, or both. +# # AutoAugment # ~~~~~~~~~~~ # The :class:`~torchvision.transforms.AutoAugment` transform # automatically augments data based on a given auto-augmentation policy. # See :class:`~torchvision.transforms.AutoAugmentPolicy` for the available policies. -policies = [T.AutoAugmentPolicy.CIFAR10, T.AutoAugmentPolicy.IMAGENET, T.AutoAugmentPolicy.SVHN] -augmenters = [T.AutoAugment(policy) for policy in policies] +policies = [v2.AutoAugmentPolicy.CIFAR10, v2.AutoAugmentPolicy.IMAGENET, v2.AutoAugmentPolicy.SVHN] +augmenters = [v2.AutoAugment(policy) for policy in policies] imgs = [ [augmenter(orig_img) for _ in range(4)] for augmenter in augmenters ] row_title = [str(policy).split('.')[-1] for policy in policies] -plot(imgs, row_title=row_title) +plot([[orig_img] + row for row in imgs], row_title=row_title) -#################################### +# %% # RandAugment # ~~~~~~~~~~~ -# The :class:`~torchvision.transforms.RandAugment` transform automatically augments the data. -augmenter = T.RandAugment() +# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment. +augmenter = v2.RandAugment() imgs = [augmenter(orig_img) for _ in range(4)] -plot(imgs) +plot([orig_img] + imgs) -#################################### +# %% # TrivialAugmentWide # ~~~~~~~~~~~~~~~~~~ -# The :class:`~torchvision.transforms.TrivialAugmentWide` transform automatically augments the data. -augmenter = T.TrivialAugmentWide() +# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment. +# However, instead of transforming an image multiple times, it transforms an image only once +# using a random transform from a given list with a random strength number. +augmenter = v2.TrivialAugmentWide() imgs = [augmenter(orig_img) for _ in range(4)] -plot(imgs) +plot([orig_img] + imgs) -#################################### +# %% # AugMix # ~~~~~~ -# The :class:`~torchvision.transforms.AugMix` transform automatically augments the data. -augmenter = T.AugMix() +# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image. +augmenter = v2.AugMix() imgs = [augmenter(orig_img) for _ in range(4)] -plot(imgs) +plot([orig_img] + imgs) -#################################### -# Randomly-applied transforms +# %% +# Randomly-applied Transforms # --------------------------- # -# Some transforms are randomly-applied given a probability ``p``. That is, the -# transformed image may actually be the same as the original one, even when -# called with the same transformer instance! +# The following transforms are randomly-applied given a probability ``p``. That is, given ``p = 0.5``, +# there is a 50% chance to return the original image, and a 50% chance to return the transformed image, +# even when called with the same transform instance! # # RandomHorizontalFlip # ~~~~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomHorizontalFlip` transform # (see also :func:`~torchvision.transforms.functional.hflip`) # performs horizontal flip of an image, with a given probability. -hflipper = T.RandomHorizontalFlip(p=0.5) +hflipper = v2.RandomHorizontalFlip(p=0.5) transformed_imgs = [hflipper(orig_img) for _ in range(4)] -plot(transformed_imgs) +plot([orig_img] + transformed_imgs) -#################################### +# %% # RandomVerticalFlip # ~~~~~~~~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomVerticalFlip` transform # (see also :func:`~torchvision.transforms.functional.vflip`) # performs vertical flip of an image, with a given probability. -vflipper = T.RandomVerticalFlip(p=0.5) +vflipper = v2.RandomVerticalFlip(p=0.5) transformed_imgs = [vflipper(orig_img) for _ in range(4)] -plot(transformed_imgs) +plot([orig_img] + transformed_imgs) -#################################### +# %% # RandomApply # ~~~~~~~~~~~ # The :class:`~torchvision.transforms.RandomApply` transform # randomly applies a list of transforms, with a given probability. -applier = T.RandomApply(transforms=[T.RandomCrop(size=(64, 64))], p=0.5) +applier = v2.RandomApply(transforms=[v2.RandomCrop(size=(64, 64))], p=0.5) transformed_imgs = [applier(orig_img) for _ in range(4)] -plot(transformed_imgs) +plot([orig_img] + transformed_imgs) diff --git a/gallery/transforms/plot_tv_tensors.py b/gallery/transforms/plot_tv_tensors.py new file mode 100644 index 0000000000000000000000000000000000000000..0cdbe9d083142ead856afe5387da0d4b1dd1ef0a --- /dev/null +++ b/gallery/transforms/plot_tv_tensors.py @@ -0,0 +1,224 @@ +""" +============= +TVTensors FAQ +============= + +.. note:: + Try on `collab `_ + or :ref:`go to the end ` to download the full example code. + + +TVTensors are Tensor subclasses introduced together with +``torchvision.transforms.v2``. This example showcases what these TVTensors are +and how they behave. + +.. warning:: + + **Intended Audience** Unless you're writing your own transforms or your own TVTensors, you + probably do not need to read this guide. This is a fairly low-level topic + that most users will not need to worry about: you do not need to understand + the internals of TVTensors to efficiently rely on + ``torchvision.transforms.v2``. It may however be useful for advanced users + trying to implement their own datasets, transforms, or work directly with + the TVTensors. +""" + +# %% +import PIL.Image + +import torch +from torchvision import tv_tensors + + +# %% +# What are TVTensors? +# ------------------- +# +# TVTensors are zero-copy tensor subclasses: + +tensor = torch.rand(3, 256, 256) +image = tv_tensors.Image(tensor) + +assert isinstance(image, torch.Tensor) +assert image.data_ptr() == tensor.data_ptr() + +# %% +# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function +# for the input data. +# +# :mod:`torchvision.tv_tensors` supports four types of TVTensors: +# +# * :class:`~torchvision.tv_tensors.Image` +# * :class:`~torchvision.tv_tensors.Video` +# * :class:`~torchvision.tv_tensors.BoundingBoxes` +# * :class:`~torchvision.tv_tensors.Mask` +# +# What can I do with a TVTensor? +# ------------------------------ +# +# TVTensors look and feel just like regular tensors - they **are** tensors. +# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or +# any ``torch.*`` operator will also work on TVTensors. See +# :ref:`tv_tensor_unwrapping_behaviour` for a few gotchas. + +# %% +# .. _tv_tensor_creation: +# +# How do I construct a TVTensor? +# ------------------------------ +# +# Using the constructor +# ^^^^^^^^^^^^^^^^^^^^^ +# +# Each TVTensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor` + +image = tv_tensors.Image([[[[0, 1], [1, 0]]]]) +print(image) + + +# %% +# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad`` +# parameters. + +float_image = tv_tensors.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True) +print(float_image) + + +# %% +# In addition, :class:`~torchvision.tv_tensors.Image` and :class:`~torchvision.tv_tensors.Mask` can also take a +# :class:`PIL.Image.Image` directly: + +image = tv_tensors.Image(PIL.Image.open("../assets/astronaut.jpg")) +print(image.shape, image.dtype) + +# %% +# Some TVTensors require additional metadata to be passed in ordered to be constructed. For example, +# :class:`~torchvision.tv_tensors.BoundingBoxes` requires the coordinate format as well as the size of the +# corresponding image (``canvas_size``) alongside the actual values. These +# metadata are required to properly transform the bounding boxes. + +bboxes = tv_tensors.BoundingBoxes( + [[17, 16, 344, 495], [0, 10, 0, 10]], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=image.shape[-2:] +) +print(bboxes) + +# %% +# Using ``tv_tensors.wrap()`` +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# You can also use the :func:`~torchvision.tv_tensors.wrap` function to wrap a tensor object +# into a TVTensor. This is useful when you already have an object of the +# desired type, which typically happens when writing transforms: you just want +# to wrap the output like the input. + +new_bboxes = torch.tensor([0, 20, 30, 40]) +new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes) +assert isinstance(new_bboxes, tv_tensors.BoundingBoxes) +assert new_bboxes.canvas_size == bboxes.canvas_size + +# %% +# The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass +# it as a parameter to override it. +# +# .. _tv_tensor_unwrapping_behaviour: +# +# I had a TVTensor but now I have a Tensor. Help! +# ----------------------------------------------- +# +# By default, operations on :class:`~torchvision.tv_tensors.TVTensor` objects +# will return a pure Tensor: + + +assert isinstance(bboxes, tv_tensors.BoundingBoxes) + +# Shift bboxes by 3 pixels in both H and W +new_bboxes = bboxes + 3 + +assert isinstance(new_bboxes, torch.Tensor) +assert not isinstance(new_bboxes, tv_tensors.BoundingBoxes) + +# %% +# .. note:: +# +# This behavior only affects native ``torch`` operations. If you are using +# the built-in ``torchvision`` transforms or functionals, you will always get +# as output the same type that you passed as input (pure ``Tensor`` or +# ``TVTensor``). + +# %% +# But I want a TVTensor back! +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# +# You can re-wrap a pure tensor into a TVTensor by just calling the TVTensor +# constructor, or by using the :func:`~torchvision.tv_tensors.wrap` function +# (see more details above in :ref:`tv_tensor_creation`): + +new_bboxes = bboxes + 3 +new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes) +assert isinstance(new_bboxes, tv_tensors.BoundingBoxes) + +# %% +# Alternatively, you can use the :func:`~torchvision.tv_tensors.set_return_type` +# as a global config setting for the whole program, or as a context manager +# (read its docs to learn more about caveats): + +with tv_tensors.set_return_type("TVTensor"): + new_bboxes = bboxes + 3 +assert isinstance(new_bboxes, tv_tensors.BoundingBoxes) + +# %% +# Why is this happening? +# ^^^^^^^^^^^^^^^^^^^^^^ +# +# **For performance reasons**. :class:`~torchvision.tv_tensors.TVTensor` +# classes are Tensor subclasses, so any operation involving a +# :class:`~torchvision.tv_tensors.TVTensor` object will go through the +# `__torch_function__ +# `_ +# protocol. This induces a small overhead, which we want to avoid when possible. +# This doesn't matter for built-in ``torchvision`` transforms because we can +# avoid the overhead there, but it could be a problem in your model's +# ``forward``. +# +# **The alternative isn't much better anyway.** For every operation where +# preserving the :class:`~torchvision.tv_tensors.TVTensor` type makes +# sense, there are just as many operations where returning a pure Tensor is +# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.tv_tensors.Image`? +# If we were to preserve :class:`~torchvision.tv_tensors.TVTensor` types all +# the way, even model's logits or the output of the loss function would end up +# being of type :class:`~torchvision.tv_tensors.Image`, and surely that's not +# desirable. +# +# .. note:: +# +# This behaviour is something we're actively seeking feedback on. If you find this surprising or if you +# have any suggestions on how to better support your use-cases, please reach out to us via this issue: +# https://github.com/pytorch/vision/issues/7319 +# +# Exceptions +# ^^^^^^^^^^ +# +# There are a few exceptions to this "unwrapping" rule: +# :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`, +# :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain +# the TVTensor type. +# +# Inplace operations on TVTensors like ``obj.add_()`` will preserve the type of +# ``obj``. However, the **returned** value of inplace operations will be a pure +# tensor: + +image = tv_tensors.Image([[[0, 1], [1, 0]]]) + +new_image = image.add_(1).mul_(2) + +# image got transformed in-place and is still a TVTensor Image, but new_image +# is a Tensor. They share the same underlying data and they're equal, just +# different classes. +assert isinstance(image, tv_tensors.Image) +print(image) + +assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, tv_tensors.Image) +assert (new_image == image).all() +assert new_image.data_ptr() == image.data_ptr() diff --git a/hubconf.py b/hubconf.py index 57ce7a0d12a3ebd200493a565bd2ab439ab16c00..637827127cab488eb0cf7d08ff9eb120a1989155 100644 --- a/hubconf.py +++ b/hubconf.py @@ -20,6 +20,7 @@ from torchvision.models.efficientnet import ( ) from torchvision.models.googlenet import googlenet from torchvision.models.inception import inception_v3 +from torchvision.models.maxvit import maxvit_t from torchvision.models.mnasnet import mnasnet0_5, mnasnet0_75, mnasnet1_0, mnasnet1_3 from torchvision.models.mobilenetv2 import mobilenet_v2 from torchvision.models.mobilenetv3 import mobilenet_v3_large, mobilenet_v3_small @@ -68,6 +69,17 @@ from torchvision.models.shufflenetv2 import ( shufflenet_v2_x2_0, ) from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1 -from torchvision.models.swin_transformer import swin_b, swin_s, swin_t +from torchvision.models.swin_transformer import swin_b, swin_s, swin_t, swin_v2_b, swin_v2_s, swin_v2_t from torchvision.models.vgg import vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, vgg19_bn +from torchvision.models.video import ( + mc3_18, + mvit_v1_b, + mvit_v2_s, + r2plus1d_18, + r3d_18, + s3d, + swin3d_b, + swin3d_s, + swin3d_t, +) from torchvision.models.vision_transformer import vit_b_16, vit_b_32, vit_h_14, vit_l_16, vit_l_32 diff --git a/ios/CMakeLists.txt b/ios/CMakeLists.txt index 6b9fd3925b2972faa7e0691187fe1b8cfd2d810f..4201240a42725dc52e05e67859347c65459e7e8e 100644 --- a/ios/CMakeLists.txt +++ b/ios/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 3.4.1) set(TARGET torchvision_ops) project(${TARGET} CXX) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) set(LIBTORCH_HEADER_ROOT ${LIBTORCH_HEADER_ROOT}) set(LIBRARY_OUTPUT_PATH ../lib) diff --git a/ios/LibTorchvision.podspec b/ios/LibTorchvision.podspec index ba87820e142f7804d5824e5975a9f091cacfe63a..b88fb70ac40fe786a323e2eb40065ac0ad537dcc 100644 --- a/ios/LibTorchvision.podspec +++ b/ios/LibTorchvision.podspec @@ -1,8 +1,8 @@ -pytorch_version = '1.12.0' +pytorch_version = '2.0.0' Pod::Spec.new do |s| s.name = 'LibTorchvision' - s.version = '0.13.0' + s.version = '0.15.1' s.authors = 'PyTorch Team' s.license = { :type => 'BSD' } s.homepage = 'https://github.com/pytorch/vision' diff --git a/ios/README.md b/ios/README.md new file mode 100644 index 0000000000000000000000000000000000000000..0b50245f1ee819ede8dbdc177c10e06db48e8408 --- /dev/null +++ b/ios/README.md @@ -0,0 +1,3 @@ +## Status + +The iOS demo of TorchVision is currently unmaintained, untested and likely out-of-date. diff --git a/maintainer_guide.md b/maintainer_guide.md new file mode 100644 index 0000000000000000000000000000000000000000..24ac0943821d86a82ff2021add09799d730ca4b4 --- /dev/null +++ b/maintainer_guide.md @@ -0,0 +1,76 @@ +## Torchvision maintainers guide + +This document aims at documenting user-facing policies / principles used when +developing and maintaining torchvision. Other maintainer info (e.g. release +process) can be found in the meta-internal wiki. + +### What is public and what is private? + +For the Python API, torchvision largely follows the [PyTorch +policy](https://github.com/pytorch/pytorch/wiki/Public-API-definition-and-documentation) +which is consistent with other major packages +([numpy](https://numpy.org/neps/nep-0023-backwards-compatibility.html), +[scikit-learn](https://scikit-learn.org/dev/glossary.html#term-API) etc.). +We recognize that his policy is somewhat imperfect for some edge cases, and that +it's difficult to come up with an accurate technical definition. In broad terms, +which are usually well understood by users, the policy is that: + +- modules that can be accessed without leading underscore are public +- objects in a public file that don't have a leading underscore are public +- class attributes are public iff they have no leading underscore +- the rest of the modules / objects / class attributes are considered private + +The public API has backward-compatible (BC) guarantees defined in our +deprecation policy (see below). The private API has not BC guarantees. + +For C++, code is private. For Meta employees: if a C++ change breaks fbcode, fix +fbcode or revert the change. We should be careful about models running in +production and relying on torchvision ops. + +The `test` folder is not importable and is **private.** Even meta-internal +projects should *not* rely on it (it has happened in the past and is now +programmatically impossible). + +The training references do not have BC guarantees. Breaking changes are +possible, but we should make sure that the tutorials are still running properly, +and that their intended narrative is preserved (by e.g. checking outputs, +etc.). + +The rest of the folders (build, android, ios, etc.) are private and have no BC +guarantees. + +### Deprecation policy. + +Because they're disruptive, **deprecations should only be used sparingly**. + +We largely follow the [PyTorch +policy](https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy): +breaking changes require a deprecation period of at least 2 versions. + +Deprecations should clearly indicate their deadline in the docs and warning +messages. Avoid not committing to a deadline, or keeping deprecated APIs for too +long: it gives no incentive for users to update their code, sends conflicting +messages ("why was this API removed while this other one is still around?"), and +accumulates debt in the project. + +### Should this attribute be public? Should this function be private? + +When designing an API it’s not always obvious what should be exposed as public, +and what should be kept as a private implementation detail. The following +guidelines can be useful: + +* Functional consistency throughout the library is a top priority, for users and + developers’ sake. In doubt and unless it’s clearly wrong, expose what other + similar classes expose. +* Think really hard about the users and their use-cases, and try to expose what + they would need to address those use-cases. Aggressively keep everything else + private. Remember that the “private -> public” direction is way smoother than + the “public -> private” one: in doubt, keep it private. +* When thinking about use-cases, the general API motto applies: make what’s + simple and common easy, and make what’s complex possible (80% / 20% rule). + There might be a ~1% left that’s not addressed: that’s OK. Also, **make what’s + wrong very hard**, if not impossible. + +As a good practice, always create new files and even classes with a leading +underscore in their name. This way, everything is private by default and the +only public surface is explicitly present in an `__init__.py` file. diff --git a/mypy.ini b/mypy.ini index aaeea57a6915aff7903a9f9c3b56975b354323dd..653f7c14ec2b6d63edb4512fc7379b18f1b22d84 100644 --- a/mypy.ini +++ b/mypy.ini @@ -7,6 +7,38 @@ allow_redefinition = True no_implicit_optional = True warn_redundant_casts = True +[mypy-torchvision.prototype.datapoints.*] + +; untyped definitions and calls +disallow_untyped_defs = True + +; None and Optional handling +no_implicit_optional = True + +; warnings +warn_unused_ignores = True + +; miscellaneous strictness flags +allow_redefinition = True + +[mypy-torchvision.prototype.transforms.*] + +; untyped definitions and calls +disallow_untyped_defs = True + +; None and Optional handling +no_implicit_optional = True + +; warnings +warn_unused_ignores = True + +; miscellaneous strictness flags +allow_redefinition = True + +[mypy-torchvision.prototype.datasets.*] + +ignore_errors = True + [mypy-torchvision.io.image.*] ignore_errors = True diff --git a/packaging/README.md b/packaging/README.md deleted file mode 100644 index 3ceac53030e7b89d1df93e03be86d18c667f49e6..0000000000000000000000000000000000000000 --- a/packaging/README.md +++ /dev/null @@ -1,6 +0,0 @@ -# Building torchvision packages for release - -TorchVision release packages are built by using `build_wheel.sh` and `build_conda.sh` for all permutations of -supported operating systems, compute platforms and python versions. - -OS/Python/Compute matrix is defined in https://github.com/pytorch/vision/blob/main/.circleci/regenerate.py diff --git a/packaging/build_cmake.sh b/packaging/build_cmake.sh deleted file mode 100755 index 99d98c67f1ad08355476f484b81fb812ab336502..0000000000000000000000000000000000000000 --- a/packaging/build_cmake.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/bin/bash -set -ex - -PARALLELISM=8 -if [ -n "$MAX_JOBS" ]; then - PARALLELISM=$MAX_JOBS -fi - -if [[ "$(uname)" != Darwin && "$OSTYPE" != "msys" ]]; then - eval "$(./conda/bin/conda shell.bash hook)" - conda activate ./env -fi - -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -. "$script_dir/pkg_helpers.bash" - -export BUILD_TYPE=conda -setup_env -export SOURCE_ROOT_DIR="$PWD" -setup_conda_pytorch_constraint -setup_conda_cudatoolkit_plain_constraint - -if [[ "$OSTYPE" == "msys" ]]; then - conda install -yq conda-build cmake future - pip install dataclasses -fi - -setup_visual_studio_constraint -setup_junit_results_folder - -if [[ "$(uname)" == Darwin ]]; then - # TODO: this can be removed as soon as mkl's CMake support works with clang - # see https://github.com/pytorch/vision/pull/4203 for details - MKL_CONSTRAINT='mkl==2021.2.0' -else - MKL_CONSTRAINT='' -fi - -if [[ $CONDA_BUILD_VARIANT == "cpu" ]]; then - PYTORCH_MUTEX_CONSTRAINT='pytorch-mutex=1.0=cpu' -else - PYTORCH_MUTEX_CONSTRAINT='' -fi - -conda install -yq \pytorch=$PYTORCH_VERSION $CONDA_CUDATOOLKIT_CONSTRAINT $PYTORCH_MUTEX_CONSTRAINT $MKL_CONSTRAINT numpy -c nvidia -c "pytorch-${UPLOAD_CHANNEL}" -TORCH_PATH=$(dirname $(python -c "import torch; print(torch.__file__)")) - -if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then - conda install -yq libpng jpeg -else - yum install -y libpng-devel libjpeg-turbo-devel -fi - -if [[ "$OSTYPE" == "msys" ]]; then - source .circleci/unittest/windows/scripts/set_cuda_envs.sh -fi - -mkdir cpp_build -pushd cpp_build - -# Generate libtorchvision files -cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA - -# Compile and install libtorchvision -if [[ "$OSTYPE" == "msys" ]]; then - "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cmake.bat" $PARALLELISM - CONDA_PATH=$(dirname $(which python)) - cp -r "C:/Program Files (x86)/torchvision/include/torchvision" $CONDA_PATH/include -else - make -j$PARALLELISM - make install - - if [[ "$(uname)" == Darwin ]]; then - CONDA_PATH=$(dirname $(dirname $(which python))) - cp -r /usr/local/include/torchvision $CONDA_PATH/include/ - export C_INCLUDE_PATH=/usr/local/include - export CPLUS_INCLUDE_PATH=/usr/local/include - fi -fi - -popd - -# Install torchvision locally -python setup.py develop - -# Trace, compile and run project that uses Faster-RCNN -pushd test/tracing/frcnn -mkdir build - -# Trace model -python trace_model.py -cp fasterrcnn_resnet50_fpn.pt build - -cd build -cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA -if [[ "$OSTYPE" == "msys" ]]; then - "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_frcnn.bat" $PARALLELISM - mv fasterrcnn_resnet50_fpn.pt Release - cd Release - export PATH=$(cygpath -w "C:/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"):$(cygpath -w "C:/Program Files (x86)/torchvision/bin"):$(cygpath -w $TORCH_PATH)/lib:$PATH -else - make -j$PARALLELISM -fi - -# Run traced program -./test_frcnn_tracing - -# Compile and run the CPP example -popd -cd examples/cpp/hello_world -mkdir build - -# Trace model -python trace_model.py -cp resnet18.pt build - -cd build -cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch - -if [[ "$OSTYPE" == "msys" ]]; then - "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cpp_example.bat" $PARALLELISM - mv resnet18.pt Release - cd Release -else - make -j$PARALLELISM -fi - -# Run CPP example -./hello-world diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh deleted file mode 100755 index e80c7dfbe64c4da74fb535b09c78d94871aeb54b..0000000000000000000000000000000000000000 --- a/packaging/build_conda.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash -set -ex - -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -. "$script_dir/pkg_helpers.bash" - -export BUILD_TYPE=conda -setup_env -export SOURCE_ROOT_DIR="$PWD" -setup_conda_pytorch_constraint -setup_conda_cudatoolkit_constraint -setup_visual_studio_constraint -setup_junit_results_folder -export CUDATOOLKIT_CHANNEL="nvidia" - -conda build -c $CUDATOOLKIT_CHANNEL $CONDA_CHANNEL_FLAGS --no-anaconda-upload --python "$PYTHON_VERSION" packaging/torchvision diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh deleted file mode 100755 index 3299d16ec92ab1c50b53ea586b25779fdd1f62ea..0000000000000000000000000000000000000000 --- a/packaging/build_wheel.sh +++ /dev/null @@ -1,60 +0,0 @@ -#!/bin/bash -set -ex - -script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" -. "$script_dir/pkg_helpers.bash" - -export BUILD_TYPE=wheel -setup_env -setup_wheel_python -pip_install numpy pyyaml future ninja -pip_install --upgrade setuptools -setup_pip_pytorch_version -python setup.py clean - -# Copy binaries to be included in the wheel distribution -if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then - python_exec="$(which python)" - bin_path=$(dirname $python_exec) - env_path=$(dirname $bin_path) - if [[ "$(uname)" == Darwin ]]; then - # Install delocate to relocate the required binaries - pip_install "delocate>=0.9" - else - cp "$bin_path/Library/bin/libpng16.dll" torchvision - cp "$bin_path/Library/bin/libjpeg.dll" torchvision - fi -else - # Install auditwheel to get some inspection utilities - pip_install auditwheel - - # Point to custom libraries - export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH - export TORCHVISION_INCLUDE=$(pwd)/ext_libraries/include - export TORCHVISION_LIBRARY=$(pwd)/ext_libraries/lib -fi - -download_copy_ffmpeg - -if [[ "$OSTYPE" == "msys" ]]; then - IS_WHEEL=1 "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel -else - IS_WHEEL=1 python setup.py bdist_wheel -fi - - -if [[ "$(uname)" == Darwin ]]; then - pushd dist/ - python_exec="$(which python)" - bin_path=$(dirname $python_exec) - env_path=$(dirname $bin_path) - for whl in *.whl; do - DYLD_FALLBACK_LIBRARY_PATH="$env_path/lib/:$DYLD_FALLBACK_LIBRARY_PATH" delocate-wheel -v --ignore-missing-dependencies $whl - done -else - if [[ "$OSTYPE" == "msys" ]]; then - "$script_dir/windows/internal/vc_env_helper.bat" python $script_dir/wheel/relocate.py - else - LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python $script_dir/wheel/relocate.py - fi -fi diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash deleted file mode 100644 index 195bc3a4561025fb5a7ccdf37f2e03d9f99a3fcb..0000000000000000000000000000000000000000 --- a/packaging/pkg_helpers.bash +++ /dev/null @@ -1,339 +0,0 @@ -# A set of useful bash functions for common functionality we need to do in -# many build scripts - - -# Setup CUDA environment variables, based on CU_VERSION -# -# Inputs: -# CU_VERSION (cpu, cu92, cu100) -# NO_CUDA_PACKAGE (bool) -# BUILD_TYPE (conda, wheel) -# -# Outputs: -# VERSION_SUFFIX (e.g., "") -# PYTORCH_VERSION_SUFFIX (e.g., +cpu) -# WHEEL_DIR (e.g., cu100/) -# CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension) -# FORCE_CUDA (respected by torchvision setup.py) -# NVCC_FLAGS (respected by torchvision setup.py) -# -# Precondition: CUDA versions are installed in their conventional locations in -# /usr/local/cuda-* -# -# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX? If you're building -# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX == -# PYTORCH_VERSION_SUFFIX and everyone is happy. However, if you are building a -# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always -# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU -# version of a Python package. But that doesn't apply if you're on OS X, -# since the default CU_VERSION on OS X is cpu. -setup_cuda() { - - # First, compute version suffixes. By default, assume no version suffixes - export VERSION_SUFFIX="" - export PYTORCH_VERSION_SUFFIX="" - export WHEEL_DIR="" - # Wheel builds need suffixes (but not if they're on OS X, which never has suffix) - if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then - export PYTORCH_VERSION_SUFFIX="+$CU_VERSION" - # Match the suffix scheme of pytorch, unless this package does not have - # CUDA builds (in which case, use default) - if [[ -z "$NO_CUDA_PACKAGE" ]]; then - export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX" - export WHEEL_DIR="$CU_VERSION/" - fi - fi - - # Now work out the CUDA settings - case "$CU_VERSION" in - cu117) - if [[ "$OSTYPE" == "msys" ]]; then - export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7" - else - export CUDA_HOME=/usr/local/cuda-11.7/ - fi - export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" - ;; - cu116) - if [[ "$OSTYPE" == "msys" ]]; then - export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.6" - else - export CUDA_HOME=/usr/local/cuda-11.6/ - fi - export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6" - ;; - cpu) - ;; - rocm*) - export FORCE_CUDA=1 - ;; - *) - echo "Unrecognized CU_VERSION=$CU_VERSION" - exit 1 - ;; - esac - if [[ -n "$CUDA_HOME" ]]; then - # Adds nvcc binary to the search path so that CMake's `find_package(CUDA)` will pick the right one - export PATH="$CUDA_HOME/bin:$PATH" - export FORCE_CUDA=1 - fi -} - -# Populate build version if necessary, and add version suffix -# -# Inputs: -# BUILD_VERSION (e.g., 0.2.0 or empty) -# VERSION_SUFFIX (e.g., +cpu) -# -# Outputs: -# BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu) -# -# Fill BUILD_VERSION if it doesn't exist already with a nightly string -# Usage: setup_build_version 0.2.0 -setup_build_version() { - if [[ -z "$BUILD_VERSION" ]]; then - if [[ -z "$1" ]]; then - setup_base_build_version - else - BUILD_VERSION="$1" - fi - BUILD_VERSION="$BUILD_VERSION.dev$(date "+%Y%m%d")$VERSION_SUFFIX" - else - BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX" - fi - - # Set build version based on tag if on tag - if [[ -n "${CIRCLE_TAG}" ]]; then - # Strip tag - BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')${VERSION_SUFFIX}" - fi - - export BUILD_VERSION -} - -setup_base_build_version() { - SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" - # version.txt for some reason has `a` character after major.minor.rev - # command below yields 0.10.0 from version.txt containing 0.10.0a0 - BUILD_VERSION=$( cut -f 1 -d a "$SCRIPT_DIR/../version.txt" ) - export BUILD_VERSION -} - -# Set some useful variables for OS X, if applicable -setup_macos() { - if [[ "$(uname)" == Darwin ]]; then - export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ - fi -} - - -# Top-level entry point for things every package will need to do -# -# Usage: setup_env 0.2.0 -setup_env() { - setup_cuda - setup_build_version "$1" - setup_macos -} - -# Function to retry functions that sometimes timeout or have flaky failures -retry () { - $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) -} - -# Inputs: -# PYTHON_VERSION (3.7, 3.8, 3.9) -# UNICODE_ABI (bool) -# -# Outputs: -# PATH modified to put correct Python version in PATH -# -# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image -setup_wheel_python() { - if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then - eval "$(conda shell.bash hook)" - conda env remove -n "env$PYTHON_VERSION" || true - conda create ${CONDA_CHANNEL_FLAGS} -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION" - conda activate "env$PYTHON_VERSION" - # Install libpng from Anaconda (defaults) - conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y - else - # Install native CentOS libJPEG, freetype and GnuTLS - yum install -y libjpeg-turbo-devel freetype gnutls - case "$PYTHON_VERSION" in - 3.7) python_abi=cp37-cp37m ;; - 3.8) python_abi=cp38-cp38 ;; - 3.9) python_abi=cp39-cp39 ;; - 3.10) python_abi=cp310-cp310 ;; - *) - echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION" - exit 1 - ;; - esac - # Download all the dependencies required to compile image and video_reader - # extensions - - mkdir -p ext_libraries - pushd ext_libraries - popd - export PATH="/opt/python/$python_abi/bin:$(pwd)/ext_libraries/bin:$PATH" - fi -} - -# Install with pip a bit more robustly than the default -pip_install() { - retry pip install --progress-bar off "$@" -} - -# Install torch with pip, respecting PYTORCH_VERSION, and record the installed -# version into PYTORCH_VERSION, if applicable -setup_pip_pytorch_version() { - if [[ -z "$PYTORCH_VERSION" ]]; then - # Install latest prerelease version of torch, per our nightlies, consistent - # with the requested cuda version - pip_install --pre torch -f "https://download.pytorch.org/whl/test/${WHEEL_DIR}torch_test.html" - if [[ "$CUDA_VERSION" == "cpu" ]]; then - # CUDA and CPU are ABI compatible on the CPU-only parts, so strip - # in this case - export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version: *//' | sed 's/+.\+//')" - else - export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version: *//')" - fi - else - pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \ - -f "https://download.pytorch.org/whl/${CU_VERSION}/torch_stable.html" \ - -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${CU_VERSION}/torch_${UPLOAD_CHANNEL}.html" - fi -} - -# Fill PYTORCH_VERSION with the latest conda nightly version, and -# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions -# -# You MUST have populated PYTORCH_VERSION_SUFFIX before hand. -setup_conda_pytorch_constraint() { - if [[ -z "$PYTORCH_VERSION" ]]; then - export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch-test -c pytorch" - PYTHON="python" - # Check if we have python 3 instead and prefer that - if python3 --version >/dev/null 2>/dev/null; then - PYTHON="python3" - fi - export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-test]' | \ - ${PYTHON} -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \ - cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \ - cuver_2 = (cuver[:-1] + '.' + cuver[-1]).replace('cu', 'cuda') if cuver != 'cpu' else cuver; \ - print(re.sub(r'\\+.*$', '', \ - [x['version'] for x in json.load(sys.stdin)['pytorch'] \ - if (x['platform'] == 'darwin' or cuver_1 in x['fn'] or cuver_2 in x['fn']) \ - and 'py' + os.environ['PYTHON_VERSION'] in x['fn']][-1]))")" - if [[ -z "$PYTORCH_VERSION" ]]; then - echo "PyTorch version auto detection failed" - echo "No package found for CU_VERSION=$CU_VERSION and PYTHON_VERSION=$PYTHON_VERSION" - exit 1 - fi - else - export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch -c pytorch-${UPLOAD_CHANNEL}" - fi - if [[ "$CU_VERSION" == cpu ]]; then - export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}" - export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION" - else - export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}" - export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}" - fi - if [[ "$OSTYPE" == msys && "$CU_VERSION" == cu92 ]]; then - export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c defaults -c numba/label/dev" - fi -} - -# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT -setup_conda_cudatoolkit_constraint() { - export CONDA_BUILD_VARIANT="cuda" - if [[ "$(uname)" == Darwin ]]; then - export CONDA_BUILD_VARIANT="cpu" - else - case "$CU_VERSION" in - cu117) - export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.7 # [not osx]" - ;; - cu116) - export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.6 # [not osx]" - ;; - cpu) - export CONDA_CUDATOOLKIT_CONSTRAINT="" - export CONDA_BUILD_VARIANT="cpu" - ;; - *) - echo "Unrecognized CU_VERSION=$CU_VERSION" - exit 1 - ;; - esac - fi -} - -setup_conda_cudatoolkit_plain_constraint() { - export CONDA_BUILD_VARIANT="cuda" - export CMAKE_USE_CUDA=1 - if [[ "$(uname)" == Darwin ]]; then - export CONDA_BUILD_VARIANT="cpu" - export CMAKE_USE_CUDA=0 - else - case "$CU_VERSION" in - cu117) - export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.7" - ;; - cu116) - export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.6" - ;; - cpu) - export CONDA_CUDATOOLKIT_CONSTRAINT="" - export CONDA_BUILD_VARIANT="cpu" - export CMAKE_USE_CUDA=0 - ;; - *) - echo "Unrecognized CU_VERSION=$CU_VERSION" - exit 1 - ;; - esac - fi -} - -# Build the proper compiler package before building the final package -setup_visual_studio_constraint() { - if [[ "$OSTYPE" == "msys" ]]; then - export VSTOOLCHAIN_PACKAGE=vs$VC_YEAR - conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE - cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchvision/conda_build_config.yaml - fi -} - -setup_junit_results_folder() { - if [[ "$CI" == "true" ]]; then - export CONDA_PYTORCH_BUILD_RESULTS_DIRECTORY="${SOURCE_ROOT_DIR}/build_results/results.xml" - fi -} - - -download_copy_ffmpeg() { - if [[ "$OSTYPE" == "msys" ]]; then - # conda install -yq ffmpeg=4.2 -c pytorch - # curl -L -q https://anaconda.org/pytorch/ffmpeg/4.3/download/win-64/ffmpeg-4.3-ha925a31_0.tar.bz2 --output ffmpeg-4.3-ha925a31_0.tar.bz2 - # bzip2 --decompress --stdout ffmpeg-4.3-ha925a31_0.tar.bz2 | tar -x --file=- - # cp Library/bin/*.dll ../torchvision - echo "FFmpeg is disabled currently on Windows" - else - if [[ "$(uname)" == Darwin ]]; then - conda install -yq ffmpeg=4.2 -c pytorch - conda install -yq wget - else - # pushd ext_libraries - # wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/linux-64/ffmpeg-4.2-hf484d3e_0.tar.bz2 - # tar -xjvf ffmpeg-4.2-hf484d3e_0.tar.bz2 - # rm -rf ffmpeg-4.2-hf484d3e_0.tar.bz2 - # ldconfig - # which ffmpeg - # popd - echo "FFmpeg is disabled currently on Linux" - fi - fi -} diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..ae7542f9f8a97a7e66d255dba0f7925b5c8584fe --- /dev/null +++ b/packaging/post_build_script.sh @@ -0,0 +1,2 @@ +#!/bin/bash +LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python packaging/wheel/relocate.py diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh new file mode 100644 index 0000000000000000000000000000000000000000..e93a7267e651507f69787f05e1bde62cb1aef4a8 --- /dev/null +++ b/packaging/pre_build_script.sh @@ -0,0 +1,50 @@ +#!/bin/bash +if [[ "$(uname)" == Darwin ]]; then + # Uninstall Conflicting jpeg brew formulae + jpeg_packages=$(brew list | grep jpeg) + echo "Existing Jpeg-related Brew libraries" + echo $jpeg_packages + for pkg in $jpeg_packages; do + brew uninstall --ignore-dependencies --force $pkg || true + done + + conda install -yq wget +fi + +if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then + # Install libpng from Anaconda (defaults) + conda install libpng -yq + conda install -yq ffmpeg=4.2 libjpeg-turbo -c pytorch + + # Copy binaries to be included in the wheel distribution + if [[ "$OSTYPE" == "msys" ]]; then + python_exec="$(which python)" + bin_path=$(dirname $python_exec) + cp "$bin_path/Library/bin/libjpeg.dll" torchvision + fi +else + + if [[ "$ARCH" == "aarch64" ]]; then + conda install libpng -yq + conda install -yq ffmpeg=4.2 libjpeg-turbo -c pytorch-nightly + fi + + # Install native CentOS libJPEG, freetype and GnuTLS + yum install -y libjpeg-turbo-devel freetype gnutls + + # Download all the dependencies required to compile image and video_reader + # extensions + mkdir -p ext_libraries + pushd ext_libraries + popd + export PATH="$(pwd)/ext_libraries/bin:$PATH" + pip install auditwheel + + # Point to custom libraries + export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH + export TORCHVISION_INCLUDE=$(pwd)/ext_libraries/include + export TORCHVISION_LIBRARY=$(pwd)/ext_libraries/lib +fi + +pip install numpy pyyaml future ninja +pip install --upgrade setuptools diff --git a/packaging/torchvision/conda_build_config.yaml b/packaging/torchvision/conda_build_config.yaml index 52b95952ddf14f4812790e5ba316503cadc2b15b..a7c25c6d53475a8c3a32b0d30b9df35727c753f2 100644 --- a/packaging/torchvision/conda_build_config.yaml +++ b/packaging/torchvision/conda_build_config.yaml @@ -7,7 +7,7 @@ c_compiler: cxx_compiler: - vs2017 # [win] python: - - 3.7 + - 3.8 # This differs from target_platform in that it determines what subdir the compiler # will target, not what subdir the compiler package will be itself. # For example, we need a win-64 vs2008_win-32 package, so that we compile win-32 diff --git a/packaging/torchvision/meta.yaml b/packaging/torchvision/meta.yaml index 105e28c453e33b763333d4c3b3e5e19b56b20f96..9adc13b558bb9646dd27ee508fa5e27121997ded 100644 --- a/packaging/torchvision/meta.yaml +++ b/packaging/torchvision/meta.yaml @@ -10,7 +10,7 @@ requirements: build: - {{ compiler('c') }} # [win] - libpng - - jpeg + - libjpeg-turbo # NOTE: The only ffmpeg version that we build is actually 4.2 - ffmpeg >=4.2 # [not win] @@ -23,11 +23,12 @@ requirements: run: - python - - defaults::numpy >=1.11 + - defaults::numpy >=1.11 # [py <= 310] + - numpy >=1.23.5 # [py >= 311] - requests - libpng - ffmpeg >=4.2 # [not win] - - jpeg + - libjpeg-turbo - pillow >=5.3.0, !=8.3.* - pytorch-mutex 1.0 {{ build_variant }} # [not osx ] {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }} @@ -61,7 +62,7 @@ test: requires: - pytest - scipy - - jpeg + - libjpeg-turbo - ca-certificates diff --git a/packaging/vs2017/activate.bat b/packaging/vs2017/activate.bat deleted file mode 100644 index ccecfc25442f0563990588edfb0e9f949a4b8af4..0000000000000000000000000000000000000000 --- a/packaging/vs2017/activate.bat +++ /dev/null @@ -1,44 +0,0 @@ -:: Set env vars that tell distutils to use the compiler that we put on path -SET DISTUTILS_USE_SDK=1 -SET MSSdk=1 - -SET "VS_VERSION=15.0" -SET "VS_MAJOR=15" -SET "VS_YEAR=2017" - -set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out" -set "MSYS2_ENV_CONV_EXCL=CL" - -:: For Python 3.5+, ensure that we link with the dynamic runtime. See -:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info -set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll" - -for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do ( - if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( - set "VSINSTALLDIR=%%i\" - goto :vswhere - ) -) - -:vswhere - -:: Shorten PATH to avoid the `input line too long` error. -SET MyPath=%PATH% - -setlocal EnableDelayedExpansion - -SET TempPath="%MyPath:;=";"%" -SET var= -FOR %%a IN (%TempPath%) DO ( - IF EXIST %%~sa ( - SET "var=!var!;%%~sa" - ) -) - -set "TempPath=!var:~1!" -endlocal & set "PATH=%TempPath%" - -:: Shorten current directory too -FOR %%A IN (.) DO CD "%%~sA" - -:: other things added by install_activate.bat at package build time diff --git a/packaging/vs2017/conda_build_config.yaml b/packaging/vs2017/conda_build_config.yaml deleted file mode 100644 index 2479ceb3e762b561c6b5d7b4daa5bb4d2cfded59..0000000000000000000000000000000000000000 --- a/packaging/vs2017/conda_build_config.yaml +++ /dev/null @@ -1,23 +0,0 @@ -blas_impl: - - mkl # [x86_64] -c_compiler: - - vs2017 # [win] -cxx_compiler: - - vs2017 # [win] -python: - - 3.7 -# This differs from target_platform in that it determines what subdir the compiler -# will target, not what subdir the compiler package will be itself. -# For example, we need a win-64 vs2008_win-32 package, so that we compile win-32 -# code on win-64 miniconda. -cross_compiler_target_platform: - - win-64 # [win] -target_platform: - - win-64 # [win] -vc: - - 14 -zip_keys: - - # [win] - - vc # [win] - - c_compiler # [win] - - cxx_compiler # [win] diff --git a/packaging/vs2017/install_activate.bat b/packaging/vs2017/install_activate.bat deleted file mode 100644 index 253d2f2c2c1d3431eeb1f7cb90f26260d9f71c9f..0000000000000000000000000000000000000000 --- a/packaging/vs2017/install_activate.bat +++ /dev/null @@ -1,29 +0,0 @@ -set YEAR=2017 -set VER=15 - -mkdir "%PREFIX%\etc\conda\activate.d" -COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - -IF "%cross_compiler_target_platform%" == "win-64" ( - set "target_platform=amd64" - echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - IF "%VSDEVCMD_ARGS%" == "" ( - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - ) ELSE ( - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - ) - echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - ) else ( - set "target_platform=x86" - echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat" - echo popd - ) diff --git a/packaging/vs2017/install_runtime.bat b/packaging/vs2017/install_runtime.bat deleted file mode 100644 index 5163c16cf24d49092b6a4aa5cfb1d18a19cc1549..0000000000000000000000000000000000000000 --- a/packaging/vs2017/install_runtime.bat +++ /dev/null @@ -1,49 +0,0 @@ -set VC_PATH=x86 -if "%ARCH%"=="64" ( - set VC_PATH=x64 -) - -set MSC_VER=2017 - -rem :: This should always be present for VC installed with VS. Not sure about VC installed with Visual C++ Build Tools 2015 -rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO ( -rem set SP=%%A -rem ) - -rem if not "%SP%" == "%PKG_VERSION%" ( -rem echo "Version detected from registry: %SP%" -rem echo "does not match version of package being built (%PKG_VERSION%)" -rem echo "Do you have current updates for VS 2015 installed?" -rem exit 1 -rem ) - - -REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below! -robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%LIBRARY_BIN%" *.dll /E -robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%PREFIX%" *.dll /E -if %ERRORLEVEL% GEQ 8 exit 1 - -REM ========== This one comes from visual studio 2017 -set "VC_VER=141" - -for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do ( - if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( - set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat" - goto :eof - ) -) - -@setlocal -call "%VS15VARSALL%" x64 - -set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%" - -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -@endlocal diff --git a/packaging/vs2017/meta.yaml b/packaging/vs2017/meta.yaml deleted file mode 100644 index 1f569525ee176da433857aa6ae5a565350320549..0000000000000000000000000000000000000000 --- a/packaging/vs2017/meta.yaml +++ /dev/null @@ -1,24 +0,0 @@ -{% set vcver="14.1" %} -{% set vcfeature="14" %} -{% set vsyear="2017" %} -{% set fullver="15.4.27004.2010" %} - -package: - name: vs{{ vsyear }} - version: {{ fullver }} - -build: - skip: True [not win] - script_env: - - VSDEVCMD_ARGS # [win] - -outputs: - - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }} - script: install_activate.bat - track_features: - # VS 2017 is binary-compatible with VS 2015/vc14. Tools are "v141". - strong: - - vc{{ vcfeature }} - about: - summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler - license: BSD 3-clause diff --git a/packaging/vs2019/conda_build_config.yaml b/packaging/vs2019/conda_build_config.yaml index 7bd8de2ea5bd629fedb307852d566a918a9fd623..b4dc99341d07a245acdfaf4383235230449943a1 100644 --- a/packaging/vs2019/conda_build_config.yaml +++ b/packaging/vs2019/conda_build_config.yaml @@ -5,7 +5,7 @@ c_compiler: cxx_compiler: - vs2019 # [win] python: - - 3.7 + - 3.8 # This differs from target_platform in that it determines what subdir the compiler # will target, not what subdir the compiler package will be itself. # For example, we need a win-64 vs2008_win-32 package, so that we compile win-32 diff --git a/packaging/vs2019/install_runtime.bat b/packaging/vs2019/install_runtime.bat deleted file mode 100644 index e09a5ccfb0f42cc6de2a2f960d31faf2511ae094..0000000000000000000000000000000000000000 --- a/packaging/vs2019/install_runtime.bat +++ /dev/null @@ -1,49 +0,0 @@ -set VC_PATH=x86 -if "%ARCH%"=="64" ( - set VC_PATH=x64 -) - -set MSC_VER=2019 - -rem :: This should always be present for VC installed with VS. Not sure about VC installed with Visual C++ Build Tools 2015 -rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO ( -rem set SP=%%A -rem ) - -rem if not "%SP%" == "%PKG_VERSION%" ( -rem echo "Version detected from registry: %SP%" -rem echo "does not match version of package being built (%PKG_VERSION%)" -rem echo "Do you have current updates for VS 2015 installed?" -rem exit 1 -rem ) - - -REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below! -robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%LIBRARY_BIN%" *.dll /E -robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%" "%PREFIX%" *.dll /E -if %ERRORLEVEL% GEQ 8 exit 1 - -REM ========== This one comes from visual studio 2019 -set "VC_VER=142" - -for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do ( - if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" ( - set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat" - goto :eof - ) -) - -@setlocal -call "%VS15VARSALL%" x64 - -set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%" - -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E -if %ERRORLEVEL% LSS 8 exit 0 -@endlocal diff --git a/packaging/wheel/relocate.py b/packaging/wheel/relocate.py index e6a4ef9d458888bdbe3e63d757c54253c6a5c662..fb110abd873def571154fe219f1a589dccb9eb06 100644 --- a/packaging/wheel/relocate.py +++ b/packaging/wheel/relocate.py @@ -2,7 +2,6 @@ import glob import hashlib -import io # Standard library imports import os @@ -65,21 +64,12 @@ PLATFORM_ARCH = platform.machine() PYTHON_VERSION = sys.version_info -def read_chunks(file, size=io.DEFAULT_BUFFER_SIZE): - """Yield pieces of data from a file-like object until EOF.""" - while True: - chunk = file.read(size) - if not chunk: - break - yield chunk - - def rehash(path, blocksize=1 << 20): """Return (hash, length) for path using hashlib.sha256()""" h = hashlib.sha256() length = 0 with open(path, "rb") as f: - for block in read_chunks(f, size=blocksize): + while block := f.read(blocksize): length += len(block) h.update(block) digest = "sha256=" + urlsafe_b64encode(h.digest()).decode("latin1").rstrip("=") @@ -191,7 +181,7 @@ def relocate_elf_library(patchelf, output_dir, output_library, binary): print("Copying dependencies to wheel directory") new_libraries_path = osp.join(output_dir, "torchvision.libs") - os.makedirs(new_libraries_path) + os.makedirs(new_libraries_path, exist_ok=True) new_names = {binary: binary_path} diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat deleted file mode 100644 index 66e922289956382e0a6f2189d5eec0d8072d1b66..0000000000000000000000000000000000000000 --- a/packaging/windows/internal/cuda_install.bat +++ /dev/null @@ -1,143 +0,0 @@ -@echo on - -if "%CU_VERSION%" == "cpu" ( - echo Skipping for CPU builds - exit /b 0 -) - -set SRC_DIR=%~dp0\.. - -if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build" - -rem in unit test workflow, we get CUDA_VERSION, for example 11.1 -if defined CUDA_VERSION ( - set CUDA_VER=%CUDA_VERSION:.=% -) else ( - set CUDA_VER=%CU_VERSION:cu=% -) - -set /a CUDA_VER=%CU_VERSION:cu=% -set CUDA_VER_MAJOR=%CUDA_VER:~0,-1% -set CUDA_VER_MINOR=%CUDA_VER:~-1,1% -set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR% -set CUDNN_FOLDER="cuda" -set CUDNN_LIB_FOLDER="lib\x64" - -if %CUDA_VER% EQU 116 goto cuda116 -if %CUDA_VER% EQU 117 goto cuda117 - -echo CUDA %CUDA_VERSION_STR% is not supported -exit /b 1 - -:cuda116 - -set CUDA_INSTALL_EXE=cuda_11.6.0_511.23_windows.exe -if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( - curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - if errorlevel 1 exit /b 1 - set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=thrust_11.6 nvcc_11.6 cuobjdump_11.6 nvprune_11.6 nvprof_11.6 cupti_11.6 cublas_11.6 cublas_dev_11.6 cudart_11.6 cufft_11.6 cufft_dev_11.6 curand_11.6 curand_dev_11.6 cusolver_11.6 cusolver_dev_11.6 cusparse_11.6 cusparse_dev_11.6 npp_11.6 npp_dev_11.6 nvjpeg_11.6 nvjpeg_dev_11.6 nvrtc_11.6 nvrtc_dev_11.6 nvml_dev_11.6" -) - -set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip -set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive -set CUDNN_LIB_FOLDER="lib" -if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( - curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" - if errorlevel 1 exit /b 1 - set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" - - rem Make sure windows path contains zlib dll - curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" - 7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" - xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" -) - -goto cuda_common - -:cuda117 - -set CUDA_INSTALL_EXE=cuda_11.7.0_516.01_windows.exe -if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" ( - curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - if errorlevel 1 exit /b 1 - set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" - set "ARGS=thrust_11.7 nvcc_11.7 cuobjdump_11.7 nvprune_11.7 nvprof_11.7 cupti_11.7 cublas_11.7 cublas_dev_11.7 cudart_11.7 cufft_11.7 cufft_dev_11.7 curand_11.7 curand_dev_11.7 cusolver_11.7 cusolver_dev_11.7 cusparse_11.7 cusparse_dev_11.7 npp_11.7 npp_dev_11.7 nvjpeg_11.7 nvjpeg_dev_11.7 nvrtc_11.7 nvrtc_dev_11.7 nvml_dev_11.7" -) - -set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip -set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive -set CUDNN_LIB_FOLDER="lib" -if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" ( - curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" - if errorlevel 1 exit /b 1 - set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" - - rem Make sure windows path contains zlib dll - curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip" - 7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib" - xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32" -) - -goto cuda_common - -:cuda_common - -if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" ( - curl -k -L https://www.dropbox.com/s/9mcolalfdj4n979/NvToolsExt.7z?dl=1 --output "%SRC_DIR%\temp_build\NvToolsExt.7z" - if errorlevel 1 exit /b 1 -) - -echo Installing CUDA toolkit... -7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda" -pushd "%SRC_DIR%\temp_build\cuda" -sc config wuauserv start= disabled -sc stop wuauserv -sc query wuauserv - -start /wait setup.exe -s %ARGS% -loglevel:6 -log:"%cd%/cuda_install_logs" -echo %errorlevel% - -popd - -echo Installing VS integration... -rem It's for VS 2019 -if "%CUDA_VER_MAJOR%" == "10" ( - xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations" -) -if "%CUDA_VER_MAJOR%" == "11" ( - xcopy /Y "%SRC_DIR%\temp_build\cuda\visual_studio_integration\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations" -) - -echo Installing NvToolsExt... -7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt" -mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" -mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include" -mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64" -xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" -xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include" -xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64" - -echo Setting up environment... -set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%" -set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" -set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%" -set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64" - -if not exist "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" ( - echo CUDA %CUDA_VERSION_STR% installed failed. - echo --------- RunDll32.exe.log - type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.RunDll32.exe.log" - echo --------- setup.exe.log ------- - type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.setup.exe.log" - exit /b 1 -) - -echo Installing cuDNN... -7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn" -xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin" -xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64" -xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include" - -echo Cleaning temp files -rd /s /q "%SRC_DIR%\temp_build" || ver > nul diff --git a/packaging/windows/internal/driver_update.bat b/packaging/windows/internal/driver_update.bat deleted file mode 100644 index 00b43affc01cc302a3d6c527be197f1adcc0ba2f..0000000000000000000000000000000000000000 --- a/packaging/windows/internal/driver_update.bat +++ /dev/null @@ -1,25 +0,0 @@ -set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe" -curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -if errorlevel 1 exit /b 1 - -start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot -if errorlevel 1 exit /b 1 - -del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL - -setlocal EnableDelayedExpansion -set NVIDIA_GPU_EXISTS=0 -for /F "delims=" %%i in ('wmic path win32_VideoController get name') do ( - set GPUS=%%i - if not "x!GPUS:NVIDIA=!" == "x!GPUS!" ( - SET NVIDIA_GPU_EXISTS=1 - goto gpu_check_end - ) -) -:gpu_check_end -endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS% - -if "%NVIDIA_GPU_EXISTS%" == "0" ( - echo "CUDA Driver installation Failed" - exit /b 1 -) diff --git a/packaging/windows/internal/vc_env_helper.bat b/packaging/windows/internal/vc_env_helper.bat index e85a372f93d58c87107c7dc1e2d7aa2a5e423445..d3484a66e9f9021a06512a4a7888c7d9329c1029 100644 --- a/packaging/windows/internal/vc_env_helper.bat +++ b/packaging/windows/internal/vc_env_helper.bat @@ -1,7 +1,11 @@ @echo on -set VC_VERSION_LOWER=16 -set VC_VERSION_UPPER=17 +set VC_VERSION_LOWER=17 +set VC_VERSION_UPPER=18 +if "%VC_YEAR%" == "2019" ( + set VC_VERSION_LOWER=16 + set VC_VERSION_UPPER=17 +) if "%VC_YEAR%" == "2017" ( set VC_VERSION_LOWER=15 set VC_VERSION_UPPER=16 diff --git a/packaging/windows/internal/vc_install_helper.sh b/packaging/windows/internal/vc_install_helper.sh deleted file mode 100644 index cdae18065b9f6e97e385fa2002131ef857562306..0000000000000000000000000000000000000000 --- a/packaging/windows/internal/vc_install_helper.sh +++ /dev/null @@ -1,16 +0,0 @@ -#!/bin/bash - -set -ex - -if [[ "$CU_VERSION" == "cu92" ]]; then - export VC_YEAR=2017 - export VSDEVCMD_ARGS="-vcvars_ver=14.13" - powershell packaging/windows/internal/vs2017_install.ps1 -elif [[ "$CU_VERSION" == "cu100" ]]; then - export VC_YEAR=2017 - export VSDEVCMD_ARGS="" - powershell packaging/windows/internal/vs2017_install.ps1 -else - export VC_YEAR=2019 - export VSDEVCMD_ARGS="" -fi diff --git a/packaging/windows/internal/vs2017_install.ps1 b/packaging/windows/internal/vs2017_install.ps1 deleted file mode 100644 index 3e953de1ab7a0fa33238e10fbcd80564246c1a55..0000000000000000000000000000000000000000 --- a/packaging/windows/internal/vs2017_install.ps1 +++ /dev/null @@ -1,25 +0,0 @@ -$VS_DOWNLOAD_LINK = "https://aka.ms/vs/15/release/vs_buildtools.exe" -$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools", - "--add Microsoft.VisualStudio.Component.VC.Tools.14.13", - "--add Microsoft.Component.MSBuild", - "--add Microsoft.VisualStudio.Component.Roslyn.Compiler", - "--add Microsoft.VisualStudio.Component.TextTemplating", - "--add Microsoft.VisualStudio.Component.VC.CoreIde", - "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest", - "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core", - "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64", - "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81") - -curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe -if ($LASTEXITCODE -ne 0) { - echo "Download of the VS 2017 installer failed" - exit 1 -} - -$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru -Remove-Item -Path vs_installer.exe -Force -$exitCode = $process.ExitCode -if (($exitCode -ne 0) -and ($exitCode -ne 3010)) { - echo "VS 2017 installer exited with code $exitCode, which should be one of [0, 3010]." - exit 1 -} diff --git a/packaging/windows/internal/vs2019_install.ps1 b/packaging/windows/internal/vs2019_install.ps1 deleted file mode 100644 index e436051f0dbb2ce9361f3d1c33295249ba032bb2..0000000000000000000000000000000000000000 --- a/packaging/windows/internal/vs2019_install.ps1 +++ /dev/null @@ -1,21 +0,0 @@ -$VS_DOWNLOAD_LINK = "https://aka.ms/vs/16/release/vs_buildtools.exe" -$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools", - "--add Microsoft.Component.MSBuild", - "--add Microsoft.VisualStudio.Component.Roslyn.Compiler", - "--add Microsoft.VisualStudio.Component.VC.CoreBuildTools", - "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest", - "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64") - -curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe -if ($LASTEXITCODE -ne 0) { - echo "Download of the VS 2019 installer failed" - exit 1 -} - -$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru -Remove-Item -Path vs_installer.exe -Force -$exitCode = $process.ExitCode -if (($exitCode -ne 0) -and ($exitCode -ne 3010)) { - echo "VS 2019 installer exited with code $exitCode, which should be one of [0, 3010]." - exit 1 -} diff --git a/pyproject.toml b/pyproject.toml index 8f0be4245bd978c8d945bc14e3c276c3b017cf12..61e4a957fc563f9503eb1ef52bb93a701b1fbcb1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,7 +5,7 @@ first_party_detection = false [tool.black] line-length = 120 -target-version = ["py37"] +target-version = ["py38"] [tool.ufmt] diff --git a/pytest.ini b/pytest.ini index ca7539448595a47ec17404e7eb07be60dc142e35..594f14964a1868374b67506025df277f4b7cfbc7 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,9 +1,9 @@ [pytest] addopts = - # show summary of all tests that did not pass - -ra + # show tests that (f)ailed, (E)rror, or (X)passed in the summary + -rfEX # Make tracebacks shorter - --tb=native + --tb=short # enable all warnings -Wd --ignore=test/test_datasets_download.py diff --git a/references/classification/README.md b/references/classification/README.md index da5cd98867de85b6c6e6d2f9dbc73750e8a63646..66ae871aedeec5af1f275690f64885911b015c8d 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -298,7 +298,7 @@ Here `$MODEL` is one of `googlenet`, `inception_v3`, `resnet18`, `resnet50`, `re ### Quantized ShuffleNet V2 -Here are commands that we use to quantized the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models. +Here are commands that we use to quantize the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models. ``` # For shufflenet_v2_x1_5 python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' \ diff --git a/references/classification/presets.py b/references/classification/presets.py index 5d1bf1cc71455fcb043538fecbad54050671d015..8653957a57646925f2f028041e3fc4b2e422ee94 100644 --- a/references/classification/presets.py +++ b/references/classification/presets.py @@ -1,9 +1,23 @@ import torch -from torchvision.transforms import autoaugment, transforms from torchvision.transforms.functional import InterpolationMode +def get_module(use_v2): + # We need a protected import to avoid the V2 warning in case just V1 is used + if use_v2: + import torchvision.transforms.v2 + + return torchvision.transforms.v2 + else: + import torchvision.transforms + + return torchvision.transforms + + class ClassificationPresetTrain: + # Note: this transform assumes that the input to forward() are always PIL + # images, regardless of the backend parameter. We may change that in the + # future though, if we change the output type from the dataset. def __init__( self, *, @@ -16,31 +30,48 @@ class ClassificationPresetTrain: ra_magnitude=9, augmix_severity=3, random_erase_prob=0.0, + backend="pil", + use_v2=False, ): - trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)] + T = get_module(use_v2) + + transforms = [] + backend = backend.lower() + if backend == "tensor": + transforms.append(T.PILToTensor()) + elif backend != "pil": + raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}") + + transforms.append(T.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True)) if hflip_prob > 0: - trans.append(transforms.RandomHorizontalFlip(hflip_prob)) + transforms.append(T.RandomHorizontalFlip(hflip_prob)) if auto_augment_policy is not None: if auto_augment_policy == "ra": - trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=ra_magnitude)) + transforms.append(T.RandAugment(interpolation=interpolation, magnitude=ra_magnitude)) elif auto_augment_policy == "ta_wide": - trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation)) + transforms.append(T.TrivialAugmentWide(interpolation=interpolation)) elif auto_augment_policy == "augmix": - trans.append(autoaugment.AugMix(interpolation=interpolation, severity=augmix_severity)) + transforms.append(T.AugMix(interpolation=interpolation, severity=augmix_severity)) else: - aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy) - trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation)) - trans.extend( + aa_policy = T.AutoAugmentPolicy(auto_augment_policy) + transforms.append(T.AutoAugment(policy=aa_policy, interpolation=interpolation)) + + if backend == "pil": + transforms.append(T.PILToTensor()) + + transforms.extend( [ - transforms.PILToTensor(), - transforms.ConvertImageDtype(torch.float), - transforms.Normalize(mean=mean, std=std), + T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float), + T.Normalize(mean=mean, std=std), ] ) if random_erase_prob > 0: - trans.append(transforms.RandomErasing(p=random_erase_prob)) + transforms.append(T.RandomErasing(p=random_erase_prob)) - self.transforms = transforms.Compose(trans) + if use_v2: + transforms.append(T.ToPureTensor()) + + self.transforms = T.Compose(transforms) def __call__(self, img): return self.transforms(img) @@ -55,17 +86,34 @@ class ClassificationPresetEval: mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), interpolation=InterpolationMode.BILINEAR, + backend="pil", + use_v2=False, ): + T = get_module(use_v2) + transforms = [] + backend = backend.lower() + if backend == "tensor": + transforms.append(T.PILToTensor()) + elif backend != "pil": + raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}") - self.transforms = transforms.Compose( - [ - transforms.Resize(resize_size, interpolation=interpolation), - transforms.CenterCrop(crop_size), - transforms.PILToTensor(), - transforms.ConvertImageDtype(torch.float), - transforms.Normalize(mean=mean, std=std), - ] - ) + transforms += [ + T.Resize(resize_size, interpolation=interpolation, antialias=True), + T.CenterCrop(crop_size), + ] + + if backend == "pil": + transforms.append(T.PILToTensor()) + + transforms += [ + T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float), + T.Normalize(mean=mean, std=std), + ] + + if use_v2: + transforms.append(T.ToPureTensor()) + + self.transforms = T.Compose(transforms) def __call__(self, img): return self.transforms(img) diff --git a/references/classification/train.py b/references/classification/train.py index 00af63018316588228ebfbe8629bd59799acca36..1bb0d86e9a592c84cae85839251997ad67db1130 100644 --- a/references/classification/train.py +++ b/references/classification/train.py @@ -7,12 +7,13 @@ import presets import torch import torch.utils.data import torchvision -import transforms +import torchvision.transforms import utils from sampler import RASampler from torch import nn from torch.utils.data.dataloader import default_collate from torchvision.transforms.functional import InterpolationMode +from transforms import get_mixup_cutmix def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None): @@ -128,10 +129,12 @@ def load_data(traindir, valdir, args): print(f"Loading dataset_train from {cache_path}") dataset, _ = torch.load(cache_path) else: + # We need a default value for the variables below because args may come + # from train_quantization.py which doesn't define them. auto_augment_policy = getattr(args, "auto_augment", None) random_erase_prob = getattr(args, "random_erase", 0.0) - ra_magnitude = args.ra_magnitude - augmix_severity = args.augmix_severity + ra_magnitude = getattr(args, "ra_magnitude", None) + augmix_severity = getattr(args, "augmix_severity", None) dataset = torchvision.datasets.ImageFolder( traindir, presets.ClassificationPresetTrain( @@ -141,6 +144,8 @@ def load_data(traindir, valdir, args): random_erase_prob=random_erase_prob, ra_magnitude=ra_magnitude, augmix_severity=augmix_severity, + backend=args.backend, + use_v2=args.use_v2, ), ) if args.cache_dataset: @@ -158,10 +163,17 @@ def load_data(traindir, valdir, args): else: if args.weights and args.test_only: weights = torchvision.models.get_weight(args.weights) - preprocessing = weights.transforms() + preprocessing = weights.transforms(antialias=True) + if args.backend == "tensor": + preprocessing = torchvision.transforms.Compose([torchvision.transforms.PILToTensor(), preprocessing]) + else: preprocessing = presets.ClassificationPresetEval( - crop_size=val_crop_size, resize_size=val_resize_size, interpolation=interpolation + crop_size=val_crop_size, + resize_size=val_resize_size, + interpolation=interpolation, + backend=args.backend, + use_v2=args.use_v2, ) dataset_test = torchvision.datasets.ImageFolder( @@ -206,18 +218,17 @@ def main(args): val_dir = os.path.join(args.data_path, "val") dataset, dataset_test, train_sampler, test_sampler = load_data(train_dir, val_dir, args) - collate_fn = None num_classes = len(dataset.classes) - mixup_transforms = [] - if args.mixup_alpha > 0.0: - mixup_transforms.append(transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha)) - if args.cutmix_alpha > 0.0: - mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha)) - if mixup_transforms: - mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms) + mixup_cutmix = get_mixup_cutmix( + mixup_alpha=args.mixup_alpha, cutmix_alpha=args.cutmix_alpha, num_categories=num_classes, use_v2=args.use_v2 + ) + if mixup_cutmix is not None: def collate_fn(batch): - return mixupcutmix(*default_collate(batch)) + return mixup_cutmix(*default_collate(batch)) + + else: + collate_fn = default_collate data_loader = torch.utils.data.DataLoader( dataset, @@ -314,11 +325,11 @@ def main(args): model_ema = None if args.model_ema: - # Decay adjustment that aims to keep the decay independent from other hyper-parameters originally proposed at: + # Decay adjustment that aims to keep the decay independent of other hyper-parameters originally proposed at: # https://github.com/facebookresearch/pycls/blob/f8cd9627/pycls/core/net.py#L123 # # total_ema_updates = (Dataset_size / n_GPUs) * epochs / (batch_size_per_gpu * EMA_steps) - # We consider constant = Dataset_size for a given dataset/setup and ommit it. Thus: + # We consider constant = Dataset_size for a given dataset/setup and omit it. Thus: # adjust = 1 / total_ema_updates ~= n_GPUs * batch_size_per_gpu * EMA_steps / epochs adjust = args.world_size * args.batch_size * args.model_ema_steps / args.epochs alpha = 1.0 - args.model_ema_decay @@ -505,6 +516,8 @@ def get_args_parser(add_help=True): "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)" ) parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load") + parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive") + parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms") return parser diff --git a/references/classification/transforms.py b/references/classification/transforms.py index 9a8ef7877d6d5525aead106695d495b7a05cb009..3d10388c36fc8af1374d235049f4e2c690fe436f 100644 --- a/references/classification/transforms.py +++ b/references/classification/transforms.py @@ -2,12 +2,35 @@ import math from typing import Tuple import torch +from presets import get_module from torch import Tensor from torchvision.transforms import functional as F -class RandomMixup(torch.nn.Module): - """Randomly apply Mixup to the provided batch and targets. +def get_mixup_cutmix(*, mixup_alpha, cutmix_alpha, num_categories, use_v2): + transforms_module = get_module(use_v2) + + mixup_cutmix = [] + if mixup_alpha > 0: + mixup_cutmix.append( + transforms_module.MixUp(alpha=mixup_alpha, num_categories=num_categories) + if use_v2 + else RandomMixUp(num_classes=num_categories, p=1.0, alpha=mixup_alpha) + ) + if cutmix_alpha > 0: + mixup_cutmix.append( + transforms_module.CutMix(alpha=mixup_alpha, num_categories=num_categories) + if use_v2 + else RandomCutMix(num_classes=num_categories, p=1.0, alpha=mixup_alpha) + ) + if not mixup_cutmix: + return None + + return transforms_module.RandomChoice(mixup_cutmix) + + +class RandomMixUp(torch.nn.Module): + """Randomly apply MixUp to the provided batch and targets. The class implements the data augmentations as described in the paper `"mixup: Beyond Empirical Risk Minimization" `_. @@ -89,8 +112,8 @@ class RandomMixup(torch.nn.Module): return s -class RandomCutmix(torch.nn.Module): - """Randomly apply Cutmix to the provided batch and targets. +class RandomCutMix(torch.nn.Module): + """Randomly apply CutMix to the provided batch and targets. The class implements the data augmentations as described in the paper `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features" `_. diff --git a/references/classification/utils.py b/references/classification/utils.py index c31f3928e8641acabf725be129895f0533ecd29e..3e6c2e89e39341d0d15aac17948c1f284f5d8353 100644 --- a/references/classification/utils.py +++ b/references/classification/utils.py @@ -365,12 +365,12 @@ def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=T checkpoint_path = os.path.abspath(checkpoint_path) output_dir = os.path.dirname(checkpoint_path) - # Deep copy to avoid side-effects on the model object. + # Deep copy to avoid side effects on the model object. model = copy.deepcopy(model) checkpoint = torch.load(checkpoint_path, map_location="cpu") # Load the weights to the model to validate that everything works - # and remove unnecessary weights (such as auxiliaries, etc) + # and remove unnecessary weights (such as auxiliaries, etc.) if checkpoint_key == "model_ema": del checkpoint[checkpoint_key]["n_averaged"] torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(checkpoint[checkpoint_key], "module.") diff --git a/references/depth/stereo/README.md b/references/depth/stereo/README.md new file mode 100644 index 0000000000000000000000000000000000000000..22bcae27ab0e81cbd9899e5db185819f27c1f115 --- /dev/null +++ b/references/depth/stereo/README.md @@ -0,0 +1,180 @@ +# Stereo Matching reference training scripts + +This folder contains reference training scripts for Stereo Matching. +They serve as a log of how to train specific models, so as to provide baseline +training and evaluation scripts to quickly bootstrap research. + + +### CREStereo + +The CREStereo model was trained on a dataset mixture between **CREStereo**, **ETH3D** and the additional split from **Middlebury2014**. +A ratio of **88-6-6** was used in order to train a baseline weight set. We provide multi-set variant as well. +Both used 8 A100 GPUs and a batch size of 2 (so effective batch size is 16). The +rest of the hyper-parameters loosely follow the recipe from https://github.com/megvii-research/CREStereo. +The original recipe trains for **300000** updates (or steps) on the dataset mixture. We modify the learning rate +schedule to one that starts decaying the weight much sooner. Throughout the experiments we found that this reduces +overfitting during evaluation time and gradient clip help stabilize the loss during a pre-mature learning rate change. + +``` +torchrun --nproc_per_node 8 --nnodes 1 train.py \ + --dataset-root $dataset_root \ + --name $name_cre \ + --model crestereo_base \ + --train-datasets crestereo eth3d-train middlebury2014-other \ + --dataset-steps 264000 18000 18000 + --batch-size 2 \ + --lr 0.0004 \ + --min-lr 0.00002 \ + --lr-decay-method cosine \ + --warmup-steps 6000 \ + --decay-after-steps 30000 \ + --clip-grad-norm 1.0 \ +``` + +We employ a multi-set fine-tuning stage where we uniformly sample from multiple datasets. Given hat some of these datasets have extremely large images (``2048x2048`` or more) we opt for a very aggressive scale-range ``[0.2 - 0.8]`` such that as much of the original frame composition is captured inside the ``384x512`` crop. + +``` +torchrun --nproc_per_node 8 --nnodes 1 train.py \ + --dataset-root $dataset_root \ + --name $name_things \ + --model crestereo_base \ + --train-datasets crestereo eth3d-train middlebury2014-other instereo2k fallingthings carla-highres sintel sceneflow-monkaa sceneflow-driving \ + --dataset-steps 12000 12000 12000 12000 12000 12000 12000 12000 12000 + --batch-size 2 \ + --scale-range 0.2 0.8 \ + --lr 0.0004 \ + --lr-decay-method cosine \ + --decay-after-steps 0 \ + --warmup-steps 0 \ + --min-lr 0.00002 \ + --resume-path $checkpoint_dir/$name_cre.pth +``` + + +### Evaluation + +Evaluating the base weights + +``` +torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_ETH_MBL_V1 +``` + +This should give an **mae of about 1.416** on the train set of `Middlebury2014`. Results may vary slightly depending on the batch size and the number of GPUs. For the most accurate results use 1 GPU and `--batch-size 1`. The created log file should look like this, where the first key is the number of cascades and the nested key is the number of recursive iterations: + +``` +Dataset: middlebury2014-train @size: [384, 512]: +{ + 1: { + 2: {'mae': 2.363, 'rmse': 4.352, '1px': 0.611, '3px': 0.828, '5px': 0.891, 'relepe': 0.176, 'fl-all': 64.511} + 5: {'mae': 1.618, 'rmse': 3.71, '1px': 0.761, '3px': 0.879, '5px': 0.918, 'relepe': 0.154, 'fl-all': 77.128} + 10: {'mae': 1.416, 'rmse': 3.53, '1px': 0.777, '3px': 0.896, '5px': 0.933, 'relepe': 0.148, 'fl-all': 78.388} + 20: {'mae': 1.448, 'rmse': 3.583, '1px': 0.771, '3px': 0.893, '5px': 0.931, 'relepe': 0.145, 'fl-all': 77.7} + }, +} +{ + 2: { + 2: {'mae': 1.972, 'rmse': 4.125, '1px': 0.73, '3px': 0.865, '5px': 0.908, 'relepe': 0.169, 'fl-all': 74.396} + 5: {'mae': 1.403, 'rmse': 3.448, '1px': 0.793, '3px': 0.905, '5px': 0.937, 'relepe': 0.151, 'fl-all': 80.186} + 10: {'mae': 1.312, 'rmse': 3.368, '1px': 0.799, '3px': 0.912, '5px': 0.943, 'relepe': 0.148, 'fl-all': 80.379} + 20: {'mae': 1.376, 'rmse': 3.542, '1px': 0.796, '3px': 0.91, '5px': 0.942, 'relepe': 0.149, 'fl-all': 80.054} + }, +} +``` + +You can also evaluate the Finetuned weights: + +``` +torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_FINETUNE_MULTI_V1 +``` + +``` +Dataset: middlebury2014-train @size: [384, 512]: +{ + 1: { + 2: {'mae': 1.85, 'rmse': 3.797, '1px': 0.673, '3px': 0.862, '5px': 0.917, 'relepe': 0.171, 'fl-all': 69.736} + 5: {'mae': 1.111, 'rmse': 3.166, '1px': 0.838, '3px': 0.93, '5px': 0.957, 'relepe': 0.134, 'fl-all': 84.596} + 10: {'mae': 1.02, 'rmse': 3.073, '1px': 0.854, '3px': 0.938, '5px': 0.96, 'relepe': 0.129, 'fl-all': 86.042} + 20: {'mae': 0.993, 'rmse': 3.059, '1px': 0.855, '3px': 0.942, '5px': 0.967, 'relepe': 0.126, 'fl-all': 85.784} + }, +} +{ + 2: { + 2: {'mae': 1.667, 'rmse': 3.867, '1px': 0.78, '3px': 0.891, '5px': 0.922, 'relepe': 0.165, 'fl-all': 78.89} + 5: {'mae': 1.158, 'rmse': 3.278, '1px': 0.843, '3px': 0.926, '5px': 0.955, 'relepe': 0.135, 'fl-all': 84.556} + 10: {'mae': 1.046, 'rmse': 3.13, '1px': 0.85, '3px': 0.934, '5px': 0.96, 'relepe': 0.13, 'fl-all': 85.464} + 20: {'mae': 1.021, 'rmse': 3.102, '1px': 0.85, '3px': 0.935, '5px': 0.963, 'relepe': 0.129, 'fl-all': 85.417} + }, +} +``` + +Evaluating the author provided weights: + +``` +torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.MEGVII_V1 +``` + +``` +Dataset: middlebury2014-train @size: [384, 512]: +{ + 1: { + 2: {'mae': 1.704, 'rmse': 3.738, '1px': 0.738, '3px': 0.896, '5px': 0.933, 'relepe': 0.157, 'fl-all': 76.464} + 5: {'mae': 0.956, 'rmse': 2.963, '1px': 0.88, '3px': 0.948, '5px': 0.965, 'relepe': 0.124, 'fl-all': 88.186} + 10: {'mae': 0.792, 'rmse': 2.765, '1px': 0.905, '3px': 0.958, '5px': 0.97, 'relepe': 0.114, 'fl-all': 90.429} + 20: {'mae': 0.749, 'rmse': 2.706, '1px': 0.907, '3px': 0.961, '5px': 0.972, 'relepe': 0.113, 'fl-all': 90.807} + }, +} +{ + 2: { + 2: {'mae': 1.702, 'rmse': 3.784, '1px': 0.784, '3px': 0.894, '5px': 0.924, 'relepe': 0.172, 'fl-all': 80.313} + 5: {'mae': 0.932, 'rmse': 2.907, '1px': 0.877, '3px': 0.944, '5px': 0.963, 'relepe': 0.125, 'fl-all': 87.979} + 10: {'mae': 0.773, 'rmse': 2.768, '1px': 0.901, '3px': 0.958, '5px': 0.972, 'relepe': 0.117, 'fl-all': 90.43} + 20: {'mae': 0.854, 'rmse': 2.971, '1px': 0.9, '3px': 0.957, '5px': 0.97, 'relepe': 0.122, 'fl-all': 90.269} + }, +} +``` + +# Concerns when training + +We encourage users to be aware of the **aspect-ratio** and **disparity scale** they are targeting when doing any sort of training or fine-tuning. The model is highly sensitive to these two factors, as a consequence of naive multi-set fine-tuning one can achieve `0.2 mae` relatively fast. We recommend that users pay close attention to how they **balance dataset sizing** when training such networks. + + Ideally, dataset scaling should be trated at an individual level and a thorough **EDA** of the disparity distribution in random crops at the desired training / inference size should be performed prior to any large compute investments. + +### Disparity scaling + +##### Sample A + The top row contains a sample from `Sintel` whereas the bottom row one from `Middlebury`. + +![Disparity1](assets/disparity-domain-drift.jpg) + +From left to right (`left_image`, `right_image`, `valid_mask`, `valid_mask & ground_truth`, `prediction`). **Darker is further away, lighter is closer**. In the case of `Sintel` which is more closely aligned to the original distribution of `CREStereo` we notice that the model accurately predicts the background scale whereas in the case of `Middlebury2014` it cannot correctly estimate the continuous disparity. Notice that the frame composition is similar for both examples. The blue skybox in the `Sintel` scene behaves similarly to the `Middlebury` black background. However, because the `Middlebury` samples comes from an extremely large scene the crop size of `384x512` does not correctly capture the general training distribution. + + + + +##### Sample B + +The top row contains a scene from `Sceneflow` using the `Monkaa` split whilst the bottom row is a scene from `Middlebury`. This sample exhibits the same issues when it comes to **background estimation**. Given the exaggerated size of the `Middlebury` samples the model **colapses the smooth background** of the sample to what it considers to be a mean background disparity value. + +![Disparity2](assets/disparity-background-mode-collapse.jpg) + + +For more detail on why this behaviour occurs based on the training distribution proportions you can read more about the network at: https://github.com/pytorch/vision/pull/6629#discussion_r978160493 + + +### Metric overfitting + +##### Learning is critical in the beginning + +We also advise users to make user of faster training schedules, as the performance gain over long periods time is marginal. Here we exhibit a difference between a faster decay schedule and later decay schedule. + +![Loss1](assets/Loss.jpg) + +In **grey** we set the lr decay to begin after `30000` steps whilst in **orange** we opt for a very late learning rate decay at around `180000` steps. Although exhibiting stronger variance, we can notice that unfreezing the learning rate earlier whilst employing `gradient-norm` out-performs the default configuration. + +##### Gradient norm saves time + +![Loss2](assets/gradient-norm-removal.jpg) + +In **grey** we keep ``gradient norm`` enabled whilst in **orange** we do not. We can notice that remvoing the gradient norm exacerbates the performance decrease in the early stages whilst also showcasing an almost complete collapse around the `60000` steps mark where we started decaying the lr for **orange**. + +Although both runs ahieve an improvement of about ``0.1`` mae after the lr decay start, the benefits of it are observable much faster when ``gradient norm`` is employed as the recovery period is no longer accounted for. diff --git a/references/depth/stereo/cascade_evaluation.py b/references/depth/stereo/cascade_evaluation.py new file mode 100644 index 0000000000000000000000000000000000000000..ee506ce398551537ad38a9a1f5eb01eb52c4ec7e --- /dev/null +++ b/references/depth/stereo/cascade_evaluation.py @@ -0,0 +1,299 @@ +import os +import warnings + +import torch +import torchvision +import torchvision.prototype.models.depth.stereo +import utils +from torch.nn import functional as F +from train import make_eval_loader + +from utils.metrics import AVAILABLE_METRICS +from visualization import make_prediction_image_side_to_side + + +def get_args_parser(add_help=True): + import argparse + + parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Evaluation", add_help=add_help) + parser.add_argument("--dataset", type=str, default="middlebury2014-train", help="dataset to use") + parser.add_argument("--dataset-root", type=str, default="", help="root of the dataset") + + parser.add_argument("--checkpoint", type=str, default="", help="path to weights") + parser.add_argument("--weights", type=str, default=None, help="torchvision API weight") + parser.add_argument( + "--model", + type=str, + default="crestereo_base", + help="which model to use if not speciffying a training checkpoint", + ) + parser.add_argument("--img-folder", type=str, default="images") + + parser.add_argument("--batch-size", type=int, default=1, help="batch size") + parser.add_argument("--workers", type=int, default=0, help="number of workers") + + parser.add_argument("--eval-size", type=int, nargs="+", default=[384, 512], help="resize size") + parser.add_argument( + "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization" + ) + parser.add_argument( + "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization" + ) + parser.add_argument( + "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False + ) + parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity") + parser.add_argument( + "--interpolation-strategy", + type=str, + default="bilinear", + help="interpolation strategy", + choices=["bilinear", "bicubic", "mixed"], + ) + + parser.add_argument("--n_iterations", nargs="+", type=int, default=[10], help="number of recurent iterations") + parser.add_argument("--n_cascades", nargs="+", type=int, default=[1], help="number of cascades") + parser.add_argument( + "--metrics", + type=str, + nargs="+", + default=["mae", "rmse", "1px", "3px", "5px", "relepe"], + help="metrics to log", + choices=AVAILABLE_METRICS, + ) + parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training") + + parser.add_argument("--world-size", type=int, default=1, help="number of distributed processes") + parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training") + parser.add_argument("--device", type=str, default="cuda", help="device to use for training") + + parser.add_argument("--save-images", action="store_true", help="save images of the predictions") + parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"]) + + return parser + + +def cascade_inference(model, image_left, image_right, iterations, cascades): + # check that image size is divisible by 16 * (2 ** (cascades - 1)) + for image in [image_left, image_right]: + if image.shape[-2] % ((2 ** (cascades - 1))) != 0: + raise ValueError( + f"image height is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}" + ) + + if image.shape[-1] % ((2 ** (cascades - 1))) != 0: + raise ValueError( + f"image width is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}" + ) + + left_image_pyramid = [image_left] + right_image_pyramid = [image_right] + for idx in range(0, cascades - 1): + ds_factor = int(2 ** (idx + 1)) + ds_shape = (image_left.shape[-2] // ds_factor, image_left.shape[-1] // ds_factor) + left_image_pyramid += F.interpolate(image_left, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(0) + right_image_pyramid += F.interpolate(image_right, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze( + 0 + ) + + flow_init = None + for left_image, right_image in zip(reversed(left_image_pyramid), reversed(right_image_pyramid)): + flow_pred = model(left_image, right_image, flow_init, num_iters=iterations) + # flow pred is a list + flow_init = flow_pred[-1] + + return flow_init + + +@torch.inference_mode() +def _evaluate( + model, + args, + val_loader, + *, + padder_mode, + print_freq=10, + writer=None, + step=None, + iterations=10, + cascades=1, + batch_size=None, + header=None, + save_images=False, + save_path="", +): + """Helper function to compute various metrics (epe, etc.) for a model on a given dataset. + We process as many samples as possible with ddp. + """ + model.eval() + header = header or "Test:" + device = torch.device(args.device) + metric_logger = utils.MetricLogger(delimiter=" ") + + iterations = iterations or args.recurrent_updates + + logger = utils.MetricLogger() + for meter_name in args.metrics: + logger.add_meter(meter_name, fmt="{global_avg:.4f}") + if "fl-all" not in args.metrics: + logger.add_meter("fl-all", fmt="{global_avg:.4f}") + + num_processed_samples = 0 + with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16): + batch_idx = 0 + for blob in metric_logger.log_every(val_loader, print_freq, header): + image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob) + padder = utils.InputPadder(image_left.shape, mode=padder_mode) + image_left, image_right = padder.pad(image_left, image_right) + + disp_pred = cascade_inference(model, image_left, image_right, iterations, cascades) + disp_pred = disp_pred[:, :1, :, :] + disp_pred = padder.unpad(disp_pred) + + if save_images: + if args.distributed: + rank_prefix = args.rank + else: + rank_prefix = 0 + make_prediction_image_side_to_side( + disp_pred, disp_gt, valid_disp_mask, save_path, prefix=f"batch_{rank_prefix}_{batch_idx}" + ) + + metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys()) + num_processed_samples += image_left.shape[0] + for name in metrics: + logger.meters[name].update(metrics[name], n=1) + + batch_idx += 1 + + num_processed_samples = utils.reduce_across_processes(num_processed_samples) / args.world_size + + print("Num_processed_samples: ", num_processed_samples) + if ( + hasattr(val_loader.dataset, "__len__") + and len(val_loader.dataset) != num_processed_samples + and torch.distributed.get_rank() == 0 + ): + warnings.warn( + f"Number of processed samples {num_processed_samples} is different" + f"from the dataset size {len(val_loader.dataset)}. This may happen if" + "the dataset is not divisible by the batch size. Try lowering the batch size for more accurate results." + ) + + if writer is not None and args.rank == 0: + for meter_name, meter_value in logger.meters.items(): + scalar_name = f"{meter_name} {header}" + writer.add_scalar(scalar_name, meter_value.avg, step) + + logger.synchronize_between_processes() + print(header, logger) + + logger_metrics = {k: v.global_avg for k, v in logger.meters.items()} + return logger_metrics + + +def evaluate(model, loader, args, writer=None, step=None): + os.makedirs(args.img_folder, exist_ok=True) + checkpoint_name = os.path.basename(args.checkpoint) or args.weights + image_checkpoint_folder = os.path.join(args.img_folder, checkpoint_name) + + metrics = {} + base_image_folder = os.path.join(image_checkpoint_folder, args.dataset) + os.makedirs(base_image_folder, exist_ok=True) + + for n_cascades in args.n_cascades: + for n_iters in args.n_iterations: + + config = f"{n_cascades}c_{n_iters}i" + config_image_folder = os.path.join(base_image_folder, config) + os.makedirs(config_image_folder, exist_ok=True) + + metrics[config] = _evaluate( + model, + args, + loader, + padder_mode=args.padder_type, + header=f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{n_cascades} n_iters:{n_iters}", + batch_size=args.batch_size, + writer=writer, + step=step, + iterations=n_iters, + cascades=n_cascades, + save_path=config_image_folder, + save_images=args.save_images, + ) + + metric_log = [] + metric_log_dict = {} + # print the final results + for config in metrics: + config_tokens = config.split("_") + config_iters = config_tokens[1][:-1] + config_cascades = config_tokens[0][:-1] + + metric_log_dict[config_cascades] = metric_log_dict.get(config_cascades, {}) + metric_log_dict[config_cascades][config_iters] = metrics[config] + + evaluation_str = f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{config_cascades} recurrent_updates:{config_iters}" + metrics_str = f"Metrics: {metrics[config]}" + metric_log.extend([evaluation_str, metrics_str]) + + print(evaluation_str) + print(metrics_str) + + eval_log_name = f"{checkpoint_name.replace('.pth', '')}_eval.log" + print("Saving eval log to: ", eval_log_name) + with open(eval_log_name, "w") as f: + f.write(f"Dataset: {args.dataset} @size: {args.eval_size}:\n") + # write the dict line by line for each key, and each value in the keys + for config_cascades in metric_log_dict: + f.write("{\n") + f.write(f"\t{config_cascades}: {{\n") + for config_iters in metric_log_dict[config_cascades]: + # convert every metric to 4 decimal places + metrics = metric_log_dict[config_cascades][config_iters] + metrics = {k: float(f"{v:.3f}") for k, v in metrics.items()} + f.write(f"\t\t{config_iters}: {metrics}\n") + f.write("\t},\n") + f.write("}\n") + + +def load_checkpoint(args): + utils.setup_ddp(args) + + if not args.weights: + checkpoint = torch.load(args.checkpoint, map_location=torch.device("cpu")) + if "model" in checkpoint: + experiment_args = checkpoint["args"] + model = torchvision.prototype.models.depth.stereo.__dict__[experiment_args.model](weights=None) + model.load_state_dict(checkpoint["model"]) + else: + model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=None) + model.load_state_dict(checkpoint) + + # set the appropriate devices + if args.distributed and args.device == "cpu": + raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun") + device = torch.device(args.device) + else: + model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights) + + # convert to DDP if need be + if args.distributed: + model = model.to(args.device) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + else: + model.to(device) + + return model + + +def main(args): + model = load_checkpoint(args) + loader = make_eval_loader(args.dataset, args) + evaluate(model, loader, args) + + +if __name__ == "__main__": + args = get_args_parser().parse_args() + main(args) diff --git a/references/depth/stereo/train.py b/references/depth/stereo/train.py new file mode 100644 index 0000000000000000000000000000000000000000..30c73628c619b9d0443640615e6c59a3cd678fad --- /dev/null +++ b/references/depth/stereo/train.py @@ -0,0 +1,788 @@ +import argparse +import os +import warnings +from pathlib import Path +from typing import List, Union + +import numpy as np +import torch +import torch.distributed as dist +import torchvision.models.optical_flow +import torchvision.prototype.models.depth.stereo +import utils +import visualization + +from parsing import make_dataset, make_eval_transform, make_train_transform, VALID_DATASETS +from torch import nn +from torchvision.transforms.functional import get_dimensions, InterpolationMode, resize +from utils.metrics import AVAILABLE_METRICS +from utils.norm import freeze_batch_norm + + +def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_channels: int) -> torch.Tensor: + """Helper function to make stereo flow from a given model output""" + if isinstance(flow, list): + return [make_stereo_flow(flow_i, model_out_channels) for flow_i in flow] + + B, C, H, W = flow.shape + # we need to add zero flow if the model outputs 2 channels + if C == 1 and model_out_channels == 2: + zero_flow = torch.zeros_like(flow) + # by convention the flow is X-Y axis, so we need the Y flow last + flow = torch.cat([flow, zero_flow], dim=1) + return flow + + +def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> np.ndarray: + """Helper function to return a learning rate scheduler for CRE-stereo""" + if args.decay_after_steps < args.warmup_steps: + raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}") + + warmup_steps = args.warmup_steps if args.warmup_steps else 0 + flat_lr_steps = args.decay_after_steps - warmup_steps if args.decay_after_steps else 0 + decay_lr_steps = args.total_iterations - flat_lr_steps + + max_lr = args.lr + min_lr = args.min_lr + + schedulers = [] + milestones = [] + + if warmup_steps > 0: + if args.lr_warmup_method == "linear": + warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=args.lr_warmup_factor, total_iters=warmup_steps + ) + elif args.lr_warmup_method == "constant": + warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR( + optimizer, factor=args.lr_warmup_factor, total_iters=warmup_steps + ) + else: + raise ValueError(f"Unknown lr warmup method {args.lr_warmup_method}") + schedulers.append(warmup_lr_scheduler) + milestones.append(warmup_steps) + + if flat_lr_steps > 0: + flat_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=max_lr, total_iters=flat_lr_steps) + schedulers.append(flat_lr_scheduler) + milestones.append(flat_lr_steps + warmup_steps) + + if decay_lr_steps > 0: + if args.lr_decay_method == "cosine": + decay_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( + optimizer, T_max=decay_lr_steps, eta_min=min_lr + ) + elif args.lr_decay_method == "linear": + decay_lr_scheduler = torch.optim.lr_scheduler.LinearLR( + optimizer, start_factor=max_lr, end_factor=min_lr, total_iters=decay_lr_steps + ) + elif args.lr_decay_method == "exponential": + decay_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR( + optimizer, gamma=args.lr_decay_gamma, last_epoch=-1 + ) + else: + raise ValueError(f"Unknown lr decay method {args.lr_decay_method}") + schedulers.append(decay_lr_scheduler) + + scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers, milestones=milestones) + return scheduler + + +def shuffle_dataset(dataset): + """Shuffle the dataset""" + perm = torch.randperm(len(dataset)) + return torch.utils.data.Subset(dataset, perm) + + +def resize_dataset_to_n_steps( + dataset: torch.utils.data.Dataset, dataset_steps: int, samples_per_step: int, args: argparse.Namespace +) -> torch.utils.data.Dataset: + original_size = len(dataset) + if args.steps_is_epochs: + samples_per_step = original_size + target_size = dataset_steps * samples_per_step + + dataset_copies = [] + n_expands, remainder = divmod(target_size, original_size) + for idx in range(n_expands): + dataset_copies.append(dataset) + + if remainder > 0: + dataset_copies.append(torch.utils.data.Subset(dataset, list(range(remainder)))) + + if args.dataset_shuffle: + dataset_copies = [shuffle_dataset(dataset_copy) for dataset_copy in dataset_copies] + + dataset = torch.utils.data.ConcatDataset(dataset_copies) + return dataset + + +def get_train_dataset(dataset_root: str, args: argparse.Namespace) -> torch.utils.data.Dataset: + datasets = [] + for dataset_name in args.train_datasets: + transform = make_train_transform(args) + dataset = make_dataset(dataset_name, dataset_root, transform) + datasets.append(dataset) + + if len(datasets) == 0: + raise ValueError("No datasets specified for training") + + samples_per_step = args.world_size * args.batch_size + + for idx, (dataset, steps_per_dataset) in enumerate(zip(datasets, args.dataset_steps)): + datasets[idx] = resize_dataset_to_n_steps(dataset, steps_per_dataset, samples_per_step, args) + + dataset = torch.utils.data.ConcatDataset(datasets) + if args.dataset_order_shuffle: + dataset = shuffle_dataset(dataset) + + print(f"Training dataset: {len(dataset)} samples") + return dataset + + +@torch.inference_mode() +def _evaluate( + model, + args, + val_loader, + *, + padder_mode, + print_freq=10, + writer=None, + step=None, + iterations=None, + batch_size=None, + header=None, +): + """Helper function to compute various metrics (epe, etc.) for a model on a given dataset.""" + model.eval() + header = header or "Test:" + device = torch.device(args.device) + metric_logger = utils.MetricLogger(delimiter=" ") + + iterations = iterations or args.recurrent_updates + + logger = utils.MetricLogger() + for meter_name in args.metrics: + logger.add_meter(meter_name, fmt="{global_avg:.4f}") + if "fl-all" not in args.metrics: + logger.add_meter("fl-all", fmt="{global_avg:.4f}") + + num_processed_samples = 0 + with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16): + for blob in metric_logger.log_every(val_loader, print_freq, header): + image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob) + padder = utils.InputPadder(image_left.shape, mode=padder_mode) + image_left, image_right = padder.pad(image_left, image_right) + + disp_predictions = model(image_left, image_right, flow_init=None, num_iters=iterations) + disp_pred = disp_predictions[-1][:, :1, :, :] + disp_pred = padder.unpad(disp_pred) + + metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys()) + num_processed_samples += image_left.shape[0] + for name in metrics: + logger.meters[name].update(metrics[name], n=1) + + num_processed_samples = utils.reduce_across_processes(num_processed_samples) + + print("Num_processed_samples: ", num_processed_samples) + if ( + hasattr(val_loader.dataset, "__len__") + and len(val_loader.dataset) != num_processed_samples + and torch.distributed.get_rank() == 0 + ): + warnings.warn( + f"Number of processed samples {num_processed_samples} is different" + f"from the dataset size {len(val_loader.dataset)}. This may happen if" + "the dataset is not divisible by the batch size. Try lowering the batch size or GPU number for more accurate results." + ) + + if writer is not None and args.rank == 0: + for meter_name, meter_value in logger.meters.items(): + scalar_name = f"{meter_name} {header}" + writer.add_scalar(scalar_name, meter_value.avg, step) + + logger.synchronize_between_processes() + print(header, logger) + + +def make_eval_loader(dataset_name: str, args: argparse.Namespace) -> torch.utils.data.DataLoader: + if args.weights: + weights = torchvision.models.get_weight(args.weights) + trans = weights.transforms() + + def preprocessing(image_left, image_right, disp, valid_disp_mask): + C_o, H_o, W_o = get_dimensions(image_left) + image_left, image_right = trans(image_left, image_right) + + C_t, H_t, W_t = get_dimensions(image_left) + scale_factor = W_t / W_o + + if disp is not None and not isinstance(disp, torch.Tensor): + disp = torch.from_numpy(disp) + if W_t != W_o: + disp = resize(disp, (H_t, W_t), mode=InterpolationMode.BILINEAR) * scale_factor + if valid_disp_mask is not None and not isinstance(valid_disp_mask, torch.Tensor): + valid_disp_mask = torch.from_numpy(valid_disp_mask) + if W_t != W_o: + valid_disp_mask = resize(valid_disp_mask, (H_t, W_t), mode=InterpolationMode.NEAREST) + return image_left, image_right, disp, valid_disp_mask + + else: + preprocessing = make_eval_transform(args) + + val_dataset = make_dataset(dataset_name, args.dataset_root, transforms=preprocessing) + if args.distributed: + sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=False) + else: + sampler = torch.utils.data.SequentialSampler(val_dataset) + + val_loader = torch.utils.data.DataLoader( + val_dataset, + sampler=sampler, + batch_size=args.batch_size, + pin_memory=True, + num_workers=args.workers, + ) + + return val_loader + + +def evaluate(model, loaders, args, writer=None, step=None): + for loader_name, loader in loaders.items(): + _evaluate( + model, + args, + loader, + iterations=args.recurrent_updates, + padder_mode=args.padder_type, + header=f"{loader_name} evaluation", + batch_size=args.batch_size, + writer=writer, + step=step, + ) + + +def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer, scaler, args): + device = torch.device(args.device) + # wrap the loader in a logger + loader = iter(logger.log_every(train_loader)) + # output channels + model_out_channels = model.module.output_channels if args.distributed else model.output_channels + + torch.set_num_threads(args.threads) + + sequence_criterion = utils.SequenceLoss( + gamma=args.gamma, + max_flow=args.max_disparity, + exclude_large_flows=args.flow_loss_exclude_large, + ).to(device) + + if args.consistency_weight: + consistency_criterion = utils.FlowSequenceConsistencyLoss( + args.gamma, + resize_factor=0.25, + rescale_factor=0.25, + rescale_mode="bilinear", + ).to(device) + else: + consistency_criterion = None + + if args.psnr_weight: + psnr_criterion = utils.PSNRLoss().to(device) + else: + psnr_criterion = None + + if args.smoothness_weight: + smoothness_criterion = utils.SmoothnessLoss().to(device) + else: + smoothness_criterion = None + + if args.photometric_weight: + photometric_criterion = utils.FlowPhotoMetricLoss( + ssim_weight=args.photometric_ssim_weight, + max_displacement_ratio=args.photometric_max_displacement_ratio, + ssim_use_padding=False, + ).to(device) + else: + photometric_criterion = None + + for step in range(args.start_step + 1, args.total_iterations + 1): + data_blob = next(loader) + optimizer.zero_grad() + + # unpack the data blob + image_left, image_right, disp_mask, valid_disp_mask = (x.to(device) for x in data_blob) + with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16): + disp_predictions = model(image_left, image_right, flow_init=None, num_iters=args.recurrent_updates) + # different models have different outputs, make sure we get the right ones for this task + disp_predictions = make_stereo_flow(disp_predictions, model_out_channels) + # should the architecture or training loop require it, we have to adjust the disparity mask + # target to possibly look like an optical flow mask + disp_mask = make_stereo_flow(disp_mask, model_out_channels) + # sequence loss on top of the model outputs + + loss = sequence_criterion(disp_predictions, disp_mask, valid_disp_mask) * args.flow_loss_weight + + if args.consistency_weight > 0: + loss_consistency = consistency_criterion(disp_predictions) + loss += loss_consistency * args.consistency_weight + + if args.psnr_weight > 0: + loss_psnr = 0.0 + for pred in disp_predictions: + # predictions might have 2 channels + loss_psnr += psnr_criterion( + pred * valid_disp_mask.unsqueeze(1), + disp_mask * valid_disp_mask.unsqueeze(1), + ).mean() # mean the psnr loss over the batch + loss += loss_psnr / len(disp_predictions) * args.psnr_weight + + if args.photometric_weight > 0: + loss_photometric = 0.0 + for pred in disp_predictions: + # predictions might have 1 channel, therefore we need to inpute 0s for the second channel + if model_out_channels == 1: + pred = torch.cat([pred, torch.zeros_like(pred)], dim=1) + + loss_photometric += photometric_criterion( + image_left, image_right, pred, valid_disp_mask + ) # photometric loss already comes out meaned over the batch + loss += loss_photometric / len(disp_predictions) * args.photometric_weight + + if args.smoothness_weight > 0: + loss_smoothness = 0.0 + for pred in disp_predictions: + # predictions might have 2 channels + loss_smoothness += smoothness_criterion( + image_left, pred[:, :1, :, :] + ).mean() # mean the smoothness loss over the batch + loss += loss_smoothness / len(disp_predictions) * args.smoothness_weight + + with torch.no_grad(): + metrics, _ = utils.compute_metrics( + disp_predictions[-1][:, :1, :, :], # predictions might have 2 channels + disp_mask[:, :1, :, :], # so does the ground truth + valid_disp_mask, + args.metrics, + ) + + metrics.pop("fl-all", None) + logger.update(loss=loss, **metrics) + + if scaler is not None: + scaler.scale(loss).backward() + scaler.unscale_(optimizer) + if args.clip_grad_norm: + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm) + scaler.step(optimizer) + scaler.update() + else: + loss.backward() + if args.clip_grad_norm: + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm) + optimizer.step() + + scheduler.step() + + if not dist.is_initialized() or dist.get_rank() == 0: + if writer is not None and step % args.tensorboard_log_frequency == 0: + # log the loss and metrics to tensorboard + + writer.add_scalar("loss", loss, step) + for name, value in logger.meters.items(): + writer.add_scalar(name, value.avg, step) + # log the images to tensorboard + pred_grid = visualization.make_training_sample_grid( + image_left, image_right, disp_mask, valid_disp_mask, disp_predictions + ) + writer.add_image("predictions", pred_grid, step, dataformats="HWC") + + # second thing we want to see is how relevant the iterative refinement is + pred_sequence_grid = visualization.make_disparity_sequence_grid(disp_predictions, disp_mask) + writer.add_image("sequence", pred_sequence_grid, step, dataformats="HWC") + + if step % args.save_frequency == 0: + if not args.distributed or args.rank == 0: + model_without_ddp = ( + model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model + ) + checkpoint = { + "model": model_without_ddp.state_dict(), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + "step": step, + "args": args, + } + os.makedirs(args.checkpoint_dir, exist_ok=True) + torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth") + torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth") + + if step % args.valid_frequency == 0: + evaluate(model, val_loaders, args, writer, step) + model.train() + if args.freeze_batch_norm: + if isinstance(model, nn.parallel.DistributedDataParallel): + freeze_batch_norm(model.module) + else: + freeze_batch_norm(model) + + # one final save at the end + if not args.distributed or args.rank == 0: + model_without_ddp = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model + checkpoint = { + "model": model_without_ddp.state_dict(), + "optimizer": optimizer.state_dict(), + "scheduler": scheduler.state_dict(), + "step": step, + "args": args, + } + os.makedirs(args.checkpoint_dir, exist_ok=True) + torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth") + torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth") + + +def main(args): + args.total_iterations = sum(args.dataset_steps) + + # initialize DDP setting + utils.setup_ddp(args) + print(args) + + args.test_only = args.train_datasets is None + + # set the appropriate devices + if args.distributed and args.device == "cpu": + raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun") + device = torch.device(args.device) + + # select model architecture + model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights) + + # convert to DDP if need be + if args.distributed: + model = model.to(args.gpu) + model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) + model_without_ddp = model.module + else: + model.to(device) + model_without_ddp = model + + os.makedirs(args.checkpoint_dir, exist_ok=True) + + val_loaders = {name: make_eval_loader(name, args) for name in args.test_datasets} + + # EVAL ONLY configurations + if args.test_only: + evaluate(model, val_loaders, args) + return + + # Sanity check for the parameter count + print(f"Parameter Count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}") + + # Compose the training dataset + train_dataset = get_train_dataset(args.dataset_root, args) + + # initialize the optimizer + if args.optimizer == "adam": + optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) + elif args.optimizer == "sgd": + optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, momentum=0.9) + else: + raise ValueError(f"Unknown optimizer {args.optimizer}. Please choose between adam and sgd") + + # initialize the learning rate schedule + scheduler = make_lr_schedule(args, optimizer) + + # load them from checkpoint if needed + args.start_step = 0 + if args.resume_path is not None: + checkpoint = torch.load(args.resume_path, map_location="cpu") + if "model" in checkpoint: + # this means the user requested to resume from a training checkpoint + model_without_ddp.load_state_dict(checkpoint["model"]) + # this means the user wants to continue training from where it was left off + if args.resume_schedule: + optimizer.load_state_dict(checkpoint["optimizer"]) + scheduler.load_state_dict(checkpoint["scheduler"]) + args.start_step = checkpoint["step"] + 1 + # modify starting point of the dat + sample_start_step = args.start_step * args.batch_size * args.world_size + train_dataset = train_dataset[sample_start_step:] + + else: + # this means the user wants to finetune on top of a model state dict + # and that no other changes are required + model_without_ddp.load_state_dict(checkpoint) + + torch.backends.cudnn.benchmark = True + + # enable training mode + model.train() + if args.freeze_batch_norm: + freeze_batch_norm(model_without_ddp) + + # put dataloader on top of the dataset + # make sure to disable shuffling since the dataset is already shuffled + # in order to guarantee quasi randomness whilst retaining a deterministic + # dataset consumption order + if args.distributed: + # the train dataset is preshuffled in order to respect the iteration order + sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=False, drop_last=True) + else: + # the train dataset is already shuffled, so we can use a simple SequentialSampler + sampler = torch.utils.data.SequentialSampler(train_dataset) + + train_loader = torch.utils.data.DataLoader( + train_dataset, + sampler=sampler, + batch_size=args.batch_size, + pin_memory=True, + num_workers=args.workers, + ) + + # initialize the logger + if args.tensorboard_summaries: + from torch.utils.tensorboard import SummaryWriter + + tensorboard_path = Path(args.checkpoint_dir) / "tensorboard" + os.makedirs(tensorboard_path, exist_ok=True) + + tensorboard_run = tensorboard_path / f"{args.name}" + writer = SummaryWriter(tensorboard_run) + else: + writer = None + + logger = utils.MetricLogger(delimiter=" ") + + scaler = torch.cuda.amp.GradScaler() if args.mixed_precision else None + # run the training loop + # this will perform optimization, respectively logging and saving checkpoints + # when need be + run( + model=model, + optimizer=optimizer, + scheduler=scheduler, + train_loader=train_loader, + val_loaders=val_loaders, + logger=logger, + writer=writer, + scaler=scaler, + args=args, + ) + + +def get_args_parser(add_help=True): + import argparse + + parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Training", add_help=add_help) + # checkpointing + parser.add_argument("--name", default="crestereo", help="name of the experiment") + parser.add_argument("--resume", type=str, default=None, help="from which checkpoint to resume") + parser.add_argument("--checkpoint-dir", type=str, default="checkpoints", help="path to the checkpoint directory") + + # dataset + parser.add_argument("--dataset-root", type=str, default="", help="path to the dataset root directory") + parser.add_argument( + "--train-datasets", + type=str, + nargs="+", + default=["crestereo"], + help="dataset(s) to train on", + choices=list(VALID_DATASETS.keys()), + ) + parser.add_argument( + "--dataset-steps", type=int, nargs="+", default=[300_000], help="number of steps for each dataset" + ) + parser.add_argument( + "--steps-is-epochs", action="store_true", help="if set, dataset-steps are interpreted as epochs" + ) + parser.add_argument( + "--test-datasets", + type=str, + nargs="+", + default=["middlebury2014-train"], + help="dataset(s) to test on", + choices=["middlebury2014-train"], + ) + parser.add_argument("--dataset-shuffle", type=bool, help="shuffle the dataset", default=True) + parser.add_argument("--dataset-order-shuffle", type=bool, help="shuffle the dataset order", default=True) + parser.add_argument("--batch-size", type=int, default=2, help="batch size per GPU") + parser.add_argument("--workers", type=int, default=4, help="number of workers per GPU") + parser.add_argument( + "--threads", + type=int, + default=16, + help="number of CPU threads per GPU. This can be changed around to speed-up transforms if needed. This can lead to worker thread contention so use with care.", + ) + + # model architecture + parser.add_argument( + "--model", + type=str, + default="crestereo_base", + help="model architecture", + choices=["crestereo_base", "raft_stereo"], + ) + parser.add_argument("--recurrent-updates", type=int, default=10, help="number of recurrent updates") + parser.add_argument("--freeze-batch-norm", action="store_true", help="freeze batch norm parameters") + + # loss parameters + parser.add_argument("--gamma", type=float, default=0.8, help="gamma parameter for the flow sequence loss") + parser.add_argument("--flow-loss-weight", type=float, default=1.0, help="weight for the flow loss") + parser.add_argument( + "--flow-loss-exclude-large", + action="store_true", + help="exclude large flow values from the loss. A large value is defined as a value greater than the ground truth flow norm", + default=False, + ) + parser.add_argument("--consistency-weight", type=float, default=0.0, help="consistency loss weight") + parser.add_argument( + "--consistency-resize-factor", + type=float, + default=0.25, + help="consistency loss resize factor to account for the fact that the flow is computed on a downsampled image", + ) + parser.add_argument("--psnr-weight", type=float, default=0.0, help="psnr loss weight") + parser.add_argument("--smoothness-weight", type=float, default=0.0, help="smoothness loss weight") + parser.add_argument("--photometric-weight", type=float, default=0.0, help="photometric loss weight") + parser.add_argument( + "--photometric-max-displacement-ratio", + type=float, + default=0.15, + help="Only pixels with a displacement smaller than this ratio of the image width will be considered for the photometric loss", + ) + parser.add_argument("--photometric-ssim-weight", type=float, default=0.85, help="photometric ssim loss weight") + + # transforms parameters + parser.add_argument("--gpu-transforms", action="store_true", help="use GPU transforms") + parser.add_argument( + "--eval-size", type=int, nargs="+", default=[384, 512], help="size of the images for evaluation" + ) + parser.add_argument("--resize-size", type=int, nargs=2, default=None, help="resize size") + parser.add_argument("--crop-size", type=int, nargs=2, default=[384, 512], help="crop size") + parser.add_argument("--scale-range", type=float, nargs=2, default=[0.6, 1.0], help="random scale range") + parser.add_argument("--rescale-prob", type=float, default=1.0, help="probability of resizing the image") + parser.add_argument( + "--scaling-type", type=str, default="linear", help="scaling type", choices=["exponential", "linear"] + ) + parser.add_argument("--flip-prob", type=float, default=0.5, help="probability of flipping the image") + parser.add_argument( + "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization" + ) + parser.add_argument( + "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization" + ) + parser.add_argument( + "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False + ) + parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity") + parser.add_argument( + "--interpolation-strategy", + type=str, + default="bilinear", + help="interpolation strategy", + choices=["bilinear", "bicubic", "mixed"], + ) + parser.add_argument("--spatial-shift-prob", type=float, default=1.0, help="probability of shifting the image") + parser.add_argument( + "--spatial-shift-max-angle", type=float, default=0.1, help="maximum angle for the spatial shift" + ) + parser.add_argument( + "--spatial-shift-max-displacement", type=float, default=2.0, help="maximum displacement for the spatial shift" + ) + parser.add_argument("--gamma-range", type=float, nargs="+", default=[0.8, 1.2], help="range for gamma correction") + parser.add_argument( + "--brightness-range", type=float, nargs="+", default=[0.8, 1.2], help="range for brightness correction" + ) + parser.add_argument( + "--contrast-range", type=float, nargs="+", default=[0.8, 1.2], help="range for contrast correction" + ) + parser.add_argument( + "--saturation-range", type=float, nargs="+", default=0.0, help="range for saturation correction" + ) + parser.add_argument("--hue-range", type=float, nargs="+", default=0.0, help="range for hue correction") + parser.add_argument( + "--asymmetric-jitter-prob", + type=float, + default=1.0, + help="probability of using asymmetric jitter instead of symmetric jitter", + ) + parser.add_argument("--occlusion-prob", type=float, default=0.5, help="probability of occluding the rightimage") + parser.add_argument( + "--occlusion-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of occluded pixels" + ) + parser.add_argument("--erase-prob", type=float, default=0.0, help="probability of erasing in both images") + parser.add_argument( + "--erase-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of erased pixels" + ) + parser.add_argument( + "--erase-num-repeats", type=int, default=1, help="number of times to repeat the erase operation" + ) + + # optimizer parameters + parser.add_argument("--optimizer", type=str, default="adam", help="optimizer", choices=["adam", "sgd"]) + parser.add_argument("--lr", type=float, default=4e-4, help="learning rate") + parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay") + parser.add_argument("--clip-grad-norm", type=float, default=0.0, help="clip grad norm") + + # lr_scheduler parameters + parser.add_argument("--min-lr", type=float, default=2e-5, help="minimum learning rate") + parser.add_argument("--warmup-steps", type=int, default=6_000, help="number of warmup steps") + parser.add_argument( + "--decay-after-steps", type=int, default=180_000, help="number of steps after which to start decay the lr" + ) + parser.add_argument( + "--lr-warmup-method", type=str, default="linear", help="warmup method", choices=["linear", "cosine"] + ) + parser.add_argument("--lr-warmup-factor", type=float, default=0.02, help="warmup factor for the learning rate") + parser.add_argument( + "--lr-decay-method", + type=str, + default="linear", + help="decay method", + choices=["linear", "cosine", "exponential"], + ) + parser.add_argument("--lr-decay-gamma", type=float, default=0.8, help="decay factor for the learning rate") + + # deterministic behaviour + parser.add_argument("--seed", type=int, default=42, help="seed for random number generators") + + # mixed precision training + parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training") + + # logging + parser.add_argument("--tensorboard-summaries", action="store_true", help="log to tensorboard") + parser.add_argument("--tensorboard-log-frequency", type=int, default=100, help="log frequency") + parser.add_argument("--save-frequency", type=int, default=1_000, help="save frequency") + parser.add_argument("--valid-frequency", type=int, default=1_000, help="validation frequency") + parser.add_argument( + "--metrics", + type=str, + nargs="+", + default=["mae", "rmse", "1px", "3px", "5px", "relepe"], + help="metrics to log", + choices=AVAILABLE_METRICS, + ) + + # distributed parameters + parser.add_argument("--world-size", type=int, default=8, help="number of distributed processes") + parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training") + parser.add_argument("--device", type=str, default="cuda", help="device to use for training") + + # weights API + parser.add_argument("--weights", type=str, default=None, help="weights API url") + parser.add_argument( + "--resume-path", type=str, default=None, help="a path from which to resume or start fine-tuning" + ) + parser.add_argument("--resume-schedule", action="store_true", help="resume optimizer state") + + # padder parameters + parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"]) + return parser + + +if __name__ == "__main__": + args = get_args_parser().parse_args() + main(args) diff --git a/references/depth/stereo/transforms.py b/references/depth/stereo/transforms.py new file mode 100644 index 0000000000000000000000000000000000000000..9c4a6bab6d3b56922ea5339d4424356ef9eca292 --- /dev/null +++ b/references/depth/stereo/transforms.py @@ -0,0 +1,650 @@ +import random +from typing import Callable, List, Optional, Sequence, Tuple, Union + +import numpy as np +import PIL.Image +import torch +import torchvision.transforms as T +import torchvision.transforms.functional as F +from torch import Tensor + +T_FLOW = Union[Tensor, np.ndarray, None] +T_MASK = Union[Tensor, np.ndarray, None] +T_STEREO_TENSOR = Tuple[Tensor, Tensor] +T_COLOR_AUG_PARAM = Union[float, Tuple[float, float]] + + +def rand_float_range(size: Sequence[int], low: float, high: float) -> Tensor: + return (low - high) * torch.rand(size) + high + + +class InterpolationStrategy: + + _valid_modes: List[str] = ["mixed", "bicubic", "bilinear"] + + def __init__(self, mode: str = "mixed") -> None: + if mode not in self._valid_modes: + raise ValueError(f"Invalid interpolation mode: {mode}. Valid modes are: {self._valid_modes}") + + if mode == "mixed": + self.strategies = [F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC] + elif mode == "bicubic": + self.strategies = [F.InterpolationMode.BICUBIC] + elif mode == "bilinear": + self.strategies = [F.InterpolationMode.BILINEAR] + + def __call__(self) -> F.InterpolationMode: + return random.choice(self.strategies) + + @classmethod + def is_valid(mode: str) -> bool: + return mode in InterpolationStrategy._valid_modes + + @property + def valid_modes() -> List[str]: + return InterpolationStrategy._valid_modes + + +class ValidateModelInput(torch.nn.Module): + # Pass-through transform that checks the shape and dtypes to make sure the model gets what it expects + def forward(self, images: T_STEREO_TENSOR, disparities: T_FLOW, masks: T_MASK): + if images[0].shape != images[1].shape: + raise ValueError("img1 and img2 should have the same shape.") + h, w = images[0].shape[-2:] + if disparities[0] is not None and disparities[0].shape != (1, h, w): + raise ValueError(f"disparities[0].shape should be (1, {h}, {w}) instead of {disparities[0].shape}") + if masks[0] is not None: + if masks[0].shape != (h, w): + raise ValueError(f"masks[0].shape should be ({h}, {w}) instead of {masks[0].shape}") + if masks[0].dtype != torch.bool: + raise TypeError(f"masks[0] should be of dtype torch.bool instead of {masks[0].dtype}") + + return images, disparities, masks + + +class ConvertToGrayscale(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward( + self, + images: Tuple[PIL.Image.Image, PIL.Image.Image], + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + img_left = F.rgb_to_grayscale(images[0], num_output_channels=3) + img_right = F.rgb_to_grayscale(images[1], num_output_channels=3) + + return (img_left, img_right), disparities, masks + + +class MakeValidDisparityMask(torch.nn.Module): + def __init__(self, max_disparity: Optional[int] = 256) -> None: + super().__init__() + self.max_disparity = max_disparity + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + valid_masks = tuple( + torch.ones(images[idx].shape[-2:], dtype=torch.bool, device=images[idx].device) if mask is None else mask + for idx, mask in enumerate(masks) + ) + + valid_masks = tuple( + torch.logical_and(mask, disparity > 0).squeeze(0) if disparity is not None else mask + for mask, disparity in zip(valid_masks, disparities) + ) + + if self.max_disparity is not None: + valid_masks = tuple( + torch.logical_and(mask, disparity < self.max_disparity).squeeze(0) if disparity is not None else mask + for mask, disparity in zip(valid_masks, disparities) + ) + + return images, disparities, valid_masks + + +class ToGPU(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + dev_images = tuple(image.cuda() for image in images) + dev_disparities = tuple(map(lambda x: x.cuda() if x is not None else None, disparities)) + dev_masks = tuple(map(lambda x: x.cuda() if x is not None else None, masks)) + return dev_images, dev_disparities, dev_masks + + +class ConvertImageDtype(torch.nn.Module): + def __init__(self, dtype: torch.dtype): + super().__init__() + self.dtype = dtype + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + img_left = F.convert_image_dtype(images[0], dtype=self.dtype) + img_right = F.convert_image_dtype(images[1], dtype=self.dtype) + + img_left = img_left.contiguous() + img_right = img_right.contiguous() + + return (img_left, img_right), disparities, masks + + +class Normalize(torch.nn.Module): + def __init__(self, mean: List[float], std: List[float]) -> None: + super().__init__() + self.mean = mean + self.std = std + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + img_left = F.normalize(images[0], mean=self.mean, std=self.std) + img_right = F.normalize(images[1], mean=self.mean, std=self.std) + + img_left = img_left.contiguous() + img_right = img_right.contiguous() + + return (img_left, img_right), disparities, masks + + +class ToTensor(torch.nn.Module): + def forward( + self, + images: Tuple[PIL.Image.Image, PIL.Image.Image], + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + if images[0] is None: + raise ValueError("img_left is None") + if images[1] is None: + raise ValueError("img_right is None") + + img_left = F.pil_to_tensor(images[0]) + img_right = F.pil_to_tensor(images[1]) + disparity_tensors = () + mask_tensors = () + + for idx in range(2): + disparity_tensors += (torch.from_numpy(disparities[idx]),) if disparities[idx] is not None else (None,) + mask_tensors += (torch.from_numpy(masks[idx]),) if masks[idx] is not None else (None,) + + return (img_left, img_right), disparity_tensors, mask_tensors + + +class AsymmetricColorJitter(T.ColorJitter): + # p determines the probability of doing asymmetric vs symmetric color jittering + def __init__( + self, + brightness: T_COLOR_AUG_PARAM = 0, + contrast: T_COLOR_AUG_PARAM = 0, + saturation: T_COLOR_AUG_PARAM = 0, + hue: T_COLOR_AUG_PARAM = 0, + p: float = 0.2, + ): + super().__init__(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue) + self.p = p + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + if torch.rand(1) < self.p: + # asymmetric: different transform for img1 and img2 + img_left = super().forward(images[0]) + img_right = super().forward(images[1]) + else: + # symmetric: same transform for img1 and img2 + batch = torch.stack(images) + batch = super().forward(batch) + img_left, img_right = batch[0], batch[1] + + return (img_left, img_right), disparities, masks + + +class AsymetricGammaAdjust(torch.nn.Module): + def __init__(self, p: float, gamma_range: Tuple[float, float], gain: float = 1) -> None: + super().__init__() + self.gamma_range = gamma_range + self.gain = gain + self.p = p + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + gamma = rand_float_range((1,), low=self.gamma_range[0], high=self.gamma_range[1]).item() + + if torch.rand(1) < self.p: + # asymmetric: different transform for img1 and img2 + img_left = F.adjust_gamma(images[0], gamma, gain=self.gain) + img_right = F.adjust_gamma(images[1], gamma, gain=self.gain) + else: + # symmetric: same transform for img1 and img2 + batch = torch.stack(images) + batch = F.adjust_gamma(batch, gamma, gain=self.gain) + img_left, img_right = batch[0], batch[1] + + return (img_left, img_right), disparities, masks + + +class RandomErase(torch.nn.Module): + # Produces multiple symmetric random erasures + # these can be viewed as occlusions present in both camera views. + # Similarly to Optical Flow occlusion prediction tasks, we mask these pixels in the disparity map + def __init__( + self, + p: float = 0.5, + erase_px_range: Tuple[int, int] = (50, 100), + value: Union[Tensor, float] = 0, + inplace: bool = False, + max_erase: int = 2, + ): + super().__init__() + self.min_px_erase = erase_px_range[0] + self.max_px_erase = erase_px_range[1] + if self.max_px_erase < 0: + raise ValueError("erase_px_range[1] should be equal or greater than 0") + if self.min_px_erase < 0: + raise ValueError("erase_px_range[0] should be equal or greater than 0") + if self.min_px_erase > self.max_px_erase: + raise ValueError("erase_prx_range[0] should be equal or lower than erase_px_range[1]") + + self.p = p + self.value = value + self.inplace = inplace + self.max_erase = max_erase + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: T_STEREO_TENSOR, + masks: T_STEREO_TENSOR, + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + if torch.rand(1) < self.p: + return images, disparities, masks + + image_left, image_right = images + mask_left, mask_right = masks + for _ in range(torch.randint(self.max_erase, size=(1,)).item()): + y, x, h, w, v = self._get_params(image_left) + image_right = F.erase(image_right, y, x, h, w, v, self.inplace) + image_left = F.erase(image_left, y, x, h, w, v, self.inplace) + # similarly to optical flow occlusion prediction, we consider + # any erasure pixels that are in both images to be occluded therefore + # we mark them as invalid + if mask_left is not None: + mask_left = F.erase(mask_left, y, x, h, w, False, self.inplace) + if mask_right is not None: + mask_right = F.erase(mask_right, y, x, h, w, False, self.inplace) + + return (image_left, image_right), disparities, (mask_left, mask_right) + + def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]: + img_h, img_w = img.shape[-2:] + crop_h, crop_w = ( + random.randint(self.min_px_erase, self.max_px_erase), + random.randint(self.min_px_erase, self.max_px_erase), + ) + crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h)) + + return crop_y, crop_x, crop_h, crop_w, self.value + + +class RandomOcclusion(torch.nn.Module): + # This adds an occlusion in the right image + # the occluded patch works as a patch erase where the erase value is the mean + # of the pixels from the selected zone + def __init__(self, p: float = 0.5, occlusion_px_range: Tuple[int, int] = (50, 100), inplace: bool = False): + super().__init__() + + self.min_px_occlusion = occlusion_px_range[0] + self.max_px_occlusion = occlusion_px_range[1] + + if self.max_px_occlusion < 0: + raise ValueError("occlusion_px_range[1] should be greater or equal than 0") + if self.min_px_occlusion < 0: + raise ValueError("occlusion_px_range[0] should be greater or equal than 0") + if self.min_px_occlusion > self.max_px_occlusion: + raise ValueError("occlusion_px_range[0] should be lower than occlusion_px_range[1]") + + self.p = p + self.inplace = inplace + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: T_STEREO_TENSOR, + masks: T_STEREO_TENSOR, + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + left_image, right_image = images + + if torch.rand(1) < self.p: + return images, disparities, masks + + y, x, h, w, v = self._get_params(right_image) + right_image = F.erase(right_image, y, x, h, w, v, self.inplace) + + return ((left_image, right_image), disparities, masks) + + def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]: + img_h, img_w = img.shape[-2:] + crop_h, crop_w = ( + random.randint(self.min_px_occlusion, self.max_px_occlusion), + random.randint(self.min_px_occlusion, self.max_px_occlusion), + ) + + crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h)) + occlusion_value = img[..., crop_y : crop_y + crop_h, crop_x : crop_x + crop_w].mean(dim=(-2, -1), keepdim=True) + + return (crop_y, crop_x, crop_h, crop_w, occlusion_value) + + +class RandomSpatialShift(torch.nn.Module): + # This transform applies a vertical shift and a slight angle rotation and the same time + def __init__( + self, p: float = 0.5, max_angle: float = 0.1, max_px_shift: int = 2, interpolation_type: str = "bilinear" + ) -> None: + super().__init__() + self.p = p + self.max_angle = max_angle + self.max_px_shift = max_px_shift + self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type) + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: T_STEREO_TENSOR, + masks: T_STEREO_TENSOR, + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + # the transform is applied only on the right image + # in order to mimic slight calibration issues + img_left, img_right = images + + INTERP_MODE = self._interpolation_mode_strategy() + + if torch.rand(1) < self.p: + # [0, 1] -> [-a, a] + shift = rand_float_range((1,), low=-self.max_px_shift, high=self.max_px_shift).item() + angle = rand_float_range((1,), low=-self.max_angle, high=self.max_angle).item() + # sample center point for the rotation matrix + y = torch.randint(size=(1,), low=0, high=img_right.shape[-2]).item() + x = torch.randint(size=(1,), low=0, high=img_right.shape[-1]).item() + # apply affine transformations + img_right = F.affine( + img_right, + angle=angle, + translate=[0, shift], # translation only on the y-axis + center=[x, y], + scale=1.0, + shear=0.0, + interpolation=INTERP_MODE, + ) + + return ((img_left, img_right), disparities, masks) + + +class RandomHorizontalFlip(torch.nn.Module): + def __init__(self, p: float = 0.5) -> None: + super().__init__() + self.p = p + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + img_left, img_right = images + dsp_left, dsp_right = disparities + mask_left, mask_right = masks + + if dsp_right is not None and torch.rand(1) < self.p: + img_left, img_right = F.hflip(img_left), F.hflip(img_right) + dsp_left, dsp_right = F.hflip(dsp_left), F.hflip(dsp_right) + if mask_left is not None and mask_right is not None: + mask_left, mask_right = F.hflip(mask_left), F.hflip(mask_right) + return ((img_right, img_left), (dsp_right, dsp_left), (mask_right, mask_left)) + + return images, disparities, masks + + +class Resize(torch.nn.Module): + def __init__(self, resize_size: Tuple[int, ...], interpolation_type: str = "bilinear") -> None: + super().__init__() + self.resize_size = list(resize_size) # doing this to keep mypy happy + self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type) + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + resized_images = () + resized_disparities = () + resized_masks = () + + INTERP_MODE = self._interpolation_mode_strategy() + + for img in images: + # We hard-code antialias=False to preserve results after we changed + # its default from None to True (see + # https://github.com/pytorch/vision/pull/7160) + # TODO: we could re-train the stereo models with antialias=True? + resized_images += (F.resize(img, self.resize_size, interpolation=INTERP_MODE, antialias=False),) + + for dsp in disparities: + if dsp is not None: + # rescale disparity to match the new image size + scale_x = self.resize_size[1] / dsp.shape[-1] + resized_disparities += (F.resize(dsp, self.resize_size, interpolation=INTERP_MODE) * scale_x,) + else: + resized_disparities += (None,) + + for mask in masks: + if mask is not None: + resized_masks += ( + # we squeeze and unsqueeze because the API requires > 3D tensors + F.resize( + mask.unsqueeze(0), + self.resize_size, + interpolation=F.InterpolationMode.NEAREST, + ).squeeze(0), + ) + else: + resized_masks += (None,) + + return resized_images, resized_disparities, resized_masks + + +class RandomRescaleAndCrop(torch.nn.Module): + # This transform will resize the input with a given proba, and then crop it. + # These are the reversed operations of the built-in RandomResizedCrop, + # although the order of the operations doesn't matter too much: resizing a + # crop would give the same result as cropping a resized image, up to + # interpolation artifact at the borders of the output. + # + # The reason we don't rely on RandomResizedCrop is because of a significant + # difference in the parametrization of both transforms, in particular, + # because of the way the random parameters are sampled in both transforms, + # which leads to fairly different results (and different epe). For more details see + # https://github.com/pytorch/vision/pull/5026/files#r762932579 + def __init__( + self, + crop_size: Tuple[int, int], + scale_range: Tuple[float, float] = (-0.2, 0.5), + rescale_prob: float = 0.8, + scaling_type: str = "exponential", + interpolation_type: str = "bilinear", + ) -> None: + super().__init__() + self.crop_size = crop_size + self.min_scale = scale_range[0] + self.max_scale = scale_range[1] + self.rescale_prob = rescale_prob + self.scaling_type = scaling_type + self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type) + + if self.scaling_type == "linear" and self.min_scale < 0: + raise ValueError("min_scale must be >= 0 for linear scaling") + + def forward( + self, + images: T_STEREO_TENSOR, + disparities: Tuple[T_FLOW, T_FLOW], + masks: Tuple[T_MASK, T_MASK], + ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]: + + img_left, img_right = images + dsp_left, dsp_right = disparities + mask_left, mask_right = masks + INTERP_MODE = self._interpolation_mode_strategy() + + # randomly sample scale + h, w = img_left.shape[-2:] + # Note: in original code, they use + 1 instead of + 8 for sparse datasets (e.g. Kitti) + # It shouldn't matter much + min_scale = max((self.crop_size[0] + 8) / h, (self.crop_size[1] + 8) / w) + + # exponential scaling will draw a random scale in (min_scale, max_scale) and then raise + # 2 to the power of that random value. This final scale distribution will have a different + # mean and variance than a uniform distribution. Note that a scale of 1 will result in + # a rescaling of 2X the original size, whereas a scale of -1 will result in a rescaling + # of 0.5X the original size. + if self.scaling_type == "exponential": + scale = 2 ** torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item() + # linear scaling will draw a random scale in (min_scale, max_scale) + elif self.scaling_type == "linear": + scale = torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item() + + scale = max(scale, min_scale) + + new_h, new_w = round(h * scale), round(w * scale) + + if torch.rand(1).item() < self.rescale_prob: + # rescale the images + img_left = F.resize(img_left, size=(new_h, new_w), interpolation=INTERP_MODE) + img_right = F.resize(img_right, size=(new_h, new_w), interpolation=INTERP_MODE) + + resized_masks, resized_disparities = (), () + + for disparity, mask in zip(disparities, masks): + if disparity is not None: + if mask is None: + resized_disparity = F.resize(disparity, size=(new_h, new_w), interpolation=INTERP_MODE) + # rescale the disparity + resized_disparity = ( + resized_disparity * torch.tensor([scale], device=resized_disparity.device)[:, None, None] + ) + resized_mask = None + else: + resized_disparity, resized_mask = _resize_sparse_flow( + disparity, mask, scale_x=scale, scale_y=scale + ) + resized_masks += (resized_mask,) + resized_disparities += (resized_disparity,) + + else: + resized_disparities = disparities + resized_masks = masks + + disparities = resized_disparities + masks = resized_masks + + # Note: For sparse datasets (Kitti), the original code uses a "margin" + # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220 + # We don't, not sure if it matters much + y0 = torch.randint(0, img_left.shape[1] - self.crop_size[0], size=(1,)).item() + x0 = torch.randint(0, img_right.shape[2] - self.crop_size[1], size=(1,)).item() + + img_left = F.crop(img_left, y0, x0, self.crop_size[0], self.crop_size[1]) + img_right = F.crop(img_right, y0, x0, self.crop_size[0], self.crop_size[1]) + if dsp_left is not None: + dsp_left = F.crop(disparities[0], y0, x0, self.crop_size[0], self.crop_size[1]) + if dsp_right is not None: + dsp_right = F.crop(disparities[1], y0, x0, self.crop_size[0], self.crop_size[1]) + + cropped_masks = () + for mask in masks: + if mask is not None: + mask = F.crop(mask, y0, x0, self.crop_size[0], self.crop_size[1]) + cropped_masks += (mask,) + + return ((img_left, img_right), (dsp_left, dsp_right), cropped_masks) + + +def _resize_sparse_flow( + flow: Tensor, valid_flow_mask: Tensor, scale_x: float = 1.0, scale_y: float = 0.0 +) -> Tuple[Tensor, Tensor]: + # This resizes both the flow and the valid_flow_mask mask (which is assumed to be reasonably sparse) + # There are as-many non-zero values in the original flow as in the resized flow (up to OOB) + # So for example if scale_x = scale_y = 2, the sparsity of the output flow is multiplied by 4 + + h, w = flow.shape[-2:] + + h_new = int(round(h * scale_y)) + w_new = int(round(w * scale_x)) + flow_new = torch.zeros(size=[1, h_new, w_new], dtype=flow.dtype) + valid_new = torch.zeros(size=[h_new, w_new], dtype=valid_flow_mask.dtype) + + jj, ii = torch.meshgrid(torch.arange(w), torch.arange(h), indexing="xy") + + ii_valid, jj_valid = ii[valid_flow_mask], jj[valid_flow_mask] + + ii_valid_new = torch.round(ii_valid.to(float) * scale_y).to(torch.long) + jj_valid_new = torch.round(jj_valid.to(float) * scale_x).to(torch.long) + + within_bounds_mask = (0 <= ii_valid_new) & (ii_valid_new < h_new) & (0 <= jj_valid_new) & (jj_valid_new < w_new) + + ii_valid = ii_valid[within_bounds_mask] + jj_valid = jj_valid[within_bounds_mask] + ii_valid_new = ii_valid_new[within_bounds_mask] + jj_valid_new = jj_valid_new[within_bounds_mask] + + valid_flow_new = flow[:, ii_valid, jj_valid] + valid_flow_new *= scale_x + + flow_new[:, ii_valid_new, jj_valid_new] = valid_flow_new + valid_new[ii_valid_new, jj_valid_new] = valid_flow_mask[ii_valid, jj_valid] + + return flow_new, valid_new.bool() + + +class Compose(torch.nn.Module): + def __init__(self, transforms: List[Callable]): + super().__init__() + self.transforms = transforms + + @torch.inference_mode() + def forward(self, images, disparities, masks): + for t in self.transforms: + images, disparities, masks = t(images, disparities, masks) + return images, disparities, masks diff --git a/references/depth/stereo/utils/losses.py b/references/depth/stereo/utils/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..c809cc74d0f49f1b87277dd0ecece413a0f079c2 --- /dev/null +++ b/references/depth/stereo/utils/losses.py @@ -0,0 +1,503 @@ +from typing import List, Optional + +import torch +from torch import nn, Tensor +from torch.nn import functional as F +from torchvision.prototype.models.depth.stereo.raft_stereo import grid_sample, make_coords_grid + + +def make_gaussian_kernel(kernel_size: int, sigma: float) -> torch.Tensor: + """Function to create a 2D Gaussian kernel.""" + + x = torch.arange(kernel_size, dtype=torch.float32) + y = torch.arange(kernel_size, dtype=torch.float32) + x = x - (kernel_size - 1) / 2 + y = y - (kernel_size - 1) / 2 + x, y = torch.meshgrid(x, y) + grid = (x**2 + y**2) / (2 * sigma**2) + kernel = torch.exp(-grid) + kernel = kernel / kernel.sum() + return kernel + + +def _sequence_loss_fn( + flow_preds: List[Tensor], + flow_gt: Tensor, + valid_flow_mask: Optional[Tensor], + gamma: Tensor, + max_flow: int = 256, + exclude_large: bool = False, + weights: Optional[Tensor] = None, +): + """Loss function defined over sequence of flow predictions""" + torch._assert( + gamma < 1, + "sequence_loss: `gamma` must be lower than 1, but got {}".format(gamma), + ) + + if exclude_large: + # exclude invalid pixels and extremely large diplacements + flow_norm = torch.sum(flow_gt**2, dim=1).sqrt() + if valid_flow_mask is not None: + valid_flow_mask = valid_flow_mask & (flow_norm < max_flow) + else: + valid_flow_mask = flow_norm < max_flow + + if valid_flow_mask is not None: + valid_flow_mask = valid_flow_mask.unsqueeze(1) + flow_preds = torch.stack(flow_preds) # shape = (num_flow_updates, batch_size, 2, H, W) + + abs_diff = (flow_preds - flow_gt).abs() + if valid_flow_mask is not None: + abs_diff = abs_diff * valid_flow_mask.unsqueeze(0) + + abs_diff = abs_diff.mean(axis=(1, 2, 3, 4)) + num_predictions = flow_preds.shape[0] + + # allocating on CPU and moving to device during run-time can force + # an unwanted GPU synchronization that produces a large overhead + if weights is None or len(weights) != num_predictions: + weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype) + + flow_loss = (abs_diff * weights).sum() + return flow_loss, weights + + +class SequenceLoss(nn.Module): + def __init__(self, gamma: float = 0.8, max_flow: int = 256, exclude_large_flows: bool = False) -> None: + """ + Args: + gamma: value for the exponential weighting of the loss across frames + max_flow: maximum flow value to exclude + exclude_large_flows: whether to exclude large flows + """ + + super().__init__() + self.max_flow = max_flow + self.excluding_large = exclude_large_flows + self.register_buffer("gamma", torch.tensor([gamma])) + # cache the scale factor for the loss + self._weights = None + + def forward(self, flow_preds: List[Tensor], flow_gt: Tensor, valid_flow_mask: Optional[Tensor]) -> Tensor: + """ + Args: + flow_preds: list of flow predictions of shape (batch_size, C, H, W) + flow_gt: ground truth flow of shape (batch_size, C, H, W) + valid_flow_mask: mask of valid flow pixels of shape (batch_size, H, W) + """ + loss, weights = _sequence_loss_fn( + flow_preds, flow_gt, valid_flow_mask, self.gamma, self.max_flow, self.excluding_large, self._weights + ) + self._weights = weights + return loss + + def set_gamma(self, gamma: float) -> None: + self.gamma.fill_(gamma) + # reset the cached scale factor + self._weights = None + + +def _ssim_loss_fn( + source: Tensor, + reference: Tensor, + kernel: Tensor, + eps: float = 1e-8, + c1: float = 0.01**2, + c2: float = 0.03**2, + use_padding: bool = False, +) -> Tensor: + # ref: Algorithm section: https://en.wikipedia.org/wiki/Structural_similarity + # ref: Alternative implementation: https://kornia.readthedocs.io/en/latest/_modules/kornia/metrics/ssim.html#ssim + + torch._assert( + source.ndim == reference.ndim == 4, + "SSIM: `source` and `reference` must be 4-dimensional tensors", + ) + + torch._assert( + source.shape == reference.shape, + "SSIM: `source` and `reference` must have the same shape, but got {} and {}".format( + source.shape, reference.shape + ), + ) + + B, C, H, W = source.shape + kernel = kernel.unsqueeze(0).unsqueeze(0).repeat(C, 1, 1, 1) + if use_padding: + pad_size = kernel.shape[2] // 2 + source = F.pad(source, (pad_size, pad_size, pad_size, pad_size), "reflect") + reference = F.pad(reference, (pad_size, pad_size, pad_size, pad_size), "reflect") + + mu1 = F.conv2d(source, kernel, groups=C) + mu2 = F.conv2d(reference, kernel, groups=C) + + mu1_sq = mu1.pow(2) + mu2_sq = mu2.pow(2) + + mu1_mu2 = mu1 * mu2 + mu_img1_sq = F.conv2d(source.pow(2), kernel, groups=C) + mu_img2_sq = F.conv2d(reference.pow(2), kernel, groups=C) + mu_img1_mu2 = F.conv2d(source * reference, kernel, groups=C) + + sigma1_sq = mu_img1_sq - mu1_sq + sigma2_sq = mu_img2_sq - mu2_sq + sigma12 = mu_img1_mu2 - mu1_mu2 + + numerator = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2) + denominator = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2) + ssim = numerator / (denominator + eps) + + # doing 1 - ssim because we want to maximize the ssim + return 1 - ssim.mean(dim=(1, 2, 3)) + + +class SSIM(nn.Module): + def __init__( + self, + kernel_size: int = 11, + max_val: float = 1.0, + sigma: float = 1.5, + eps: float = 1e-12, + use_padding: bool = True, + ) -> None: + """SSIM loss function. + + Args: + kernel_size: size of the Gaussian kernel + max_val: constant scaling factor + sigma: sigma of the Gaussian kernel + eps: constant for division by zero + use_padding: whether to pad the input tensor such that we have a score for each pixel + """ + super().__init__() + + self.kernel_size = kernel_size + self.max_val = max_val + self.sigma = sigma + + gaussian_kernel = make_gaussian_kernel(kernel_size, sigma) + self.register_buffer("gaussian_kernel", gaussian_kernel) + + self.c1 = (0.01 * self.max_val) ** 2 + self.c2 = (0.03 * self.max_val) ** 2 + + self.use_padding = use_padding + self.eps = eps + + def forward(self, source: torch.Tensor, reference: torch.Tensor) -> torch.Tensor: + """ + Args: + source: source image of shape (batch_size, C, H, W) + reference: reference image of shape (batch_size, C, H, W) + + Returns: + SSIM loss of shape (batch_size,) + """ + return _ssim_loss_fn( + source, + reference, + kernel=self.gaussian_kernel, + c1=self.c1, + c2=self.c2, + use_padding=self.use_padding, + eps=self.eps, + ) + + +def _smoothness_loss_fn(img_gx: Tensor, img_gy: Tensor, val_gx: Tensor, val_gy: Tensor): + # ref: https://github.com/nianticlabs/monodepth2/blob/b676244e5a1ca55564eb5d16ab521a48f823af31/layers.py#L202 + + torch._assert( + img_gx.ndim >= 3, + "smoothness_loss: `img_gx` must be at least 3-dimensional tensor of shape (..., C, H, W)", + ) + + torch._assert( + img_gx.ndim == val_gx.ndim, + "smoothness_loss: `img_gx` and `depth_gx` must have the same dimensionality, but got {} and {}".format( + img_gx.ndim, val_gx.ndim + ), + ) + + for idx in range(img_gx.ndim): + torch._assert( + (img_gx.shape[idx] == val_gx.shape[idx] or (img_gx.shape[idx] == 1 or val_gx.shape[idx] == 1)), + "smoothness_loss: `img_gx` and `depth_gx` must have either the same shape or broadcastable shape, but got {} and {}".format( + img_gx.shape, val_gx.shape + ), + ) + + # -3 is channel dimension + weights_x = torch.exp(-torch.mean(torch.abs(val_gx), axis=-3, keepdim=True)) + weights_y = torch.exp(-torch.mean(torch.abs(val_gy), axis=-3, keepdim=True)) + + smoothness_x = img_gx * weights_x + smoothness_y = img_gy * weights_y + + smoothness = (torch.abs(smoothness_x) + torch.abs(smoothness_y)).mean(axis=(-3, -2, -1)) + return smoothness + + +class SmoothnessLoss(nn.Module): + def __init__(self) -> None: + super().__init__() + + def _x_gradient(self, img: Tensor) -> Tensor: + if img.ndim > 4: + original_shape = img.shape + is_reshaped = True + img = img.reshape(-1, *original_shape[-3:]) + else: + is_reshaped = False + + padded = F.pad(img, (0, 1, 0, 0), mode="replicate") + grad = padded[..., :, :-1] - padded[..., :, 1:] + if is_reshaped: + grad = grad.reshape(original_shape) + return grad + + def _y_gradient(self, x: torch.Tensor) -> torch.Tensor: + if x.ndim > 4: + original_shape = x.shape + is_reshaped = True + x = x.reshape(-1, *original_shape[-3:]) + else: + is_reshaped = False + + padded = F.pad(x, (0, 0, 0, 1), mode="replicate") + grad = padded[..., :-1, :] - padded[..., 1:, :] + if is_reshaped: + grad = grad.reshape(original_shape) + return grad + + def forward(self, images: Tensor, vals: Tensor) -> Tensor: + """ + Args: + images: tensor of shape (D1, D2, ..., DN, C, H, W) + vals: tensor of shape (D1, D2, ..., DN, 1, H, W) + + Returns: + smoothness loss of shape (D1, D2, ..., DN) + """ + img_gx = self._x_gradient(images) + img_gy = self._y_gradient(images) + + val_gx = self._x_gradient(vals) + val_gy = self._y_gradient(vals) + + return _smoothness_loss_fn(img_gx, img_gy, val_gx, val_gy) + + +def _flow_sequence_consistency_loss_fn( + flow_preds: List[Tensor], + gamma: float = 0.8, + resize_factor: float = 0.25, + rescale_factor: float = 0.25, + rescale_mode: str = "bilinear", + weights: Optional[Tensor] = None, +): + """Loss function defined over sequence of flow predictions""" + + # Simplified version of ref: https://arxiv.org/pdf/2006.11242.pdf + # In the original paper, an additional refinement network is used to refine a flow prediction. + # Each step performed by the recurrent module in Raft or CREStereo is a refinement step using a delta_flow update. + # which should be consistent with the previous step. In this implementation, we simplify the overall loss + # term and ignore left-right consistency loss or photometric loss which can be treated separately. + + torch._assert( + rescale_factor <= 1.0, + "sequence_consistency_loss: `rescale_factor` must be less than or equal to 1, but got {}".format( + rescale_factor + ), + ) + + flow_preds = torch.stack(flow_preds) # shape = (num_flow_updates, batch_size, 2, H, W) + N, B, C, H, W = flow_preds.shape + + # rescale flow predictions to account for bilinear upsampling artifacts + if rescale_factor: + flow_preds = ( + F.interpolate( + flow_preds.view(N * B, C, H, W), scale_factor=resize_factor, mode=rescale_mode, align_corners=True + ) + ) * rescale_factor + flow_preds = torch.stack(torch.chunk(flow_preds, N, dim=0), dim=0) + + # force the next prediction to be similar to the previous prediction + abs_diff = (flow_preds[1:] - flow_preds[:-1]).square() + abs_diff = abs_diff.mean(axis=(1, 2, 3, 4)) + + num_predictions = flow_preds.shape[0] - 1 # because we are comparing differences + if weights is None or len(weights) != num_predictions: + weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype) + + flow_loss = (abs_diff * weights).sum() + return flow_loss, weights + + +class FlowSequenceConsistencyLoss(nn.Module): + def __init__( + self, + gamma: float = 0.8, + resize_factor: float = 0.25, + rescale_factor: float = 0.25, + rescale_mode: str = "bilinear", + ) -> None: + super().__init__() + self.gamma = gamma + self.resize_factor = resize_factor + self.rescale_factor = rescale_factor + self.rescale_mode = rescale_mode + self._weights = None + + def forward(self, flow_preds: List[Tensor]) -> Tensor: + """ + Args: + flow_preds: list of tensors of shape (batch_size, C, H, W) + + Returns: + sequence consistency loss of shape (batch_size,) + """ + loss, weights = _flow_sequence_consistency_loss_fn( + flow_preds, + gamma=self.gamma, + resize_factor=self.resize_factor, + rescale_factor=self.rescale_factor, + rescale_mode=self.rescale_mode, + weights=self._weights, + ) + self._weights = weights + return loss + + def set_gamma(self, gamma: float) -> None: + self.gamma.fill_(gamma) + # reset the cached scale factor + self._weights = None + + +def _psnr_loss_fn(source: torch.Tensor, target: torch.Tensor, max_val: float) -> torch.Tensor: + torch._assert( + source.shape == target.shape, + "psnr_loss: source and target must have the same shape, but got {} and {}".format(source.shape, target.shape), + ) + + # ref https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio + return 10 * torch.log10(max_val**2 / ((source - target).pow(2).mean(axis=(-3, -2, -1)))) + + +class PSNRLoss(nn.Module): + def __init__(self, max_val: float = 256) -> None: + """ + Args: + max_val: maximum value of the input tensor. This refers to the maximum domain value of the input tensor. + + """ + super().__init__() + self.max_val = max_val + + def forward(self, source: Tensor, target: Tensor) -> Tensor: + """ + Args: + source: tensor of shape (D1, D2, ..., DN, C, H, W) + target: tensor of shape (D1, D2, ..., DN, C, H, W) + + Returns: + psnr loss of shape (D1, D2, ..., DN) + """ + + # multiply by -1 as we want to maximize the psnr + return -1 * _psnr_loss_fn(source, target, self.max_val) + + +class FlowPhotoMetricLoss(nn.Module): + def __init__( + self, + ssim_weight: float = 0.85, + ssim_window_size: int = 11, + ssim_max_val: float = 1.0, + ssim_sigma: float = 1.5, + ssim_eps: float = 1e-12, + ssim_use_padding: bool = True, + max_displacement_ratio: float = 0.15, + ) -> None: + super().__init__() + + self._ssim_loss = SSIM( + kernel_size=ssim_window_size, + max_val=ssim_max_val, + sigma=ssim_sigma, + eps=ssim_eps, + use_padding=ssim_use_padding, + ) + + self._L1_weight = 1 - ssim_weight + self._SSIM_weight = ssim_weight + self._max_displacement_ratio = max_displacement_ratio + + def forward( + self, + source: Tensor, + reference: Tensor, + flow_pred: Tensor, + valid_mask: Optional[Tensor] = None, + ): + """ + Args: + source: tensor of shape (B, C, H, W) + reference: tensor of shape (B, C, H, W) + flow_pred: tensor of shape (B, 2, H, W) + valid_mask: tensor of shape (B, H, W) or None + + Returns: + photometric loss of shape + + """ + torch._assert( + source.ndim == 4, + "FlowPhotoMetricLoss: source must have 4 dimensions, but got {}".format(source.ndim), + ) + torch._assert( + reference.ndim == source.ndim, + "FlowPhotoMetricLoss: source and other must have the same number of dimensions, but got {} and {}".format( + source.ndim, reference.ndim + ), + ) + torch._assert( + flow_pred.shape[1] == 2, + "FlowPhotoMetricLoss: flow_pred must have 2 channels, but got {}".format(flow_pred.shape[1]), + ) + torch._assert( + flow_pred.ndim == 4, + "FlowPhotoMetricLoss: flow_pred must have 4 dimensions, but got {}".format(flow_pred.ndim), + ) + + B, C, H, W = source.shape + flow_channels = flow_pred.shape[1] + + max_displacements = [] + for dim in range(flow_channels): + shape_index = -1 - dim + max_displacements.append(int(self._max_displacement_ratio * source.shape[shape_index])) + + # mask out all pixels that have larger flow than the max flow allowed + max_flow_mask = torch.logical_and( + *[flow_pred[:, dim, :, :] < max_displacements[dim] for dim in range(flow_channels)] + ) + + if valid_mask is not None: + valid_mask = torch.logical_and(valid_mask, max_flow_mask).unsqueeze(1) + else: + valid_mask = max_flow_mask.unsqueeze(1) + + grid = make_coords_grid(B, H, W, device=str(source.device)) + resampled_grids = grid - flow_pred + resampled_grids = resampled_grids.permute(0, 2, 3, 1) + resampled_source = grid_sample(reference, resampled_grids, mode="bilinear") + + # compute SSIM loss + ssim_loss = self._ssim_loss(resampled_source * valid_mask, source * valid_mask) + l1_loss = (resampled_source * valid_mask - source * valid_mask).abs().mean(axis=(-3, -2, -1)) + loss = self._L1_weight * l1_loss + self._SSIM_weight * ssim_loss + + return loss.mean() diff --git a/references/depth/stereo/utils/metrics.py b/references/depth/stereo/utils/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..05b149fb048b70a95e32b485ac91de7de45c237a --- /dev/null +++ b/references/depth/stereo/utils/metrics.py @@ -0,0 +1,49 @@ +from typing import Dict, List, Optional, Tuple + +from torch import Tensor + +AVAILABLE_METRICS = ["mae", "rmse", "epe", "bad1", "bad2", "epe", "1px", "3px", "5px", "fl-all", "relepe"] + + +def compute_metrics( + flow_pred: Tensor, flow_gt: Tensor, valid_flow_mask: Optional[Tensor], metrics: List[str] +) -> Tuple[Dict[str, float], int]: + for m in metrics: + if m not in AVAILABLE_METRICS: + raise ValueError(f"Invalid metric: {m}. Valid metrics are: {AVAILABLE_METRICS}") + + metrics_dict = {} + + pixels_diffs = (flow_pred - flow_gt).abs() + # there is no Y flow in Stereo Matching, therefore flow.abs() = flow.pow(2).sum(dim=1).sqrt() + flow_norm = flow_gt.abs() + + if valid_flow_mask is not None: + valid_flow_mask = valid_flow_mask.unsqueeze(1) + pixels_diffs = pixels_diffs[valid_flow_mask] + flow_norm = flow_norm[valid_flow_mask] + + num_pixels = pixels_diffs.numel() + if "bad1" in metrics: + metrics_dict["bad1"] = (pixels_diffs > 1).float().mean().item() + if "bad2" in metrics: + metrics_dict["bad2"] = (pixels_diffs > 2).float().mean().item() + + if "mae" in metrics: + metrics_dict["mae"] = pixels_diffs.mean().item() + if "rmse" in metrics: + metrics_dict["rmse"] = pixels_diffs.pow(2).mean().sqrt().item() + if "epe" in metrics: + metrics_dict["epe"] = pixels_diffs.mean().item() + if "1px" in metrics: + metrics_dict["1px"] = (pixels_diffs < 1).float().mean().item() + if "3px" in metrics: + metrics_dict["3px"] = (pixels_diffs < 3).float().mean().item() + if "5px" in metrics: + metrics_dict["5px"] = (pixels_diffs < 5).float().mean().item() + if "fl-all" in metrics: + metrics_dict["fl-all"] = ((pixels_diffs < 3) & ((pixels_diffs / flow_norm) < 0.05)).float().mean().item() * 100 + if "relepe" in metrics: + metrics_dict["relepe"] = (pixels_diffs / flow_norm).mean().item() + + return metrics_dict, num_pixels diff --git a/references/depth/stereo/visualization.py b/references/depth/stereo/visualization.py new file mode 100644 index 0000000000000000000000000000000000000000..d043d274614969d206159e5bc4973e6844cf39f7 --- /dev/null +++ b/references/depth/stereo/visualization.py @@ -0,0 +1,126 @@ +import os +from typing import List + +import numpy as np +import torch +from torch import Tensor +from torchvision.utils import make_grid + + +@torch.no_grad() +def make_disparity_image(disparity: Tensor): + # normalize image to [0, 1] + disparity = disparity.detach().cpu() + disparity = (disparity - disparity.min()) / (disparity.max() - disparity.min()) + return disparity + + +@torch.no_grad() +def make_disparity_image_pairs(disparity: Tensor, image: Tensor): + disparity = make_disparity_image(disparity) + # image is in [-1, 1], bring it to [0, 1] + image = image.detach().cpu() + image = image * 0.5 + 0.5 + return disparity, image + + +@torch.no_grad() +def make_disparity_sequence(disparities: List[Tensor]): + # convert each disparity to [0, 1] + for idx, disparity_batch in enumerate(disparities): + disparities[idx] = torch.stack(list(map(make_disparity_image, disparity_batch))) + # make the list into a batch + disparity_sequences = torch.stack(disparities) + return disparity_sequences + + +@torch.no_grad() +def make_pair_grid(*inputs, orientation="horizontal"): + # make a grid of images with the outputs and references side by side + if orientation == "horizontal": + # interleave the outputs and references + canvas = torch.zeros_like(inputs[0]) + canvas = torch.cat([canvas] * len(inputs), dim=0) + size = len(inputs) + for idx, inp in enumerate(inputs): + canvas[idx::size, ...] = inp + grid = make_grid(canvas, nrow=len(inputs), padding=16, normalize=True, scale_each=True) + elif orientation == "vertical": + # interleave the outputs and references + canvas = torch.cat(inputs, dim=0) + size = len(inputs) + for idx, inp in enumerate(inputs): + canvas[idx::size, ...] = inp + grid = make_grid(canvas, nrow=len(inputs[0]), padding=16, normalize=True, scale_each=True) + else: + raise ValueError("Unknown orientation: {}".format(orientation)) + return grid + + +@torch.no_grad() +def make_training_sample_grid( + left_images: Tensor, + right_images: Tensor, + disparities: Tensor, + masks: Tensor, + predictions: List[Tensor], +) -> np.ndarray: + # detach images and renormalize to [0, 1] + images_left = left_images.detach().cpu() * 0.5 + 0.5 + images_right = right_images.detach().cpu() * 0.5 + 0.5 + # detach the disparties and predictions + disparities = disparities.detach().cpu() + predictions = predictions[-1].detach().cpu() + # keep only the first channel of pixels, and repeat it 3 times + disparities = disparities[:, :1, ...].repeat(1, 3, 1, 1) + predictions = predictions[:, :1, ...].repeat(1, 3, 1, 1) + # unsqueeze and repeat the masks + masks = masks.detach().cpu().unsqueeze(1).repeat(1, 3, 1, 1) + # make a grid that will self normalize across the batch + pred_grid = make_pair_grid(images_left, images_right, masks, disparities, predictions, orientation="horizontal") + pred_grid = pred_grid.permute(1, 2, 0).numpy() + pred_grid = (pred_grid * 255).astype(np.uint8) + return pred_grid + + +@torch.no_grad() +def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> np.ndarray: + # right most we will be adding the ground truth + seq_len = len(predictions) + 1 + predictions = list(map(lambda x: x[:, :1, :, :].detach().cpu(), predictions + [disparities])) + sequence = make_disparity_sequence(predictions) + # swap axes to have the in the correct order for each batch sample + sequence = torch.swapaxes(sequence, 0, 1).contiguous().reshape(-1, 1, disparities.shape[-2], disparities.shape[-1]) + sequence = make_grid(sequence, nrow=seq_len, padding=16, normalize=True, scale_each=True) + sequence = sequence.permute(1, 2, 0).numpy() + sequence = (sequence * 255).astype(np.uint8) + return sequence + + +@torch.no_grad() +def make_prediction_image_side_to_side( + predictions: Tensor, disparities: Tensor, valid_mask: Tensor, save_path: str, prefix: str +) -> None: + import matplotlib.pyplot as plt + + # normalize the predictions and disparities in [0, 1] + predictions = (predictions - predictions.min()) / (predictions.max() - predictions.min()) + disparities = (disparities - disparities.min()) / (disparities.max() - disparities.min()) + predictions = predictions * valid_mask + disparities = disparities * valid_mask + + predictions = predictions.detach().cpu() + disparities = disparities.detach().cpu() + + for idx, (pred, gt) in enumerate(zip(predictions, disparities)): + pred = pred.permute(1, 2, 0).numpy() + gt = gt.permute(1, 2, 0).numpy() + # plot pred and gt side by side + fig, ax = plt.subplots(1, 2, figsize=(10, 5)) + ax[0].imshow(pred) + ax[0].set_title("Prediction") + ax[1].imshow(gt) + ax[1].set_title("Ground Truth") + save_name = os.path.join(save_path, "{}_{}.png".format(prefix, idx)) + plt.savefig(save_name) + plt.close() diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py index 396de63297ba2e38cc307d9aff1334704edb0298..f40dcdff783d7a2ce26d5e453c13dd23b52cc212 100644 --- a/references/detection/coco_utils.py +++ b/references/detection/coco_utils.py @@ -1,4 +1,3 @@ -import copy import os import torch @@ -9,24 +8,6 @@ from pycocotools import mask as coco_mask from pycocotools.coco import COCO -class FilterAndRemapCocoCategories: - def __init__(self, categories, remap=True): - self.categories = categories - self.remap = remap - - def __call__(self, image, target): - anno = target["annotations"] - anno = [obj for obj in anno if obj["category_id"] in self.categories] - if not self.remap: - target["annotations"] = anno - return image, target - anno = copy.deepcopy(anno) - for obj in anno: - obj["category_id"] = self.categories.index(obj["category_id"]) - target["annotations"] = anno - return image, target - - def convert_coco_poly_to_mask(segmentations, height, width): masks = [] for polygons in segmentations: @@ -49,7 +30,6 @@ class ConvertCocoPolysToMask: w, h = image.size image_id = target["image_id"] - image_id = torch.tensor([image_id]) anno = target["annotations"] @@ -116,7 +96,7 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None): # if all boxes have close to zero area, there is no annotation if _has_only_empty_bbox(anno): return False - # keypoints task have a slight different critera for considering + # keypoints task have a slight different criteria for considering # if an annotation is valid if "keypoints" not in anno[0]: return True @@ -126,10 +106,6 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None): return True return False - if not isinstance(dataset, torchvision.datasets.CocoDetection): - raise TypeError( - f"This function expects dataset of type torchvision.datasets.CocoDetection, instead got {type(dataset)}" - ) ids = [] for ds_idx, img_id in enumerate(dataset.ids): ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None) @@ -153,7 +129,7 @@ def convert_to_coco_api(ds): # find better way to get target # targets = ds.get_annotations(img_idx) img, targets = ds[img_idx] - image_id = targets["image_id"].item() + image_id = targets["image_id"] img_dict = {} img_dict["id"] = image_id img_dict["height"] = img.shape[-2] @@ -196,6 +172,7 @@ def convert_to_coco_api(ds): def get_coco_api_from_dataset(dataset): + # FIXME: This is... awful? for _ in range(10): if isinstance(dataset, torchvision.datasets.CocoDetection): break @@ -220,7 +197,7 @@ class CocoDetection(torchvision.datasets.CocoDetection): return img, target -def get_coco(root, image_set, transforms, mode="instances"): +def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False): anno_file_template = "{}_{}2017.json" PATHS = { "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))), @@ -228,17 +205,26 @@ def get_coco(root, image_set, transforms, mode="instances"): # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))) } - t = [ConvertCocoPolysToMask()] - - if transforms is not None: - t.append(transforms) - transforms = T.Compose(t) - img_folder, ann_file = PATHS[image_set] img_folder = os.path.join(root, img_folder) ann_file = os.path.join(root, ann_file) - dataset = CocoDetection(img_folder, ann_file, transforms=transforms) + if use_v2: + from torchvision.datasets import wrap_dataset_for_transforms_v2 + + dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms) + target_keys = ["boxes", "labels", "image_id"] + if with_masks: + target_keys += ["masks"] + dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys) + else: + # TODO: handle with_masks for V1? + t = [ConvertCocoPolysToMask()] + if transforms is not None: + t.append(transforms) + transforms = T.Compose(t) + + dataset = CocoDetection(img_folder, ann_file, transforms=transforms) if image_set == "train": dataset = _coco_remove_images_without_annotations(dataset) @@ -246,7 +232,3 @@ def get_coco(root, image_set, transforms, mode="instances"): # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)]) return dataset - - -def get_coco_kp(root, image_set, transforms): - return get_coco(root, image_set, transforms, mode="person_keypoints") diff --git a/references/detection/engine.py b/references/detection/engine.py index 0e5d55f189d482d34371f49c719c51660228244e..0e9bfffdf8af566c4bc13436361005c1e7b84dcb 100644 --- a/references/detection/engine.py +++ b/references/detection/engine.py @@ -26,7 +26,7 @@ def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, sc for images, targets in metric_logger.log_every(data_loader, print_freq, header): images = list(image.to(device) for image in images) - targets = [{k: v.to(device) for k, v in t.items()} for t in targets] + targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets] with torch.cuda.amp.autocast(enabled=scaler is not None): loss_dict = model(images, targets) losses = sum(loss for loss in loss_dict.values()) @@ -97,7 +97,7 @@ def evaluate(model, data_loader, device): outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs] model_time = time.time() - model_time - res = {target["image_id"].item(): output for target, output in zip(targets, outputs)} + res = {target["image_id"]: output for target, output in zip(targets, outputs)} evaluator_time = time.time() coco_evaluator.update(res) evaluator_time = time.time() - evaluator_time diff --git a/references/detection/group_by_aspect_ratio.py b/references/detection/group_by_aspect_ratio.py index 5312cc036d61f730771dbf3df9b6b1c4416dc2dd..d12e14b540cc788abb98f40134ca9738dcd88a9a 100644 --- a/references/detection/group_by_aspect_ratio.py +++ b/references/detection/group_by_aspect_ratio.py @@ -63,7 +63,7 @@ class GroupedBatchSampler(BatchSampler): expected_num_batches = len(self) num_remaining = expected_num_batches - num_batches if num_remaining > 0: - # for the remaining batches, take first the buffers with largest number + # for the remaining batches, take first the buffers with the largest number # of elements for group_id, _ in sorted(buffer_per_group.items(), key=lambda x: len(x[1]), reverse=True): remaining = self.batch_size - len(buffer_per_group[group_id]) diff --git a/references/detection/presets.py b/references/detection/presets.py index 779f3f218ca0b4092c2d32374ccfdfb9a41369f9..e9b6d56c8861263fbe70acc1f6e01bb56f172e2b 100644 --- a/references/detection/presets.py +++ b/references/detection/presets.py @@ -1,73 +1,114 @@ +from collections import defaultdict + import torch -import transforms as T +import transforms as reference_transforms + + +def get_modules(use_v2): + # We need a protected import to avoid the V2 warning in case just V1 is used + if use_v2: + import torchvision.transforms.v2 + import torchvision.tv_tensors + + return torchvision.transforms.v2, torchvision.tv_tensors + else: + return reference_transforms, None class DetectionPresetTrain: - def __init__(self, *, data_augmentation, hflip_prob=0.5, mean=(123.0, 117.0, 104.0)): + # Note: this transform assumes that the input to forward() are always PIL + # images, regardless of the backend parameter. + def __init__( + self, + *, + data_augmentation, + hflip_prob=0.5, + mean=(123.0, 117.0, 104.0), + backend="pil", + use_v2=False, + ): + + T, tv_tensors = get_modules(use_v2) + + transforms = [] + backend = backend.lower() + if backend == "tv_tensor": + transforms.append(T.ToImage()) + elif backend == "tensor": + transforms.append(T.PILToTensor()) + elif backend != "pil": + raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}") + if data_augmentation == "hflip": - self.transforms = T.Compose( - [ - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + transforms += [T.RandomHorizontalFlip(p=hflip_prob)] elif data_augmentation == "lsj": - self.transforms = T.Compose( - [ - T.ScaleJitter(target_size=(1024, 1024)), - T.FixedSizeCrop(size=(1024, 1024), fill=mean), - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + transforms += [ + T.ScaleJitter(target_size=(1024, 1024), antialias=True), + # TODO: FixedSizeCrop below doesn't work on tensors! + reference_transforms.FixedSizeCrop(size=(1024, 1024), fill=mean), + T.RandomHorizontalFlip(p=hflip_prob), + ] elif data_augmentation == "multiscale": - self.transforms = T.Compose( - [ - T.RandomShortestSize( - min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 - ), - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + transforms += [ + T.RandomShortestSize(min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333), + T.RandomHorizontalFlip(p=hflip_prob), + ] elif data_augmentation == "ssd": - self.transforms = T.Compose( - [ - T.RandomPhotometricDistort(), - T.RandomZoomOut(fill=list(mean)), - T.RandomIoUCrop(), - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + fill = defaultdict(lambda: mean, {tv_tensors.Mask: 0}) if use_v2 else list(mean) + transforms += [ + T.RandomPhotometricDistort(), + T.RandomZoomOut(fill=fill), + T.RandomIoUCrop(), + T.RandomHorizontalFlip(p=hflip_prob), + ] elif data_augmentation == "ssdlite": - self.transforms = T.Compose( - [ - T.RandomIoUCrop(), - T.RandomHorizontalFlip(p=hflip_prob), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + transforms += [ + T.RandomIoUCrop(), + T.RandomHorizontalFlip(p=hflip_prob), + ] else: raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"') + if backend == "pil": + # Note: we could just convert to pure tensors even in v2. + transforms += [T.ToImage() if use_v2 else T.PILToTensor()] + + transforms += [T.ToDtype(torch.float, scale=True)] + + if use_v2: + transforms += [ + T.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.XYXY), + T.SanitizeBoundingBoxes(), + T.ToPureTensor(), + ] + + self.transforms = T.Compose(transforms) + def __call__(self, img, target): return self.transforms(img, target) class DetectionPresetEval: - def __init__(self): - self.transforms = T.Compose( - [ - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - ] - ) + def __init__(self, backend="pil", use_v2=False): + T, _ = get_modules(use_v2) + transforms = [] + backend = backend.lower() + if backend == "pil": + # Note: we could just convert to pure tensors even in v2? + transforms += [T.ToImage() if use_v2 else T.PILToTensor()] + elif backend == "tensor": + transforms += [T.PILToTensor()] + elif backend == "tv_tensor": + transforms += [T.ToImage()] + else: + raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}") + + transforms += [T.ToDtype(torch.float, scale=True)] + + if use_v2: + transforms += [T.ToPureTensor()] + + self.transforms = T.Compose(transforms) def __call__(self, img, target): return self.transforms(img, target) diff --git a/references/detection/train.py b/references/detection/train.py index dea483c5f7537ded5da48bd523be8394928714e4..d165a2d3598da093b41c3df145cd3a732d04d56a 100644 --- a/references/detection/train.py +++ b/references/detection/train.py @@ -28,7 +28,7 @@ import torchvision import torchvision.models.detection import torchvision.models.detection.mask_rcnn import utils -from coco_utils import get_coco, get_coco_kp +from coco_utils import get_coco from engine import evaluate, train_one_epoch from group_by_aspect_ratio import create_aspect_ratio_groups, GroupedBatchSampler from torchvision.transforms import InterpolationMode @@ -40,23 +40,32 @@ def copypaste_collate_fn(batch): return copypaste(*utils.collate_fn(batch)) -def get_dataset(name, image_set, transform, data_path): - paths = {"coco": (data_path, get_coco, 91), "coco_kp": (data_path, get_coco_kp, 2)} - p, ds_fn, num_classes = paths[name] - - ds = ds_fn(p, image_set=image_set, transforms=transform) +def get_dataset(is_train, args): + image_set = "train" if is_train else "val" + num_classes, mode = {"coco": (91, "instances"), "coco_kp": (2, "person_keypoints")}[args.dataset] + with_masks = "mask" in args.model + ds = get_coco( + root=args.data_path, + image_set=image_set, + transforms=get_transform(is_train, args), + mode=mode, + use_v2=args.use_v2, + with_masks=with_masks, + ) return ds, num_classes -def get_transform(train, args): - if train: - return presets.DetectionPresetTrain(data_augmentation=args.data_augmentation) +def get_transform(is_train, args): + if is_train: + return presets.DetectionPresetTrain( + data_augmentation=args.data_augmentation, backend=args.backend, use_v2=args.use_v2 + ) elif args.weights and args.test_only: weights = torchvision.models.get_weight(args.weights) trans = weights.transforms() return lambda img, target: (trans(img), target) else: - return presets.DetectionPresetEval() + return presets.DetectionPresetEval(backend=args.backend, use_v2=args.use_v2) def get_args_parser(add_help=True): @@ -65,7 +74,12 @@ def get_args_parser(add_help=True): parser = argparse.ArgumentParser(description="PyTorch Detection Training", add_help=add_help) parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path") - parser.add_argument("--dataset", default="coco", type=str, help="dataset name") + parser.add_argument( + "--dataset", + default="coco", + type=str, + help="dataset name. Use coco for object detection and instance segmentation and coco_kp for Keypoint detection", + ) parser.add_argument("--model", default="maskrcnn_resnet50_fpn", type=str, help="model name") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument( @@ -159,10 +173,22 @@ def get_args_parser(add_help=True): help="Use CopyPaste data augmentation. Works only with data-augmentation='lsj'.", ) + parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive") + parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms") + return parser def main(args): + if args.backend.lower() == "tv_tensor" and not args.use_v2: + raise ValueError("Use --use-v2 if you want to use the tv_tensor backend.") + if args.dataset not in ("coco", "coco_kp"): + raise ValueError(f"Dataset should be coco or coco_kp, got {args.dataset}") + if "keypoint" in args.model and args.dataset != "coco_kp": + raise ValueError("Oops, if you want Keypoint detection, set --dataset coco_kp") + if args.dataset == "coco_kp" and args.use_v2: + raise ValueError("KeyPoint detection doesn't support V2 transforms yet") + if args.output_dir: utils.mkdir(args.output_dir) @@ -177,8 +203,8 @@ def main(args): # Data loading code print("Loading data") - dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path) - dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path) + dataset, num_classes = get_dataset(is_train=True, args=args) + dataset_test, _ = get_dataset(is_train=False, args=args) print("Creating data loaders") if args.distributed: diff --git a/references/detection/transforms.py b/references/detection/transforms.py index d26bf6eac8566e21f6c9e7a50f1c38f28af20aa5..e07ccfc992153960b5360b59f24b33585ec62130 100644 --- a/references/detection/transforms.py +++ b/references/detection/transforms.py @@ -53,14 +53,17 @@ class PILToTensor(nn.Module): return image, target -class ConvertImageDtype(nn.Module): - def __init__(self, dtype: torch.dtype) -> None: +class ToDtype(nn.Module): + def __init__(self, dtype: torch.dtype, scale: bool = False) -> None: super().__init__() self.dtype = dtype + self.scale = scale def forward( self, image: Tensor, target: Optional[Dict[str, Tensor]] = None ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]: + if not self.scale: + return image.to(dtype=self.dtype), target image = F.convert_image_dtype(image, self.dtype) return image, target @@ -293,11 +296,13 @@ class ScaleJitter(nn.Module): target_size: Tuple[int, int], scale_range: Tuple[float, float] = (0.1, 2.0), interpolation: InterpolationMode = InterpolationMode.BILINEAR, + antialias=True, ): super().__init__() self.target_size = target_size self.scale_range = scale_range self.interpolation = interpolation + self.antialias = antialias def forward( self, image: Tensor, target: Optional[Dict[str, Tensor]] = None @@ -315,14 +320,17 @@ class ScaleJitter(nn.Module): new_width = int(orig_width * r) new_height = int(orig_height * r) - image = F.resize(image, [new_height, new_width], interpolation=self.interpolation) + image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias) if target is not None: target["boxes"][:, 0::2] *= new_width / orig_width target["boxes"][:, 1::2] *= new_height / orig_height if "masks" in target: target["masks"] = F.resize( - target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST + target["masks"], + [new_height, new_width], + interpolation=InterpolationMode.NEAREST, + antialias=self.antialias, ) return image, target diff --git a/references/optical_flow/README.md b/references/optical_flow/README.md index a7ac0223739ab76bffe1999bd701dd74f37ee93b..6ad1d4079f7629d92421accc9bcce2ee391afd9f 100644 --- a/references/optical_flow/README.md +++ b/references/optical_flow/README.md @@ -56,7 +56,7 @@ torchrun --nproc_per_node 1 --nnodes 1 train.py --val-dataset sintel --batch-siz This should give an epe of about 1.3822 on the clean pass and 2.7161 on the final pass of Sintel-train. Results may vary slightly depending on the batch -size and the number of GPUs. For the most accurate resuts use 1 GPU and +size and the number of GPUs. For the most accurate results use 1 GPU and `--batch-size 1`: ``` diff --git a/references/optical_flow/train.py b/references/optical_flow/train.py index be6ffe4ccefaa8ad2dd3544893b66f08f83449ce..ab99cc3ae55bef3f8c110b357d44543752abfee7 100644 --- a/references/optical_flow/train.py +++ b/references/optical_flow/train.py @@ -82,7 +82,7 @@ def _evaluate(model, args, val_dataset, *, padder_mode, num_flow_updates=None, b def inner_loop(blob): if blob[0].dim() == 3: - # input is not batched so we add an extra dim for consistency + # input is not batched, so we add an extra dim for consistency blob = [x[None, :, :, :] if x is not None else None for x in blob] image1, image2, flow_gt = blob[:3] @@ -150,7 +150,7 @@ def evaluate(model, args): for name in val_datasets: if name == "kitti": - # Kitti has different image sizes so we need to individually pad them, we can't batch. + # Kitti has different image sizes, so we need to individually pad them, we can't batch. # see comment in InputPadder if args.batch_size != 1 and (not args.distributed or args.rank == 0): warnings.warn( diff --git a/references/optical_flow/transforms.py b/references/optical_flow/transforms.py index 6011608183a7e5d8d1a7ecb1675b96fe0c430972..bc831a2ee52cb7ad1b87162c3035d134249ba633 100644 --- a/references/optical_flow/transforms.py +++ b/references/optical_flow/transforms.py @@ -164,7 +164,7 @@ class RandomResizeAndCrop(torch.nn.Module): # The reason we don't rely on RandomResizedCrop is because of a significant # difference in the parametrization of both transforms, in particular, # because of the way the random parameters are sampled in both transforms, - # which leads to fairly different resuts (and different epe). For more details see + # which leads to fairly different results (and different epe). For more details see # https://github.com/pytorch/vision/pull/5026/files#r762932579 def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, stretch_prob=0.8): super().__init__() @@ -196,8 +196,12 @@ class RandomResizeAndCrop(torch.nn.Module): if torch.rand(1).item() < self.resize_prob: # rescale the images - img1 = F.resize(img1, size=(new_h, new_w)) - img2 = F.resize(img2, size=(new_h, new_w)) + # We hard-code antialias=False to preserve results after we changed + # its default from None to True (see + # https://github.com/pytorch/vision/pull/7160) + # TODO: we could re-train the OF models with antialias=True? + img1 = F.resize(img1, size=(new_h, new_w), antialias=False) + img2 = F.resize(img2, size=(new_h, new_w), antialias=False) if valid_flow_mask is None: flow = F.resize(flow, size=(new_h, new_w)) flow = flow * torch.tensor([scale_x, scale_y])[:, None, None] @@ -208,7 +212,7 @@ class RandomResizeAndCrop(torch.nn.Module): # Note: For sparse datasets (Kitti), the original code uses a "margin" # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220 - # We don't, not sure it matters much + # We don't, not sure if it matters much y0 = torch.randint(0, img1.shape[1] - self.crop_size[0], size=(1,)).item() x0 = torch.randint(0, img1.shape[2] - self.crop_size[1], size=(1,)).item() diff --git a/references/optical_flow/utils.py b/references/optical_flow/utils.py index 8b07e9de35c075424e9756a8ebe7ec13d4afb520..cd4b16eb0d8c9ed773d284e8702e6d87e687733f 100644 --- a/references/optical_flow/utils.py +++ b/references/optical_flow/utils.py @@ -181,7 +181,7 @@ def sequence_loss(flow_preds, flow_gt, valid_flow_mask, gamma=0.8, max_flow=400) if gamma > 1: raise ValueError(f"Gamma should be < 1, got {gamma}.") - # exlude invalid pixels and extremely large diplacements + # exclude invalid pixels and extremely large diplacements flow_norm = torch.sum(flow_gt**2, dim=1).sqrt() valid_flow_mask = valid_flow_mask & (flow_norm < max_flow) @@ -248,7 +248,7 @@ def setup_ddp(args): # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2 if all(key in os.environ for key in ("LOCAL_RANK", "RANK", "WORLD_SIZE")): - # if we're here, the script was called with torchrun. Otherwise + # if we're here, the script was called with torchrun. Otherwise, # these args will be set already by the run_with_submitit script args.local_rank = int(os.environ["LOCAL_RANK"]) args.rank = int(os.environ["RANK"]) diff --git a/references/segmentation/coco_utils.py b/references/segmentation/coco_utils.py index e02434012f1fc3517a284be04aa058a3ac32b79d..6a15dbefb526c1b01085ed05de0452b5e24d7c30 100644 --- a/references/segmentation/coco_utils.py +++ b/references/segmentation/coco_utils.py @@ -68,11 +68,6 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None): # if more than 1k pixels occupied in the image return sum(obj["area"] for obj in anno) > 1000 - if not isinstance(dataset, torchvision.datasets.CocoDetection): - raise TypeError( - f"This function expects dataset of type torchvision.datasets.CocoDetection, instead got {type(dataset)}" - ) - ids = [] for ds_idx, img_id in enumerate(dataset.ids): ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None) @@ -86,7 +81,7 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None): return dataset -def get_coco(root, image_set, transforms): +def get_coco(root, image_set, transforms, use_v2=False): PATHS = { "train": ("train2017", os.path.join("annotations", "instances_train2017.json")), "val": ("val2017", os.path.join("annotations", "instances_val2017.json")), @@ -94,13 +89,24 @@ def get_coco(root, image_set, transforms): } CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72] - transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms]) - img_folder, ann_file = PATHS[image_set] img_folder = os.path.join(root, img_folder) ann_file = os.path.join(root, ann_file) - dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms) + # The 2 "Compose" below achieve the same thing: converting coco detection + # samples into segmentation-compatible samples. They just do it with + # slightly different implementations. We could refactor and unify, but + # keeping them separate helps keeping the v2 version clean + if use_v2: + import v2_extras + from torchvision.datasets import wrap_dataset_for_transforms_v2 + + transforms = Compose([v2_extras.CocoDetectionToVOCSegmentation(), transforms]) + dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms) + dataset = wrap_dataset_for_transforms_v2(dataset, target_keys={"masks", "labels"}) + else: + transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms]) + dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms) if image_set == "train": dataset = _coco_remove_images_without_annotations(dataset, CAT_LIST) diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py index ed02ae660e4e4272b37cf1970ed3e1d8dcfeba2c..803769fcafce82d15f25637e67918dfa0f2d003b 100644 --- a/references/segmentation/presets.py +++ b/references/segmentation/presets.py @@ -1,39 +1,109 @@ import torch -import transforms as T + + +def get_modules(use_v2): + # We need a protected import to avoid the V2 warning in case just V1 is used + if use_v2: + import torchvision.transforms.v2 + import torchvision.tv_tensors + import v2_extras + + return torchvision.transforms.v2, torchvision.tv_tensors, v2_extras + else: + import transforms + + return transforms, None, None class SegmentationPresetTrain: - def __init__(self, *, base_size, crop_size, hflip_prob=0.5, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)): - min_size = int(0.5 * base_size) - max_size = int(2.0 * base_size) + def __init__( + self, + *, + base_size, + crop_size, + hflip_prob=0.5, + mean=(0.485, 0.456, 0.406), + std=(0.229, 0.224, 0.225), + backend="pil", + use_v2=False, + ): + T, tv_tensors, v2_extras = get_modules(use_v2) + + transforms = [] + backend = backend.lower() + if backend == "tv_tensor": + transforms.append(T.ToImage()) + elif backend == "tensor": + transforms.append(T.PILToTensor()) + elif backend != "pil": + raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}") + + transforms += [T.RandomResize(min_size=int(0.5 * base_size), max_size=int(2.0 * base_size))] - trans = [T.RandomResize(min_size, max_size)] if hflip_prob > 0: - trans.append(T.RandomHorizontalFlip(hflip_prob)) - trans.extend( - [ - T.RandomCrop(crop_size), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - T.Normalize(mean=mean, std=std), + transforms += [T.RandomHorizontalFlip(hflip_prob)] + + if use_v2: + # We need a custom pad transform here, since the padding we want to perform here is fundamentally + # different from the padding in `RandomCrop` if `pad_if_needed=True`. + transforms += [v2_extras.PadIfSmaller(crop_size, fill={tv_tensors.Mask: 255, "others": 0})] + + transforms += [T.RandomCrop(crop_size)] + + if backend == "pil": + transforms += [T.PILToTensor()] + + if use_v2: + img_type = tv_tensors.Image if backend == "tv_tensor" else torch.Tensor + transforms += [ + T.ToDtype(dtype={img_type: torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True) ] - ) - self.transforms = T.Compose(trans) + else: + # No need to explicitly convert masks as they're magically int64 already + transforms += [T.ToDtype(torch.float, scale=True)] + + transforms += [T.Normalize(mean=mean, std=std)] + if use_v2: + transforms += [T.ToPureTensor()] + + self.transforms = T.Compose(transforms) def __call__(self, img, target): return self.transforms(img, target) class SegmentationPresetEval: - def __init__(self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)): - self.transforms = T.Compose( - [ - T.RandomResize(base_size, base_size), - T.PILToTensor(), - T.ConvertImageDtype(torch.float), - T.Normalize(mean=mean, std=std), - ] - ) + def __init__( + self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), backend="pil", use_v2=False + ): + T, _, _ = get_modules(use_v2) + + transforms = [] + backend = backend.lower() + if backend == "tensor": + transforms += [T.PILToTensor()] + elif backend == "tv_tensor": + transforms += [T.ToImage()] + elif backend != "pil": + raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}") + + if use_v2: + transforms += [T.Resize(size=(base_size, base_size))] + else: + transforms += [T.RandomResize(min_size=base_size, max_size=base_size)] + + if backend == "pil": + # Note: we could just convert to pure tensors even in v2? + transforms += [T.ToImage() if use_v2 else T.PILToTensor()] + + transforms += [ + T.ToDtype(torch.float, scale=True), + T.Normalize(mean=mean, std=std), + ] + if use_v2: + transforms += [T.ToPureTensor()] + + self.transforms = T.Compose(transforms) def __call__(self, img, target): return self.transforms(img, target) diff --git a/references/segmentation/train.py b/references/segmentation/train.py index bb57e65b801c5776ee9f2b950cc6c789c5f86da6..35ece7264a336a8ec1d97ce206bcf681b7a21027 100644 --- a/references/segmentation/train.py +++ b/references/segmentation/train.py @@ -14,24 +14,30 @@ from torch.optim.lr_scheduler import PolynomialLR from torchvision.transforms import functional as F, InterpolationMode -def get_dataset(dir_path, name, image_set, transform): +def get_dataset(args, is_train): def sbd(*args, **kwargs): + kwargs.pop("use_v2") return torchvision.datasets.SBDataset(*args, mode="segmentation", **kwargs) + def voc(*args, **kwargs): + kwargs.pop("use_v2") + return torchvision.datasets.VOCSegmentation(*args, **kwargs) + paths = { - "voc": (dir_path, torchvision.datasets.VOCSegmentation, 21), - "voc_aug": (dir_path, sbd, 21), - "coco": (dir_path, get_coco, 21), + "voc": (args.data_path, voc, 21), + "voc_aug": (args.data_path, sbd, 21), + "coco": (args.data_path, get_coco, 21), } - p, ds_fn, num_classes = paths[name] + p, ds_fn, num_classes = paths[args.dataset] - ds = ds_fn(p, image_set=image_set, transforms=transform) + image_set = "train" if is_train else "val" + ds = ds_fn(p, image_set=image_set, transforms=get_transform(is_train, args), use_v2=args.use_v2) return ds, num_classes -def get_transform(train, args): - if train: - return presets.SegmentationPresetTrain(base_size=520, crop_size=480) +def get_transform(is_train, args): + if is_train: + return presets.SegmentationPresetTrain(base_size=520, crop_size=480, backend=args.backend, use_v2=args.use_v2) elif args.weights and args.test_only: weights = torchvision.models.get_weight(args.weights) trans = weights.transforms() @@ -44,7 +50,7 @@ def get_transform(train, args): return preprocessing else: - return presets.SegmentationPresetEval(base_size=520) + return presets.SegmentationPresetEval(base_size=520, backend=args.backend, use_v2=args.use_v2) def criterion(inputs, target): @@ -120,6 +126,12 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi def main(args): + if args.backend.lower() != "pil" and not args.use_v2: + # TODO: Support tensor backend in V1? + raise ValueError("Use --use-v2 if you want to use the tv_tensor or tensor backend.") + if args.use_v2 and args.dataset != "coco": + raise ValueError("v2 is only support supported for coco dataset for now.") + if args.output_dir: utils.mkdir(args.output_dir) @@ -134,8 +146,8 @@ def main(args): else: torch.backends.cudnn.benchmark = True - dataset, num_classes = get_dataset(args.data_path, args.dataset, "train", get_transform(True, args)) - dataset_test, _ = get_dataset(args.data_path, args.dataset, "val", get_transform(False, args)) + dataset, num_classes = get_dataset(args, is_train=True) + dataset_test, _ = get_dataset(args, is_train=False) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) @@ -260,7 +272,7 @@ def get_args_parser(add_help=True): parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path") parser.add_argument("--dataset", default="coco", type=str, help="dataset name") parser.add_argument("--model", default="fcn_resnet101", type=str, help="model name") - parser.add_argument("--aux-loss", action="store_true", help="auxiliar loss") + parser.add_argument("--aux-loss", action="store_true", help="auxiliary loss") parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)") parser.add_argument( "-b", "--batch-size", default=8, type=int, help="images per gpu, the total batch size is $NGPU x batch_size" @@ -307,6 +319,8 @@ def get_args_parser(add_help=True): # Mixed precision training parameters parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training") + parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive") + parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms") return parser diff --git a/references/segmentation/transforms.py b/references/segmentation/transforms.py index 518048db2faef36297c4a47c700d7235434fcc0b..6934b9f862ea62984c4e505e6544dac39dd1ab18 100644 --- a/references/segmentation/transforms.py +++ b/references/segmentation/transforms.py @@ -35,7 +35,7 @@ class RandomResize: def __call__(self, image, target): size = random.randint(self.min_size, self.max_size) - image = F.resize(image, size) + image = F.resize(image, size, antialias=True) target = F.resize(target, size, interpolation=T.InterpolationMode.NEAREST) return image, target @@ -81,11 +81,14 @@ class PILToTensor: return image, target -class ConvertImageDtype: - def __init__(self, dtype): +class ToDtype: + def __init__(self, dtype, scale=False): self.dtype = dtype + self.scale = scale def __call__(self, image, target): + if not self.scale: + return image.to(dtype=self.dtype), target image = F.convert_image_dtype(image, self.dtype) return image, target diff --git a/references/segmentation/utils.py b/references/segmentation/utils.py index 4ea24db83ed99dcc9b1edb6a646cadb6cfd07bb3..cb200f23d766b92440108cb05fe0fe093f89a08c 100644 --- a/references/segmentation/utils.py +++ b/references/segmentation/utils.py @@ -267,9 +267,9 @@ def init_distributed_mode(args): args.rank = int(os.environ["RANK"]) args.world_size = int(os.environ["WORLD_SIZE"]) args.gpu = int(os.environ["LOCAL_RANK"]) - elif "SLURM_PROCID" in os.environ: - args.rank = int(os.environ["SLURM_PROCID"]) - args.gpu = args.rank % torch.cuda.device_count() + # elif "SLURM_PROCID" in os.environ: + # args.rank = int(os.environ["SLURM_PROCID"]) + # args.gpu = args.rank % torch.cuda.device_count() elif hasattr(args, "rank"): pass else: diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py new file mode 100644 index 0000000000000000000000000000000000000000..e1a8b53e02ba016a49e5c96f3ea5c70a87bb5c47 --- /dev/null +++ b/references/segmentation/v2_extras.py @@ -0,0 +1,83 @@ +"""This file only exists to be lazy-imported and avoid V2-related import warnings when just using V1.""" +import torch +from torchvision import tv_tensors +from torchvision.transforms import v2 + + +class PadIfSmaller(v2.Transform): + def __init__(self, size, fill=0): + super().__init__() + self.size = size + self.fill = v2._utils._setup_fill_arg(fill) + + def _get_params(self, sample): + _, height, width = v2._utils.query_chw(sample) + padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)] + needs_padding = any(padding) + return dict(padding=padding, needs_padding=needs_padding) + + def _transform(self, inpt, params): + if not params["needs_padding"]: + return inpt + + fill = v2._utils._get_fill(self.fill, type(inpt)) + fill = v2._utils._convert_fill_arg(fill) + + return v2.functional.pad(inpt, padding=params["padding"], fill=fill) + + +class CocoDetectionToVOCSegmentation(v2.Transform): + """Turn samples from datasets.CocoDetection into the same format as VOCSegmentation. + + This is achieved in two steps: + + 1. COCO differentiates between 91 categories while VOC only supports 21, including background for both. Fortunately, + the COCO categories are a superset of the VOC ones and thus can be mapped. Instances of the 70 categories not + present in VOC are dropped and replaced by background. + 2. COCO only offers detection masks, i.e. a (N, H, W) bool-ish tensor, where the truthy values in each individual + mask denote the instance. However, a segmentation mask is a (H, W) integer tensor (typically torch.uint8), where + the value of each pixel denotes the category it belongs to. The detection masks are merged into one segmentation + mask while pixels that belong to multiple detection masks are marked as invalid. + """ + + COCO_TO_VOC_LABEL_MAP = dict( + zip( + [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72], + range(21), + ) + ) + INVALID_VALUE = 255 + + def _coco_detection_masks_to_voc_segmentation_mask(self, target): + if "masks" not in target: + return None + + instance_masks, instance_labels_coco = target["masks"], target["labels"] + + valid_labels_voc = [ + (idx, label_voc) + for idx, label_coco in enumerate(instance_labels_coco.tolist()) + if (label_voc := self.COCO_TO_VOC_LABEL_MAP.get(label_coco)) is not None + ] + + if not valid_labels_voc: + return None + + valid_voc_category_idcs, instance_labels_voc = zip(*valid_labels_voc) + + instance_masks = instance_masks[list(valid_voc_category_idcs)].to(torch.uint8) + instance_labels_voc = torch.tensor(instance_labels_voc, dtype=torch.uint8) + + # Calling `.max()` on the stacked detection masks works fine to separate background from foreground as long as + # there is at most a single instance per pixel. Overlapping instances will be filtered out in the next step. + segmentation_mask, _ = (instance_masks * instance_labels_voc.reshape(-1, 1, 1)).max(dim=0) + segmentation_mask[instance_masks.sum(dim=0) > 1] = self.INVALID_VALUE + + return segmentation_mask + + def forward(self, image, target): + segmentation_mask = self._coco_detection_masks_to_voc_segmentation_mask(target) + if segmentation_mask is None: + segmentation_mask = torch.zeros(v2.functional.get_size(image), dtype=torch.uint8) + + return image, tv_tensors.Mask(segmentation_mask) diff --git a/references/similarity/sampler.py b/references/similarity/sampler.py index f4564eca33e22c2d1f6fea1cee8a49236497295e..fe6517418ab092f1b859bc5802268e774411c40b 100644 --- a/references/similarity/sampler.py +++ b/references/similarity/sampler.py @@ -48,7 +48,7 @@ class PKSampler(Sampler): # Ensures there are enough classes to sample from if len(self.groups) < p: - raise ValueError("There are not enought classes to sample from") + raise ValueError("There are not enough classes to sample from") def __iter__(self): # Shuffle samples within groups diff --git a/references/video_classification/README.md b/references/video_classification/README.md index cbd303275e5e82667b730c8e6a523159f2de3827..39c5d8f1bbaee7a6dcde7145f928b10b0f030616 100644 --- a/references/video_classification/README.md +++ b/references/video_classification/README.md @@ -76,7 +76,7 @@ Input data augmentations at validation time (with optional parameters): 5. Convert BCHW to CBHW This translates in the following set of command-line arguments. Please note that `--batch-size` parameter controls the -batch size per GPU. Moreover note that our default `--lr` is configured for 64 GPUs which is how many we used for the +batch size per GPU. Moreover, note that our default `--lr` is configured for 64 GPUs which is how many we used for the Video resnet models: ``` # number of frames per clip diff --git a/references/video_classification/presets.py b/references/video_classification/presets.py index ef77405225786883990d110c3ed726870f0eaa16..f73802c9666cca56411e8b1c7b2483719c578c31 100644 --- a/references/video_classification/presets.py +++ b/references/video_classification/presets.py @@ -15,7 +15,11 @@ class VideoClassificationPresetTrain: ): trans = [ transforms.ConvertImageDtype(torch.float32), - transforms.Resize(resize_size), + # We hard-code antialias=False to preserve results after we changed + # its default from None to True (see + # https://github.com/pytorch/vision/pull/7160) + # TODO: we could re-train the video models with antialias=True? + transforms.Resize(resize_size, antialias=False), ] if hflip_prob > 0: trans.append(transforms.RandomHorizontalFlip(hflip_prob)) @@ -31,7 +35,11 @@ class VideoClassificationPresetEval: self.transforms = transforms.Compose( [ transforms.ConvertImageDtype(torch.float32), - transforms.Resize(resize_size), + # We hard-code antialias=False to preserve results after we changed + # its default from None to True (see + # https://github.com/pytorch/vision/pull/7160) + # TODO: we could re-train the video models with antialias=True? + transforms.Resize(resize_size, antialias=False), transforms.Normalize(mean=mean, std=std), transforms.CenterCrop(crop_size), ConvertBCHWtoCBHW(), diff --git a/scripts/download_model_urls.py b/scripts/download_model_urls.py new file mode 100644 index 0000000000000000000000000000000000000000..f5f53d71e98f1c0c82d74bdb5b6cca122c4090c2 --- /dev/null +++ b/scripts/download_model_urls.py @@ -0,0 +1,41 @@ +import asyncio +import sys +from pathlib import Path +from time import perf_counter +from urllib.parse import urlsplit + +import aiofiles +import aiohttp +from torchvision import models +from tqdm.asyncio import tqdm + + +async def main(download_root): + download_root.mkdir(parents=True, exist_ok=True) + urls = {weight.url for name in models.list_models() for weight in iter(models.get_model_weights(name))} + + async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=None)) as session: + await tqdm.gather(*[download(download_root, session, url) for url in urls]) + + +async def download(download_root, session, url): + response = await session.get(url, params=dict(source="ci")) + + assert response.ok + + file_name = Path(urlsplit(url).path).name + async with aiofiles.open(download_root / file_name, "wb") as f: + async for data in response.content.iter_any(): + await f.write(data) + + +if __name__ == "__main__": + download_root = ( + (Path(sys.argv[1]) if len(sys.argv) > 1 else Path("~/.cache/torch/hub/checkpoints")).expanduser().resolve() + ) + print(f"Downloading model weights to {download_root}") + start = perf_counter() + asyncio.get_event_loop().run_until_complete(main(download_root)) + stop = perf_counter() + minutes, seconds = divmod(stop - start, 60) + print(f"Download took {minutes:2.0f}m {seconds:2.0f}s") diff --git a/setup.cfg b/setup.cfg index f36195194cd058f3d95689eebc25d7b775acaabb..0f4ddbfab10c11315a9de75f7dcc35cf7ddeae52 100644 --- a/setup.cfg +++ b/setup.cfg @@ -2,7 +2,7 @@ universal=1 [metadata] -license_file = LICENSE +license_files = LICENSE [pep8] max-line-length = 120 @@ -10,7 +10,7 @@ max-line-length = 120 [flake8] # note: we ignore all 501s (line too long) anyway as they're taken care of by black max-line-length = 120 -ignore = E203, E402, W503, W504, F821, E501 +ignore = E203, E402, W503, W504, F821, E501, B, C4, EXE per-file-ignores = __init__.py: F401, F403, F405 ./hubconf.py: F401 diff --git a/setup.py b/setup.py index be1a29609a805c7a4f122e3ff8040bb330046e6b..f0aa3f4ab2a13a19270bb3cc2109cde6a1001210 100644 --- a/setup.py +++ b/setup.py @@ -86,7 +86,6 @@ if os.getenv("PYTORCH_VERSION"): pytorch_dep += "==" + os.getenv("PYTORCH_VERSION") requirements = [ - "typing_extensions", "numpy", "requests", pytorch_dep, @@ -166,10 +165,13 @@ def get_extensions(): + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp")) + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp")) ) + source_mps = glob.glob(os.path.join(extensions_dir, "ops", "mps", "*.mm")) print("Compiling extensions with following flags:") force_cuda = os.getenv("FORCE_CUDA", "0") == "1" print(f" FORCE_CUDA: {force_cuda}") + force_mps = os.getenv("FORCE_MPS", "0") == "1" + print(f" FORCE_MPS: {force_mps}") debug_mode = os.getenv("DEBUG", "0") == "1" print(f" DEBUG: {debug_mode}") use_png = os.getenv("TORCHVISION_USE_PNG", "1") == "1" @@ -231,6 +233,8 @@ def get_extensions(): define_macros += [("WITH_HIP", None)] nvcc_flags = [] extra_compile_args["nvcc"] = nvcc_flags + elif torch.backends.mps.is_available() or force_mps: + sources += source_mps if sys.platform == "win32": define_macros += [("torchvision_EXPORTS", None)] @@ -247,6 +251,9 @@ def get_extensions(): extra_compile_args["nvcc"] = [f for f in nvcc_flags if not ("-O" in f or "-g" in f)] extra_compile_args["nvcc"].append("-O0") extra_compile_args["nvcc"].append("-g") + else: + print("Compiling with debug mode OFF") + extra_compile_args["cxx"].append("-g0") sources = [os.path.join(extensions_dir, s) for s in sources] @@ -327,6 +334,8 @@ def get_extensions(): use_jpeg = use_jpeg and jpeg_found if use_jpeg: print("Building torchvision with JPEG image support") + print(f" libjpeg include path: {jpeg_include}") + print(f" libjpeg lib path: {jpeg_lib}") image_link_flags.append("jpeg") if jpeg_conda: image_library += [jpeg_lib] @@ -352,11 +361,14 @@ def get_extensions(): image_macros += [("NVJPEG_FOUND", str(int(use_nvjpeg)))] image_path = os.path.join(extensions_dir, "io", "image") - image_src = ( - glob.glob(os.path.join(image_path, "*.cpp")) - + glob.glob(os.path.join(image_path, "cpu", "*.cpp")) - + glob.glob(os.path.join(image_path, "cuda", "*.cpp")) - ) + image_src = glob.glob(os.path.join(image_path, "*.cpp")) + glob.glob(os.path.join(image_path, "cpu", "*.cpp")) + + if is_rocm_pytorch: + image_src += glob.glob(os.path.join(image_path, "hip", "*.cpp")) + # we need to exclude this in favor of the hipified source + image_src.remove(os.path.join(image_path, "image.cpp")) + else: + image_src += glob.glob(os.path.join(image_path, "cuda", "*.cpp")) if use_png or use_jpeg: ext_modules.append( @@ -464,8 +476,8 @@ def get_extensions(): "swresample", "swscale", ], - extra_compile_args=["-std=c++14"] if os.name != "nt" else ["/std:c++14", "/MP"], - extra_link_args=["-std=c++14" if os.name != "nt" else "/std:c++14"], + extra_compile_args=["-std=c++17"] if os.name != "nt" else ["/std:c++17", "/MP"], + extra_link_args=["-std=c++17" if os.name != "nt" else "/std:c++17"], ) ) @@ -564,6 +576,7 @@ if __name__ == "__main__": url="https://github.com/pytorch/vision", description="image and video datasets and models for torch deep learning", long_description=readme, + long_description_content_type="text/markdown", license="BSD", # Package info packages=find_packages(exclude=("test",)), @@ -574,7 +587,7 @@ if __name__ == "__main__": "scipy": ["scipy"], }, ext_modules=get_extensions(), - python_requires=">=3.7", + python_requires=">=3.8", cmdclass={ "build_ext": BuildExtension.with_options(no_python_abi_suffix=True), "clean": clean, diff --git a/test/assets/toosmall_png/heapbof.png b/test/assets/toosmall_png/heapbof.png new file mode 100644 index 0000000000000000000000000000000000000000..e720d1833423d20f7df5a5bab5411956ed01a879 Binary files /dev/null and b/test/assets/toosmall_png/heapbof.png differ diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py new file mode 100644 index 0000000000000000000000000000000000000000..ef5d5e1ec9690f4731059628db485df8a04a33f0 --- /dev/null +++ b/test/builtin_dataset_mocks.py @@ -0,0 +1,1582 @@ +import bz2 +import collections.abc +import csv +import functools +import gzip +import io +import itertools +import json +import lzma +import pathlib +import pickle +import random +import shutil +import unittest.mock +import xml.etree.ElementTree as ET +from collections import Counter, defaultdict + +import numpy as np +import pytest +import torch +from common_utils import combinations_grid +from datasets_utils import create_image_file, create_image_folder, make_tar, make_zip +from torch.nn.functional import one_hot +from torch.testing import make_tensor as _make_tensor +from torchvision.prototype import datasets + +make_tensor = functools.partial(_make_tensor, device="cpu") +make_scalar = functools.partial(make_tensor, ()) + + +__all__ = ["DATASET_MOCKS", "parametrize_dataset_mocks"] + + +class DatasetMock: + def __init__(self, name, *, mock_data_fn, configs): + # FIXME: error handling for unknown names + self.name = name + self.mock_data_fn = mock_data_fn + self.configs = configs + + def _parse_mock_info(self, mock_info): + if mock_info is None: + raise pytest.UsageError( + f"The mock data function for dataset '{self.name}' returned nothing. It needs to at least return an " + f"integer indicating the number of samples for the current `config`." + ) + elif isinstance(mock_info, int): + mock_info = dict(num_samples=mock_info) + elif not isinstance(mock_info, dict): + raise pytest.UsageError( + f"The mock data function for dataset '{self.name}' returned a {type(mock_info)}. The returned object " + f"should be a dictionary containing at least the number of samples for the key `'num_samples'`. If no " + f"additional information is required for specific tests, the number of samples can also be returned as " + f"an integer." + ) + elif "num_samples" not in mock_info: + raise pytest.UsageError( + f"The dictionary returned by the mock data function for dataset '{self.name}' has to contain a " + f"`'num_samples'` entry indicating the number of samples." + ) + + return mock_info + + def load(self, config): + # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in + # test/test_prototype_builtin_datasets.py + root = pathlib.Path(datasets.home()) / self.name + # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn, + # this will only download **and** preprocess if the file is not present. In other words, if we already place + # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing. + # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to + # `root` only when it is requested. + tmp_mock_data_folder = root / "__mock__" + tmp_mock_data_folder.mkdir(parents=True) + + mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config)) + + def patched_download(resource, root, **kwargs): + src = tmp_mock_data_folder / resource.file_name + if not src.exists(): + raise pytest.UsageError( + f"Dataset '{self.name}' requires the file {resource.file_name} for {config}" + f"but it was not created by the mock data function." + ) + + dst = root / resource.file_name + shutil.move(str(src), str(root)) + + return dst + + with unittest.mock.patch( + "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download + ): + dataset = datasets.load(self.name, **config) + + extra_files = list(tmp_mock_data_folder.glob("**/*")) + if extra_files: + raise pytest.UsageError( + ( + f"Dataset '{self.name}' created the following files for {config} in the mock data function, " + f"but they were not loaded:\n\n" + ) + + "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files) + ) + + tmp_mock_data_folder.rmdir() + + return dataset, mock_info + + +def config_id(name, config): + parts = [name] + for name, value in config.items(): + if isinstance(value, bool): + part = ("" if value else "no_") + name + else: + part = str(value) + parts.append(part) + return "-".join(parts) + + +def parametrize_dataset_mocks(*dataset_mocks, marks=None): + mocks = {} + for mock in dataset_mocks: + if isinstance(mock, DatasetMock): + mocks[mock.name] = mock + elif isinstance(mock, collections.abc.Mapping): + mocks.update(mock) + else: + raise pytest.UsageError( + f"The positional arguments passed to `parametrize_dataset_mocks` can either be a `DatasetMock`, " + f"a sequence of `DatasetMock`'s, or a mapping of names to `DatasetMock`'s, " + f"but got {mock} instead." + ) + dataset_mocks = mocks + + if marks is None: + marks = {} + elif not isinstance(marks, collections.abc.Mapping): + raise pytest.UsageError() + + return pytest.mark.parametrize( + ("dataset_mock", "config"), + [ + pytest.param(dataset_mock, config, id=config_id(name, config), marks=marks.get(name, ())) + for name, dataset_mock in dataset_mocks.items() + for config in dataset_mock.configs + ], + ) + + +DATASET_MOCKS = {} + + +def register_mock(name=None, *, configs): + def wrapper(mock_data_fn): + nonlocal name + if name is None: + name = mock_data_fn.__name__ + DATASET_MOCKS[name] = DatasetMock(name, mock_data_fn=mock_data_fn, configs=configs) + + return mock_data_fn + + return wrapper + + +class MNISTMockData: + _DTYPES_ID = { + torch.uint8: 8, + torch.int8: 9, + torch.int16: 11, + torch.int32: 12, + torch.float32: 13, + torch.float64: 14, + } + + @classmethod + def _magic(cls, dtype, ndim): + return cls._DTYPES_ID[dtype] * 256 + ndim + 1 + + @staticmethod + def _encode(t): + return torch.tensor(t, dtype=torch.int32).numpy().tobytes()[::-1] + + @staticmethod + def _big_endian_dtype(dtype): + np_dtype = getattr(np, str(dtype).replace("torch.", ""))().dtype + return np.dtype(f">{np_dtype.kind}{np_dtype.itemsize}") + + @classmethod + def _create_binary_file(cls, root, filename, *, num_samples, shape, dtype, compressor, low=0, high): + with compressor(root / filename, "wb") as fh: + for meta in (cls._magic(dtype, len(shape)), num_samples, *shape): + fh.write(cls._encode(meta)) + + data = make_tensor((num_samples, *shape), dtype=dtype, low=low, high=high) + + fh.write(data.numpy().astype(cls._big_endian_dtype(dtype)).tobytes()) + + @classmethod + def generate( + cls, + root, + *, + num_categories, + num_samples=None, + images_file, + labels_file, + image_size=(28, 28), + image_dtype=torch.uint8, + label_size=(), + label_dtype=torch.uint8, + compressor=None, + ): + if num_samples is None: + num_samples = num_categories + if compressor is None: + compressor = gzip.open + + cls._create_binary_file( + root, + images_file, + num_samples=num_samples, + shape=image_size, + dtype=image_dtype, + compressor=compressor, + high=float("inf"), + ) + cls._create_binary_file( + root, + labels_file, + num_samples=num_samples, + shape=label_size, + dtype=label_dtype, + compressor=compressor, + high=num_categories, + ) + + return num_samples + + +def mnist(root, config): + prefix = "train" if config["split"] == "train" else "t10k" + return MNISTMockData.generate( + root, + num_categories=10, + images_file=f"{prefix}-images-idx3-ubyte.gz", + labels_file=f"{prefix}-labels-idx1-ubyte.gz", + ) + + +DATASET_MOCKS.update( + { + name: DatasetMock(name, mock_data_fn=mnist, configs=combinations_grid(split=("train", "test"))) + for name in ["mnist", "fashionmnist", "kmnist"] + } +) + + +@register_mock( + configs=combinations_grid( + split=("train", "test"), + image_set=("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"), + ) +) +def emnist(root, config): + num_samples_map = {} + file_names = set() + for split, image_set in itertools.product( + ("train", "test"), + ("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"), + ): + prefix = f"emnist-{image_set.replace('_', '').lower()}-{split}" + images_file = f"{prefix}-images-idx3-ubyte.gz" + labels_file = f"{prefix}-labels-idx1-ubyte.gz" + file_names.update({images_file, labels_file}) + num_samples_map[(split, image_set)] = MNISTMockData.generate( + root, + # The image sets that merge some lower case letters in their respective upper case variant, still use dense + # labels in the data files. Thus, num_categories != len(categories) there. + num_categories=47 if config["image_set"] in ("Balanced", "By_Merge") else 62, + images_file=images_file, + labels_file=labels_file, + ) + + make_zip(root, "emnist-gzip.zip", *file_names) + + return num_samples_map[(config["split"], config["image_set"])] + + +@register_mock(configs=combinations_grid(split=("train", "test", "test10k", "test50k", "nist"))) +def qmnist(root, config): + num_categories = 10 + if config["split"] == "train": + num_samples = num_samples_gen = num_categories + 2 + prefix = "qmnist-train" + suffix = ".gz" + compressor = gzip.open + elif config["split"].startswith("test"): + # The split 'test50k' is defined as the last 50k images beginning at index 10000. Thus, we need to create + # more than 10000 images for the dataset to not be empty. + num_samples_gen = 10001 + num_samples = { + "test": num_samples_gen, + "test10k": min(num_samples_gen, 10_000), + "test50k": num_samples_gen - 10_000, + }[config["split"]] + prefix = "qmnist-test" + suffix = ".gz" + compressor = gzip.open + else: # config["split"] == "nist" + num_samples = num_samples_gen = num_categories + 3 + prefix = "xnist" + suffix = ".xz" + compressor = lzma.open + + MNISTMockData.generate( + root, + num_categories=num_categories, + num_samples=num_samples_gen, + images_file=f"{prefix}-images-idx3-ubyte{suffix}", + labels_file=f"{prefix}-labels-idx2-int{suffix}", + label_size=(8,), + label_dtype=torch.int32, + compressor=compressor, + ) + return num_samples + + +class CIFARMockData: + NUM_PIXELS = 32 * 32 * 3 + + @classmethod + def _create_batch_file(cls, root, name, *, num_categories, labels_key, num_samples=1): + content = { + "data": make_tensor((num_samples, cls.NUM_PIXELS), dtype=torch.uint8).numpy(), + labels_key: torch.randint(0, num_categories, size=(num_samples,)).tolist(), + } + with open(pathlib.Path(root) / name, "wb") as fh: + pickle.dump(content, fh) + + @classmethod + def generate( + cls, + root, + name, + *, + folder, + train_files, + test_files, + num_categories, + labels_key, + ): + folder = root / folder + folder.mkdir() + files = (*train_files, *test_files) + for file in files: + cls._create_batch_file( + folder, + file, + num_categories=num_categories, + labels_key=labels_key, + ) + + make_tar(root, name, folder, compression="gz") + + +@register_mock(configs=combinations_grid(split=("train", "test"))) +def cifar10(root, config): + train_files = [f"data_batch_{idx}" for idx in range(1, 6)] + test_files = ["test_batch"] + + CIFARMockData.generate( + root=root, + name="cifar-10-python.tar.gz", + folder=pathlib.Path("cifar-10-batches-py"), + train_files=train_files, + test_files=test_files, + num_categories=10, + labels_key="labels", + ) + + return len(train_files if config["split"] == "train" else test_files) + + +@register_mock(configs=combinations_grid(split=("train", "test"))) +def cifar100(root, config): + train_files = ["train"] + test_files = ["test"] + + CIFARMockData.generate( + root=root, + name="cifar-100-python.tar.gz", + folder=pathlib.Path("cifar-100-python"), + train_files=train_files, + test_files=test_files, + num_categories=100, + labels_key="fine_labels", + ) + + return len(train_files if config["split"] == "train" else test_files) + + +@register_mock(configs=[dict()]) +def caltech101(root, config): + def create_ann_file(root, name): + import scipy.io + + box_coord = make_tensor((1, 4), dtype=torch.int32, low=0).numpy().astype(np.uint16) + obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))), dtype=torch.float64, low=0).numpy() + + scipy.io.savemat(str(pathlib.Path(root) / name), dict(box_coord=box_coord, obj_contour=obj_contour)) + + def create_ann_folder(root, name, file_name_fn, num_examples): + root = pathlib.Path(root) / name + root.mkdir(parents=True) + + for idx in range(num_examples): + create_ann_file(root, file_name_fn(idx)) + + images_root = root / "101_ObjectCategories" + anns_root = root / "Annotations" + + image_category_map = { + "Faces": "Faces_2", + "Faces_easy": "Faces_3", + "Motorbikes": "Motorbikes_16", + "airplanes": "Airplanes_Side_2", + } + + categories = ["Faces", "Faces_easy", "Motorbikes", "airplanes", "yin_yang"] + + num_images_per_category = 2 + for category in categories: + create_image_folder( + root=images_root, + name=category, + file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg", + num_examples=num_images_per_category, + ) + create_ann_folder( + root=anns_root, + name=image_category_map.get(category, category), + file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat", + num_examples=num_images_per_category, + ) + + (images_root / "BACKGROUND_Goodle").mkdir() + make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz") + + make_tar(root, f"{anns_root.name}.tar", anns_root) + + return num_images_per_category * len(categories) + + +@register_mock(configs=[dict()]) +def caltech256(root, config): + dir = root / "256_ObjectCategories" + num_images_per_category = 2 + + categories = [ + (1, "ak47"), + (127, "laptop-101"), + (198, "spider"), + (257, "clutter"), + ] + + for category_idx, category in categories: + files = create_image_folder( + dir, + name=f"{category_idx:03d}.{category}", + file_name_fn=lambda image_idx: f"{category_idx:03d}_{image_idx + 1:04d}.jpg", + num_examples=num_images_per_category, + ) + if category == "spider": + open(files[0].parent / "RENAME2", "w").close() + + make_tar(root, f"{dir.name}.tar", dir) + + return num_images_per_category * len(categories) + + +@register_mock(configs=combinations_grid(split=("train", "val", "test"))) +def imagenet(root, config): + from scipy.io import savemat + + info = datasets.info("imagenet") + + if config["split"] == "train": + num_samples = len(info["wnids"]) + archive_name = "ILSVRC2012_img_train.tar" + + files = [] + for wnid in info["wnids"]: + create_image_folder( + root=root, + name=wnid, + file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG", + num_examples=1, + ) + files.append(make_tar(root, f"{wnid}.tar")) + elif config["split"] == "val": + num_samples = 3 + archive_name = "ILSVRC2012_img_val.tar" + files = [create_image_file(root, f"ILSVRC2012_val_{idx + 1:08d}.JPEG") for idx in range(num_samples)] + + devkit_root = root / "ILSVRC2012_devkit_t12" + data_root = devkit_root / "data" + data_root.mkdir(parents=True) + + with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file: + for label in torch.randint(0, len(info["wnids"]), (num_samples,)).tolist(): + file.write(f"{label}\n") + + num_children = 0 + synsets = [ + (idx, wnid, category, "", num_children, [], 0, 0) + for idx, (category, wnid) in enumerate(zip(info["categories"], info["wnids"]), 1) + ] + num_children = 1 + synsets.extend((0, "", "", "", num_children, [], 0, 0) for _ in range(5)) + synsets = np.array( + synsets, + dtype=np.dtype( + [ + ("ILSVRC2012_ID", "O"), + ("WNID", "O"), + ("words", "O"), + ("gloss", "O"), + ("num_children", "O"), + ("children", "O"), + ("wordnet_height", "O"), + ("num_train_images", "O"), + ] + ), + ) + savemat(data_root / "meta.mat", dict(synsets=synsets)) + + make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz") + else: # config["split"] == "test" + num_samples = 5 + archive_name = "ILSVRC2012_img_test_v10102019.tar" + files = [create_image_file(root, f"ILSVRC2012_test_{idx + 1:08d}.JPEG") for idx in range(num_samples)] + + make_tar(root, archive_name, *files) + + return num_samples + + +class CocoMockData: + @classmethod + def _make_annotations_json( + cls, + root, + name, + *, + images_meta, + fn, + ): + num_anns_per_image = torch.randint(1, 5, (len(images_meta),)) + num_anns_total = int(num_anns_per_image.sum()) + ann_ids_iter = iter(torch.arange(num_anns_total)[torch.randperm(num_anns_total)]) + + anns_meta = [] + for image_meta, num_anns in zip(images_meta, num_anns_per_image): + for _ in range(num_anns): + ann_id = int(next(ann_ids_iter)) + anns_meta.append(dict(fn(ann_id, image_meta), id=ann_id, image_id=image_meta["id"])) + anns_meta.sort(key=lambda ann: ann["id"]) + + with open(root / name, "w") as file: + json.dump(dict(images=images_meta, annotations=anns_meta), file) + + return num_anns_per_image + + @staticmethod + def _make_instances_data(ann_id, image_meta): + def make_rle_segmentation(): + height, width = image_meta["height"], image_meta["width"] + numel = height * width + counts = [] + while sum(counts) <= numel: + counts.append(int(torch.randint(5, 8, ()))) + if sum(counts) > numel: + counts[-1] -= sum(counts) - numel + return dict(counts=counts, size=[height, width]) + + return dict( + segmentation=make_rle_segmentation(), + bbox=make_tensor((4,), dtype=torch.float32, low=0).tolist(), + iscrowd=True, + area=float(make_scalar(dtype=torch.float32)), + category_id=int(make_scalar(dtype=torch.int64)), + ) + + @staticmethod + def _make_captions_data(ann_id, image_meta): + return dict(caption=f"Caption {ann_id} describing image {image_meta['id']}.") + + @classmethod + def _make_annotations(cls, root, name, *, images_meta): + num_anns_per_image = torch.zeros((len(images_meta),), dtype=torch.int64) + for annotations, fn in ( + ("instances", cls._make_instances_data), + ("captions", cls._make_captions_data), + ): + num_anns_per_image += cls._make_annotations_json( + root, f"{annotations}_{name}.json", images_meta=images_meta, fn=fn + ) + + return int(num_anns_per_image.sum()) + + @classmethod + def generate( + cls, + root, + *, + split, + year, + num_samples, + ): + annotations_dir = root / "annotations" + annotations_dir.mkdir() + + for split_ in ("train", "val"): + config_name = f"{split_}{year}" + + images_meta = [ + dict( + file_name=f"{idx:012d}.jpg", + id=idx, + width=width, + height=height, + ) + for idx, (height, width) in enumerate( + torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist() + ) + ] + + if split_ == split: + create_image_folder( + root, + config_name, + file_name_fn=lambda idx: images_meta[idx]["file_name"], + num_examples=num_samples, + size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]), + ) + make_zip(root, f"{config_name}.zip") + + cls._make_annotations( + annotations_dir, + config_name, + images_meta=images_meta, + ) + + make_zip(root, f"annotations_trainval{year}.zip", annotations_dir) + + return num_samples + + +@register_mock( + configs=combinations_grid( + split=("train", "val"), + year=("2017", "2014"), + annotations=("instances", "captions", None), + ) +) +def coco(root, config): + return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5) + + +class SBDMockData: + _NUM_CATEGORIES = 20 + + @classmethod + def _make_split_files(cls, root_map, *, split): + splits_and_idcs = [ + ("train", [0, 1, 2]), + ("val", [3]), + ] + if split == "train_noval": + splits_and_idcs.append(("train_noval", [0, 2])) + + ids_map = {split: [f"2008_{idx:06d}" for idx in idcs] for split, idcs in splits_and_idcs} + + for split, ids in ids_map.items(): + with open(root_map[split] / f"{split}.txt", "w") as fh: + fh.writelines(f"{id}\n" for id in ids) + + return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()} + + @classmethod + def _make_anns_folder(cls, root, name, ids): + from scipy.io import savemat + + anns_folder = root / name + anns_folder.mkdir() + + sizes = torch.randint(1, 9, size=(len(ids), 2)).tolist() + for id, size in zip(ids, sizes): + savemat( + anns_folder / f"{id}.mat", + { + "GTcls": { + "Boundaries": cls._make_boundaries(size), + "Segmentation": cls._make_segmentation(size), + } + }, + ) + return sizes + + @classmethod + def _make_boundaries(cls, size): + from scipy.sparse import csc_matrix + + return [ + [csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())] for _ in range(cls._NUM_CATEGORIES) + ] + + @classmethod + def _make_segmentation(cls, size): + return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy() + + @classmethod + def generate(cls, root, *, split): + archive_folder = root / "benchmark_RELEASE" + dataset_folder = archive_folder / "dataset" + dataset_folder.mkdir(parents=True, exist_ok=True) + + ids, num_samples_map = cls._make_split_files( + defaultdict(lambda: dataset_folder, {"train_noval": root}), split=split + ) + sizes = cls._make_anns_folder(dataset_folder, "cls", ids) + create_image_folder( + dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx] + ) + + make_tar(root, "benchmark.tgz", archive_folder, compression="gz") + + return num_samples_map[split] + + +@register_mock(configs=combinations_grid(split=("train", "val", "train_noval"))) +def sbd(root, config): + return SBDMockData.generate(root, split=config["split"]) + + +@register_mock(configs=[dict()]) +def semeion(root, config): + num_samples = 3 + num_categories = 10 + + images = torch.rand(num_samples, 256) + labels = one_hot(torch.randint(num_categories, size=(num_samples,)), num_classes=num_categories) + with open(root / "semeion.data", "w") as fh: + for image, one_hot_label in zip(images, labels): + image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image]) + labels_columns = " ".join([str(label.item()) for label in one_hot_label]) + fh.write(f"{image_columns} {labels_columns} \n") + + return num_samples + + +class VOCMockData: + _TRAIN_VAL_FILE_NAMES = { + "2007": "VOCtrainval_06-Nov-2007.tar", + "2008": "VOCtrainval_14-Jul-2008.tar", + "2009": "VOCtrainval_11-May-2009.tar", + "2010": "VOCtrainval_03-May-2010.tar", + "2011": "VOCtrainval_25-May-2011.tar", + "2012": "VOCtrainval_11-May-2012.tar", + } + _TEST_FILE_NAMES = { + "2007": "VOCtest_06-Nov-2007.tar", + } + + @classmethod + def _make_split_files(cls, root, *, year, trainval): + split_folder = root / "ImageSets" + + if trainval: + idcs_map = { + "train": [0, 1, 2], + "val": [3, 4], + } + idcs_map["trainval"] = [*idcs_map["train"], *idcs_map["val"]] + else: + idcs_map = { + "test": [5], + } + ids_map = {split: [f"{year}_{idx:06d}" for idx in idcs] for split, idcs in idcs_map.items()} + + for task_sub_folder in ("Main", "Segmentation"): + task_folder = split_folder / task_sub_folder + task_folder.mkdir(parents=True, exist_ok=True) + for split, ids in ids_map.items(): + with open(task_folder / f"{split}.txt", "w") as fh: + fh.writelines(f"{id}\n" for id in ids) + + return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()} + + @classmethod + def _make_detection_anns_folder(cls, root, name, *, file_name_fn, num_examples): + folder = root / name + folder.mkdir(parents=True, exist_ok=True) + + for idx in range(num_examples): + cls._make_detection_ann_file(folder, file_name_fn(idx)) + + @classmethod + def _make_detection_ann_file(cls, root, name): + def add_child(parent, name, text=None): + child = ET.SubElement(parent, name) + child.text = str(text) + return child + + def add_name(obj, name="dog"): + add_child(obj, "name", name) + + def add_size(obj): + obj = add_child(obj, "size") + size = {"width": 0, "height": 0, "depth": 3} + for name, text in size.items(): + add_child(obj, name, text) + + def add_bndbox(obj): + obj = add_child(obj, "bndbox") + bndbox = {"xmin": 1, "xmax": 2, "ymin": 3, "ymax": 4} + for name, text in bndbox.items(): + add_child(obj, name, text) + + annotation = ET.Element("annotation") + add_size(annotation) + obj = add_child(annotation, "object") + add_name(obj) + add_bndbox(obj) + + with open(root / name, "wb") as fh: + fh.write(ET.tostring(annotation)) + + @classmethod + def generate(cls, root, *, year, trainval): + archive_folder = root + if year == "2011": + archive_folder = root / "TrainVal" + data_folder = archive_folder / "VOCdevkit" + else: + archive_folder = data_folder = root / "VOCdevkit" + data_folder = data_folder / f"VOC{year}" + data_folder.mkdir(parents=True, exist_ok=True) + + ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval) + for make_folder_fn, name, suffix in [ + (create_image_folder, "JPEGImages", ".jpg"), + (create_image_folder, "SegmentationClass", ".png"), + (cls._make_detection_anns_folder, "Annotations", ".xml"), + ]: + make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids)) + make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder) + + return num_samples_map + + +@register_mock( + configs=[ + *combinations_grid( + split=("train", "val", "trainval"), + year=("2007", "2008", "2009", "2010", "2011", "2012"), + task=("detection", "segmentation"), + ), + *combinations_grid( + split=("test",), + year=("2007",), + task=("detection", "segmentation"), + ), + ], +) +def voc(root, config): + trainval = config["split"] != "test" + return VOCMockData.generate(root, year=config["year"], trainval=trainval)[config["split"]] + + +class CelebAMockData: + @classmethod + def _make_ann_file(cls, root, name, data, *, field_names=None): + with open(root / name, "w") as file: + if field_names: + file.write(f"{len(data)}\r\n") + file.write(" ".join(field_names) + "\r\n") + file.writelines(" ".join(str(item) for item in row) + "\r\n" for row in data) + + _SPLIT_TO_IDX = { + "train": 0, + "val": 1, + "test": 2, + } + + @classmethod + def _make_split_file(cls, root): + num_samples_map = {"train": 4, "val": 3, "test": 2} + + data = [ + (f"{idx:06d}.jpg", cls._SPLIT_TO_IDX[split]) + for split, num_samples in num_samples_map.items() + for idx in range(num_samples) + ] + cls._make_ann_file(root, "list_eval_partition.txt", data) + + image_file_names, _ = zip(*data) + return image_file_names, num_samples_map + + @classmethod + def _make_identity_file(cls, root, image_file_names): + cls._make_ann_file( + root, "identity_CelebA.txt", [(name, int(make_scalar(low=1, dtype=torch.int))) for name in image_file_names] + ) + + @classmethod + def _make_attributes_file(cls, root, image_file_names): + field_names = ("5_o_Clock_Shadow", "Young") + data = [ + [name, *[" 1" if attr else "-1" for attr in make_tensor((len(field_names),), dtype=torch.bool)]] + for name in image_file_names + ] + cls._make_ann_file(root, "list_attr_celeba.txt", data, field_names=(*field_names, "")) + + @classmethod + def _make_bounding_boxes_file(cls, root, image_file_names): + field_names = ("image_id", "x_1", "y_1", "width", "height") + data = [ + [f"{name} ", *[f"{coord:3d}" for coord in make_tensor((4,), low=0, dtype=torch.int).tolist()]] + for name in image_file_names + ] + cls._make_ann_file(root, "list_bbox_celeba.txt", data, field_names=field_names) + + @classmethod + def _make_landmarks_file(cls, root, image_file_names): + field_names = ("lefteye_x", "lefteye_y", "rightmouth_x", "rightmouth_y") + data = [ + [ + name, + *[ + f"{coord:4d}" if idx else coord + for idx, coord in enumerate(make_tensor((len(field_names),), low=0, dtype=torch.int).tolist()) + ], + ] + for name in image_file_names + ] + cls._make_ann_file(root, "list_landmarks_align_celeba.txt", data, field_names=field_names) + + @classmethod + def generate(cls, root): + image_file_names, num_samples_map = cls._make_split_file(root) + + image_files = create_image_folder( + root, "img_align_celeba", file_name_fn=lambda idx: image_file_names[idx], num_examples=len(image_file_names) + ) + make_zip(root, image_files[0].parent.with_suffix(".zip").name) + + for make_ann_file_fn in ( + cls._make_identity_file, + cls._make_attributes_file, + cls._make_bounding_boxes_file, + cls._make_landmarks_file, + ): + make_ann_file_fn(root, image_file_names) + + return num_samples_map + + +@register_mock(configs=combinations_grid(split=("train", "val", "test"))) +def celeba(root, config): + return CelebAMockData.generate(root)[config["split"]] + + +@register_mock(configs=combinations_grid(split=("train", "val", "test"))) +def country211(root, config): + split_folder = pathlib.Path(root, "country211", "valid" if config["split"] == "val" else config["split"]) + split_folder.mkdir(parents=True, exist_ok=True) + + num_examples = { + "train": 3, + "val": 4, + "test": 5, + }[config["split"]] + + classes = ("AD", "BS", "GR") + for cls in classes: + create_image_folder( + split_folder, + name=cls, + file_name_fn=lambda idx: f"{idx}.jpg", + num_examples=num_examples, + ) + make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz") + return num_examples * len(classes) + + +@register_mock(configs=combinations_grid(split=("train", "test"))) +def food101(root, config): + data_folder = root / "food-101" + + num_images_per_class = 3 + image_folder = data_folder / "images" + categories = ["apple_pie", "baby_back_ribs", "waffles"] + image_ids = [] + for category in categories: + image_files = create_image_folder( + image_folder, + category, + file_name_fn=lambda idx: f"{idx:04d}.jpg", + num_examples=num_images_per_class, + ) + image_ids.extend(path.relative_to(path.parents[1]).with_suffix("").as_posix() for path in image_files) + + meta_folder = data_folder / "meta" + meta_folder.mkdir() + + with open(meta_folder / "classes.txt", "w") as file: + for category in categories: + file.write(f"{category}\n") + + splits = ["train", "test"] + num_samples_map = {} + for offset, split in enumerate(splits): + image_ids_in_split = image_ids[offset :: len(splits)] + num_samples_map[split] = len(image_ids_in_split) + with open(meta_folder / f"{split}.txt", "w") as file: + for image_id in image_ids_in_split: + file.write(f"{image_id}\n") + + make_tar(root, f"{data_folder.name}.tar.gz", compression="gz") + + return num_samples_map[config["split"]] + + +@register_mock(configs=combinations_grid(split=("train", "val", "test"), fold=(1, 4, 10))) +def dtd(root, config): + data_folder = root / "dtd" + + num_images_per_class = 3 + image_folder = data_folder / "images" + categories = {"banded", "marbled", "zigzagged"} + image_ids_per_category = { + category: [ + str(path.relative_to(path.parents[1]).as_posix()) + for path in create_image_folder( + image_folder, + category, + file_name_fn=lambda idx: f"{category}_{idx:04d}.jpg", + num_examples=num_images_per_class, + ) + ] + for category in categories + } + + meta_folder = data_folder / "labels" + meta_folder.mkdir() + + with open(meta_folder / "labels_joint_anno.txt", "w") as file: + for cls, image_ids in image_ids_per_category.items(): + for image_id in image_ids: + joint_categories = random.choices( + list(categories - {cls}), k=int(torch.randint(len(categories) - 1, ())) + ) + file.write(" ".join([image_id, *sorted([cls, *joint_categories])]) + "\n") + + image_ids = list(itertools.chain(*image_ids_per_category.values())) + splits = ("train", "val", "test") + num_samples_map = {} + for fold in range(1, 11): + random.shuffle(image_ids) + for offset, split in enumerate(splits): + image_ids_in_config = image_ids[offset :: len(splits)] + with open(meta_folder / f"{split}{fold}.txt", "w") as file: + file.write("\n".join(image_ids_in_config) + "\n") + + num_samples_map[(split, fold)] = len(image_ids_in_config) + + make_tar(root, "dtd-r1.0.1.tar.gz", data_folder, compression="gz") + + return num_samples_map[config["split"], config["fold"]] + + +@register_mock(configs=combinations_grid(split=("train", "test"))) +def fer2013(root, config): + split = config["split"] + num_samples = 5 if split == "train" else 3 + + path = root / f"{split}.csv" + with open(path, "w", newline="") as file: + field_names = ["emotion"] if split == "train" else [] + field_names.append("pixels") + + file.write(",".join(field_names) + "\n") + + writer = csv.DictWriter(file, fieldnames=field_names, quotechar='"', quoting=csv.QUOTE_NONNUMERIC) + for _ in range(num_samples): + rowdict = { + "pixels": " ".join([str(int(pixel)) for pixel in torch.randint(256, (48 * 48,), dtype=torch.uint8)]) + } + if split == "train": + rowdict["emotion"] = int(torch.randint(7, ())) + writer.writerow(rowdict) + + make_zip(root, f"{path.name}.zip", path) + + return num_samples + + +@register_mock(configs=combinations_grid(split=("train", "test"))) +def gtsrb(root, config): + num_examples_per_class = 5 if config["split"] == "train" else 3 + classes = ("00000", "00042", "00012") + num_examples = num_examples_per_class * len(classes) + + csv_columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"] + + def _make_ann_file(path, num_examples, class_idx): + if class_idx == "random": + class_idx = torch.randint(1, len(classes) + 1, size=(1,)).item() + + with open(path, "w") as csv_file: + writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";") + writer.writeheader() + for image_idx in range(num_examples): + writer.writerow( + { + "Filename": f"{image_idx:05d}.ppm", + "Width": torch.randint(1, 100, size=()).item(), + "Height": torch.randint(1, 100, size=()).item(), + "Roi.X1": torch.randint(1, 100, size=()).item(), + "Roi.Y1": torch.randint(1, 100, size=()).item(), + "Roi.X2": torch.randint(1, 100, size=()).item(), + "Roi.Y2": torch.randint(1, 100, size=()).item(), + "ClassId": class_idx, + } + ) + + archive_folder = root / "GTSRB" + + if config["split"] == "train": + train_folder = archive_folder / "Training" + train_folder.mkdir(parents=True) + + for class_idx in classes: + create_image_folder( + train_folder, + name=class_idx, + file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm", + num_examples=num_examples_per_class, + ) + _make_ann_file( + path=train_folder / class_idx / f"GT-{class_idx}.csv", + num_examples=num_examples_per_class, + class_idx=int(class_idx), + ) + make_zip(root, "GTSRB-Training_fixed.zip", archive_folder) + else: + test_folder = archive_folder / "Final_Test" + test_folder.mkdir(parents=True) + + create_image_folder( + test_folder, + name="Images", + file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm", + num_examples=num_examples, + ) + + make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder) + + _make_ann_file( + path=root / "GT-final_test.csv", + num_examples=num_examples, + class_idx="random", + ) + + make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv") + + return num_examples + + +@register_mock(configs=combinations_grid(split=("train", "val", "test"))) +def clevr(root, config): + data_folder = root / "CLEVR_v1.0" + + num_samples_map = { + "train": 3, + "val": 2, + "test": 1, + } + + images_folder = data_folder / "images" + image_files = { + split: create_image_folder( + images_folder, + split, + file_name_fn=lambda idx: f"CLEVR_{split}_{idx:06d}.jpg", + num_examples=num_samples, + ) + for split, num_samples in num_samples_map.items() + } + + scenes_folder = data_folder / "scenes" + scenes_folder.mkdir() + for split in ["train", "val"]: + with open(scenes_folder / f"CLEVR_{split}_scenes.json", "w") as file: + json.dump( + { + "scenes": [ + { + "image_filename": image_file.name, + # We currently only return the number of objects in a scene. + # Thus, it is sufficient for now to only mock the number of elements. + "objects": [None] * int(torch.randint(1, 5, ())), + } + for image_file in image_files[split] + ] + }, + file, + ) + + make_zip(root, f"{data_folder.name}.zip", data_folder) + + return num_samples_map[config["split"]] + + +class OxfordIIITPetMockData: + @classmethod + def _meta_to_split_and_classification_ann(cls, meta, idx): + image_id = "_".join( + [ + *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()], + str(idx), + ] + ) + class_id = str(meta["label"] + 1) + species = "1" if meta["species"] == "cat" else "2" + breed_id = "-1" + return (image_id, class_id, species, breed_id) + + @classmethod + def generate(self, root): + classification_anns_meta = ( + dict(cls="Abyssinian", label=0, species="cat"), + dict(cls="Keeshond", label=18, species="dog"), + dict(cls="Yorkshire Terrier", label=36, species="dog"), + ) + split_and_classification_anns = [ + self._meta_to_split_and_classification_ann(meta, idx) + for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10)) + ] + image_ids, *_ = zip(*split_and_classification_anns) + + image_files = create_image_folder( + root, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids) + ) + + anns_folder = root / "annotations" + anns_folder.mkdir() + random.shuffle(split_and_classification_anns) + splits = ("trainval", "test") + num_samples_map = {} + for offset, split in enumerate(splits): + split_and_classification_anns_in_split = split_and_classification_anns[offset :: len(splits)] + with open(anns_folder / f"{split}.txt", "w") as file: + writer = csv.writer(file, delimiter=" ") + for split_and_classification_ann in split_and_classification_anns_in_split: + writer.writerow(split_and_classification_ann) + + num_samples_map[split] = len(split_and_classification_anns_in_split) + + segmentation_files = create_image_folder( + anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids) + ) + + # The dataset has some rogue files + for path in image_files[:3]: + path.with_suffix(".mat").touch() + for path in segmentation_files: + path.with_name(f".{path.name}").touch() + + make_tar(root, "images.tar.gz", compression="gz") + make_tar(root, anns_folder.with_suffix(".tar.gz").name, compression="gz") + + return num_samples_map + + +@register_mock(name="oxford-iiit-pet", configs=combinations_grid(split=("trainval", "test"))) +def oxford_iiit_pet(root, config): + return OxfordIIITPetMockData.generate(root)[config["split"]] + + +class _CUB200MockData: + @classmethod + def _category_folder(cls, category, idx): + return f"{idx:03d}.{category}" + + @classmethod + def _file_stem(cls, category, idx): + return f"{category}_{idx:04d}" + + @classmethod + def _make_images(cls, images_folder): + image_files = [] + for category_idx, category in [ + (1, "Black_footed_Albatross"), + (100, "Brown_Pelican"), + (200, "Common_Yellowthroat"), + ]: + image_files.extend( + create_image_folder( + images_folder, + cls._category_folder(category, category_idx), + lambda image_idx: f"{cls._file_stem(category, image_idx)}.jpg", + num_examples=5, + ) + ) + + return image_files + + +class CUB2002011MockData(_CUB200MockData): + @classmethod + def _make_archive(cls, root): + archive_folder = root / "CUB_200_2011" + + images_folder = archive_folder / "images" + image_files = cls._make_images(images_folder) + image_ids = list(range(1, len(image_files) + 1)) + + with open(archive_folder / "images.txt", "w") as file: + file.write( + "\n".join( + f"{id} {path.relative_to(images_folder).as_posix()}" for id, path in zip(image_ids, image_files) + ) + ) + + split_ids = torch.randint(2, (len(image_ids),)).tolist() + counts = Counter(split_ids) + num_samples_map = {"train": counts[1], "test": counts[0]} + with open(archive_folder / "train_test_split.txt", "w") as file: + file.write("\n".join(f"{image_id} {split_id}" for image_id, split_id in zip(image_ids, split_ids))) + + with open(archive_folder / "bounding_boxes.txt", "w") as file: + file.write( + "\n".join( + " ".join( + str(item) + for item in [image_id, *make_tensor((4,), dtype=torch.int, low=0).to(torch.float).tolist()] + ) + for image_id in image_ids + ) + ) + + make_tar(root, archive_folder.with_suffix(".tgz").name, compression="gz") + + return image_files, num_samples_map + + @classmethod + def _make_segmentations(cls, root, image_files): + segmentations_folder = root / "segmentations" + for image_file in image_files: + folder = segmentations_folder.joinpath(image_file.relative_to(image_file.parents[1])) + folder.mkdir(exist_ok=True, parents=True) + create_image_file( + folder, + image_file.with_suffix(".png").name, + size=[1, *make_tensor((2,), low=3, dtype=torch.int).tolist()], + ) + + make_tar(root, segmentations_folder.with_suffix(".tgz").name, compression="gz") + + @classmethod + def generate(cls, root): + image_files, num_samples_map = cls._make_archive(root) + cls._make_segmentations(root, image_files) + return num_samples_map + + +class CUB2002010MockData(_CUB200MockData): + @classmethod + def _make_hidden_rouge_file(cls, *files): + for file in files: + (file.parent / f"._{file.name}").touch() + + @classmethod + def _make_splits(cls, root, image_files): + split_folder = root / "lists" + split_folder.mkdir() + random.shuffle(image_files) + splits = ("train", "test") + num_samples_map = {} + for offset, split in enumerate(splits): + image_files_in_split = image_files[offset :: len(splits)] + + split_file = split_folder / f"{split}.txt" + with open(split_file, "w") as file: + file.write( + "\n".join( + sorted( + str(image_file.relative_to(image_file.parents[1]).as_posix()) + for image_file in image_files_in_split + ) + ) + ) + + cls._make_hidden_rouge_file(split_file) + num_samples_map[split] = len(image_files_in_split) + + make_tar(root, split_folder.with_suffix(".tgz").name, compression="gz") + + return num_samples_map + + @classmethod + def _make_anns(cls, root, image_files): + from scipy.io import savemat + + anns_folder = root / "annotations-mat" + for image_file in image_files: + ann_file = anns_folder / image_file.with_suffix(".mat").relative_to(image_file.parents[1]) + ann_file.parent.mkdir(parents=True, exist_ok=True) + + savemat( + ann_file, + { + "seg": torch.randint( + 256, make_tensor((2,), low=3, dtype=torch.int).tolist(), dtype=torch.uint8 + ).numpy(), + "bbox": dict( + zip(("left", "top", "right", "bottom"), make_tensor((4,), dtype=torch.uint8).tolist()) + ), + }, + ) + + readme_file = anns_folder / "README.txt" + readme_file.touch() + cls._make_hidden_rouge_file(readme_file) + + make_tar(root, "annotations.tgz", anns_folder, compression="gz") + + @classmethod + def generate(cls, root): + images_folder = root / "images" + image_files = cls._make_images(images_folder) + cls._make_hidden_rouge_file(*image_files) + make_tar(root, images_folder.with_suffix(".tgz").name, compression="gz") + + num_samples_map = cls._make_splits(root, image_files) + cls._make_anns(root, image_files) + + return num_samples_map + + +@register_mock(configs=combinations_grid(split=("train", "test"), year=("2010", "2011"))) +def cub200(root, config): + num_samples_map = (CUB2002011MockData if config["year"] == "2011" else CUB2002010MockData).generate(root) + return num_samples_map[config["split"]] + + +@register_mock(configs=[dict()]) +def eurosat(root, config): + data_folder = root / "2750" + data_folder.mkdir(parents=True) + + num_examples_per_class = 3 + categories = ["AnnualCrop", "Forest"] + for category in categories: + create_image_folder( + root=data_folder, + name=category, + file_name_fn=lambda idx: f"{category}_{idx + 1}.jpg", + num_examples=num_examples_per_class, + ) + make_zip(root, "EuroSAT.zip", data_folder) + return len(categories) * num_examples_per_class + + +@register_mock(configs=combinations_grid(split=("train", "test", "extra"))) +def svhn(root, config): + import scipy.io as sio + + num_samples = { + "train": 2, + "test": 3, + "extra": 4, + }[config["split"]] + + sio.savemat( + root / f"{config['split']}_32x32.mat", + { + "X": np.random.randint(256, size=(32, 32, 3, num_samples), dtype=np.uint8), + "y": np.random.randint(10, size=(num_samples,), dtype=np.uint8), + }, + ) + return num_samples + + +@register_mock(configs=combinations_grid(split=("train", "val", "test"))) +def pcam(root, config): + import h5py + + num_images = {"train": 2, "test": 3, "val": 4}[config["split"]] + + split = "valid" if config["split"] == "val" else config["split"] + + images_io = io.BytesIO() + with h5py.File(images_io, "w") as f: + f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8) + + targets_io = io.BytesIO() + with h5py.File(targets_io, "w") as f: + f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8) + + # Create .gz compressed files + images_file = root / f"camelyonpatch_level_2_split_{split}_x.h5.gz" + targets_file = root / f"camelyonpatch_level_2_split_{split}_y.h5.gz" + for compressed_file_name, uncompressed_file_io in ((images_file, images_io), (targets_file, targets_io)): + compressed_data = gzip.compress(uncompressed_file_io.getbuffer()) + with open(compressed_file_name, "wb") as compressed_file: + compressed_file.write(compressed_data) + + return num_images + + +@register_mock(name="stanford-cars", configs=combinations_grid(split=("train", "test"))) +def stanford_cars(root, config): + import scipy.io as io + from numpy.core.records import fromarrays + + split = config["split"] + num_samples = {"train": 5, "test": 7}[split] + num_categories = 3 + + if split == "train": + images_folder_name = "cars_train" + devkit = root / "devkit" + devkit.mkdir() + annotations_mat_path = devkit / "cars_train_annos.mat" + else: + images_folder_name = "cars_test" + annotations_mat_path = root / "cars_test_annos_withlabels.mat" + + create_image_folder( + root=root, + name=images_folder_name, + file_name_fn=lambda image_index: f"{image_index:5d}.jpg", + num_examples=num_samples, + ) + + make_tar(root, f"cars_{split}.tgz", images_folder_name) + bbox = np.random.randint(1, 200, num_samples, dtype=np.uint8) + classes = np.random.randint(1, num_categories + 1, num_samples, dtype=np.uint8) + fnames = [f"{i:5d}.jpg" for i in range(num_samples)] + rec_array = fromarrays( + [bbox, bbox, bbox, bbox, classes, fnames], + names=["bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", "class", "fname"], + ) + + io.savemat(annotations_mat_path, {"annotations": rec_array}) + if split == "train": + make_tar(root, "car_devkit.tgz", devkit, compression="gz") + + return num_samples + + +@register_mock(configs=combinations_grid(split=("train", "test"))) +def usps(root, config): + num_samples = {"train": 15, "test": 7}[config["split"]] + + with bz2.open(root / f"usps{'.t' if not config['split'] == 'train' else ''}.bz2", "wb") as fh: + lines = [] + for _ in range(num_samples): + label = make_tensor(1, low=1, high=11, dtype=torch.int) + values = make_tensor(256, low=-1, high=1, dtype=torch.float) + lines.append( + " ".join([f"{int(label)}", *(f"{idx}:{float(value):.6f}" for idx, value in enumerate(values, 1))]) + ) + + fh.write("\n".join(lines).encode()) + + return num_samples diff --git a/test/common_extended_utils.py b/test/common_extended_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..a34e15629bba78961accaddc91be82e1ec08386c --- /dev/null +++ b/test/common_extended_utils.py @@ -0,0 +1,310 @@ +import os +from collections import defaultdict +from numbers import Number +from typing import Any, List + +import torch +from torch.utils._python_dispatch import TorchDispatchMode + +from torch.utils._pytree import tree_map + +from torchvision.models._api import Weights + +aten = torch.ops.aten +quantized = torch.ops.quantized + + +def get_shape(i): + if isinstance(i, torch.Tensor): + return i.shape + elif hasattr(i, "weight"): + return i.weight().shape + else: + raise ValueError(f"Unknown type {type(i)}") + + +def prod(x): + res = 1 + for i in x: + res *= i + return res + + +def matmul_flop(inputs: List[Any], outputs: List[Any]) -> Number: + """ + Count flops for matmul. + """ + # Inputs should be a list of length 2. + # Inputs contains the shapes of two matrices. + input_shapes = [get_shape(v) for v in inputs] + assert len(input_shapes) == 2, input_shapes + assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes + flop = prod(input_shapes[0]) * input_shapes[-1][-1] + return flop + + +def addmm_flop(inputs: List[Any], outputs: List[Any]) -> Number: + """ + Count flops for fully connected layers. + """ + # Count flop for nn.Linear + # inputs is a list of length 3. + input_shapes = [get_shape(v) for v in inputs[1:3]] + # input_shapes[0]: [batch size, input feature dimension] + # input_shapes[1]: [batch size, output feature dimension] + assert len(input_shapes[0]) == 2, input_shapes[0] + assert len(input_shapes[1]) == 2, input_shapes[1] + batch_size, input_dim = input_shapes[0] + output_dim = input_shapes[1][1] + flops = batch_size * input_dim * output_dim + return flops + + +def bmm_flop(inputs: List[Any], outputs: List[Any]) -> Number: + """ + Count flops for the bmm operation. + """ + # Inputs should be a list of length 2. + # Inputs contains the shapes of two tensor. + assert len(inputs) == 2, len(inputs) + input_shapes = [get_shape(v) for v in inputs] + n, c, t = input_shapes[0] + d = input_shapes[-1][-1] + flop = n * c * t * d + return flop + + +def conv_flop_count( + x_shape: List[int], + w_shape: List[int], + out_shape: List[int], + transposed: bool = False, +) -> Number: + """ + Count flops for convolution. Note only multiplication is + counted. Computation for addition and bias is ignored. + Flops for a transposed convolution are calculated as + flops = (x_shape[2:] * prod(w_shape) * batch_size). + Args: + x_shape (list(int)): The input shape before convolution. + w_shape (list(int)): The filter shape. + out_shape (list(int)): The output shape after convolution. + transposed (bool): is the convolution transposed + Returns: + int: the number of flops + """ + batch_size = x_shape[0] + conv_shape = (x_shape if transposed else out_shape)[2:] + flop = batch_size * prod(w_shape) * prod(conv_shape) + return flop + + +def conv_flop(inputs: List[Any], outputs: List[Any]): + """ + Count flops for convolution. + """ + x, w = inputs[:2] + x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0])) + transposed = inputs[6] + + return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed) + + +def quant_conv_flop(inputs: List[Any], outputs: List[Any]): + """ + Count flops for quantized convolution. + """ + x, w = inputs[:2] + x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0])) + + return conv_flop_count(x_shape, w_shape, out_shape, transposed=False) + + +def transpose_shape(shape): + return [shape[1], shape[0]] + list(shape[2:]) + + +def conv_backward_flop(inputs: List[Any], outputs: List[Any]): + grad_out_shape, x_shape, w_shape = [get_shape(i) for i in inputs[:3]] + output_mask = inputs[-1] + fwd_transposed = inputs[7] + flop_count = 0 + + if output_mask[0]: + grad_input_shape = get_shape(outputs[0]) + flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not fwd_transposed) + if output_mask[1]: + grad_weight_shape = get_shape(outputs[1]) + flop_count += conv_flop_count(transpose_shape(x_shape), grad_out_shape, grad_weight_shape, fwd_transposed) + + return flop_count + + +def scaled_dot_product_flash_attention_flop(inputs: List[Any], outputs: List[Any]): + # FIXME: this needs to count the flops of this kernel + # https://github.com/pytorch/pytorch/blob/207b06d099def9d9476176a1842e88636c1f714f/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp#L52-L267 + return 0 + + +flop_mapping = { + aten.mm: matmul_flop, + aten.matmul: matmul_flop, + aten.addmm: addmm_flop, + aten.bmm: bmm_flop, + aten.convolution: conv_flop, + aten._convolution: conv_flop, + aten.convolution_backward: conv_backward_flop, + quantized.conv2d: quant_conv_flop, + quantized.conv2d_relu: quant_conv_flop, + aten._scaled_dot_product_flash_attention: scaled_dot_product_flash_attention_flop, +} + +unmapped_ops = set() + + +def normalize_tuple(x): + if not isinstance(x, tuple): + return (x,) + return x + + +class FlopCounterMode(TorchDispatchMode): + def __init__(self, model=None): + self.flop_counts = defaultdict(lambda: defaultdict(int)) + self.parents = ["Global"] + # global mod + if model is not None: + for name, module in dict(model.named_children()).items(): + module.register_forward_pre_hook(self.enter_module(name)) + module.register_forward_hook(self.exit_module(name)) + + def enter_module(self, name): + def f(module, inputs): + self.parents.append(name) + inputs = normalize_tuple(inputs) + out = self.create_backwards_pop(name)(*inputs) + return out + + return f + + def exit_module(self, name): + def f(module, inputs, outputs): + assert self.parents[-1] == name + self.parents.pop() + outputs = normalize_tuple(outputs) + return self.create_backwards_push(name)(*outputs) + + return f + + def create_backwards_push(self, name): + class PushState(torch.autograd.Function): + @staticmethod + def forward(ctx, *args): + args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args) + if len(args) == 1: + return args[0] + return args + + @staticmethod + def backward(ctx, *grad_outs): + self.parents.append(name) + return grad_outs + + return PushState.apply + + def create_backwards_pop(self, name): + class PopState(torch.autograd.Function): + @staticmethod + def forward(ctx, *args): + args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args) + if len(args) == 1: + return args[0] + return args + + @staticmethod + def backward(ctx, *grad_outs): + assert self.parents[-1] == name + self.parents.pop() + return grad_outs + + return PopState.apply + + def __enter__(self): + self.flop_counts.clear() + super().__enter__() + + def __exit__(self, *args): + # print(f"Total: {sum(self.flop_counts['Global'].values()) / 1e9} GFLOPS") + # for mod in self.flop_counts.keys(): + # print(f"Module: ", mod) + # for k, v in self.flop_counts[mod].items(): + # print(f"{k}: {v / 1e9} GFLOPS") + # print() + super().__exit__(*args) + + def __torch_dispatch__(self, func, types, args=(), kwargs=None): + kwargs = kwargs if kwargs else {} + + out = func(*args, **kwargs) + func_packet = func._overloadpacket + if func_packet in flop_mapping: + flop_count = flop_mapping[func_packet](args, normalize_tuple(out)) + for par in self.parents: + self.flop_counts[par][func_packet] += flop_count + else: + unmapped_ops.add(func_packet) + + return out + + def get_flops(self): + return sum(self.flop_counts["Global"].values()) / 1e9 + + +def get_dims(module_name, height, width): + # detection models have curated input sizes + if module_name == "detection": + # we can feed a batch of 1 for detection model instead of a list of 1 image + dims = (3, height, width) + elif module_name == "video": + # hard-coding the time dimension to size 16 + dims = (1, 16, 3, height, width) + else: + dims = (1, 3, height, width) + + return dims + + +def get_ops(model: torch.nn.Module, weight: Weights, height=512, width=512): + module_name = model.__module__.split(".")[-2] + dims = get_dims(module_name=module_name, height=height, width=width) + + input_tensor = torch.randn(dims) + + # try: + preprocess = weight.transforms() + if module_name == "optical_flow": + inp = preprocess(input_tensor, input_tensor) + else: + # hack to enable mod(*inp) for optical_flow models + inp = [preprocess(input_tensor)] + + model.eval() + + flop_counter = FlopCounterMode(model) + with flop_counter: + # detection models expect a list of 3d tensors as inputs + if module_name == "detection": + model(inp) + else: + model(*inp) + + flops = flop_counter.get_flops() + + return round(flops, 3) + + +def get_file_size_mb(weight): + weights_path = os.path.join(os.getenv("HOME"), ".cache/torch/hub/checkpoints", weight.url.split("/")[-1]) + weights_size_mb = os.path.getsize(weights_path) / 1024 / 1024 + + return round(weights_size_mb, 3) diff --git a/test/common_utils.py b/test/common_utils.py index 8f07e91d144e6a5495ecf4ab33b3174c5d27e3b0..a1d188efdaed686e129bef2844f1f24e4a1d5abe 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -1,23 +1,35 @@ import contextlib import functools +import itertools import os +import pathlib import random +import re import shutil +import sys import tempfile +import warnings +from subprocess import CalledProcessError, check_output, STDOUT import numpy as np +import PIL.Image +import pytest import torch +import torch.testing from PIL import Image -from torchvision import io -import __main__ # noqa: 401 +from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair +from torchvision import io, tv_tensors +from torchvision.transforms._functional_tensor import _max_value as get_max_value +from torchvision.transforms.v2.functional import to_image, to_pil_image -IN_CIRCLE_CI = os.getenv("CIRCLECI", False) == "true" +IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"]) IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1" CUDA_NOT_AVAILABLE_MSG = "CUDA device not available" -CIRCLECI_GPU_NO_CUDA_MSG = "We're in a CircleCI GPU machine, and this test doesn't need cuda." +MPS_NOT_AVAILABLE_MSG = "MPS device not available" +OSS_CI_GPU_NO_CUDA_MSG = "We're in an OSS GPU machine, and this test doesn't need cuda." @contextlib.contextmanager @@ -107,18 +119,28 @@ def disable_console_output(): yield -def cpu_and_gpu(): +def cpu_and_cuda(): import pytest # noqa return ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda)) +def cpu_and_cuda_and_mps(): + return cpu_and_cuda() + (pytest.param("mps", marks=pytest.mark.needs_mps),) + + def needs_cuda(test_func): import pytest # noqa return pytest.mark.needs_cuda(test_func) +def needs_mps(test_func): + import pytest # noqa + + return pytest.mark.needs_mps(test_func) + + def _create_data(height=3, width=3, channels=3, device="cpu"): # TODO: When all relevant tests are ported to pytest, turn this into a module-level fixture tensor = torch.randint(0, 256, (channels, height, width), dtype=torch.uint8, device=device) @@ -137,9 +159,6 @@ def _create_data_batch(height=3, width=3, channels=3, num_samples=4, device="cpu return batch_tensor -assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0) - - def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None): names = [] for i in range(num_videos): @@ -160,6 +179,7 @@ def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None): def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None): + # FIXME: this is handled automatically by `assert_equal` below. Let's remove this in favor of it np_pil_image = np.array(pil_image) if np_pil_image.ndim == 2: np_pil_image = np_pil_image[:, :, None] @@ -172,6 +192,7 @@ def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None): def _assert_approx_equal_tensor_to_pil( tensor, pil_image, tol=1e-5, msg=None, agg_method="mean", allowed_percentage_diff=None ): + # FIXME: this is handled automatically by `assert_close` below. Let's remove this in favor of it # TODO: we could just merge this into _assert_equal_tensor_to_pil np_pil_image = np.array(pil_image) if np_pil_image.ndim == 2: @@ -210,7 +231,7 @@ def cache(fn): """ sentinel = object() out_cache = {} - exc_cache = {} + exc_tb_cache = {} @functools.wraps(fn) def wrapper(*args, **kwargs): @@ -220,17 +241,280 @@ def cache(fn): if out is not sentinel: return out - exc = exc_cache.get(key, sentinel) - if exc is not sentinel: - raise exc + exc_tb = exc_tb_cache.get(key, sentinel) + if exc_tb is not sentinel: + raise exc_tb[0].with_traceback(exc_tb[1]) try: out = fn(*args, **kwargs) except Exception as exc: - exc_cache[key] = exc + # We need to cache the traceback here as well. Otherwise, each re-raise will add the internal pytest + # traceback frames anew, but they will only be removed once. Thus, the traceback will be ginormous hiding + # the actual information in the noise. See https://github.com/pytest-dev/pytest/issues/10363 for details. + exc_tb_cache[key] = exc, exc.__traceback__ raise exc out_cache[key] = out return out return wrapper + + +def combinations_grid(**kwargs): + """Creates a grid of input combinations. + + Each element in the returned sequence is a dictionary containing one possible combination as values. + + Example: + >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham")) + [ + {'foo': 'bar', 'spam': 'eggs'}, + {'foo': 'bar', 'spam': 'ham'}, + {'foo': 'baz', 'spam': 'eggs'}, + {'foo': 'baz', 'spam': 'ham'} + ] + """ + return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())] + + +class ImagePair(TensorLikePair): + def __init__( + self, + actual, + expected, + *, + mae=False, + **other_parameters, + ): + if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]): + actual, expected = [to_image(input) for input in [actual, expected]] + + super().__init__(actual, expected, **other_parameters) + self.mae = mae + + def compare(self) -> None: + actual, expected = self.actual, self.expected + + self._compare_attributes(actual, expected) + actual, expected = self._equalize_attributes(actual, expected) + + if self.mae: + if actual.dtype is torch.uint8: + actual, expected = actual.to(torch.int), expected.to(torch.int) + mae = float(torch.abs(actual - expected).float().mean()) + if mae > self.atol: + self._fail( + AssertionError, + f"The MAE of the images is {mae}, but only {self.atol} is allowed.", + ) + else: + super()._compare_values(actual, expected) + + +def assert_close( + actual, + expected, + *, + allow_subclasses=True, + rtol=None, + atol=None, + equal_nan=False, + check_device=True, + check_dtype=True, + check_layout=True, + check_stride=False, + msg=None, + **kwargs, +): + """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison""" + __tracebackhide__ = True + + error_metas = not_close_error_metas( + actual, + expected, + pair_types=( + NonePair, + BooleanPair, + NumberPair, + ImagePair, + TensorLikePair, + ), + allow_subclasses=allow_subclasses, + rtol=rtol, + atol=atol, + equal_nan=equal_nan, + check_device=check_device, + check_dtype=check_dtype, + check_layout=check_layout, + check_stride=check_stride, + **kwargs, + ) + + if error_metas: + raise error_metas[0].to_error(msg) + + +assert_equal = functools.partial(assert_close, rtol=0, atol=0) + + +DEFAULT_SIZE = (17, 11) + + +NUM_CHANNELS_MAP = { + "GRAY": 1, + "GRAY_ALPHA": 2, + "RGB": 3, + "RGBA": 4, +} + + +def make_image( + size=DEFAULT_SIZE, + *, + color_space="RGB", + batch_dims=(), + dtype=None, + device="cpu", + memory_format=torch.contiguous_format, +): + num_channels = NUM_CHANNELS_MAP[color_space] + dtype = dtype or torch.uint8 + max_value = get_max_value(dtype) + data = torch.testing.make_tensor( + (*batch_dims, num_channels, *size), + low=0, + high=max_value, + dtype=dtype, + device=device, + memory_format=memory_format, + ) + if color_space in {"GRAY_ALPHA", "RGBA"}: + data[..., -1, :, :] = max_value + + return tv_tensors.Image(data) + + +def make_image_tensor(*args, **kwargs): + return make_image(*args, **kwargs).as_subclass(torch.Tensor) + + +def make_image_pil(*args, **kwargs): + return to_pil_image(make_image(*args, **kwargs)) + + +def make_bounding_boxes( + canvas_size=DEFAULT_SIZE, + *, + format=tv_tensors.BoundingBoxFormat.XYXY, + dtype=None, + device="cpu", +): + def sample_position(values, max_value): + # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high. + # However, if we have batch_dims, we need tensors as limits. + return torch.stack([torch.randint(max_value - v, ()) for v in values.tolist()]) + + if isinstance(format, str): + format = tv_tensors.BoundingBoxFormat[format] + + dtype = dtype or torch.float32 + + num_objects = 1 + h, w = [torch.randint(1, c, (num_objects,)) for c in canvas_size] + y = sample_position(h, canvas_size[0]) + x = sample_position(w, canvas_size[1]) + + if format is tv_tensors.BoundingBoxFormat.XYWH: + parts = (x, y, w, h) + elif format is tv_tensors.BoundingBoxFormat.XYXY: + x1, y1 = x, y + x2 = x1 + w + y2 = y1 + h + parts = (x1, y1, x2, y2) + elif format is tv_tensors.BoundingBoxFormat.CXCYWH: + cx = x + w / 2 + cy = y + h / 2 + parts = (cx, cy, w, h) + else: + raise ValueError(f"Format {format} is not supported") + + return tv_tensors.BoundingBoxes( + torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size + ) + + +def make_detection_mask(size=DEFAULT_SIZE, *, dtype=None, device="cpu"): + """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks""" + num_objects = 1 + return tv_tensors.Mask( + torch.testing.make_tensor( + (num_objects, *size), + low=0, + high=2, + dtype=dtype or torch.bool, + device=device, + ) + ) + + +def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"): + """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value""" + return tv_tensors.Mask( + torch.testing.make_tensor( + (*batch_dims, *size), + low=0, + high=num_categories, + dtype=dtype or torch.uint8, + device=device, + ) + ) + + +def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs): + return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs)) + + +def make_video_tensor(*args, **kwargs): + return make_video(*args, **kwargs).as_subclass(torch.Tensor) + + +def assert_run_python_script(source_code): + """Utility to check assertions in an independent Python subprocess. + + The script provided in the source code should return 0 and not print + anything on stderr or stdout. Modified from scikit-learn test utils. + + Args: + source_code (str): The Python source code to execute. + """ + with get_tmp_dir() as root: + path = pathlib.Path(root) / "main.py" + with open(path, "w") as file: + file.write(source_code) + + try: + out = check_output([sys.executable, str(path)], stderr=STDOUT) + except CalledProcessError as e: + raise RuntimeError(f"script errored with output:\n{e.output.decode()}") + if out != b"": + raise AssertionError(out.decode()) + + +@contextlib.contextmanager +def assert_no_warnings(): + # The name `catch_warnings` is a misnomer as the context manager does **not** catch any warnings, but rather scopes + # the warning filters. All changes that are made to the filters while in this context, will be reset upon exit. + with warnings.catch_warnings(): + warnings.simplefilter("error") + yield + + +@contextlib.contextmanager +def ignore_jit_no_profile_information_warning(): + # Calling a scripted object often triggers a warning like + # `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information` + # with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore + # them. + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", message=re.escape("operator() profile_node %"), category=UserWarning) + yield diff --git a/test/conftest.py b/test/conftest.py index 1a9b2db7f5cd2f0e2513beb84d9d977e861b3f5f..ea73b09b906d6373e28f9f03fa4c082d54df2809 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,12 +3,21 @@ import random import numpy as np import pytest import torch -from common_utils import CIRCLECI_GPU_NO_CUDA_MSG, CUDA_NOT_AVAILABLE_MSG, IN_CIRCLE_CI, IN_FBCODE, IN_RE_WORKER + +from common_utils import ( + CUDA_NOT_AVAILABLE_MSG, + IN_FBCODE, + IN_OSS_CI, + IN_RE_WORKER, + MPS_NOT_AVAILABLE_MSG, + OSS_CI_GPU_NO_CUDA_MSG, +) def pytest_configure(config): # register an additional marker (see pytest_collection_modifyitems) config.addinivalue_line("markers", "needs_cuda: mark for tests that rely on a CUDA device") + config.addinivalue_line("markers", "needs_mps: mark for tests that rely on a MPS device") config.addinivalue_line("markers", "dont_collect: mark for tests that should not be collected") @@ -16,9 +25,9 @@ def pytest_collection_modifyitems(items): # This hook is called by pytest after it has collected the tests (google its name to check out its doc!) # We can ignore some tests as we see fit here, or add marks, such as a skip mark. # - # Typically here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the + # Typically, here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the # tests that don't need CUDA, because those tests are extensively tested in the CPU CI instances already. - # This is true for both CircleCI and the fbcode internal CI. + # This is true for both OSS CI and the fbcode internal CI. # In the fbcode CI, we have an additional constraint: we try to avoid skipping tests. So instead of relying on # pytest.mark.skip, in fbcode we literally just remove those tests from the `items` list, and it's as if # these tests never existed. @@ -28,16 +37,20 @@ def pytest_collection_modifyitems(items): # The needs_cuda mark will exist if the test was explicitly decorated with # the @needs_cuda decorator. It will also exist if it was parametrized with a # parameter that has the mark: for example if a test is parametrized with - # @pytest.mark.parametrize('device', cpu_and_gpu()) + # @pytest.mark.parametrize('device', cpu_and_cuda()) # the "instances" of the tests where device == 'cuda' will have the 'needs_cuda' mark, # and the ones with device == 'cpu' won't have the mark. needs_cuda = item.get_closest_marker("needs_cuda") is not None + needs_mps = item.get_closest_marker("needs_mps") is not None if needs_cuda and not torch.cuda.is_available(): # In general, we skip cuda tests on machines without a GPU # There are special cases though, see below item.add_marker(pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG)) + if needs_mps and not torch.backends.mps.is_available(): + item.add_marker(pytest.mark.skip(reason=MPS_NOT_AVAILABLE_MSG)) + if IN_FBCODE: # fbcode doesn't like skipping tests, so instead we just don't collect the test # so that they don't even "exist", hence the continue statements. @@ -49,15 +62,18 @@ def pytest_collection_modifyitems(items): # TODO: something more robust would be to do that only in a sandcastle instance, # so that we can still see the test being skipped when testing locally from a devvm continue - elif IN_CIRCLE_CI: + if needs_mps and not torch.backends.mps.is_available(): + # Same as above, but for MPS + continue + elif IN_OSS_CI: # Here we're not in fbcode, so we can safely collect and skip tests. if not needs_cuda and torch.cuda.is_available(): - # Similar to what happens in RE workers: we don't need the CircleCI GPU machines + # Similar to what happens in RE workers: we don't need the OSS CI GPU machines # to run the CPU-only tests. - item.add_marker(pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG)) + item.add_marker(pytest.mark.skip(reason=OSS_CI_GPU_NO_CUDA_MSG)) if item.get_closest_marker("dont_collect") is not None: - # currently, this is only used for some tests we're sure we dont want to run on fbcode + # currently, this is only used for some tests we're sure we don't want to run on fbcode continue out_items.append(item) diff --git a/test/datasets_utils.py b/test/datasets_utils.py index c232e7132b4beb3a138b704e32687b7b749a5f0a..bd9f7ea3a0f8cd1b0ef5211b4cd4667475fb9b62 100644 --- a/test/datasets_utils.py +++ b/test/datasets_utils.py @@ -5,6 +5,7 @@ import inspect import itertools import os import pathlib +import platform import random import shutil import string @@ -25,6 +26,7 @@ import torch import torchvision.datasets import torchvision.io from common_utils import disable_console_output, get_tmp_dir +from torch.utils._pytree import tree_any from torchvision.transforms.functional import get_dimensions @@ -137,7 +139,7 @@ def test_all_configs(test): .. note:: - This will try to remove duplicate configurations. During this process it will not not preserve a potential + This will try to remove duplicate configurations. During this process it will not preserve a potential ordering of the configurations or an inner ordering of a configuration. """ @@ -146,7 +148,7 @@ def test_all_configs(test): return [dict(config_) for config_ in {tuple(sorted(config.items())) for config in configs}] except TypeError: # A TypeError will be raised if a value of any config is not hashable, e.g. a list. In that case duplicate - # removal would be a lot more elaborate and we simply bail out. + # removal would be a lot more elaborate, and we simply bail out. return configs @functools.wraps(test) @@ -169,23 +171,6 @@ def test_all_configs(test): return wrapper -def combinations_grid(**kwargs): - """Creates a grid of input combinations. - - Each element in the returned sequence is a dictionary containing one possible combination as values. - - Example: - >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham")) - [ - {'foo': 'bar', 'spam': 'eggs'}, - {'foo': 'bar', 'spam': 'ham'}, - {'foo': 'baz', 'spam': 'eggs'}, - {'foo': 'baz', 'spam': 'ham'} - ] - """ - return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())] - - class DatasetTestCase(unittest.TestCase): """Abstract base class for all dataset testcases. @@ -297,7 +282,7 @@ class DatasetTestCase(unittest.TestCase): .. note:: The default behavior is only valid if the dataset to be tested has ``root`` as the only required parameter. - Otherwise you need to overwrite this method. + Otherwise, you need to overwrite this method. Args: tmpdir (str): Path to a temporary directory. For most cases this acts as root directory for the dataset @@ -564,7 +549,7 @@ class DatasetTestCase(unittest.TestCase): @test_all_configs def test_num_examples(self, config): with self.create_dataset(config) as (dataset, info): - assert len(dataset) == info["num_examples"] + assert len(list(dataset)) == len(dataset) == info["num_examples"] @test_all_configs def test_transforms(self, config): @@ -581,6 +566,42 @@ class DatasetTestCase(unittest.TestCase): mock.assert_called() + @test_all_configs + def test_transforms_v2_wrapper(self, config): + from torchvision import tv_tensors + from torchvision.datasets import wrap_dataset_for_transforms_v2 + + try: + with self.create_dataset(config) as (dataset, info): + for target_keys in [None, "all"]: + if target_keys is not None and self.DATASET_CLASS not in { + torchvision.datasets.CocoDetection, + torchvision.datasets.VOCDetection, + torchvision.datasets.Kitti, + torchvision.datasets.WIDERFace, + }: + with self.assertRaisesRegex(ValueError, "`target_keys` is currently only supported for"): + wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys) + continue + + wrapped_dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys) + assert isinstance(wrapped_dataset, self.DATASET_CLASS) + assert len(wrapped_dataset) == info["num_examples"] + + wrapped_sample = wrapped_dataset[0] + assert tree_any( + lambda item: isinstance(item, (tv_tensors.TVTensor, PIL.Image.Image)), wrapped_sample + ) + except TypeError as error: + msg = f"No wrapper exists for dataset class {type(dataset).__name__}" + if str(error).startswith(msg): + pytest.skip(msg) + raise error + except RuntimeError as error: + if "currently not supported by this wrapper" in str(error): + pytest.skip("Config is currently not supported by this wrapper") + raise error + class ImageDatasetTestCase(DatasetTestCase): """Abstract base class for image dataset testcases. @@ -604,7 +625,7 @@ class ImageDatasetTestCase(DatasetTestCase): patch_checks=patch_checks, **kwargs, ) as (dataset, info): - # PIL.Image.open() only loads the image meta data upfront and keeps the file open until the first access + # PIL.Image.open() only loads the image metadata upfront and keeps the file open until the first access # to the pixel data occurs. Trying to delete such a file results in an PermissionError on Windows. Thus, we # force-load opened images. # This problem only occurs during testing since some tests, e.g. DatasetTestCase.test_feature_types open an @@ -641,27 +662,73 @@ class VideoDatasetTestCase(DatasetTestCase): FEATURE_TYPES = (torch.Tensor, torch.Tensor, int) REQUIRED_PACKAGES = ("av",) - DEFAULT_FRAMES_PER_CLIP = 1 + FRAMES_PER_CLIP = 1 def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.dataset_args = self._set_default_frames_per_clip(self.dataset_args) - def _set_default_frames_per_clip(self, inject_fake_data): + def _set_default_frames_per_clip(self, dataset_args): argspec = inspect.getfullargspec(self.DATASET_CLASS.__init__) args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)] frames_per_clip_last = args_without_default[-1] == "frames_per_clip" - @functools.wraps(inject_fake_data) + @functools.wraps(dataset_args) def wrapper(tmpdir, config): - args = inject_fake_data(tmpdir, config) + args = dataset_args(tmpdir, config) if frames_per_clip_last and len(args) == len(args_without_default) - 1: - args = (*args, self.DEFAULT_FRAMES_PER_CLIP) + args = (*args, self.FRAMES_PER_CLIP) return args return wrapper + def test_output_format(self): + for output_format in ["TCHW", "THWC"]: + with self.create_dataset(output_format=output_format) as (dataset, _): + for video, *_ in dataset: + if output_format == "TCHW": + num_frames, num_channels, *_ = video.shape + else: # output_format == "THWC": + num_frames, *_, num_channels = video.shape + + assert num_frames == self.FRAMES_PER_CLIP + assert num_channels == 3 + + @test_all_configs + def test_transforms_v2_wrapper(self, config): + # `output_format == "THWC"` is not supported by the wrapper. Thus, we skip the `config` if it is set explicitly + # or use the supported `"TCHW"` + if config.setdefault("output_format", "TCHW") == "THWC": + return + + super().test_transforms_v2_wrapper.__wrapped__(self, config) + + +def _no_collate(batch): + return batch + + +def check_transforms_v2_wrapper_spawn(dataset): + # On Linux and Windows, the DataLoader forks the main process by default. This is not available on macOS, so new + # subprocesses are spawned. This requires the whole pipeline including the dataset to be pickleable, which is what + # we are enforcing here. + if platform.system() != "Darwin": + pytest.skip("Multiprocessing spawning is only checked on macOS.") + + from torch.utils.data import DataLoader + from torchvision import tv_tensors + from torchvision.datasets import wrap_dataset_for_transforms_v2 + + wrapped_dataset = wrap_dataset_for_transforms_v2(dataset) + + dataloader = DataLoader(wrapped_dataset, num_workers=2, multiprocessing_context="spawn", collate_fn=_no_collate) + + for wrapped_sample in dataloader: + assert tree_any( + lambda item: isinstance(item, (tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)), wrapped_sample + ) + def create_image_or_video_tensor(size: Sequence[int]) -> torch.Tensor: r"""Create a random uint8 tensor. @@ -786,7 +853,7 @@ def create_video_file( fps: float = 25, **kwargs: Any, ) -> pathlib.Path: - """Create an video file from random data. + """Create a video file from random data. Args: root (Union[str, pathlib.Path]): Root directory the video file will be placed in. @@ -951,7 +1018,7 @@ def create_random_string(length: int, *digits: str) -> str: Args: length (int): Number of characters in the generated string. - *characters (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`. + *digits (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`. """ if not digits: digits = string.ascii_lowercase diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl index e95ba5f53985e3773c6a625bed929b406350aa90..862af2185c75bd90734b068981e298cf94d11cc8 100644 Binary files a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl index c2875679efd98e7d3454084ddc67054a8dec047e..1d317eb791515686c7294d8c0663f798df6fb71c 100644 Binary files a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl differ diff --git a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl index 0657261d96cefe0d09b93efcd81c21b4cb56b3da..3d4e3e63f280c79044706fa5ac4e9c1c448fdefe 100644 Binary files a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl index 2f1ff941abae5994144c73dcfd361e963ab28cb9..54dfb7cd206f1e420915bb5703f13971a4055cbe 100644 Binary files a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl index 36b680816726017ffafc262ca38861df2737087a..f52b77a8dd8eb18ec2d4b0c85a52968bf6d7d92b 100644 Binary files a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl index c6d1fd14081505a25ffea7ddc8e279078a917b3b..23e841bf8749504030baca953e351dd9b7f146b0 100644 Binary files a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl differ diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl index 7fb8d66b080dfdcebb4bed386cd752b99398b779..f188ee7b911cc7a024563f7572eb71062a0f97e7 100644 Binary files a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl differ diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl index 9c74f2e9b9940de50adb68253c2b3d2bf9b41ba2..beaf6c8e84b1dee9a3748c0cc08dcaab2cf15c07 100644 Binary files a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl differ diff --git a/test/expect/ModelTester.test_swin3d_b_expect.pkl b/test/expect/ModelTester.test_swin3d_b_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..1efc513c91166243925d9f32cc2ae2d35de2f019 Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_b_expect.pkl differ diff --git a/test/expect/ModelTester.test_swin3d_s_expect.pkl b/test/expect/ModelTester.test_swin3d_s_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..0c1e594993e01c3610c395608fe46ec6bde16214 Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_s_expect.pkl differ diff --git a/test/expect/ModelTester.test_swin3d_t_expect.pkl b/test/expect/ModelTester.test_swin3d_t_expect.pkl new file mode 100644 index 0000000000000000000000000000000000000000..5e658ff16b7352da3748eebeabd03a7c4fb5a8dc Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_t_expect.pkl differ diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..b26bcff32466b33004883b0acaca9b124e819485 --- /dev/null +++ b/test/prototype_common_utils.py @@ -0,0 +1,82 @@ +import collections.abc +import dataclasses +from typing import Optional, Sequence + +import pytest +import torch +from torch.nn.functional import one_hot + +from torchvision.prototype import tv_tensors + +from transforms_v2_legacy_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader + + +@dataclasses.dataclass +class LabelLoader(TensorLoader): + categories: Optional[Sequence[str]] + + +def _parse_categories(categories): + if categories is None: + num_categories = int(torch.randint(1, 11, ())) + elif isinstance(categories, int): + num_categories = categories + categories = [f"category{idx}" for idx in range(num_categories)] + elif isinstance(categories, collections.abc.Sequence) and all(isinstance(category, str) for category in categories): + categories = list(categories) + num_categories = len(categories) + else: + raise pytest.UsageError( + f"`categories` can either be `None` (default), an integer, or a sequence of strings, " + f"but got '{categories}' instead." + ) + return categories, num_categories + + +def make_label_loader(*, extra_dims=(), categories=None, dtype=torch.int64): + categories, num_categories = _parse_categories(categories) + + def fn(shape, dtype, device): + # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values, + # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123 + data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype) + return tv_tensors.Label(data, categories=categories) + + return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories) + + +make_label = from_loader(make_label_loader) + + +@dataclasses.dataclass +class OneHotLabelLoader(TensorLoader): + categories: Optional[Sequence[str]] + + +def make_one_hot_label_loader(*, categories=None, extra_dims=(), dtype=torch.int64): + categories, num_categories = _parse_categories(categories) + + def fn(shape, dtype, device): + if num_categories == 0: + data = torch.empty(shape, dtype=dtype, device=device) + else: + # The idiom `make_label_loader(..., dtype=torch.int64); ...; one_hot(...).to(dtype)` is intentional + # since `one_hot` only supports int64 + label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device) + data = one_hot(label, num_classes=num_categories).to(dtype) + return tv_tensors.OneHotLabel(data, categories=categories) + + return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories) + + +def make_one_hot_label_loaders( + *, + categories=(1, 0, None), + extra_dims=DEFAULT_EXTRA_DIMS, + dtypes=(torch.int64, torch.float32), +): + for params in combinations_grid(categories=categories, extra_dims=extra_dims, dtype=dtypes): + yield make_one_hot_label_loader(**params) + + +make_one_hot_labels = from_loaders(make_one_hot_label_loaders) diff --git a/test/smoke_test.py b/test/smoke_test.py index c3a4bdd19d6431250591c8376bf1d2c785c2cb10..6cc07c00aedcb72c4476a335e37e726af576eb5a 100644 --- a/test/smoke_test.py +++ b/test/smoke_test.py @@ -1,4 +1,102 @@ +"""Run smoke tests""" + +import sys +from pathlib import Path + import torch import torchvision -import torchvision.datasets as dset -import torchvision.transforms +from torchvision.io import decode_jpeg, read_file, read_image +from torchvision.models import resnet50, ResNet50_Weights + +SCRIPT_DIR = Path(__file__).parent + + +def smoke_test_torchvision() -> None: + print( + "Is torchvision usable?", + all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]), + ) + + +def smoke_test_torchvision_read_decode() -> None: + img_jpg = read_image(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")) + if img_jpg.shape != (3, 606, 517): + raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}") + img_png = read_image(str(SCRIPT_DIR / "assets" / "interlaced_png" / "wizard_low.png")) + if img_png.shape != (4, 471, 354): + raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}") + + +def smoke_test_torchvision_decode_jpeg(device: str = "cpu"): + img_jpg_data = read_file(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")) + img_jpg = decode_jpeg(img_jpg_data, device=device) + if img_jpg.shape != (3, 606, 517): + raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}") + + +def smoke_test_compile() -> None: + try: + model = resnet50().cuda() + model = torch.compile(model) + x = torch.randn(1, 3, 224, 224, device="cuda") + out = model(x) + print(f"torch.compile model output: {out.shape}") + except RuntimeError: + if sys.platform == "win32": + print("Successfully caught torch.compile RuntimeError on win") + elif sys.version_info >= (3, 11, 0): + print("Successfully caught torch.compile RuntimeError on Python 3.11") + else: + raise + + +def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None: + img = read_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg")).to(device) + + # Step 1: Initialize model with the best available weights + weights = ResNet50_Weights.DEFAULT + model = resnet50(weights=weights).to(device) + model.eval() + + # Step 2: Initialize the inference transforms + preprocess = weights.transforms() + + # Step 3: Apply inference preprocessing transforms + batch = preprocess(img).unsqueeze(0) + + # Step 4: Use the model and print the predicted category + prediction = model(batch).squeeze(0).softmax(0) + class_id = prediction.argmax().item() + score = prediction[class_id].item() + category_name = weights.meta["categories"][class_id] + expected_category = "German shepherd" + print(f"{category_name} ({device}): {100 * score:.1f}%") + if category_name != expected_category: + raise RuntimeError(f"Failed ResNet50 classify {category_name} Expected: {expected_category}") + + +def main() -> None: + print(f"torchvision: {torchvision.__version__}") + print(f"torch.cuda.is_available: {torch.cuda.is_available()}") + + # Turn 1.11.0aHASH into 1.11 (major.minor only) + version = ".".join(torchvision.__version__.split(".")[:2]) + if version >= "0.16": + print(f"{torch.ops.image._jpeg_version() = }") + assert torch.ops.image._is_compiled_against_turbo() + + smoke_test_torchvision() + smoke_test_torchvision_read_decode() + smoke_test_torchvision_resnet50_classify() + smoke_test_torchvision_decode_jpeg() + if torch.cuda.is_available(): + smoke_test_torchvision_decode_jpeg("cuda") + smoke_test_torchvision_resnet50_classify("cuda") + smoke_test_compile() + + if torch.backends.mps.is_available(): + smoke_test_torchvision_resnet50_classify("mps") + + +if __name__ == "__main__": + main() diff --git a/test/test_architecture_ops.py b/test/test_architecture_ops.py index 9f254c7942bd9cd33ec5d71904addbbcd4d6a63b..32ad1a32f897e11a3c1e05050f1c1f691b7a6936 100644 --- a/test/test_architecture_ops.py +++ b/test/test_architecture_ops.py @@ -20,7 +20,7 @@ class MaxvitTester(unittest.TestCase): x_hat = partition(x, partition_size) x_hat = departition(x_hat, partition_size, n_partitions, n_partitions) - assert torch.allclose(x, x_hat) + torch.testing.assert_close(x, x_hat) def test_maxvit_grid_partition(self): input_shape = (1, 3, 224, 224) @@ -39,7 +39,7 @@ class MaxvitTester(unittest.TestCase): x_hat = post_swap(x_hat) x_hat = departition(x_hat, n_partitions, partition_size, partition_size) - assert torch.allclose(x, x_hat) + torch.testing.assert_close(x, x_hat) if __name__ == "__main__": diff --git a/test/test_backbone_utils.py b/test/test_backbone_utils.py index 4fba3c3d09838661e0886a5e1bd1faa45ad2c67c..befceca020e0b8d0d9b8608ca161c114c7b762ba 100644 --- a/test/test_backbone_utils.py +++ b/test/test_backbone_utils.py @@ -194,7 +194,7 @@ class TestFxFeatureExtraction: assert n1 == n2 assert p1.equal(p2) - # And that ouputs match + # And that outputs match with torch.no_grad(): ilg_out = ilg_model(self.inp) fgn_out = fx_model(self.inp) diff --git a/test/test_datasets.py b/test/test_datasets.py index dbce7853effa27f595a0c76f71aaddaacedf311e..1270201d53e059437560dce28239a5ab93305e6d 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -8,6 +8,7 @@ import os import pathlib import pickle import random +import re import shutil import string import unittest @@ -21,12 +22,13 @@ import PIL import pytest import torch import torch.nn.functional as F +from common_utils import combinations_grid from torchvision import datasets class STL10TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.STL10 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled")) @staticmethod def _make_binary_file(num_elements, root, name): @@ -112,9 +114,7 @@ class Caltech101TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Caltech101 FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple)) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( - target_type=("category", "annotation", ["category", "annotation"]) - ) + ADDITIONAL_CONFIGS = combinations_grid(target_type=("category", "annotation", ["category", "annotation"])) REQUIRED_PACKAGES = ("scipy",) def inject_fake_data(self, tmpdir, config): @@ -183,6 +183,10 @@ class Caltech101TestCase(datasets_utils.ImageDatasetTestCase): ), "Type of the combined target does not match the type of the corresponding individual target: " f"{actual} is not {expected}", + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset(target_type="category") as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class Caltech256TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Caltech256 @@ -190,7 +194,7 @@ class Caltech256TestCase(datasets_utils.ImageDatasetTestCase): def inject_fake_data(self, tmpdir, config): tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories" - categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter")) + categories = ((1, "ak47"), (2, "american-flag"), (3, "backpack")) num_images_per_category = 2 for idx, category in categories: @@ -207,7 +211,7 @@ class Caltech256TestCase(datasets_utils.ImageDatasetTestCase): class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.WIDERFace FEATURE_TYPES = (PIL.Image.Image, (dict, type(None))) # test split returns None as target - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) def inject_fake_data(self, tmpdir, config): widerface_dir = pathlib.Path(tmpdir) / "widerface" @@ -258,6 +262,10 @@ class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase): return split_to_num_examples[config["split"]] + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset() as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class CityScapesTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Cityscapes @@ -268,8 +276,8 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase): "color", ) ADDITIONAL_CONFIGS = ( - *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES), - *datasets_utils.combinations_grid( + *combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES), + *combinations_grid( mode=("coarse",), split=("train", "train_extra", "val"), target_type=TARGET_TYPES, @@ -382,11 +390,16 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase): assert isinstance(polygon_img, PIL.Image.Image) (polygon_target, info["expected_polygon_target"]) + def test_transforms_v2_wrapper_spawn(self): + for target_type in ["instance", "semantic", ["instance", "semantic"]]: + with self.create_dataset(target_type=target_type) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class ImageNetTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.ImageNet REQUIRED_PACKAGES = ("scipy",) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val")) def inject_fake_data(self, tmpdir, config): tmpdir = pathlib.Path(tmpdir) @@ -413,10 +426,14 @@ class ImageNetTestCase(datasets_utils.ImageDatasetTestCase): torch.save((wnid_to_classes, None), tmpdir / "meta.bin") return num_examples + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset() as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.CIFAR10 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(train=(True, False)) _VERSION_CONFIG = dict( base_folder="cifar-10-batches-py", @@ -489,7 +506,7 @@ class CelebATestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.CelebA FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None))) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train", "valid", "test", "all"), target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]), ) @@ -607,15 +624,18 @@ class CelebATestCase(datasets_utils.ImageDatasetTestCase): assert merged_imgs_names == all_imgs_names + def test_transforms_v2_wrapper_spawn(self): + for target_type in ["identity", "bbox", ["identity", "bbox"]]: + with self.create_dataset(target_type=target_type) as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.VOCSegmentation FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image) ADDITIONAL_CONFIGS = ( - *datasets_utils.combinations_grid( - year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval") - ), + *combinations_grid(year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")), dict(year="2007", image_set="test"), ) @@ -696,6 +716,10 @@ class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase): return data + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset() as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class VOCDetectionTestCase(VOCSegmentationTestCase): DATASET_CLASS = datasets.VOCDetection @@ -716,6 +740,10 @@ class VOCDetectionTestCase(VOCSegmentationTestCase): assert object == info["annotation"] + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset() as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.CocoDetection @@ -763,11 +791,21 @@ class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase): return info def _create_annotations(self, image_ids, num_annotations_per_image): - annotations = datasets_utils.combinations_grid( - image_id=image_ids, bbox=([1.0, 2.0, 3.0, 4.0],) * num_annotations_per_image - ) - for id, annotation in enumerate(annotations): - annotation["id"] = id + annotations = [] + annotion_id = 0 + for image_id in itertools.islice(itertools.cycle(image_ids), len(image_ids) * num_annotations_per_image): + annotations.append( + dict( + image_id=image_id, + id=annotion_id, + bbox=torch.rand(4).tolist(), + segmentation=[torch.rand(8).tolist()], + category_id=int(torch.randint(91, ())), + area=float(torch.rand(1)), + iscrowd=int(torch.randint(2, size=(1,))), + ) + ) + annotion_id += 1 return annotations, dict() def _create_json(self, root, name, content): @@ -776,13 +814,17 @@ class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase): json.dump(content, fh) return file + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset() as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class CocoCaptionsTestCase(CocoDetectionTestCase): DATASET_CLASS = datasets.CocoCaptions def _create_annotations(self, image_ids, num_annotations_per_image): captions = [str(idx) for idx in range(num_annotations_per_image)] - annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions) + annotations = combinations_grid(image_id=image_ids, caption=captions) for id, annotation in enumerate(annotations): annotation["id"] = id return annotations, dict(captions=captions) @@ -792,11 +834,16 @@ class CocoCaptionsTestCase(CocoDetectionTestCase): _, captions = dataset[0] assert tuple(captions) == tuple(info["captions"]) + def test_transforms_v2_wrapper_spawn(self): + # We need to define this method, because otherwise the test from the super class will + # be run + pytest.skip("CocoCaptions is currently not supported by the v2 wrapper.") + class UCF101TestCase(datasets_utils.VideoDatasetTestCase): DATASET_CLASS = datasets.UCF101 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False)) _VIDEO_FOLDER = "videos" _ANNOTATIONS_FOLDER = "annotations" @@ -857,9 +904,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.LSUN REQUIRED_PACKAGES = ("lmdb",) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( - classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"]) - ) + ADDITIONAL_CONFIGS = combinations_grid(classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])) _CATEGORIES = ( "bedroom", @@ -944,7 +989,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase): class KineticsTestCase(datasets_utils.VideoDatasetTestCase): DATASET_CLASS = datasets.Kinetics - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"), num_classes=("400", "600", "700")) def inject_fake_data(self, tmpdir, config): classes = ("Abseiling", "Zumba") @@ -960,11 +1005,15 @@ class KineticsTestCase(datasets_utils.VideoDatasetTestCase): ) return num_videos_per_class * len(classes) + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset(output_format="TCHW") as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class HMDB51TestCase(datasets_utils.VideoDatasetTestCase): DATASET_CLASS = datasets.HMDB51 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False)) _VIDEO_FOLDER = "videos" _SPLITS_FOLDER = "splits" @@ -1024,7 +1073,7 @@ class HMDB51TestCase(datasets_utils.VideoDatasetTestCase): class OmniglotTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Omniglot - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(background=(True, False)) def inject_fake_data(self, tmpdir, config): target_folder = ( @@ -1104,7 +1153,7 @@ class SEMEIONTestCase(datasets_utils.ImageDatasetTestCase): class USPSTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.USPS - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(train=(True, False)) def inject_fake_data(self, tmpdir, config): num_images = 2 if config["train"] else 1 @@ -1126,7 +1175,7 @@ class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase): REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse") - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation") ) @@ -1187,6 +1236,10 @@ class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase): def _file_stem(self, idx): return f"2008_{idx:06d}" + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset(mode="segmentation") as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class FakeDataTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FakeData @@ -1212,7 +1265,7 @@ class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase): _TRAIN_FEATURE_TYPES = (torch.Tensor,) _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor) - datasets_utils.combinations_grid(train=(True, False)) + combinations_grid(train=(True, False)) _NAME = "liberty" @@ -1371,7 +1424,7 @@ class Flickr30kTestCase(Flickr8kTestCase): class MNISTTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.MNIST - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(train=(True, False)) _MAGIC_DTYPES = { torch.uint8: 8, @@ -1441,7 +1494,7 @@ class EMNISTTestCase(MNISTTestCase): DATASET_CLASS = datasets.EMNIST DEFAULT_CONFIG = dict(split="byclass") - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("byclass", "bymerge", "balanced", "letters", "digits", "mnist"), train=(True, False) ) @@ -1452,7 +1505,7 @@ class EMNISTTestCase(MNISTTestCase): class QMNISTTestCase(MNISTTestCase): DATASET_CLASS = datasets.QMNIST - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(what=("train", "test", "test10k", "nist")) + ADDITIONAL_CONFIGS = combinations_grid(what=("train", "test", "test10k", "nist")) _LABELS_SIZE = (8,) _LABELS_DTYPE = torch.int32 @@ -1494,30 +1547,51 @@ class QMNISTTestCase(MNISTTestCase): assert len(dataset) == info["num_examples"] - 10000 +class MovingMNISTTestCase(datasets_utils.DatasetTestCase): + DATASET_CLASS = datasets.MovingMNIST + FEATURE_TYPES = (torch.Tensor,) + + ADDITIONAL_CONFIGS = combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19)) + + _NUM_FRAMES = 20 + + def inject_fake_data(self, tmpdir, config): + base_folder = os.path.join(tmpdir, self.DATASET_CLASS.__name__) + os.makedirs(base_folder, exist_ok=True) + num_samples = 5 + data = np.concatenate( + [ + np.zeros((config["split_ratio"], num_samples, 64, 64)), + np.ones((self._NUM_FRAMES - config["split_ratio"], num_samples, 64, 64)), + ] + ) + np.save(os.path.join(base_folder, "mnist_test_seq.npy"), data) + return num_samples + + @datasets_utils.test_all_configs + def test_split(self, config): + with self.create_dataset(config) as (dataset, _): + if config["split"] == "train": + assert (dataset.data == 0).all() + elif config["split"] == "test": + assert (dataset.data == 1).all() + else: + assert dataset.data.size()[1] == self._NUM_FRAMES + + class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.DatasetFolder - # The dataset has no fixed return type since it is defined by the loader parameter. For testing, we use a loader - # that simply returns the path as type 'str' instead of loading anything. See the 'dataset_args()' method. - FEATURE_TYPES = (str, int) - - _IMAGE_EXTENSIONS = ("jpg", "png") - _VIDEO_EXTENSIONS = ("avi", "mp4") - _EXTENSIONS = (*_IMAGE_EXTENSIONS, *_VIDEO_EXTENSIONS) + _EXTENSIONS = ("jpg", "png") # DatasetFolder has two mutually exclusive parameters: 'extensions' and 'is_valid_file'. One of both is required. # We only iterate over different 'extensions' here and handle the tests for 'is_valid_file' in the # 'test_is_valid_file()' method. DEFAULT_CONFIG = dict(extensions=_EXTENSIONS) - ADDITIONAL_CONFIGS = ( - *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _IMAGE_EXTENSIONS]), - dict(extensions=_IMAGE_EXTENSIONS), - *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _VIDEO_EXTENSIONS]), - dict(extensions=_VIDEO_EXTENSIONS), - ) + ADDITIONAL_CONFIGS = combinations_grid(extensions=[(ext,) for ext in _EXTENSIONS]) def dataset_args(self, tmpdir, config): - return tmpdir, lambda x: x + return tmpdir, datasets.folder.pil_loader def inject_fake_data(self, tmpdir, config): extensions = config["extensions"] or self._is_valid_file_to_extensions(config["is_valid_file"]) @@ -1528,14 +1602,8 @@ class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase): if ext not in extensions: continue - create_example_folder = ( - datasets_utils.create_image_folder - if ext in self._IMAGE_EXTENSIONS - else datasets_utils.create_video_folder - ) - num_examples = torch.randint(1, 3, size=()).item() - create_example_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples) + datasets_utils.create_image_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples) num_examples_total += num_examples classes.append(cls) @@ -1589,7 +1657,7 @@ class ImageFolderTestCase(datasets_utils.ImageDatasetTestCase): class KittiTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Kitti FEATURE_TYPES = (PIL.Image.Image, (list, type(None))) # test split returns None as target - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False)) + ADDITIONAL_CONFIGS = combinations_grid(train=(True, False)) def inject_fake_data(self, tmpdir, config): kitti_dir = os.path.join(tmpdir, "Kitti", "raw") @@ -1621,11 +1689,15 @@ class KittiTestCase(datasets_utils.ImageDatasetTestCase): return split_to_num_examples[config["train"]] + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset() as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class SvhnTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.SVHN REQUIRED_PACKAGES = ("scipy",) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "extra")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "extra")) def inject_fake_data(self, tmpdir, config): import scipy.io as sio @@ -1646,7 +1718,7 @@ class SvhnTestCase(datasets_utils.ImageDatasetTestCase): class Places365TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Places365 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train-standard", "train-challenge", "val"), small=(False, True), ) @@ -1738,7 +1810,7 @@ class INaturalistTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.INaturalist FEATURE_TYPES = (PIL.Image.Image, (int, tuple)) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( target_type=("kingdom", "full", "genus", ["kingdom", "phylum", "class", "order", "family", "genus", "full"]), version=("2021_train",), ) @@ -1775,7 +1847,7 @@ class INaturalistTestCase(datasets_utils.ImageDatasetTestCase): class LFWPeopleTestCase(datasets_utils.DatasetTestCase): DATASET_CLASS = datasets.LFWPeople FEATURE_TYPES = (PIL.Image.Image, int) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("10fold", "train", "test"), image_set=("original", "funneled", "deepfunneled") ) _IMAGES_DIR = {"original": "lfw", "funneled": "lfw_funneled", "deepfunneled": "lfw-deepfunneled"} @@ -1851,7 +1923,7 @@ class LFWPairsTestCase(LFWPeopleTestCase): class SintelTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Sintel - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) FLOW_H, FLOW_W = 3, 4 @@ -1919,7 +1991,7 @@ class SintelTestCase(datasets_utils.ImageDatasetTestCase): class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.KittiFlow - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) def inject_fake_data(self, tmpdir, config): @@ -1979,7 +2051,7 @@ class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase): class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FlyingChairs - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) FLOW_H, FLOW_W = 3, 4 @@ -2034,7 +2106,7 @@ class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase): class FlyingThings3DTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FlyingThings3D - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train", "test"), pass_name=("clean", "final", "both"), camera=("left", "right", "both") ) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) @@ -2171,7 +2243,7 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Food101 FEATURE_TYPES = (PIL.Image.Image, int) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) def inject_fake_data(self, tmpdir: str, config): root_folder = pathlib.Path(tmpdir) / "food-101" @@ -2206,7 +2278,7 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase): class FGVCAircraftTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FGVCAircraft - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train", "val", "trainval", "test"), annotation_level=("variant", "family", "manufacturer") ) @@ -2289,7 +2361,7 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.DTD FEATURE_TYPES = (PIL.Image.Image, int) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train", "test", "val"), # There is no need to test the whole matrix here, since each fold is treated exactly the same partition=(1, 5, 10), @@ -2323,7 +2395,7 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase): class FER2013TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FER2013 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, (int, type(None))) @@ -2358,7 +2430,7 @@ class GTSRBTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.GTSRB FEATURE_TYPES = (PIL.Image.Image, int) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) def inject_fake_data(self, tmpdir: str, config): root_folder = os.path.join(tmpdir, "gtsrb") @@ -2408,7 +2480,7 @@ class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.CLEVRClassification FEATURE_TYPES = (PIL.Image.Image, (int, type(None))) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) def inject_fake_data(self, tmpdir, config): data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0" @@ -2440,7 +2512,7 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.OxfordIIITPet FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None))) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("trainval", "test"), target_types=("category", "segmentation", ["category", "segmentation"], []), ) @@ -2495,11 +2567,15 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase): breed_id = "-1" return (image_id, class_id, species, breed_id) + def test_transforms_v2_wrapper_spawn(self): + with self.create_dataset() as (dataset, _): + datasets_utils.check_transforms_v2_wrapper_spawn(dataset) + class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.StanfordCars REQUIRED_PACKAGES = ("scipy",) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) def inject_fake_data(self, tmpdir, config): import scipy.io as io @@ -2543,7 +2619,7 @@ class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase): class Country211TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Country211 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "valid", "test")) def inject_fake_data(self, tmpdir: str, config): split_folder = pathlib.Path(tmpdir) / "country211" / config["split"] @@ -2570,7 +2646,7 @@ class Country211TestCase(datasets_utils.ImageDatasetTestCase): class Flowers102TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Flowers102 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) REQUIRED_PACKAGES = ("scipy",) def inject_fake_data(self, tmpdir: str, config): @@ -2606,7 +2682,7 @@ class Flowers102TestCase(datasets_utils.ImageDatasetTestCase): class PCAMTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.PCAM - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) REQUIRED_PACKAGES = ("h5py",) def inject_fake_data(self, tmpdir: str, config): @@ -2628,7 +2704,7 @@ class PCAMTestCase(datasets_utils.ImageDatasetTestCase): class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.RenderedSST2 - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test")) SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"} def inject_fake_data(self, tmpdir: str, config): @@ -2650,7 +2726,7 @@ class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase): class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Kitti2012Stereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) def inject_fake_data(self, tmpdir, config): @@ -2712,7 +2788,7 @@ class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase): class Kitti2015StereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Kitti2015Stereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) def inject_fake_data(self, tmpdir, config): @@ -2850,7 +2926,7 @@ class CREStereoTestCase(datasets_utils.ImageDatasetTestCase): class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.FallingThingsStereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(variant=("single", "mixed", "both")) + ADDITIONAL_CONFIGS = combinations_grid(variant=("single", "mixed", "both")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) @staticmethod @@ -2924,7 +3000,7 @@ class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase): class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.SceneFlowStereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( variant=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final", "both") ) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) @@ -3011,7 +3087,7 @@ class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase): class InStereo2k(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.InStereo2k FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None))) - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) @staticmethod def _make_scene_folder(root: str, name: str, size: Tuple[int, int]): @@ -3053,7 +3129,7 @@ class InStereo2k(datasets_utils.ImageDatasetTestCase): class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.SintelStereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(pass_name=("final", "clean", "both")) + ADDITIONAL_CONFIGS = combinations_grid(pass_name=("final", "clean", "both")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) def inject_fake_data(self, tmpdir, config): @@ -3129,7 +3205,7 @@ class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase): class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.ETH3DStereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test")) + ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test")) FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None))) @staticmethod @@ -3196,7 +3272,7 @@ class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase): class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase): DATASET_CLASS = datasets.Middlebury2014Stereo - ADDITIONAL_CONFIGS = datasets_utils.combinations_grid( + ADDITIONAL_CONFIGS = combinations_grid( split=("train", "additional"), calibration=("perfect", "imperfect", "both"), use_ambient_views=(True, False), @@ -3287,5 +3363,47 @@ class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase): pass +class TestDatasetWrapper: + def test_unknown_type(self): + unknown_object = object() + with pytest.raises( + TypeError, match=re.escape("is meant for subclasses of `torchvision.datasets.VisionDataset`") + ): + datasets.wrap_dataset_for_transforms_v2(unknown_object) + + def test_unknown_dataset(self): + class MyVisionDataset(datasets.VisionDataset): + pass + + dataset = MyVisionDataset("root") + + with pytest.raises(TypeError, match="No wrapper exist"): + datasets.wrap_dataset_for_transforms_v2(dataset) + + def test_missing_wrapper(self): + dataset = datasets.FakeData() + + with pytest.raises(TypeError, match="please open an issue"): + datasets.wrap_dataset_for_transforms_v2(dataset) + + def test_subclass(self, mocker): + from torchvision import tv_tensors + + sentinel = object() + mocker.patch.dict( + tv_tensors._dataset_wrapper.WRAPPER_FACTORIES, + clear=False, + values={datasets.FakeData: lambda dataset, target_keys: lambda idx, sample: sentinel}, + ) + + class MyFakeData(datasets.FakeData): + pass + + dataset = MyFakeData() + wrapped_dataset = datasets.wrap_dataset_for_transforms_v2(dataset) + + assert wrapped_dataset[0] is sentinel + + if __name__ == "__main__": unittest.main() diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py index b44d954241baad190af69bd20f38c863171d9089..e99017d8b5589360ca2c60283b09fb54e4509f8a 100644 --- a/test/test_datasets_download.py +++ b/test/test_datasets_download.py @@ -2,6 +2,7 @@ import contextlib import itertools import tempfile import time +import traceback import unittest.mock import warnings from datetime import datetime @@ -13,13 +14,7 @@ from urllib.request import Request, urlopen import pytest from torchvision import datasets -from torchvision.datasets.utils import ( - _get_redirect_url, - check_integrity, - download_file_from_google_drive, - download_url, - USER_AGENT, -) +from torchvision.datasets.utils import _get_redirect_url, USER_AGENT def limit_requests_per_time(min_secs_between_requests=2.0): @@ -83,63 +78,65 @@ urlopen = resolve_redirects()(urlopen) @contextlib.contextmanager def log_download_attempts( - urls_and_md5s=None, - file="utils", - patch=True, - mock_auxiliaries=None, + urls, + *, + dataset_module, ): - def add_mock(stack, name, file, **kwargs): + def maybe_add_mock(*, module, name, stack, lst=None): + patcher = unittest.mock.patch(f"torchvision.datasets.{module}.{name}") + try: - return stack.enter_context(unittest.mock.patch(f"torchvision.datasets.{file}.{name}", **kwargs)) - except AttributeError as error: - if file != "utils": - return add_mock(stack, name, "utils", **kwargs) - else: - raise pytest.UsageError from error - - if urls_and_md5s is None: - urls_and_md5s = set() - if mock_auxiliaries is None: - mock_auxiliaries = patch + mock = stack.enter_context(patcher) + except AttributeError: + return - with contextlib.ExitStack() as stack: - url_mock = add_mock(stack, "download_url", file, wraps=None if patch else download_url) - google_drive_mock = add_mock( - stack, "download_file_from_google_drive", file, wraps=None if patch else download_file_from_google_drive - ) + if lst is not None: + lst.append(mock) - if mock_auxiliaries: - add_mock(stack, "extract_archive", file) + with contextlib.ExitStack() as stack: + download_url_mocks = [] + download_file_from_google_drive_mocks = [] + for module in [dataset_module, "utils"]: + maybe_add_mock(module=module, name="download_url", stack=stack, lst=download_url_mocks) + maybe_add_mock( + module=module, + name="download_file_from_google_drive", + stack=stack, + lst=download_file_from_google_drive_mocks, + ) + maybe_add_mock(module=module, name="extract_archive", stack=stack) try: - yield urls_and_md5s + yield finally: - for args, kwargs in url_mock.call_args_list: - url = args[0] - md5 = args[-1] if len(args) == 4 else kwargs.get("md5") - urls_and_md5s.add((url, md5)) + for download_url_mock in download_url_mocks: + for args, kwargs in download_url_mock.call_args_list: + urls.append(args[0] if args else kwargs["url"]) - for args, kwargs in google_drive_mock.call_args_list: - id = args[0] - url = f"https://drive.google.com/file/d/{id}" - md5 = args[3] if len(args) == 4 else kwargs.get("md5") - urls_and_md5s.add((url, md5)) + for download_file_from_google_drive_mock in download_file_from_google_drive_mocks: + for args, kwargs in download_file_from_google_drive_mock.call_args_list: + file_id = args[0] if args else kwargs["file_id"] + urls.append(f"https://drive.google.com/file/d/{file_id}") def retry(fn, times=1, wait=5.0): - msgs = [] + tbs = [] for _ in range(times + 1): try: return fn() except AssertionError as error: - msgs.append(str(error)) + tbs.append("".join(traceback.format_exception(type(error), error, error.__traceback__))) time.sleep(wait) else: raise AssertionError( "\n".join( ( - f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time.\n", - *(f"{idx}: {error}" for idx, error in enumerate(msgs, 1)), + "\n", + *[f"{'_' * 40} {idx:2d} {'_' * 40}\n\n{tb}" for idx, tb in enumerate(tbs, 1)], + ( + f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time. " + f"You can find the the full tracebacks above." + ), ) ) ) @@ -149,10 +146,12 @@ def retry(fn, times=1, wait=5.0): def assert_server_response_ok(): try: yield - except URLError as error: - raise AssertionError("The request timed out.") from error except HTTPError as error: raise AssertionError(f"The server returned {error.code}: {error.reason}.") from error + except URLError as error: + raise AssertionError( + "Connection not possible due to SSL." if "SSL" in str(error) else "The request timed out." + ) from error except RecursionError as error: raise AssertionError(str(error)) from error @@ -163,45 +162,14 @@ def assert_url_is_accessible(url, timeout=5.0): urlopen(request, timeout=timeout) -def assert_file_downloads_correctly(url, md5, tmpdir, timeout=5.0): - file = path.join(tmpdir, path.basename(url)) - with assert_server_response_ok(): - with open(file, "wb") as fh: - request = Request(url, headers={"User-Agent": USER_AGENT}) - response = urlopen(request, timeout=timeout) - fh.write(response.read()) - - assert check_integrity(file, md5=md5), "The MD5 checksums mismatch" - - -class DownloadConfig: - def __init__(self, url, md5=None, id=None): - self.url = url - self.md5 = md5 - self.id = id or url +def collect_urls(dataset_cls, *args, **kwargs): + urls = [] + with contextlib.suppress(Exception), log_download_attempts( + urls, dataset_module=dataset_cls.__module__.split(".")[-1] + ): + dataset_cls(*args, **kwargs) - def __repr__(self) -> str: - return self.id - - -def make_download_configs(urls_and_md5s, name=None): - return [ - DownloadConfig(url, md5=md5, id=f"{name}, {url}" if name is not None else None) for url, md5 in urls_and_md5s - ] - - -def collect_download_configs(dataset_loader, name=None, **kwargs): - urls_and_md5s = set() - try: - with log_download_attempts(urls_and_md5s=urls_and_md5s, **kwargs): - dataset = dataset_loader() - except Exception: - dataset = None - - if name is None and dataset is not None: - name = type(dataset).__name__ - - return make_download_configs(urls_and_md5s, name) + return [(url, f"{dataset_cls.__name__}, {url}") for url in urls] # This is a workaround since fixtures, such as the built-in tmp_dir, can only be used within a test but not within a @@ -216,12 +184,14 @@ def root(): def places365(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.Places365(ROOT, split=split, small=small, download=True), - name=f"Places365, {split}, {'small' if small else 'large'}", - file="places365", + return itertools.chain.from_iterable( + [ + collect_urls( + datasets.Places365, + ROOT, + split=split, + small=small, + download=True, ) for split, small in itertools.product(("train-standard", "train-challenge", "val"), (False, True)) ] @@ -229,30 +199,26 @@ def places365(): def caltech101(): - return collect_download_configs(lambda: datasets.Caltech101(ROOT, download=True), name="Caltech101") + return collect_urls(datasets.Caltech101, ROOT, download=True) def caltech256(): - return collect_download_configs(lambda: datasets.Caltech256(ROOT, download=True), name="Caltech256") + return collect_urls(datasets.Caltech256, ROOT, download=True) def cifar10(): - return collect_download_configs(lambda: datasets.CIFAR10(ROOT, download=True), name="CIFAR10") + return collect_urls(datasets.CIFAR10, ROOT, download=True) def cifar100(): - return collect_download_configs(lambda: datasets.CIFAR100(ROOT, download=True), name="CIFAR100") + return collect_urls(datasets.CIFAR100, ROOT, download=True) def voc(): # TODO: Also test the "2007-test" key - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.VOCSegmentation(ROOT, year=year, download=True), - name=f"VOC, {year}", - file="voc", - ) + return itertools.chain.from_iterable( + [ + collect_urls(datasets.VOCSegmentation, ROOT, year=year, download=True) for year in ("2007", "2008", "2009", "2010", "2011", "2012") ] ) @@ -260,55 +226,42 @@ def voc(): def mnist(): with unittest.mock.patch.object(datasets.MNIST, "mirrors", datasets.MNIST.mirrors[-1:]): - return collect_download_configs(lambda: datasets.MNIST(ROOT, download=True), name="MNIST") + return collect_urls(datasets.MNIST, ROOT, download=True) def fashion_mnist(): - return collect_download_configs(lambda: datasets.FashionMNIST(ROOT, download=True), name="FashionMNIST") + return collect_urls(datasets.FashionMNIST, ROOT, download=True) def kmnist(): - return collect_download_configs(lambda: datasets.KMNIST(ROOT, download=True), name="KMNIST") + return collect_urls(datasets.KMNIST, ROOT, download=True) def emnist(): # the 'split' argument can be any valid one, since everything is downloaded anyway - return collect_download_configs(lambda: datasets.EMNIST(ROOT, split="byclass", download=True), name="EMNIST") + return collect_urls(datasets.EMNIST, ROOT, split="byclass", download=True) def qmnist(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.QMNIST(ROOT, what=what, download=True), - name=f"QMNIST, {what}", - file="mnist", - ) - for what in ("train", "test", "nist") - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.QMNIST, ROOT, what=what, download=True) for what in ("train", "test", "nist")] ) +def moving_mnist(): + return collect_urls(datasets.MovingMNIST, ROOT, download=True) + + def omniglot(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.Omniglot(ROOT, background=background, download=True), - name=f"Omniglot, {'background' if background else 'evaluation'}", - ) - for background in (True, False) - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.Omniglot, ROOT, background=background, download=True) for background in (True, False)] ) def phototour(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.PhotoTour(ROOT, name=name, download=True), - name=f"PhotoTour, {name}", - file="phototour", - ) + return itertools.chain.from_iterable( + [ + collect_urls(datasets.PhotoTour, ROOT, name=name, download=True) # The names postfixed with '_harris' point to the domain 'matthewalunbrown.com'. For some reason all # requests timeout from within CI. They are disabled until this is resolved. for name in ("notredame", "yosemite", "liberty") # "notredame_harris", "yosemite_harris", "liberty_harris" @@ -317,91 +270,51 @@ def phototour(): def sbdataset(): - return collect_download_configs( - lambda: datasets.SBDataset(ROOT, download=True), - name="SBDataset", - file="voc", - ) + return collect_urls(datasets.SBDataset, ROOT, download=True) def sbu(): - return collect_download_configs( - lambda: datasets.SBU(ROOT, download=True), - name="SBU", - file="sbu", - ) + return collect_urls(datasets.SBU, ROOT, download=True) def semeion(): - return collect_download_configs( - lambda: datasets.SEMEION(ROOT, download=True), - name="SEMEION", - file="semeion", - ) + return collect_urls(datasets.SEMEION, ROOT, download=True) def stl10(): - return collect_download_configs( - lambda: datasets.STL10(ROOT, download=True), - name="STL10", - ) + return collect_urls(datasets.STL10, ROOT, download=True) def svhn(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.SVHN(ROOT, split=split, download=True), - name=f"SVHN, {split}", - file="svhn", - ) - for split in ("train", "test", "extra") - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.SVHN, ROOT, split=split, download=True) for split in ("train", "test", "extra")] ) def usps(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.USPS(ROOT, train=train, download=True), - name=f"USPS, {'train' if train else 'test'}", - file="usps", - ) - for train in (True, False) - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.USPS, ROOT, train=train, download=True) for train in (True, False)] ) def celeba(): - return collect_download_configs( - lambda: datasets.CelebA(ROOT, download=True), - name="CelebA", - file="celeba", - ) + return collect_urls(datasets.CelebA, ROOT, download=True) def widerface(): - return collect_download_configs( - lambda: datasets.WIDERFace(ROOT, download=True), - name="WIDERFace", - file="widerface", - ) + return collect_urls(datasets.WIDERFace, ROOT, download=True) def kinetics(): - return itertools.chain( - *[ - collect_download_configs( - lambda: datasets.Kinetics( - path.join(ROOT, f"Kinetics{num_classes}"), - frames_per_clip=1, - num_classes=num_classes, - split=split, - download=True, - ), - name=f"Kinetics, {num_classes}, {split}", - file="kinetics", + return itertools.chain.from_iterable( + [ + collect_urls( + datasets.Kinetics, + path.join(ROOT, f"Kinetics{num_classes}"), + frames_per_clip=1, + num_classes=num_classes, + split=split, + download=True, ) for num_classes, split in itertools.product(("400", "600", "700"), ("train", "val")) ] @@ -409,58 +322,55 @@ def kinetics(): def kitti(): - return itertools.chain( - *[ - collect_download_configs( - lambda train=train: datasets.Kitti(ROOT, train=train, download=True), - name=f"Kitti, {'train' if train else 'test'}", - file="kitti", - ) - for train in (True, False) - ] + return itertools.chain.from_iterable( + [collect_urls(datasets.Kitti, ROOT, train=train, download=True) for train in (True, False)] ) -def make_parametrize_kwargs(download_configs): - argvalues = [] - ids = [] - for config in download_configs: - argvalues.append((config.url, config.md5)) - ids.append(config.id) - - return dict(argnames=("url", "md5"), argvalues=argvalues, ids=ids) - - -@pytest.mark.parametrize( - **make_parametrize_kwargs( - itertools.chain( - caltech101(), - caltech256(), - cifar10(), - cifar100(), - # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details. - # voc(), - mnist(), - fashion_mnist(), - kmnist(), - emnist(), - qmnist(), - omniglot(), - phototour(), - sbdataset(), - sbu(), - semeion(), - stl10(), - svhn(), - usps(), - celeba(), - widerface(), - kinetics(), - kitti(), - ) +def stanford_cars(): + return itertools.chain.from_iterable( + [collect_urls(datasets.StanfordCars, ROOT, split=split, download=True) for split in ["train", "test"]] + ) + + +def url_parametrization(*dataset_urls_and_ids_fns): + return pytest.mark.parametrize( + "url", + [ + pytest.param(url, id=id) + for dataset_urls_and_ids_fn in dataset_urls_and_ids_fns + for url, id in sorted(set(dataset_urls_and_ids_fn())) + ], ) + + +@url_parametrization( + caltech101, + caltech256, + cifar10, + cifar100, + # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details. + # voc, + mnist, + fashion_mnist, + kmnist, + emnist, + qmnist, + omniglot, + phototour, + sbdataset, + semeion, + stl10, + svhn, + usps, + celeba, + widerface, + kinetics, + kitti, + places365, + sbu, ) -def test_url_is_accessible(url, md5): +def test_url_is_accessible(url): """ If you see this test failing, find the offending dataset in the parametrization and move it to ``test_url_is_not_accessible`` and link an issue detailing the problem. @@ -468,15 +378,11 @@ def test_url_is_accessible(url, md5): retry(lambda: assert_url_is_accessible(url)) -@pytest.mark.parametrize( - **make_parametrize_kwargs( - itertools.chain( - places365(), # https://github.com/pytorch/vision/issues/6268 - ) - ) +@url_parametrization( + stanford_cars, # https://github.com/pytorch/vision/issues/7545 ) @pytest.mark.xfail -def test_url_is_not_accessible(url, md5): +def test_url_is_not_accessible(url): """ As the name implies, this test is the 'inverse' of ``test_url_is_accessible``. Since the download servers are beyond our control, some files might not be accessible for longer stretches of time. Still, we want to know if they @@ -486,8 +392,3 @@ def test_url_is_not_accessible(url, md5): ``test_url_is_accessible``. """ retry(lambda: assert_url_is_accessible(url)) - - -@pytest.mark.parametrize(**make_parametrize_kwargs(itertools.chain())) -def test_file_downloads_correctly(url, md5): - retry(lambda: assert_file_downloads_correctly(url, md5)) diff --git a/test/test_datasets_utils.py b/test/test_datasets_utils.py index ec68fd72a5be464c0b3fb7d2b1ef1e48e98f66fc..4e30dfab2cc6e99424da8f66d87ceccfa1acbbd4 100644 --- a/test/test_datasets_utils.py +++ b/test/test_datasets_utils.py @@ -7,7 +7,9 @@ import tarfile import zipfile import pytest +import torch import torchvision.datasets.utils as utils +from common_utils import assert_equal from torch._utils_internal import get_file_path_2 from torchvision.datasets.folder import make_dataset from torchvision.datasets.utils import _COMPRESSED_FILE_OPENERS @@ -215,6 +217,24 @@ class TestDatasetsUtils: pytest.raises(ValueError, utils.verify_str_arg, 0, ("a",), "arg") pytest.raises(ValueError, utils.verify_str_arg, "b", ("a",), "arg") + @pytest.mark.parametrize( + ("dtype", "actual_hex", "expected_hex"), + [ + (torch.uint8, "01 23 45 67 89 AB CD EF", "01 23 45 67 89 AB CD EF"), + (torch.float16, "01 23 45 67 89 AB CD EF", "23 01 67 45 AB 89 EF CD"), + (torch.int32, "01 23 45 67 89 AB CD EF", "67 45 23 01 EF CD AB 89"), + (torch.float64, "01 23 45 67 89 AB CD EF", "EF CD AB 89 67 45 23 01"), + ], + ) + def test_flip_byte_order(self, dtype, actual_hex, expected_hex): + def to_tensor(hex): + return torch.frombuffer(bytes.fromhex(hex), dtype=dtype) + + assert_equal( + utils._flip_byte_order(to_tensor(actual_hex)), + to_tensor(expected_hex), + ) + @pytest.mark.parametrize( ("kwargs", "expected_error_msg"), diff --git a/test/test_extended_models.py b/test/test_extended_models.py index c467564c9c4a46af9e546430770f3ff38d2d511f..0c918c0afd1a64c3762e6d816b991d5a4f726f88 100644 --- a/test/test_extended_models.py +++ b/test/test_extended_models.py @@ -1,12 +1,15 @@ +import copy import os +import pickle import pytest import test_models as TM import torch +from common_extended_utils import get_file_size_mb, get_ops from torchvision import models -from torchvision.models._api import get_model_weights, Weights, WeightsEnum +from torchvision.models import get_model_weights, Weights, WeightsEnum from torchvision.models._utils import handle_legacy_interface - +from torchvision.models.detection.backbone_utils import mobilenet_backbone, resnet_fpn_backbone run_if_test_with_extended = pytest.mark.skipif( os.getenv("PYTORCH_TEST_WITH_EXTENDED", "0") != "1", @@ -59,17 +62,59 @@ def test_get_model_weights(name, weight): assert models.get_model_weights(name) == weight +@pytest.mark.parametrize("copy_fn", [copy.copy, copy.deepcopy]) +@pytest.mark.parametrize( + "name", + [ + "resnet50", + "retinanet_resnet50_fpn_v2", + "raft_large", + "quantized_resnet50", + "lraspp_mobilenet_v3_large", + "mvit_v1_b", + ], +) +def test_weights_copyable(copy_fn, name): + for weights in list(models.get_model_weights(name)): + # It is somewhat surprising that (deep-)copying is an identity operation here, but this is the default behavior + # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances + # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop + # support for the identity operation in the future. + assert copy_fn(weights) is weights + + +@pytest.mark.parametrize( + "name", + [ + "resnet50", + "retinanet_resnet50_fpn_v2", + "raft_large", + "quantized_resnet50", + "lraspp_mobilenet_v3_large", + "mvit_v1_b", + ], +) +def test_weights_deserializable(name): + for weights in list(models.get_model_weights(name)): + # It is somewhat surprising that deserialization is an identity operation here, but this is the default behavior + # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances + # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop + # support for the identity operation in the future. + assert pickle.loads(pickle.dumps(weights)) is weights + + +def get_models_from_module(module): + return [ + v.__name__ + for k, v in module.__dict__.items() + if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__ + ] + + @pytest.mark.parametrize( "module", [models, models.detection, models.quantization, models.segmentation, models.video, models.optical_flow] ) def test_list_models(module): - def get_models_from_module(module): - return [ - v.__name__ - for k, v in module.__dict__.items() - if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__ - ] - a = set(get_models_from_module(module)) b = set(x.replace("quantized_", "") for x in models.list_models(module)) @@ -77,6 +122,65 @@ def test_list_models(module): assert a == b +@pytest.mark.parametrize( + "include_filters", + [ + None, + [], + (), + "", + "*resnet*", + ["*alexnet*"], + "*not-existing-model-for-test?", + ["*resnet*", "*alexnet*"], + ["*resnet*", "*alexnet*", "*not-existing-model-for-test?"], + ("*resnet*", "*alexnet*"), + set(["*resnet*", "*alexnet*"]), + ], +) +@pytest.mark.parametrize( + "exclude_filters", + [ + None, + [], + (), + "", + "*resnet*", + ["*alexnet*"], + ["*not-existing-model-for-test?"], + ["resnet34", "*not-existing-model-for-test?"], + ["resnet34", "*resnet1*"], + ("resnet34", "*resnet1*"), + set(["resnet34", "*resnet1*"]), + ], +) +def test_list_models_filters(include_filters, exclude_filters): + actual = set(models.list_models(models, include=include_filters, exclude=exclude_filters)) + classification_models = set(get_models_from_module(models)) + + if isinstance(include_filters, str): + include_filters = [include_filters] + if isinstance(exclude_filters, str): + exclude_filters = [exclude_filters] + + if include_filters: + expected = set() + for include_f in include_filters: + include_f = include_f.strip("*?") + expected = expected | set(x for x in classification_models if include_f in x) + else: + expected = classification_models + + if exclude_filters: + for exclude_f in exclude_filters: + exclude_f = exclude_f.strip("*?") + if exclude_f != "": + a_exclude = set(x for x in classification_models if exclude_f in x) + expected = expected - a_exclude + + assert expected == actual + + @pytest.mark.parametrize( "name, weight", [ @@ -111,6 +215,22 @@ def test_naming_conventions(model_fn): assert len(weights_enum) == 0 or hasattr(weights_enum, "DEFAULT") +detection_models_input_dims = { + "fasterrcnn_mobilenet_v3_large_320_fpn": (320, 320), + "fasterrcnn_mobilenet_v3_large_fpn": (800, 800), + "fasterrcnn_resnet50_fpn": (800, 800), + "fasterrcnn_resnet50_fpn_v2": (800, 800), + "fcos_resnet50_fpn": (800, 800), + "keypointrcnn_resnet50_fpn": (1333, 1333), + "maskrcnn_resnet50_fpn": (800, 800), + "maskrcnn_resnet50_fpn_v2": (800, 800), + "retinanet_resnet50_fpn": (800, 800), + "retinanet_resnet50_fpn_v2": (800, 800), + "ssd300_vgg16": (300, 300), + "ssdlite320_mobilenet_v3_large": (320, 320), +} + + @pytest.mark.parametrize( "model_fn", TM.list_model_fns(models) @@ -122,6 +242,9 @@ def test_naming_conventions(model_fn): ) @run_if_test_with_extended def test_schema_meta_validation(model_fn): + if model_fn.__name__ == "maskrcnn_resnet50_fpn_v2": + pytest.skip(reason="FIXME https://github.com/pytorch/vision/issues/7349") + # list of all possible supported high-level fields for weights meta-data permitted_fields = { "backend", @@ -135,11 +258,13 @@ def test_schema_meta_validation(model_fn): "recipe", "unquantized", "_docs", + "_ops", + "_file_size", } # mandatory fields for each computer vision task classification_fields = {"categories", ("_metrics", "ImageNet-1K", "acc@1"), ("_metrics", "ImageNet-1K", "acc@5")} defaults = { - "all": {"_metrics", "min_size", "num_params", "recipe", "_docs"}, + "all": {"_metrics", "min_size", "num_params", "recipe", "_docs", "_file_size", "_ops"}, "models": classification_fields, "detection": {"categories", ("_metrics", "COCO-val2017", "box_map")}, "quantization": classification_fields | {"backend", "unquantized"}, @@ -160,7 +285,7 @@ def test_schema_meta_validation(model_fn): pytest.skip(f"Model '{model_name}' doesn't have any pre-trained weights.") problematic_weights = {} - incorrect_params = [] + incorrect_meta = [] bad_names = [] for w in weights_enum: actual_fields = set(w.meta.keys()) @@ -173,24 +298,47 @@ def test_schema_meta_validation(model_fn): unsupported_fields = set(w.meta.keys()) - permitted_fields if missing_fields or unsupported_fields: problematic_weights[w] = {"missing": missing_fields, "unsupported": unsupported_fields} - if w == weights_enum.DEFAULT: + + if w == weights_enum.DEFAULT or any(w.meta[k] != weights_enum.DEFAULT.meta[k] for k in ["num_params", "_ops"]): if module_name == "quantization": # parameters() count doesn't work well with quantization, so we check against the non-quantized unquantized_w = w.meta.get("unquantized") - if unquantized_w is not None and w.meta.get("num_params") != unquantized_w.meta.get("num_params"): - incorrect_params.append(w) + if unquantized_w is not None: + if w.meta.get("num_params") != unquantized_w.meta.get("num_params"): + incorrect_meta.append((w, "num_params")) + + # the methodology for quantized ops count doesn't work as well, so we take unquantized FLOPs + # instead + if w.meta["_ops"] != unquantized_w.meta.get("_ops"): + incorrect_meta.append((w, "_ops")) + else: - if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()): - incorrect_params.append(w) - else: - if w.meta.get("num_params") != weights_enum.DEFAULT.meta.get("num_params"): - if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()): - incorrect_params.append(w) + # loading the model and using it for parameter and ops verification + model = model_fn(weights=w) + + if w.meta.get("num_params") != sum(p.numel() for p in model.parameters()): + incorrect_meta.append((w, "num_params")) + + kwargs = {} + if model_name in detection_models_input_dims: + # detection models have non default height and width + height, width = detection_models_input_dims[model_name] + kwargs = {"height": height, "width": width} + + if not model_fn.__name__.startswith("vit"): + # FIXME: https://github.com/pytorch/vision/issues/7871 + calculated_ops = get_ops(model=model, weight=w, **kwargs) + if calculated_ops != w.meta["_ops"]: + incorrect_meta.append((w, "_ops")) + if not w.name.isupper(): bad_names.append(w) + if get_file_size_mb(w) != w.meta.get("_file_size"): + incorrect_meta.append((w, "_file_size")) + assert not problematic_weights - assert not incorrect_params + assert not incorrect_meta assert not bad_names @@ -343,7 +491,11 @@ class TestHandleLegacyInterface: + TM.list_model_fns(models.quantization) + TM.list_model_fns(models.segmentation) + TM.list_model_fns(models.video) - + TM.list_model_fns(models.optical_flow), + + TM.list_model_fns(models.optical_flow) + + [ + lambda pretrained: resnet_fpn_backbone(backbone_name="resnet50", pretrained=pretrained), + lambda pretrained: mobilenet_backbone(backbone_name="mobilenet_v2", fpn=False, pretrained=pretrained), + ], ) @run_if_test_with_extended def test_pretrained_deprecation(self, model_fn): diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py index 9bdd4ab83a54e377da116bad73e8b1ee753d67e8..fb3f5744e54875561ec2981de1a35f1c792931ad 100644 --- a/test/test_functional_tensor.py +++ b/test/test_functional_tensor.py @@ -2,17 +2,18 @@ import colorsys import itertools import math import os -import re +import warnings from functools import partial from typing import Sequence import numpy as np +import PIL.Image import pytest import torch import torchvision.transforms as T +import torchvision.transforms._functional_pil as F_pil +import torchvision.transforms._functional_tensor as F_t import torchvision.transforms.functional as F -import torchvision.transforms.functional_pil as F_pil -import torchvision.transforms.functional_tensor as F_t from common_utils import ( _assert_approx_equal_tensor_to_pil, _assert_equal_tensor_to_pil, @@ -20,15 +21,20 @@ from common_utils import ( _create_data_batch, _test_fn_on_batch, assert_equal, - cpu_and_gpu, + cpu_and_cuda, needs_cuda, ) from torchvision.transforms import InterpolationMode -NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC +NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = ( + InterpolationMode.NEAREST, + InterpolationMode.NEAREST_EXACT, + InterpolationMode.BILINEAR, + InterpolationMode.BICUBIC, +) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("fn", [F.get_image_size, F.get_image_num_channels, F.get_dimensions]) def test_image_sizes(device, fn): script_F = torch.jit.script(fn) @@ -66,7 +72,7 @@ class TestRotate: scripted_rotate = torch.jit.script(F.rotate) IMG_W = 26 - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(7, 33), (26, IMG_W), (32, IMG_W)]) @pytest.mark.parametrize( "center", @@ -125,7 +131,7 @@ class TestRotate: f"{out_pil_tensor[0, :7, :7]}" ) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", ALL_DTYPES) def test_rotate_batch(self, device, dt): if dt == torch.float16 and device == "cpu": @@ -141,17 +147,9 @@ class TestRotate: def test_rotate_interpolation_type(self): tensor, _ = _create_data(26, 26) - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - res1 = F.rotate(tensor, 45, interpolation=2) - res2 = F.rotate(tensor, 45, interpolation=BILINEAR) - assert_equal(res1, res2) + res1 = F.rotate(tensor, 45, interpolation=PIL.Image.BILINEAR) + res2 = F.rotate(tensor, 45, interpolation=BILINEAR) + assert_equal(res1, res2) class TestAffine: @@ -159,7 +157,7 @@ class TestAffine: ALL_DTYPES = [None, torch.float32, torch.float64, torch.float16] scripted_affine = torch.jit.script(F.affine) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) def test_identity_map(self, device, height, width, dt): @@ -182,7 +180,7 @@ class TestAffine: ) assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}") - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(26, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) @pytest.mark.parametrize( @@ -226,7 +224,7 @@ class TestAffine: # Tolerance : less than 6% of different pixels assert ratio_diff_pixels < 0.06 - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(32, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) @pytest.mark.parametrize("angle", [90, 45, 15, -30, -60, -120]) @@ -260,7 +258,7 @@ class TestAffine: # Tolerance : less than 3% of different pixels assert ratio_diff_pixels < 0.03 - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) @pytest.mark.parametrize("t", [[10, 12], (-12, -13)]) @@ -285,7 +283,7 @@ class TestAffine: _assert_equal_tensor_to_pil(out_tensor, out_pil_img) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)]) @pytest.mark.parametrize("dt", ALL_DTYPES) @pytest.mark.parametrize( @@ -295,24 +293,8 @@ class TestAffine: (33, (5, -4), 1.0, [0.0, 0.0], [0, 0, 0]), (45, [-5, 4], 1.2, [0.0, 0.0], (1, 2, 3)), (33, (-4, -8), 2.0, [0.0, 0.0], [255, 255, 255]), - ( - 85, - (10, -10), - 0.7, - [0.0, 0.0], - [ - 1, - ], - ), - ( - 0, - [0, 0], - 1.0, - [ - 35.0, - ], - (2.0,), - ), + (85, (10, -10), 0.7, [0.0, 0.0], [1]), + (0, [0, 0], 1.0, [35.0], (2.0,)), (-25, [0, 0], 1.2, [0.0, 15.0], None), (-45, [-10, 0], 0.7, [2.0, 5.0], None), (-45, [-10, -10], 1.2, [4.0, 5.0], None), @@ -346,7 +328,7 @@ class TestAffine: tol = 0.06 if device == "cuda" else 0.05 assert ratio_diff_pixels < tol - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", ALL_DTYPES) def test_batches(self, device, dt): if dt == torch.float16 and device == "cpu": @@ -359,21 +341,13 @@ class TestAffine: _test_fn_on_batch(batch_tensors, F.affine, angle=-43, translate=[-3, 4], scale=1.2, shear=[4.0, 5.0]) - @pytest.mark.parametrize("device", cpu_and_gpu()) - def test_warnings(self, device): + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_interpolation_type(self, device): tensor, pil_img = _create_data(26, 26, device=device) - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=2) - res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR) - assert_equal(res1, res2) + res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=PIL.Image.BILINEAR) + res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR) + assert_equal(res1, res2) def _get_data_dims_and_points_for_perspective(): @@ -399,22 +373,10 @@ def _get_data_dims_and_points_for_perspective(): return dims_and_points -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) -@pytest.mark.parametrize( - "fill", - ( - None, - [0, 0, 0], - [1, 2, 3], - [255, 255, 255], - [ - 1, - ], - (2.0,), - ), -) +@pytest.mark.parametrize("fill", (None, [0, 0, 0], [1, 2, 3], [255, 255, 255], [1], (2.0,))) @pytest.mark.parametrize("fn", [F.perspective, torch.jit.script(F.perspective)]) def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn): @@ -445,7 +407,7 @@ def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn): assert ratio_diff_pixels < 0.05 -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) def test_perspective_batch(device, dims_and_points, dt): @@ -473,40 +435,21 @@ def test_perspective_batch(device, dims_and_points, dt): ) -def test_perspective_interpolation_warning(): - # assert changed type warning +def test_perspective_interpolation_type(): spoints = [[0, 0], [33, 0], [33, 25], [0, 25]] epoints = [[3, 2], [32, 3], [30, 24], [2, 25]] tensor = torch.randint(0, 256, (3, 26, 26)) - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=2) - res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR) - assert_equal(res1, res2) + + res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=PIL.Image.BILINEAR) + res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR) + assert_equal(res1, res2) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) -@pytest.mark.parametrize( - "size", - [ - 32, - 26, - [ - 32, - ], - [32, 32], - (32, 32), - [26, 35], - ], -) +@pytest.mark.parametrize("size", [32, 26, [32], [32, 32], (32, 32), [26, 35]]) @pytest.mark.parametrize("max_size", [None, 34, 40, 1000]) -@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST]) +@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT]) def test_resize(device, dt, size, max_size, interpolation): if dt == torch.float16 and device == "cpu": @@ -526,14 +469,12 @@ def test_resize(device, dt, size, max_size, interpolation): tensor = tensor.to(dt) batch_tensors = batch_tensors.to(dt) - resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size) - resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size) + resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size, antialias=True) + resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size, antialias=True) assert resized_tensor.size()[1:] == resized_pil_img.size[::-1] - if interpolation not in [ - NEAREST, - ]: + if interpolation != NEAREST: # We can not check values if mode = NEAREST, as results are different # E.g. resized_tensor = [[a, a, b, c, d, d, e, ...]] # E.g. resized_pil_img = [[a, b, c, c, d, e, f, ...]] @@ -543,36 +484,27 @@ def test_resize(device, dt, size, max_size, interpolation): resized_tensor_f = resized_tensor_f.to(torch.float) # Pay attention to high tolerance for MAE - _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=8.0) + _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=3.0) if isinstance(size, int): - script_size = [ - size, - ] + script_size = [size] else: script_size = size - resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size) + resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True) assert_equal(resized_tensor, resize_result) - _test_fn_on_batch(batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size) + _test_fn_on_batch( + batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True + ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_resize_asserts(device): tensor, pil_img = _create_data(26, 36, device=device) - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - res1 = F.resize(tensor, size=32, interpolation=2) - + res1 = F.resize(tensor, size=32, interpolation=PIL.Image.BILINEAR) res2 = F.resize(tensor, size=32, interpolation=BILINEAR) assert_equal(res1, res2) @@ -584,7 +516,7 @@ def test_resize_asserts(device): F.resize(img, size=32, max_size=32) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize("size", [[96, 72], [96, 420], [420, 72]]) @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC]) @@ -603,7 +535,7 @@ def test_resize_antialias(device, dt, size, interpolation): tensor = tensor.to(dt) resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, antialias=True) - resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation) + resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, antialias=True) assert resized_tensor.size()[1:] == resized_pil_img.size[::-1] @@ -637,38 +569,21 @@ def test_resize_antialias(device, dt, size, interpolation): assert_equal(resized_tensor, resize_result) -@needs_cuda -@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC]) -def test_assert_resize_antialias(interpolation): - - # Checks implementation on very large scales - # and catch TORCH_CHECK inside PyTorch implementation - torch.manual_seed(12) - tensor, _ = _create_data(1000, 1000, device="cuda") - - # Error message is not yet updated in pytorch nightly - # with pytest.raises(RuntimeError, match=r"Provided interpolation parameters can not be handled"): - with pytest.raises(RuntimeError, match=r"Too much shared memory required"): - F.resize(tensor, size=(5, 5), interpolation=interpolation, antialias=True) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("dt", [torch.float32, torch.float64, torch.float16]) -@pytest.mark.parametrize("size", [[10, 7], [10, 42], [42, 7]]) -@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC]) -def test_interpolate_antialias_backward(device, dt, size, interpolation): +def test_resize_antialias_default_warning(): - if dt == torch.float16 and device == "cpu": - # skip float16 on CPU case - return + img = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8) - torch.manual_seed(12) - x = (torch.rand(1, 32, 29, 3, dtype=torch.double, device=device).permute(0, 3, 1, 2).requires_grad_(True),) - resize = partial(F.resize, size=size, interpolation=interpolation, antialias=True) - assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False) + match = "The default value of the antialias" + with pytest.warns(UserWarning, match=match): + F.resize(img, size=(20, 20)) + with pytest.warns(UserWarning, match=match): + F.resized_crop(img, 0, 0, 10, 10, size=(20, 20)) - x = (torch.rand(1, 3, 32, 29, dtype=torch.double, device=device, requires_grad=True),) - assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False) + # For modes that aren't bicubic or bilinear, don't throw a warning + with warnings.catch_warnings(): + warnings.simplefilter("error") + F.resize(img, size=(20, 20), interpolation=NEAREST) + F.resized_crop(img, 0, 0, 10, 10, size=(20, 20), interpolation=NEAREST) def check_functional_vs_PIL_vs_scripted( @@ -708,7 +623,7 @@ def check_functional_vs_PIL_vs_scripted( _test_fn_on_batch(batch_tensors, fn, scripted_fn_atol=atol, **config) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"brightness_factor": f} for f in (0.1, 0.5, 1.0, 1.34, 2.5)]) @pytest.mark.parametrize("channels", [1, 3]) @@ -724,7 +639,7 @@ def test_adjust_brightness(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("channels", [1, 3]) def test_invert(device, dtype, channels): @@ -733,7 +648,7 @@ def test_invert(device, dtype, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("config", [{"bits": bits} for bits in range(0, 8)]) @pytest.mark.parametrize("channels", [1, 3]) def test_posterize(device, config, channels): @@ -750,7 +665,7 @@ def test_posterize(device, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0, 64, 128, 192, 255]]) @pytest.mark.parametrize("channels", [1, 3]) def test_solarize1(device, config, channels): @@ -767,7 +682,7 @@ def test_solarize1(device, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0.0, 0.25, 0.5, 0.75, 1.0]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -785,37 +700,45 @@ def test_solarize2(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("threshold", [0.0, 0.25, 0.5, 0.75, 1.0]) -def test_solarize_threshold1_bound(threshold, device): - img = torch.rand((3, 12, 23)).to(device) - F_t.solarize(img, threshold) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("threshold", [1.5]) -def test_solarize_threshold1_upper_bound(threshold, device): - img = torch.rand((3, 12, 23)).to(device) - with pytest.raises(TypeError, match="Threshold should be less than bound of img."): - F_t.solarize(img, threshold) - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("threshold", [0, 64, 128, 192, 255]) -def test_solarize_threshold2_bound(threshold, device): - img = torch.randint(0, 256, (3, 12, 23)).to(device) +@pytest.mark.parametrize( + ("dtype", "threshold"), + [ + *[ + (dtype, threshold) + for dtype, threshold in itertools.product( + [torch.float32, torch.float16], + [0.0, 0.25, 0.5, 0.75, 1.0], + ) + ], + *[(torch.uint8, threshold) for threshold in [0, 64, 128, 192, 255]], + *[(torch.int64, threshold) for threshold in [0, 2**32, 2**63 - 1]], + ], +) +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_solarize_threshold_within_bound(threshold, dtype, device): + make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max) + img = make_img((3, 12, 23), dtype=dtype, device=device) F_t.solarize(img, threshold) -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("threshold", [260]) -def test_solarize_threshold2_upper_bound(threshold, device): - img = torch.randint(0, 256, (3, 12, 23)).to(device) +@pytest.mark.parametrize( + ("dtype", "threshold"), + [ + (torch.float32, 1.5), + (torch.float16, 1.5), + (torch.uint8, 260), + (torch.int64, 2**64), + ], +) +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_solarize_threshold_above_bound(threshold, dtype, device): + make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max) + img = make_img((3, 12, 23), dtype=dtype, device=device) with pytest.raises(TypeError, match="Threshold should be less than bound of img."): F_t.solarize(img, threshold) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"sharpness_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -831,7 +754,7 @@ def test_adjust_sharpness(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("channels", [1, 3]) def test_autocontrast(device, dtype, channels): @@ -840,7 +763,7 @@ def test_autocontrast(device, dtype, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("channels", [1, 3]) def test_autocontrast_equal_minmax(device, dtype, channels): @@ -852,7 +775,7 @@ def test_autocontrast_equal_minmax(device, dtype, channels): assert (F.autocontrast(a)[0] == F.autocontrast(a[0])).all() -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("channels", [1, 3]) def test_equalize(device, channels): torch.use_deterministic_algorithms(False) @@ -869,7 +792,7 @@ def test_equalize(device, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"contrast_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -879,7 +802,7 @@ def test_adjust_contrast(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"saturation_factor": f} for f in [0.5, 0.75, 1.0, 1.5, 2.0]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -889,7 +812,7 @@ def test_adjust_saturation(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"hue_factor": f} for f in [-0.45, -0.25, 0.0, 0.25, 0.45]]) @pytest.mark.parametrize("channels", [1, 3]) @@ -899,7 +822,7 @@ def test_adjust_hue(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64)) @pytest.mark.parametrize("config", [{"gamma": g1, "gain": g2} for g1, g2 in zip([0.8, 1.0, 1.2], [0.7, 1.0, 1.3])]) @pytest.mark.parametrize("channels", [1, 3]) @@ -915,7 +838,7 @@ def test_adjust_gamma(device, dtype, config, channels): ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize("pad", [2, [3], [0, 3], (3, 3), [4, 2, 4, 3]]) @pytest.mark.parametrize( @@ -965,14 +888,16 @@ def test_pad(device, dt, pad, config): _test_fn_on_batch(batch_tensors, F.pad, padding=script_pad, **config) -@pytest.mark.parametrize("device", cpu_and_gpu()) -@pytest.mark.parametrize("mode", [NEAREST, BILINEAR, BICUBIC]) +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize("mode", [NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC]) def test_resized_crop(device, mode): # test values of F.resized_crop in several cases: # 1) resize to the same size, crop to the same size => should be identity tensor, _ = _create_data(26, 36, device=device) - out_tensor = F.resized_crop(tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode) + out_tensor = F.resized_crop( + tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode, antialias=True + ) assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}") # 2) resize by half and crop a TL corner @@ -987,11 +912,18 @@ def test_resized_crop(device, mode): batch_tensors = _create_data_batch(26, 36, num_samples=4, device=device) _test_fn_on_batch( - batch_tensors, F.resized_crop, top=1, left=2, height=20, width=30, size=[10, 15], interpolation=NEAREST + batch_tensors, + F.resized_crop, + top=1, + left=2, + height=20, + width=30, + size=[10, 15], + interpolation=NEAREST, ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "func, args", [ @@ -1024,7 +956,7 @@ def test_assert_image_tensor(device, func, args): func(tensor, *args) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_vflip(device): script_vflip = torch.jit.script(F.vflip) @@ -1041,7 +973,7 @@ def test_vflip(device): _test_fn_on_batch(batch_tensors, F.vflip) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_hflip(device): script_hflip = torch.jit.script(F.hflip) @@ -1058,7 +990,7 @@ def test_hflip(device): _test_fn_on_batch(batch_tensors, F.hflip) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "top, left, height, width", [ @@ -1087,7 +1019,7 @@ def test_crop(device, top, left, height, width): _test_fn_on_batch(batch_tensors, F.crop, top=top, left=left, height=height, width=width) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("image_size", ("small", "large")) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)]) @@ -1141,7 +1073,7 @@ def test_gaussian_blur(device, image_size, dt, ksize, sigma, fn): torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}") -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_hsv2rgb(device): scripted_fn = torch.jit.script(F_t._hsv2rgb) shape = (3, 100, 150) @@ -1172,7 +1104,7 @@ def test_hsv2rgb(device): _test_fn_on_batch(batch_tensors, F_t._hsv2rgb) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_rgb2hsv(device): scripted_fn = torch.jit.script(F_t._rgb2hsv) shape = (3, 150, 100) @@ -1211,7 +1143,7 @@ def test_rgb2hsv(device): _test_fn_on_batch(batch_tensors, F_t._rgb2hsv) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("num_output_channels", (3, 1)) def test_rgb_to_grayscale(device, num_output_channels): script_rgb_to_grayscale = torch.jit.script(F.rgb_to_grayscale) @@ -1230,7 +1162,7 @@ def test_rgb_to_grayscale(device, num_output_channels): _test_fn_on_batch(batch_tensors, F.rgb_to_grayscale, num_output_channels=num_output_channels) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_center_crop(device): script_center_crop = torch.jit.script(F.center_crop) @@ -1248,7 +1180,7 @@ def test_center_crop(device): _test_fn_on_batch(batch_tensors, F.center_crop, output_size=[10, 11]) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_five_crop(device): script_five_crop = torch.jit.script(F.five_crop) @@ -1282,7 +1214,7 @@ def test_five_crop(device): assert_equal(transformed_batch, s_transformed_batch) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_ten_crop(device): script_ten_crop = torch.jit.script(F.ten_crop) @@ -1328,7 +1260,7 @@ def test_elastic_transform_asserts(): _ = F.elastic_transform(img_tensor, displacement=torch.rand(1, 2)) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC]) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) @pytest.mark.parametrize( diff --git a/test/test_hub.py b/test/test_hub.py deleted file mode 100644 index d88c6fa2cd25b2a866748c21f45e5f4d6199d564..0000000000000000000000000000000000000000 --- a/test/test_hub.py +++ /dev/null @@ -1,46 +0,0 @@ -import os -import shutil -import sys -import tempfile - -import pytest -import torch.hub as hub - - -def sum_of_model_parameters(model): - s = 0 - for p in model.parameters(): - s += p.sum() - return s - - -SUM_OF_PRETRAINED_RESNET18_PARAMS = -12703.9931640625 - - -@pytest.mark.skipif("torchvision" in sys.modules, reason="TestHub must start without torchvision imported") -class TestHub: - # Only run this check ONCE before all tests start. - # - If torchvision is imported before all tests start, e.g. we might find _C.so - # which doesn't exist in downloaded zip but in the installed wheel. - # - After the first test is run, torchvision is already in sys.modules due to - # Python cache as we run all hub tests in the same python process. - - def test_load_from_github(self): - hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False) - assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS) - - def test_set_dir(self): - temp_dir = tempfile.gettempdir() - hub.set_dir(temp_dir) - hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False) - assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS) - assert os.path.exists(temp_dir + "/pytorch_vision_master") - shutil.rmtree(temp_dir + "/pytorch_vision_master") - - def test_list_entrypoints(self): - entry_lists = hub.list("pytorch/vision", force_reload=True) - assert "resnet18" in entry_lists - - -if __name__ == "__main__": - pytest.main([__file__]) diff --git a/test/test_image.py b/test/test_image.py index 7fcd54c9c8f43277517605025c96a34df1f2f62b..a87f5fa2d1e3ee7b5a0e2c94aab4772dd39645da 100644 --- a/test/test_image.py +++ b/test/test_image.py @@ -32,6 +32,7 @@ DAMAGED_JPEG = os.path.join(IMAGE_ROOT, "damaged_jpeg") DAMAGED_PNG = os.path.join(IMAGE_ROOT, "damaged_png") ENCODE_JPEG = os.path.join(IMAGE_ROOT, "encode_jpeg") INTERLACED_PNG = os.path.join(IMAGE_ROOT, "interlaced_png") +TOOSMALL_PNG = os.path.join(IMAGE_ROOT, "toosmall_png") IS_WINDOWS = sys.platform in ("win32", "cygwin") PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split(".")) @@ -82,12 +83,9 @@ def test_decode_jpeg(img_path, pil_mode, mode): with Image.open(img_path) as img: is_cmyk = img.mode == "CMYK" if pil_mode is not None: - if is_cmyk: - # libjpeg does not support the conversion - pytest.xfail("Decoding a CMYK jpeg isn't supported") img = img.convert(pil_mode) img_pil = torch.from_numpy(np.array(img)) - if is_cmyk: + if is_cmyk and mode == ImageReadMode.UNCHANGED: # flip the colors to match libjpeg img_pil = 255 - img_pil @@ -193,6 +191,8 @@ def test_decode_png_errors(): decode_png(torch.randint(3, 5, (300,), dtype=torch.uint8)) with pytest.raises(RuntimeError, match="Out of bound read in decode_png"): decode_png(read_file(os.path.join(DAMAGED_PNG, "sigsegv.png"))) + with pytest.raises(RuntimeError, match="Content is too small for png"): + decode_png(read_file(os.path.join(TOOSMALL_PNG, "heapbof.png"))) @pytest.mark.parametrize( @@ -369,6 +369,13 @@ def test_decode_jpeg_cuda(mode, img_path, scripted): assert (img.float() - img_nvjpeg.cpu().float()).abs().mean() < 2 +@needs_cuda +def test_decode_image_cuda_raises(): + data = torch.randint(0, 127, size=(255,), device="cuda", dtype=torch.uint8) + with pytest.raises(RuntimeError): + decode_image(data) + + @needs_cuda @pytest.mark.parametrize("cuda_device", ("cuda", "cuda:0", torch.device("cuda"))) def test_decode_jpeg_cuda_device_param(cuda_device): @@ -412,77 +419,6 @@ def test_encode_jpeg_errors(): encode_jpeg(torch.empty((100, 100), dtype=torch.uint8)) -def _collect_if(cond): - # TODO: remove this once test_encode_jpeg_reference and test_write_jpeg_reference - # are removed - def _inner(test_func): - if cond: - return test_func - else: - return pytest.mark.dont_collect(test_func) - - return _inner - - -@_collect_if(cond=False) -@pytest.mark.parametrize( - "img_path", - [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")], -) -def test_encode_jpeg_reference(img_path): - # This test is *wrong*. - # It compares a torchvision-encoded jpeg with a PIL-encoded jpeg (the reference), but it - # starts encoding the torchvision version from an image that comes from - # decode_jpeg, which can yield different results from pil.decode (see - # test_decode... which uses a high tolerance). - # Instead, we should start encoding from the exact same decoded image, for a - # valid comparison. This is done in test_encode_jpeg, but unfortunately - # these more correct tests fail on windows (probably because of a difference - # in libjpeg) between torchvision and PIL. - # FIXME: make the correct tests pass on windows and remove this. - dirname = os.path.dirname(img_path) - filename, _ = os.path.splitext(os.path.basename(img_path)) - write_folder = os.path.join(dirname, "jpeg_write") - expected_file = os.path.join(write_folder, f"{filename}_pil.jpg") - img = decode_jpeg(read_file(img_path)) - - with open(expected_file, "rb") as f: - pil_bytes = f.read() - pil_bytes = torch.as_tensor(list(pil_bytes), dtype=torch.uint8) - for src_img in [img, img.contiguous()]: - # PIL sets jpeg quality to 75 by default - jpeg_bytes = encode_jpeg(src_img, quality=75) - assert_equal(jpeg_bytes, pil_bytes) - - -@_collect_if(cond=False) -@pytest.mark.parametrize( - "img_path", - [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")], -) -def test_write_jpeg_reference(img_path, tmpdir): - # FIXME: Remove this eventually, see test_encode_jpeg_reference - data = read_file(img_path) - img = decode_jpeg(data) - - basedir = os.path.dirname(img_path) - filename, _ = os.path.splitext(os.path.basename(img_path)) - torch_jpeg = os.path.join(tmpdir, f"{filename}_torch.jpg") - pil_jpeg = os.path.join(basedir, "jpeg_write", f"{filename}_pil.jpg") - - write_jpeg(img, torch_jpeg, quality=75) - - with open(torch_jpeg, "rb") as f: - torch_bytes = f.read() - - with open(pil_jpeg, "rb") as f: - pil_bytes = f.read() - - assert_equal(torch_bytes, pil_bytes) - - -# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162. -@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo") @pytest.mark.parametrize( "img_path", [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")], @@ -501,8 +437,6 @@ def test_encode_jpeg(img_path): assert_equal(encoded_jpeg_torch, encoded_jpeg_pil) -# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162. -@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo") @pytest.mark.parametrize( "img_path", [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")], diff --git a/test/test_models.py b/test/test_models.py index 5d2b5565a9e6bb913f275cb2ef653d7b693c8304..76bddebefe4ee479b1c218a128b5eb9631bbfa20 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -15,8 +15,9 @@ import torch import torch.fx import torch.nn as nn from _utils_internal import get_relative_path -from common_utils import cpu_and_gpu, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed -from torchvision import models +from common_utils import cpu_and_cuda, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed +from PIL import Image +from torchvision import models, transforms from torchvision.models import get_model_builder, list_models @@ -24,10 +25,57 @@ ACCEPT = os.getenv("EXPECTTEST_ACCEPT", "0") == "1" SKIP_BIG_MODEL = os.getenv("SKIP_BIG_MODEL", "1") == "1" +@contextlib.contextmanager +def disable_tf32(): + previous = torch.backends.cudnn.allow_tf32 + torch.backends.cudnn.allow_tf32 = False + try: + yield + finally: + torch.backends.cudnn.allow_tf32 = previous + + def list_model_fns(module): return [get_model_builder(name) for name in list_models(module)] +def _get_image(input_shape, real_image, device, dtype=None): + """This routine loads a real or random image based on `real_image` argument. + Currently, the real image is utilized for the following list of models: + - `retinanet_resnet50_fpn`, + - `retinanet_resnet50_fpn_v2`, + - `keypointrcnn_resnet50_fpn`, + - `fasterrcnn_resnet50_fpn`, + - `fasterrcnn_resnet50_fpn_v2`, + - `fcos_resnet50_fpn`, + - `maskrcnn_resnet50_fpn`, + - `maskrcnn_resnet50_fpn_v2`, + in `test_classification_model` and `test_detection_model`. + To do so, a keyword argument `real_image` was added to the abovelisted models in `_model_params` + """ + if real_image: + # TODO: Maybe unify file discovery logic with test_image.py + GRACE_HOPPER = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg" + ) + + img = Image.open(GRACE_HOPPER) + + original_width, original_height = img.size + + # make the image square + img = img.crop((0, 0, original_width, original_width)) + img = img.resize(input_shape[1:3]) + + convert_tensor = transforms.ToTensor() + image = convert_tensor(img) + assert tuple(image.size()) == input_shape + return image.to(device=device, dtype=dtype) + + # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests + return torch.rand(input_shape).to(device=device, dtype=dtype) + + @pytest.fixture def disable_weight_loading(mocker): """When testing models, the two slowest operations are the downloading of the weights to a file and loading them @@ -129,6 +177,7 @@ def _check_jit_scriptable(nn_module, args, unwrapper=None, eager_out=None): return imported sm = torch.jit.script(nn_module) + sm.eval() if eager_out is None: with torch.no_grad(), freeze_rng_state(): @@ -154,7 +203,8 @@ def _check_fx_compatible(model, inputs, eager_out=None): model_fx = torch.fx.symbolic_trace(model) if eager_out is None: eager_out = model(inputs) - fx_out = model_fx(inputs) + with torch.no_grad(), freeze_rng_state(): + fx_out = model_fx(inputs) torch.testing.assert_close(eager_out, fx_out) @@ -231,7 +281,6 @@ autocast_flaky_numerics = ( "maskrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn_v2", "keypointrcnn_resnet50_fpn", - "fasterrcnn_resnet50_fpn", # See: https://github.com/pytorch/vision/issues/6655 ) # The tests for the following quantized models are flaky possibly due to inconsistent @@ -239,6 +288,11 @@ autocast_flaky_numerics = ( # tests under test_quantized_classification_model will be skipped for the following models. quantized_flaky_models = ("inception_v3", "resnet50") +# The tests for the following detection models are flaky. +# We run those tests on float64 to avoid floating point errors. +# FIXME: we shouldn't have to do that :'/ +detection_flaky_models = ("keypointrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn_v2") + # The following contains configuration parameters for all models which are used by # the _test_*_model methods. @@ -250,6 +304,7 @@ _model_params = { "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "retinanet_resnet50_fpn_v2": { "num_classes": 20, @@ -257,6 +312,7 @@ _model_params = { "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "keypointrcnn_resnet50_fpn": { "num_classes": 2, @@ -264,18 +320,21 @@ _model_params = { "max_size": 224, "box_score_thresh": 0.17, "input_shape": (3, 224, 224), + "real_image": True, }, "fasterrcnn_resnet50_fpn": { "num_classes": 20, "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "fasterrcnn_resnet50_fpn_v2": { "num_classes": 20, "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "fcos_resnet50_fpn": { "num_classes": 2, @@ -283,18 +342,21 @@ _model_params = { "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "maskrcnn_resnet50_fpn": { "num_classes": 10, "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "maskrcnn_resnet50_fpn_v2": { "num_classes": 10, "min_size": 224, "max_size": 224, "input_shape": (3, 224, 224), + "real_image": True, }, "fasterrcnn_mobilenet_v3_large_fpn": { "box_score_thresh": 0.02076, @@ -614,13 +676,14 @@ def vitc_b_16(**kwargs: Any): @pytest.mark.parametrize("model_fn", [vitc_b_16]) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_vitc_models(model_fn, dev): test_classification_model(model_fn, dev) +@disable_tf32() # see: https://github.com/pytorch/vision/issues/7618 @pytest.mark.parametrize("model_fn", list_model_fns(models)) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_classification_model(model_fn, dev): set_rng_seed(0) defaults = { @@ -633,13 +696,20 @@ def test_classification_model(model_fn, dev): kwargs = {**defaults, **_model_params.get(model_name, {})} num_classes = kwargs.get("num_classes") input_shape = kwargs.pop("input_shape") + real_image = kwargs.pop("real_image", False) model = model_fn(**kwargs) model.eval().to(device=dev) - # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests - x = torch.rand(input_shape).to(device=dev) + x = _get_image(input_shape=input_shape, real_image=real_image, device=dev) out = model(x) - _assert_expected(out.cpu(), model_name, prec=1e-3) + # FIXME: this if/else is nasty and only here to please our CI prior to the + # release. We rethink these tests altogether. + if model_name == "resnet101": + prec = 0.2 + else: + # FIXME: this is probably still way too high. + prec = 0.1 + _assert_expected(out.cpu(), model_name, prec=prec) assert out.shape[-1] == num_classes _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out) _check_fx_compatible(model, x, eager_out=out) @@ -656,7 +726,7 @@ def test_classification_model(model_fn, dev): @pytest.mark.parametrize("model_fn", list_model_fns(models.segmentation)) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_segmentation_model(model_fn, dev): set_rng_seed(0) defaults = { @@ -672,7 +742,8 @@ def test_segmentation_model(model_fn, dev): model.eval().to(device=dev) # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests x = torch.rand(input_shape).to(device=dev) - out = model(x) + with torch.no_grad(), freeze_rng_state(): + out = model(x) def check_out(out): prec = 0.01 @@ -700,7 +771,7 @@ def test_segmentation_model(model_fn, dev): _check_fx_compatible(model, x, eager_out=out) if dev == "cuda": - with torch.cuda.amp.autocast(): + with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state(): out = model(x) # See autocast_flaky_numerics comment at top of file. if model_name not in autocast_flaky_numerics: @@ -720,7 +791,7 @@ def test_segmentation_model(model_fn, dev): @pytest.mark.parametrize("model_fn", list_model_fns(models.detection)) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_detection_model(model_fn, dev): set_rng_seed(0) defaults = { @@ -729,15 +800,20 @@ def test_detection_model(model_fn, dev): "input_shape": (3, 300, 300), } model_name = model_fn.__name__ + if model_name in detection_flaky_models: + dtype = torch.float64 + else: + dtype = torch.get_default_dtype() kwargs = {**defaults, **_model_params.get(model_name, {})} input_shape = kwargs.pop("input_shape") + real_image = kwargs.pop("real_image", False) model = model_fn(**kwargs) - model.eval().to(device=dev) - # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests - x = torch.rand(input_shape).to(device=dev) + model.eval().to(device=dev, dtype=dtype) + x = _get_image(input_shape=input_shape, real_image=real_image, device=dev, dtype=dtype) model_input = [x] - out = model(model_input) + with torch.no_grad(), freeze_rng_state(): + out = model(model_input) assert model_input[0] is x def check_out(out): @@ -798,7 +874,7 @@ def test_detection_model(model_fn, dev): _check_jit_scriptable(model, ([x],), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out) if dev == "cuda": - with torch.cuda.amp.autocast(): + with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state(): out = model(model_input) # See autocast_flaky_numerics comment at top of file. if model_name not in autocast_flaky_numerics: @@ -847,7 +923,7 @@ def test_detection_model_validation(model_fn): @pytest.mark.parametrize("model_fn", list_model_fns(models.video)) -@pytest.mark.parametrize("dev", cpu_and_gpu()) +@pytest.mark.parametrize("dev", cpu_and_cuda()) def test_video_model(model_fn, dev): set_rng_seed(0) # the default input shape is @@ -868,7 +944,7 @@ def test_video_model(model_fn, dev): # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests x = torch.rand(input_shape).to(device=dev) out = model(x) - _assert_expected(out.cpu(), model_name, prec=1e-5) + _assert_expected(out.cpu(), model_name, prec=0.1) assert out.shape[-1] == num_classes _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out) _check_fx_compatible(model, x, eager_out=out) @@ -961,7 +1037,7 @@ def test_raft(model_fn, scripted): torch.manual_seed(0) # We need very small images, otherwise the pickle size would exceed the 50KB - # As a resut we need to override the correlation pyramid to not downsample + # As a result we need to override the correlation pyramid to not downsample # too much, otherwise we would get nan values (effective H and W would be # reduced to 1) corr_block = models.optical_flow.raft.CorrBlock(num_levels=2, radius=2) @@ -977,9 +1053,29 @@ def test_raft(model_fn, scripted): preds = model(img1, img2) flow_pred = preds[-1] # Tolerance is fairly high, but there are 2 * H * W outputs to check - # The .pkl were generated on the AWS cluter, on the CI it looks like the resuts are slightly different + # The .pkl were generated on the AWS cluter, on the CI it looks like the results are slightly different _assert_expected(flow_pred.cpu(), name=model_fn.__name__, atol=1e-2, rtol=1) +def test_presets_antialias(): + + img = torch.randint(0, 256, size=(1, 3, 224, 224), dtype=torch.uint8) + + match = "The default value of the antialias parameter" + with pytest.warns(UserWarning, match=match): + models.ResNet18_Weights.DEFAULT.transforms()(img) + with pytest.warns(UserWarning, match=match): + models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT.transforms()(img) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + models.ResNet18_Weights.DEFAULT.transforms(antialias=True)(img) + models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT.transforms(antialias=True)(img) + + models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT.transforms()(img) + models.video.R3D_18_Weights.DEFAULT.transforms()(img) + models.optical_flow.Raft_Small_Weights.DEFAULT.transforms()(img, img) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/test/test_models_detection_utils.py b/test/test_models_detection_utils.py index 09895057a9a0052b900287a40534986a7f5dc4a7..69703ab5817cd2e10067f3d4c4bdc5653874fe25 100644 --- a/test/test_models_detection_utils.py +++ b/test/test_models_detection_utils.py @@ -38,7 +38,7 @@ class TestModelsDetectionUtils: def test_resnet_fpn_backbone_frozen_layers(self, train_layers, exp_froz_params): # we know how many initial layers and parameters of the network should # be frozen for each trainable_backbone_layers parameter value - # i.e all 53 params are frozen if trainable_backbone_layers=0 + # i.e. all 53 params are frozen if trainable_backbone_layers=0 # ad first 24 params are frozen if trainable_backbone_layers=2 model = backbone_utils.resnet_fpn_backbone("resnet50", weights=None, trainable_layers=train_layers) # boolean list that is true if the param at that index is frozen diff --git a/test/test_onnx.py b/test/test_onnx.py index d5dae64b4d09aa4b5595057dc6dbda2fe25dae5b..19ed13b1a6d2b321e2db45d7f3ed4c2ee33e2504 100644 --- a/test/test_onnx.py +++ b/test/test_onnx.py @@ -1,6 +1,6 @@ import io from collections import OrderedDict -from typing import List, Tuple +from typing import List, Optional, Tuple import pytest import torch @@ -11,7 +11,7 @@ from torchvision.models.detection.image_list import ImageList from torchvision.models.detection.roi_heads import RoIHeads from torchvision.models.detection.rpn import AnchorGenerator, RegionProposalNetwork, RPNHead from torchvision.models.detection.transform import GeneralizedRCNNTransform -from torchvision.ops._register_onnx_ops import _onnx_opset_version +from torchvision.ops import _register_onnx_ops # In environments without onnxruntime we prefer to # invoke all tests in the repo and have this one skipped rather than fail. @@ -27,12 +27,15 @@ class TestONNXExporter: self, model, inputs_list, - tolerate_small_mismatch=False, do_constant_folding=True, dynamic_axes=None, output_names=None, input_names=None, + opset_version: Optional[int] = None, ): + if opset_version is None: + opset_version = _register_onnx_ops.BASE_ONNX_OPSET_VERSION + model.eval() onnx_io = io.BytesIO() @@ -46,10 +49,11 @@ class TestONNXExporter: torch_onnx_input, onnx_io, do_constant_folding=do_constant_folding, - opset_version=_onnx_opset_version, + opset_version=opset_version, dynamic_axes=dynamic_axes, input_names=input_names, output_names=output_names, + verbose=True, ) # validate the exported model with onnx runtime for test_inputs in inputs_list: @@ -59,9 +63,9 @@ class TestONNXExporter: test_ouputs = model(*test_inputs) if isinstance(test_ouputs, torch.Tensor): test_ouputs = (test_ouputs,) - self.ort_validate(onnx_io, test_inputs, test_ouputs, tolerate_small_mismatch) + self.ort_validate(onnx_io, test_inputs, test_ouputs) - def ort_validate(self, onnx_io, inputs, outputs, tolerate_small_mismatch=False): + def ort_validate(self, onnx_io, inputs, outputs): inputs, _ = torch.jit._flatten(inputs) outputs, _ = torch.jit._flatten(outputs) @@ -81,13 +85,7 @@ class TestONNXExporter: ort_outs = ort_session.run(None, ort_inputs) for i in range(0, len(outputs)): - try: - torch.testing.assert_allclose(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05) - except AssertionError as error: - if tolerate_small_mismatch: - assert "(0.00%)" in str(error), str(error) - else: - raise + torch.testing.assert_close(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05) def test_nms(self): num_boxes = 100 @@ -140,39 +138,39 @@ class TestONNXExporter: model = ops.RoIAlign((5, 5), 1, -1) self.run_model(model, [(x, single_roi)]) - @pytest.mark.skip(reason="ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16.") def test_roi_align_aligned(self): + supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16 x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 1.5, 1.5, 3, 3]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 1, 2, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 0.5, 3, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 1.8, 2, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((2, 2), 2.5, 0, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) x = torch.rand(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32) model = ops.RoIAlign((2, 2), 2.5, -1, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) - @pytest.mark.skip(reason="Issue in exporting ROIAlign with aligned = True for malformed boxes") def test_roi_align_malformed_boxes(self): + supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16 x = torch.randn(1, 1, 10, 10, dtype=torch.float32) single_roi = torch.tensor([[0, 2, 0.3, 1.5, 1.5]], dtype=torch.float32) model = ops.RoIAlign((5, 5), 1, 1, aligned=True) - self.run_model(model, [(x, single_roi)]) + self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version) def test_roi_pool(self): x = torch.rand(1, 1, 10, 10, dtype=torch.float32) @@ -320,7 +318,6 @@ class TestONNXExporter: self.run_model( model, [(images, features), (images2, test_features)], - tolerate_small_mismatch=True, input_names=["input1", "input2", "input3", "input4", "input5", "input6"], dynamic_axes={ "input1": [0, 1, 2, 3], @@ -396,7 +393,6 @@ class TestONNXExporter: self.run_model( model, [(images, features), (images2, test_features)], - tolerate_small_mismatch=True, input_names=["input1", "input2", "input3", "input4", "input5", "input6"], dynamic_axes={ "input1": [0, 1, 2, 3], @@ -411,13 +407,12 @@ class TestONNXExporter: def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor: import os - import torchvision.transforms._pil_constants as _pil_constants from PIL import Image from torchvision.transforms import functional as F data_dir = os.path.join(os.path.dirname(__file__), "assets") path = os.path.join(data_dir, *rel_path.split("/")) - image = Image.open(path).convert("RGB").resize(size, _pil_constants.BILINEAR) + image = Image.open(path).convert("RGB").resize(size, Image.BILINEAR) return F.convert_image_dtype(F.pil_to_tensor(image)) @@ -442,7 +437,6 @@ class TestONNXExporter: input_names=["images_tensors"], output_names=["outputs"], dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, - tolerate_small_mismatch=True, ) # Test exported model for an image with no detections on other images self.run_model( @@ -451,7 +445,6 @@ class TestONNXExporter: input_names=["images_tensors"], output_names=["outputs"], dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]}, - tolerate_small_mismatch=True, ) # Verify that paste_mask_in_image beahves the same in tracing. @@ -506,7 +499,6 @@ class TestONNXExporter: "scores": [0], "masks": [0, 1, 2], }, - tolerate_small_mismatch=True, ) # Test exported model for an image with no detections on other images self.run_model( @@ -521,7 +513,6 @@ class TestONNXExporter: "scores": [0], "masks": [0, 1, 2], }, - tolerate_small_mismatch=True, ) # Verify that heatmaps_to_keypoints behaves the same in tracing. @@ -563,7 +554,6 @@ class TestONNXExporter: input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"], dynamic_axes={"images_tensors": [0, 1, 2]}, - tolerate_small_mismatch=True, ) self.run_model( @@ -572,7 +562,6 @@ class TestONNXExporter: input_names=["images_tensors"], output_names=["outputs1", "outputs2", "outputs3", "outputs4"], dynamic_axes={"images_tensors": [0, 1, 2]}, - tolerate_small_mismatch=True, ) def test_shufflenet_v2_dynamic_axes(self): @@ -586,7 +575,6 @@ class TestONNXExporter: input_names=["input_images"], output_names=["output"], dynamic_axes={"input_images": {0: "batch_size"}, "output": {0: "batch_size"}}, - tolerate_small_mismatch=True, ) diff --git a/test/test_ops.py b/test/test_ops.py index b34fbe7f2546e8da0dd7d11b6043b550ef85b82d..743fe159e37a895e8a140b0b5321e1e5a918c9e2 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -10,7 +10,7 @@ import pytest import torch import torch.fx import torch.nn.functional as F -from common_utils import assert_equal, cpu_and_gpu, needs_cuda +from common_utils import assert_equal, cpu_and_cuda, cpu_and_cuda_and_mps, needs_cuda, needs_mps from PIL import Image from torch import nn, Tensor from torch.autograd import gradcheck @@ -19,6 +19,22 @@ from torchvision import models, ops from torchvision.models.feature_extraction import get_graph_node_names +# Context manager for setting deterministic flag and automatically +# resetting it to its original value +class DeterministicGuard: + def __init__(self, deterministic, *, warn_only=False): + self.deterministic = deterministic + self.warn_only = warn_only + + def __enter__(self): + self.deterministic_restore = torch.are_deterministic_algorithms_enabled() + self.warn_only_restore = torch.is_deterministic_algorithms_warn_only_enabled() + torch.use_deterministic_algorithms(self.deterministic, warn_only=self.warn_only) + + def __exit__(self, exception_type, exception_value, traceback): + torch.use_deterministic_algorithms(self.deterministic_restore, warn_only=self.warn_only_restore) + + class RoIOpTesterModuleWrapper(nn.Module): def __init__(self, obj): super().__init__() @@ -80,14 +96,35 @@ class PoolWrapper(nn.Module): class RoIOpTester(ABC): dtype = torch.float64 + mps_dtype = torch.float32 + mps_backward_atol = 2e-2 - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) @pytest.mark.parametrize("contiguous", (True, False)) - def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwargs): - x_dtype = self.dtype if x_dtype is None else x_dtype - rois_dtype = self.dtype if rois_dtype is None else rois_dtype + @pytest.mark.parametrize( + "x_dtype", + ( + torch.float16, + torch.float32, + torch.float64, + ), + ids=str, + ) + def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, deterministic=False, **kwargs): + if device == "mps" and x_dtype is torch.float64: + pytest.skip("MPS does not support float64") + + rois_dtype = x_dtype if rois_dtype is None else rois_dtype + + tol = 1e-5 + if x_dtype is torch.half: + if device == "mps": + tol = 5e-3 + else: + tol = 4e-3 + pool_size = 5 - # n_channels % (pool_size ** 2) == 0 required for PS opeartions. + # n_channels % (pool_size ** 2) == 0 required for PS operations. n_channels = 2 * (pool_size**2) x = torch.rand(2, n_channels, 10, 10, dtype=x_dtype, device=device) if not contiguous: @@ -99,17 +136,17 @@ class RoIOpTester(ABC): ) pool_h, pool_w = pool_size, pool_size - y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs) + with DeterministicGuard(deterministic): + y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs) # the following should be true whether we're running an autocast test or not. assert y.dtype == x.dtype gt_y = self.expected_fn( - x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=self.dtype, **kwargs + x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=x_dtype, **kwargs ) - tol = 1e-3 if (x_dtype is torch.half or rois_dtype is torch.half) else 1e-5 torch.testing.assert_close(gt_y.to(y), y, rtol=tol, atol=tol) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_is_leaf_node(self, device): op_obj = self.make_obj(wrap=True).to(device=device) graph_node_names = get_graph_node_names(op_obj) @@ -118,7 +155,7 @@ class RoIOpTester(ABC): assert len(graph_node_names[0]) == len(graph_node_names[1]) assert len(graph_node_names[0]) == 1 + op_obj.n_inputs - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_torch_fx_trace(self, device, x_dtype=torch.float, rois_dtype=torch.float): op_obj = self.make_obj().to(device=device) graph_module = torch.fx.symbolic_trace(op_obj) @@ -138,16 +175,19 @@ class RoIOpTester(ABC): torch.testing.assert_close(output_gt, output_fx, rtol=tol, atol=tol) @pytest.mark.parametrize("seed", range(10)) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) @pytest.mark.parametrize("contiguous", (True, False)) - def test_backward(self, seed, device, contiguous): + def test_backward(self, seed, device, contiguous, deterministic=False): + atol = self.mps_backward_atol if device == "mps" else 1e-05 + dtype = self.mps_dtype if device == "mps" else self.dtype + torch.random.manual_seed(seed) pool_size = 2 - x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=self.dtype, device=device, requires_grad=True) + x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=dtype, device=device, requires_grad=True) if not contiguous: x = x.permute(0, 1, 3, 2) rois = torch.tensor( - [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=self.dtype, device=device # format is (xyxy) + [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=dtype, device=device # format is (xyxy) ) def func(z): @@ -155,8 +195,26 @@ class RoIOpTester(ABC): script_func = self.get_script_fn(rois, pool_size) - gradcheck(func, (x,)) - gradcheck(script_func, (x,)) + with DeterministicGuard(deterministic): + gradcheck(func, (x,), atol=atol) + + gradcheck(script_func, (x,), atol=atol) + + @needs_mps + def test_mps_error_inputs(self): + pool_size = 2 + x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=torch.float16, device="mps", requires_grad=True) + rois = torch.tensor( + [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=torch.float16, device="mps" # format is (xyxy) + ) + + def func(z): + return self.fn(z, rois, pool_size, pool_size, spatial_scale=1, sampling_ratio=1) + + with pytest.raises( + RuntimeError, match="MPS does not support (?:ps_)?roi_(?:align|pool)? backward with float16 inputs." + ): + gradcheck(func, (x,)) @needs_cuda @pytest.mark.parametrize("x_dtype", (torch.float, torch.half)) @@ -252,6 +310,8 @@ class TestRoiPool(RoIOpTester): class TestPSRoIPool(RoIOpTester): + mps_backward_atol = 5e-2 + def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs): return ops.PSRoIPool((pool_h, pool_w), 1)(x, rois) @@ -333,6 +393,8 @@ def bilinear_interpolate(data, y, x, snap_border=False): class TestRoIAlign(RoIOpTester): + mps_backward_atol = 6e-2 + def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, aligned=False, **kwargs): return ops.RoIAlign( (pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio, aligned=aligned @@ -384,7 +446,6 @@ class TestRoIAlign(RoIOpTester): grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w)) for channel in range(0, n_channels): - val = 0 for iy in range(0, grid_h): y = start_h + (iy + 0.5) * bin_h / grid_h @@ -400,23 +461,47 @@ class TestRoIAlign(RoIOpTester): self._helper_boxes_shape(ops.roi_align) @pytest.mark.parametrize("aligned", (True, False)) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) + @pytest.mark.parametrize("x_dtype", (torch.float16, torch.float32, torch.float64), ids=str) @pytest.mark.parametrize("contiguous", (True, False)) - def test_forward(self, device, contiguous, aligned, x_dtype=None, rois_dtype=None): + @pytest.mark.parametrize("deterministic", (True, False)) + def test_forward(self, device, contiguous, deterministic, aligned, x_dtype, rois_dtype=None): + if deterministic and device == "cpu": + pytest.skip("cpu is always deterministic, don't retest") super().test_forward( - device=device, contiguous=contiguous, x_dtype=x_dtype, rois_dtype=rois_dtype, aligned=aligned + device=device, + contiguous=contiguous, + deterministic=deterministic, + x_dtype=x_dtype, + rois_dtype=rois_dtype, + aligned=aligned, ) @needs_cuda @pytest.mark.parametrize("aligned", (True, False)) + @pytest.mark.parametrize("deterministic", (True, False)) @pytest.mark.parametrize("x_dtype", (torch.float, torch.half)) @pytest.mark.parametrize("rois_dtype", (torch.float, torch.half)) - def test_autocast(self, aligned, x_dtype, rois_dtype): + def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype): with torch.cuda.amp.autocast(): self.test_forward( - torch.device("cuda"), contiguous=False, aligned=aligned, x_dtype=x_dtype, rois_dtype=rois_dtype + torch.device("cuda"), + contiguous=False, + deterministic=deterministic, + aligned=aligned, + x_dtype=x_dtype, + rois_dtype=rois_dtype, ) + @pytest.mark.parametrize("seed", range(10)) + @pytest.mark.parametrize("device", cpu_and_cuda_and_mps()) + @pytest.mark.parametrize("contiguous", (True, False)) + @pytest.mark.parametrize("deterministic", (True, False)) + def test_backward(self, seed, device, contiguous, deterministic): + if deterministic and device == "cpu": + pytest.skip("cpu is always deterministic, don't retest") + super().test_backward(seed, device, contiguous, deterministic) + def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000): rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype) rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,)) # set batch index @@ -496,6 +581,8 @@ class TestRoIAlign(RoIOpTester): class TestPSRoIAlign(RoIOpTester): + mps_backward_atol = 5e-2 + def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs): return ops.PSRoIAlign((pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio)(x, rois) @@ -571,7 +658,7 @@ class TestMultiScaleRoIAlign: ) assert repr(t) == expected_string - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_is_leaf_node(self, device): op_obj = self.make_obj(wrap=True).to(device=device) graph_node_names = get_graph_node_names(op_obj) @@ -585,8 +672,9 @@ class TestNMS: def _reference_nms(self, boxes, scores, iou_threshold): """ Args: - box_scores (N, 5): boxes in corner-form and probabilities. - iou_threshold: intersection over union threshold. + boxes: boxes in corner-form + scores: probabilities + iou_threshold: intersection over union threshold Returns: picked: a list of indexes of the kept boxes """ @@ -630,7 +718,7 @@ class TestNMS: boxes, scores = self._create_tensors_with_iou(1000, iou) keep_ref = self._reference_nms(boxes, scores, iou) keep = ops.nms(boxes, scores, iou) - assert torch.allclose(keep, keep_ref), err_msg.format(iou) + torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou)) def test_nms_input_errors(self): with pytest.raises(RuntimeError): @@ -646,11 +734,11 @@ class TestNMS: @pytest.mark.parametrize("scale, zero_point", ((1, 0), (2, 50), (3, 10))) def test_qnms(self, iou, scale, zero_point): # Note: we compare qnms vs nms instead of qnms vs reference implementation. - # This is because with the int convertion, the trick used in _create_tensors_with_iou + # This is because with the int conversion, the trick used in _create_tensors_with_iou # doesn't really work (in fact, nms vs reference implem will also fail with ints) err_msg = "NMS and QNMS give different results for IoU={}" boxes, scores = self._create_tensors_with_iou(1000, iou) - scores *= 100 # otherwise most scores would be 0 or 1 after int convertion + scores *= 100 # otherwise most scores would be 0 or 1 after int conversion qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point, dtype=torch.quint8) qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point, dtype=torch.quint8) @@ -661,23 +749,30 @@ class TestNMS: keep = ops.nms(boxes, scores, iou) qkeep = ops.nms(qboxes, qscores, iou) - assert torch.allclose(qkeep, keep), err_msg.format(iou) + torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou)) - @needs_cuda + @pytest.mark.parametrize( + "device", + ( + pytest.param("cuda", marks=pytest.mark.needs_cuda), + pytest.param("mps", marks=pytest.mark.needs_mps), + ), + ) @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8)) - def test_nms_cuda(self, iou, dtype=torch.float64): + def test_nms_gpu(self, iou, device, dtype=torch.float64): + dtype = torch.float32 if device == "mps" else dtype tol = 1e-3 if dtype is torch.half else 1e-5 err_msg = "NMS incompatible between CPU and CUDA for IoU={}" boxes, scores = self._create_tensors_with_iou(1000, iou) r_cpu = ops.nms(boxes, scores, iou) - r_cuda = ops.nms(boxes.cuda(), scores.cuda(), iou) + r_gpu = ops.nms(boxes.to(device), scores.to(device), iou) - is_eq = torch.allclose(r_cpu, r_cuda.cpu()) + is_eq = torch.allclose(r_cpu, r_gpu.cpu()) if not is_eq: # if the indices are not the same, ensure that it's because the scores # are duplicate - is_eq = torch.allclose(scores[r_cpu], scores[r_cuda.cpu()], rtol=tol, atol=tol) + is_eq = torch.allclose(scores[r_cpu], scores[r_gpu.cpu()], rtol=tol, atol=tol) assert is_eq, err_msg.format(iou) @needs_cuda @@ -685,18 +780,24 @@ class TestNMS: @pytest.mark.parametrize("dtype", (torch.float, torch.half)) def test_autocast(self, iou, dtype): with torch.cuda.amp.autocast(): - self.test_nms_cuda(iou=iou, dtype=dtype) + self.test_nms_gpu(iou=iou, dtype=dtype, device="cuda") - @needs_cuda - def test_nms_cuda_float16(self): + @pytest.mark.parametrize( + "device", + ( + pytest.param("cuda", marks=pytest.mark.needs_cuda), + pytest.param("mps", marks=pytest.mark.needs_mps), + ), + ) + def test_nms_float16(self, device): boxes = torch.tensor( [ [285.3538, 185.5758, 1193.5110, 851.4551], [285.1472, 188.7374, 1192.4984, 851.0669], [279.2440, 197.9812, 1189.4746, 849.2019], ] - ).cuda() - scores = torch.tensor([0.6370, 0.7569, 0.3966]).cuda() + ).to(device) + scores = torch.tensor([0.6370, 0.7569, 0.3966]).to(device) iou_thres = 0.2 keep32 = ops.nms(boxes, scores, iou_thres) @@ -843,7 +944,7 @@ class TestDeformConv: ) return DeformConvModuleWrapper(obj) if wrap else obj - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_is_leaf_node(self, device): op_obj = self.make_obj(wrap=True).to(device=device) graph_node_names = get_graph_node_names(op_obj) @@ -852,7 +953,7 @@ class TestDeformConv: assert len(graph_node_names[0]) == len(graph_node_names[1]) assert len(graph_node_names[0]) == 1 + op_obj.n_inputs - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("contiguous", (True, False)) @pytest.mark.parametrize("batch_sz", (0, 33)) def test_forward(self, device, contiguous, batch_sz, dtype=None): @@ -904,7 +1005,7 @@ class TestDeformConv: wrong_mask = torch.rand_like(mask[:, :2]) layer(x, offset, wrong_mask) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("contiguous", (True, False)) @pytest.mark.parametrize("batch_sz", (0, 33)) def test_backward(self, device, contiguous, batch_sz): @@ -977,7 +1078,6 @@ class TestDeformConv: weight = init_weight for d in ["cpu", "cuda"]: - out = ops.deform_conv2d(img.to(d), offset.to(d), weight.to(d), padding=1, mask=mask.to(d)) out.mean().backward() if true_cpu_grads is None: @@ -1237,7 +1337,7 @@ class TestIouBase: boxes2 = gen_box(7) a = TestIouBase._cartesian_product(boxes1, boxes2, target_fn) b = target_fn(boxes1, boxes2) - assert torch.allclose(a, b) + torch.testing.assert_close(a, b) class TestBoxIou(TestIouBase): @@ -1370,10 +1470,9 @@ def assert_empty_loss(iou_fn, dtype, device): class TestGeneralizedBoxIouLoss: # We refer to original test: https://github.com/facebookresearch/fvcore/blob/main/tests/test_giou_loss.py - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_giou_loss(self, dtype, device): - box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device) # Identical boxes should have loss of 0 @@ -1394,7 +1493,12 @@ class TestGeneralizedBoxIouLoss: assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 2.5, device=device, reduction="sum") assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 1.25, device=device, reduction="mean") - @pytest.mark.parametrize("device", cpu_and_gpu()) + # Test reduction value + # reduction value other than ["none", "mean", "sum"] should raise a ValueError + with pytest.raises(ValueError, match="Invalid"): + ops.generalized_box_iou_loss(box1s, box2s, reduction="xyz") + + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_empty_inputs(self, dtype, device): assert_empty_loss(ops.generalized_box_iou_loss, dtype, device) @@ -1402,7 +1506,7 @@ class TestGeneralizedBoxIouLoss: class TestCompleteBoxIouLoss: @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) def test_ciou_loss(self, dtype, device): box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device) @@ -1413,14 +1517,17 @@ class TestCompleteBoxIouLoss: assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean") assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum") - @pytest.mark.parametrize("device", cpu_and_gpu()) + with pytest.raises(ValueError, match="Invalid"): + ops.complete_box_iou_loss(box1s, box2s, reduction="xyz") + + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_empty_inputs(self, dtype, device): assert_empty_loss(ops.complete_box_iou_loss, dtype, device) class TestDistanceBoxIouLoss: - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_distance_iou_loss(self, dtype, device): box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device) @@ -1432,7 +1539,10 @@ class TestDistanceBoxIouLoss: assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean") assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum") - @pytest.mark.parametrize("device", cpu_and_gpu()) + with pytest.raises(ValueError, match="Invalid"): + ops.distance_box_iou_loss(box1s, box2s, reduction="xyz") + + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) def test_empty_distance_iou_inputs(self, dtype, device): assert_empty_loss(ops.distance_box_iou_loss, dtype, device) @@ -1477,7 +1587,7 @@ class TestFocalLoss: @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0]) @pytest.mark.parametrize("gamma", [0, 2]) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) @pytest.mark.parametrize("seed", [0, 1]) def test_correct_ratio(self, alpha, gamma, device, dtype, seed): @@ -1506,7 +1616,7 @@ class TestFocalLoss: torch.testing.assert_close(correct_ratio, loss_ratio, atol=tol, rtol=tol) @pytest.mark.parametrize("reduction", ["mean", "sum"]) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) @pytest.mark.parametrize("seed", [2, 3]) def test_equal_ce_loss(self, reduction, device, dtype, seed): @@ -1533,7 +1643,7 @@ class TestFocalLoss: @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0]) @pytest.mark.parametrize("gamma", [0, 2]) @pytest.mark.parametrize("reduction", ["none", "mean", "sum"]) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) @pytest.mark.parametrize("seed", [4, 5]) def test_jit(self, alpha, gamma, reduction, device, dtype, seed): @@ -1543,17 +1653,22 @@ class TestFocalLoss: torch.random.manual_seed(seed) inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device) focal_loss = ops.sigmoid_focal_loss(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction) - if device == "cpu": - scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction) - else: - with torch.jit.fuser("fuser2"): - # Use fuser2 to prevent a bug on fuser: https://github.com/pytorch/pytorch/issues/75476 - # We may remove this condition once the bug is resolved - scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction) + scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction) tol = 1e-3 if dtype is torch.half else 1e-5 torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol) + # Raise ValueError for anonymous reduction mode + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("dtype", [torch.float32, torch.half]) + def test_reduction_mode(self, device, dtype, reduction="xyz"): + if device == "cpu" and dtype is torch.half: + pytest.skip("Currently torch.half is not fully supported on cpu") + torch.random.manual_seed(0) + inputs, targets = self._generate_diverse_input_target_pair(device=device, dtype=dtype) + with pytest.raises(ValueError, match="Invalid"): + ops.sigmoid_focal_loss(inputs, targets, 0.25, 2, reduction) + class TestMasksToBoxes: def test_masks_box(self): @@ -1623,7 +1738,7 @@ class TestStochasticDepth: counts += batch_size - non_zero_count num_samples += batch_size - p_value = stats.binom_test(counts, num_samples, p=p) + p_value = stats.binomtest(counts, num_samples, p=p).pvalue assert p_value > 0.01 @pytest.mark.parametrize("seed", range(10)) diff --git a/test/test_transforms.py b/test/test_transforms.py index e0f8d4a5927e8e23e0a701426a66c49c3ea4f608..7581bf33220db307b90f2435673ae0698128d452 100644 --- a/test/test_transforms.py +++ b/test/test_transforms.py @@ -2,15 +2,16 @@ import math import os import random import re +import textwrap +import warnings from functools import partial import numpy as np import pytest import torch import torchvision.transforms as transforms -import torchvision.transforms._pil_constants as _pil_constants +import torchvision.transforms._functional_tensor as F_t import torchvision.transforms.functional as F -import torchvision.transforms.functional_tensor as F_t from PIL import Image from torch._utils_internal import get_file_path_2 @@ -24,7 +25,7 @@ try: except ImportError: stats = None -from common_utils import assert_equal, cycle_over, float_dtypes, int_dtypes +from common_utils import assert_equal, assert_run_python_script, cycle_over, float_dtypes, int_dtypes GRACE_HOPPER = get_file_path_2( @@ -174,7 +175,7 @@ class TestAccImage: def test_accimage_resize(self): trans = transforms.Compose( [ - transforms.Resize(256, interpolation=_pil_constants.LINEAR), + transforms.Resize(256, interpolation=Image.LINEAR), transforms.PILToTensor(), transforms.ConvertImageDtype(dtype=torch.float), ] @@ -319,7 +320,7 @@ def test_randomresized_params(): scale_range = (scale_min, scale_min + round(random.random(), 2)) aspect_min = max(round(random.random(), 2), epsilon) aspect_ratio_range = (aspect_min, aspect_min + round(random.random(), 2)) - randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range) + randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range, antialias=True) i, j, h, w = randresizecrop.get_params(img, scale_range, aspect_ratio_range) aspect_ratio_obtained = w / h assert ( @@ -366,7 +367,7 @@ def test_randomresized_params(): def test_resize(height, width, osize, max_size): img = Image.new("RGB", size=(width, height), color=127) - t = transforms.Resize(osize, max_size=max_size) + t = transforms.Resize(osize, max_size=max_size, antialias=True) result = t(img) msg = f"{height}, {width} - {osize} - {max_size}" @@ -424,7 +425,7 @@ def test_resize_sequence_output(height, width, osize): img = Image.new("RGB", size=(width, height), color=127) oheight, owidth = osize - t = transforms.Resize(osize) + t = transforms.Resize(osize, antialias=True) result = t(img) assert (owidth, oheight) == result.size @@ -439,6 +440,16 @@ def test_resize_antialias_error(): t(img) +def test_resize_antialias_default_warning(): + + img = Image.new("RGB", size=(10, 10), color=127) + # We make sure we don't warn for PIL images since the default behaviour doesn't change + with warnings.catch_warnings(): + warnings.simplefilter("error") + transforms.Resize((20, 20))(img) + transforms.RandomResizedCrop((20, 20))(img) + + @pytest.mark.parametrize("height, width", ((32, 64), (64, 32))) def test_resize_size_equals_small_edge_size(height, width): # Non-regression test for https://github.com/pytorch/vision/issues/5405 @@ -447,11 +458,21 @@ def test_resize_size_equals_small_edge_size(height, width): img = Image.new("RGB", size=(width, height), color=127) small_edge = min(height, width) - t = transforms.Resize(small_edge, max_size=max_size) + t = transforms.Resize(small_edge, max_size=max_size, antialias=True) result = t(img) assert max(result.size) == max_size +def test_resize_equal_input_output_sizes(): + # Regression test for https://github.com/pytorch/vision/issues/7518 + height, width = 28, 27 + img = Image.new("RGB", size=(width, height)) + + t = transforms.Resize((height, width), antialias=True) + result = t(img) + assert result is img + + class TestPad: @pytest.mark.parametrize("fill", [85, 85.0]) def test_pad(self, fill): @@ -931,33 +952,6 @@ def test_adjust_contrast(): torch.testing.assert_close(y_np, y_ans) -@pytest.mark.skipif(Image.__version__ >= "7", reason="Temporarily disabled") -def test_adjust_saturation(): - x_shape = [2, 2, 3] - x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] - x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape) - x_pil = Image.fromarray(x_np, mode="RGB") - - # test 0 - y_pil = F.adjust_saturation(x_pil, 1) - y_np = np.array(y_pil) - torch.testing.assert_close(y_np, x_np) - - # test 1 - y_pil = F.adjust_saturation(x_pil, 0.5) - y_np = np.array(y_pil) - y_ans = [2, 4, 8, 87, 128, 173, 39, 25, 138, 133, 215, 88] - y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape) - torch.testing.assert_close(y_np, y_ans) - - # test 2 - y_pil = F.adjust_saturation(x_pil, 2) - y_np = np.array(y_pil) - y_ans = [0, 6, 22, 0, 149, 255, 32, 0, 255, 4, 255, 0] - y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape) - torch.testing.assert_close(y_np, y_ans) - - def test_adjust_hue(): x_shape = [2, 2, 3] x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1] @@ -1424,17 +1418,17 @@ def test_random_choice(proba_passthrough, seed): def test_random_order(): random_state = random.getstate() random.seed(42) - random_order_transform = transforms.RandomOrder([transforms.Resize(20), transforms.CenterCrop(10)]) + random_order_transform = transforms.RandomOrder([transforms.Resize(20, antialias=True), transforms.CenterCrop(10)]) img = transforms.ToPILImage()(torch.rand(3, 25, 25)) num_samples = 250 num_normal_order = 0 - resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20)(img)) + resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20, antialias=True)(img)) for _ in range(num_samples): out = random_order_transform(img) if out == resize_crop_out: num_normal_order += 1 - p_value = stats.binom_test(num_normal_order, num_samples, p=0.5) + p_value = stats.binomtest(num_normal_order, num_samples, p=0.5).pvalue random.setstate(random_state) assert p_value > 0.0001 @@ -1522,10 +1516,10 @@ def test_ten_crop(should_vflip, single_dim): five_crop.__repr__() if should_vflip: - vflipped_img = img.transpose(_pil_constants.FLIP_TOP_BOTTOM) + vflipped_img = img.transpose(Image.FLIP_TOP_BOTTOM) expected_output += five_crop(vflipped_img) else: - hflipped_img = img.transpose(_pil_constants.FLIP_LEFT_RIGHT) + hflipped_img = img.transpose(Image.FLIP_LEFT_RIGHT) expected_output += five_crop(hflipped_img) assert len(results) == 10 @@ -1798,6 +1792,12 @@ def test_color_jitter(): color_jitter.__repr__() +@pytest.mark.parametrize("hue", [1, (-1, 1)]) +def test_color_jitter_hue_out_of_bounds(hue): + with pytest.raises(ValueError, match=re.escape("hue values should be between (-0.5, 0.5)")): + transforms.ColorJitter(hue=hue) + + @pytest.mark.parametrize("seed", range(10)) @pytest.mark.skipif(stats is None, reason="scipy.stats not available") def test_random_erasing(seed): @@ -1818,7 +1818,7 @@ def test_random_erasing(seed): tol = 0.05 assert 1 / 3 - tol <= aspect_ratio <= 3 + tol - # Make sure that h > w and h < w are equaly likely (log-scale sampling) + # Make sure that h > w and h < w are equally likely (log-scale sampling) aspect_ratios = [] random.seed(42) trial = 1000 @@ -1834,7 +1834,7 @@ def test_random_erasing(seed): aspect_ratios.append(h / w) count_bigger_then_ones = len([1 for aspect_ratio in aspect_ratios if aspect_ratio > 1]) - p_value = stats.binom_test(count_bigger_then_ones, trial, p=0.5) + p_value = stats.binomtest(count_bigger_then_ones, trial, p=0.5).pvalue assert p_value > 0.0001 # Checking if RandomErasing can be printed as string @@ -1866,16 +1866,8 @@ def test_random_rotation(): # Checking if RandomRotation can be printed as string t.__repr__() - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - t = transforms.RandomRotation((-10, 10), interpolation=2) - assert t.interpolation == transforms.InterpolationMode.BILINEAR + t = transforms.RandomRotation((-10, 10), interpolation=Image.BILINEAR) + assert t.interpolation == transforms.InterpolationMode.BILINEAR def test_random_rotation_error(): @@ -2067,7 +2059,7 @@ class TestAffine: # https://github.com/python-pillow/Pillow/blob/71f8ec6a0cfc1008076a023c0756542539d057ab/ # src/libImaging/Geometry.c#L1060 input_pt = np.array([x + 0.5, y + 0.5, 1.0]) - res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(np.int) + res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(int) _x, _y = res[:2] if 0 <= _x < input_img.shape[1] and 0 <= _y < input_img.shape[0]: true_result[y, x, :] = input_img[_y, _x, :] @@ -2206,16 +2198,8 @@ def test_random_affine(): t = transforms.RandomAffine(10, interpolation=transforms.InterpolationMode.BILINEAR) assert "bilinear" in t.__repr__() - # assert changed type warning - with pytest.warns( - UserWarning, - match=re.escape( - "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. " - "Please use InterpolationMode enum." - ), - ): - t = transforms.RandomAffine(10, interpolation=2) - assert t.interpolation == transforms.InterpolationMode.BILINEAR + t = transforms.RandomAffine(10, interpolation=Image.BILINEAR) + assert t.interpolation == transforms.InterpolationMode.BILINEAR def test_elastic_transformation(): @@ -2233,9 +2217,8 @@ def test_elastic_transformation(): with pytest.raises(ValueError, match=r"sigma is a sequence its length should be 2"): transforms.ElasticTransform(alpha=2.0, sigma=[1.0, 0.0, 1.0]) - with pytest.warns(UserWarning, match=r"Argument interpolation should be of type InterpolationMode"): - t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=2) - assert t.interpolation == transforms.InterpolationMode.BILINEAR + t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=Image.BILINEAR) + assert t.interpolation == transforms.InterpolationMode.BILINEAR with pytest.raises(TypeError, match=r"fill should be int or float"): transforms.ElasticTransform(alpha=1.0, sigma=1.0, fill={}) @@ -2267,5 +2250,35 @@ def test_random_grayscale_with_grayscale_input(): torch.testing.assert_close(F.pil_to_tensor(output_pil), image_tensor) +# TODO: remove in 0.17 when we can delete functional_pil.py and functional_tensor.py +@pytest.mark.parametrize( + "import_statement", + ( + "from torchvision.transforms import functional_pil", + "from torchvision.transforms import functional_tensor", + "from torchvision.transforms.functional_tensor import resize", + "from torchvision.transforms.functional_pil import resize", + ), +) +@pytest.mark.parametrize("from_private", (True, False)) +def test_functional_deprecation_warning(import_statement, from_private): + if from_private: + import_statement = import_statement.replace("functional", "_functional") + source = f""" + import warnings + + with warnings.catch_warnings(): + warnings.simplefilter("error") + {import_statement} + """ + else: + source = f""" + import pytest + with pytest.warns(UserWarning, match="removed in 0.17"): + {import_statement} + """ + assert_run_python_script(textwrap.dedent(source)) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py index f4ca544deb844142629fc76ea33274821f2433a9..e2ab5673f1efe8aac2a2f041e1599a4c97cc9c83 100644 --- a/test/test_transforms_tensor.py +++ b/test/test_transforms_tensor.py @@ -1,17 +1,18 @@ import os import sys +import warnings import numpy as np +import PIL.Image import pytest import torch -import torchvision.transforms._pil_constants as _pil_constants from common_utils import ( _assert_approx_equal_tensor_to_pil, _assert_equal_tensor_to_pil, _create_data, _create_data_batch, assert_equal, - cpu_and_gpu, + cpu_and_cuda, float_dtypes, get_tmp_dir, int_dtypes, @@ -20,7 +21,12 @@ from torchvision import transforms as T from torchvision.transforms import functional as F, InterpolationMode from torchvision.transforms.autoaugment import _apply_op -NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC +NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = ( + InterpolationMode.NEAREST, + InterpolationMode.NEAREST_EXACT, + InterpolationMode.BILINEAR, + InterpolationMode.BICUBIC, +) def _test_transform_vs_scripted(transform, s_transform, tensor, msg=None): @@ -94,12 +100,12 @@ def _test_op(func, method, device, channels=3, fn_kwargs=None, meth_kwargs=None, def _test_fn_save_load(fn, tmpdir): scripted_fn = torch.jit.script(fn) - p = os.path.join(tmpdir, f"t_op_list_{fn.__name__ if hasattr(fn, '__name__') else fn.__class__.__name__}.pt") + p = os.path.join(tmpdir, f"t_op_list_{getattr(fn, '__name__', fn.__class__.__name__)}.pt") scripted_fn.save(p) _ = torch.jit.load(p) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "func,method,fn_kwargs,match_kwargs", [ @@ -124,7 +130,7 @@ def test_random(func, method, device, channels, fn_kwargs, match_kwargs): @pytest.mark.parametrize("seed", range(10)) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("channels", [1, 3]) class TestColorJitter: @pytest.fixture(autouse=True) @@ -200,7 +206,7 @@ class TestColorJitter: ) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("m", ["constant", "edge", "reflect", "symmetric"]) @pytest.mark.parametrize("mul", [1, -1]) def test_pad(m, mul, device): @@ -223,7 +229,7 @@ def test_pad(m, mul, device): _test_op(F.pad, T.Pad, device=device, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_crop(device): fn_kwargs = {"top": 2, "left": 3, "height": 4, "width": 5} # Test transforms.RandomCrop with size and padding as tuple @@ -251,7 +257,7 @@ def test_crop(device): _test_functional_op(F.crop, fn_kwargs=fn_kwargs, device=device) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "padding_config", [ @@ -277,7 +283,7 @@ def test_random_crop_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_center_crop(device, tmpdir): fn_kwargs = {"output_size": (4, 5)} meth_kwargs = {"size": (4, 5)} @@ -307,7 +313,7 @@ def test_center_crop_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "fn, method, out_length", [ @@ -366,7 +372,7 @@ class TestResize: def test_resize_int(self, size): # TODO: Minimal check for bug-fix, improve this later x = torch.rand(3, 32, 46) - t = T.Resize(size=size) + t = T.Resize(size=size, antialias=True) y = t(x) # If size is an int, smaller edge of the image will be matched to this number. # i.e, if height > width, then image will be rescaled to (size * height / width, size). @@ -374,11 +380,11 @@ class TestResize: assert y.shape[1] == size assert y.shape[2] == int(size * 46 / 32) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64]) @pytest.mark.parametrize("size", [[32], [32, 32], (32, 32), [34, 35]]) @pytest.mark.parametrize("max_size", [None, 35, 1000]) - @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST]) + @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT]) def test_resize_scripted(self, dt, size, max_size, interpolation, device): tensor, _ = _create_data(height=34, width=36, device=device) batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device) @@ -389,25 +395,25 @@ class TestResize: if max_size is not None and len(size) != 1: pytest.skip("Size should be an int or a sequence of length 1 if max_size is specified") - transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size) + transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size, antialias=True) s_transform = torch.jit.script(transform) _test_transform_vs_scripted(transform, s_transform, tensor) _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) def test_resize_save_load(self, tmpdir): - fn = T.Resize(size=[32]) + fn = T.Resize(size=[32], antialias=True) _test_fn_save_load(fn, tmpdir) - @pytest.mark.parametrize("device", cpu_and_gpu()) + @pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]]) @pytest.mark.parametrize("ratio", [(0.75, 1.333), [0.75, 1.333]]) @pytest.mark.parametrize("size", [(32,), [44], [32], [32, 32], (32, 32), [44, 55]]) - @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC]) + @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC, NEAREST_EXACT]) @pytest.mark.parametrize("antialias", [None, True, False]) def test_resized_crop(self, scale, ratio, size, interpolation, antialias, device): - if antialias and interpolation == NEAREST: - pytest.skip("Can not resize if interpolation mode is NEAREST and antialias=True") + if antialias and interpolation in {NEAREST, NEAREST_EXACT}: + pytest.skip(f"Can not resize if interpolation mode is {interpolation} and antialias=True") tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device) batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device) @@ -419,9 +425,25 @@ class TestResize: _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) def test_resized_crop_save_load(self, tmpdir): - fn = T.RandomResizedCrop(size=[32]) + fn = T.RandomResizedCrop(size=[32], antialias=True) _test_fn_save_load(fn, tmpdir) + def test_antialias_default_warning(self): + + img = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8) + + match = "The default value of the antialias" + with pytest.warns(UserWarning, match=match): + T.Resize((20, 20))(img) + with pytest.warns(UserWarning, match=match): + T.RandomResizedCrop((20, 20))(img) + + # For modes that aren't bicubic or bilinear, don't throw a warning + with warnings.catch_warnings(): + warnings.simplefilter("error") + T.Resize((20, 20), interpolation=NEAREST)(img) + T.RandomResizedCrop((20, 20), interpolation=NEAREST)(img) + def _test_random_affine_helper(device, **kwargs): tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device) @@ -438,42 +460,42 @@ def test_random_affine_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("shear", [15, 10.0, (5.0, 10.0), [-15, 15], [-10.0, 10.0, -11.0, 11.0]]) def test_random_affine_shear(device, interpolation, shear): _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, shear=shear) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]]) def test_random_affine_scale(device, interpolation, scale): _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, scale=scale) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("translate", [(0.1, 0.2), [0.2, 0.1]]) def test_random_affine_translate(device, interpolation, translate): _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, translate=translate) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]]) def test_random_affine_degrees(device, interpolation, degrees): _test_random_affine_helper(device, degrees=degrees, interpolation=interpolation) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) def test_random_affine_fill(device, interpolation, fill): _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, fill=fill) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("center", [(0, 0), [10, 10], None, (56, 44)]) @pytest.mark.parametrize("expand", [True, False]) @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]]) @@ -495,7 +517,7 @@ def test_random_rotate_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("distortion_scale", np.linspace(0.1, 1.0, num=20)) @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR]) @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) @@ -515,7 +537,7 @@ def test_random_perspective_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "Klass, meth_kwargs", [(T.Grayscale, {"num_output_channels": 1}), (T.Grayscale, {"num_output_channels": 3}), (T.RandomGrayscale, {})], @@ -525,7 +547,7 @@ def test_to_grayscale(device, Klass, meth_kwargs): _test_class_op(Klass, meth_kwargs=meth_kwargs, test_exact_match=False, device=device, tol=tol, agg_method="max") -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("in_dtype", int_dtypes() + float_dtypes()) @pytest.mark.parametrize("out_dtype", int_dtypes() + float_dtypes()) def test_convert_image_dtype(device, in_dtype, out_dtype): @@ -556,7 +578,7 @@ def test_convert_image_dtype_save_load(tmpdir): _test_fn_save_load(fn, tmpdir) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("policy", [policy for policy in T.AutoAugmentPolicy]) @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) def test_autoaugment(device, policy, fill): @@ -570,7 +592,7 @@ def test_autoaugment(device, policy, fill): _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("num_ops", [1, 2, 3]) @pytest.mark.parametrize("magnitude", [7, 9, 11]) @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) @@ -585,7 +607,7 @@ def test_randaugment(device, num_ops, magnitude, fill): _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) def test_trivialaugmentwide(device, fill): tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device) @@ -598,7 +620,7 @@ def test_trivialaugmentwide(device, fill): _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) def test_augmix(device, fill): tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device) @@ -635,13 +657,13 @@ def test_autoaugment__op_apply_shear(interpolation, mode): matrix = (1, level, 0, 0, 1, 0) elif mode == "Y": matrix = (1, 0, 0, level, 1, 0) - return pil_img.transform((image_size, image_size), _pil_constants.AFFINE, matrix, resample=resample) + return pil_img.transform((image_size, image_size), PIL.Image.AFFINE, matrix, resample=resample) t_img, pil_img = _create_data(image_size, image_size) resample_pil = { - F.InterpolationMode.NEAREST: _pil_constants.NEAREST, - F.InterpolationMode.BILINEAR: _pil_constants.BILINEAR, + F.InterpolationMode.NEAREST: PIL.Image.NEAREST, + F.InterpolationMode.BILINEAR: PIL.Image.BILINEAR, }[interpolation] level = 0.3 @@ -664,10 +686,20 @@ def test_autoaugment__op_apply_shear(interpolation, mode): _assert_approx_equal_tensor_to_pil(out, expected_out) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "config", - [{"value": 0.2}, {"value": "random"}, {"value": (0.2, 0.2, 0.2)}, {"value": "random", "ratio": (0.1, 0.2)}], + [ + {}, + {"value": 1}, + {"value": 0.2}, + {"value": "random"}, + {"value": (1, 1, 1)}, + {"value": (0.2, 0.2, 0.2)}, + {"value": [1, 1, 1]}, + {"value": [0.2, 0.2, 0.2]}, + {"value": "random", "ratio": (0.1, 0.2)}, + ], ) def test_random_erasing(device, config): tensor, _ = _create_data(24, 32, channels=3, device=device) @@ -692,7 +724,7 @@ def test_random_erasing_with_invalid_data(): random_erasing(img) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_normalize(device, tmpdir): fn = T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) tensor, _ = _create_data(26, 34, device=device) @@ -711,7 +743,7 @@ def test_normalize(device, tmpdir): scripted_fn.save(os.path.join(tmpdir, "t_norm.pt")) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_linear_transformation(device, tmpdir): c, h, w = 3, 24, 32 @@ -737,7 +769,7 @@ def test_linear_transformation(device, tmpdir): scripted_fn.save(os.path.join(tmpdir, "t_norm.pt")) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_compose(device): tensor, _ = _create_data(26, 34, device=device) tensor = tensor.to(dtype=torch.float32) / 255.0 @@ -765,7 +797,7 @@ def test_compose(device): torch.jit.script(t) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) def test_random_apply(device): tensor, _ = _create_data(26, 34, device=device) tensor = tensor.to(dtype=torch.float32) / 255.0 @@ -807,7 +839,7 @@ def test_random_apply(device): torch.jit.script(transforms) -@pytest.mark.parametrize("device", cpu_and_gpu()) +@pytest.mark.parametrize("device", cpu_and_cuda()) @pytest.mark.parametrize( "meth_kwargs", [ @@ -843,3 +875,35 @@ def test_gaussian_blur(device, channels, meth_kwargs): agg_method="max", tol=tol, ) + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize( + "fill", + [ + 1, + 1.0, + [1], + [1.0], + (1,), + (1.0,), + [1, 2, 3], + [1.0, 2.0, 3.0], + (1, 2, 3), + (1.0, 2.0, 3.0), + ], +) +@pytest.mark.parametrize("channels", [1, 3]) +def test_elastic_transform(device, channels, fill): + if isinstance(fill, (list, tuple)) and len(fill) > 1 and channels == 1: + # For this the test would correctly fail, since the number of channels in the image does not match `fill`. + # Thus, this is not an issue in the transform, but rather a problem of parametrization that just gives the + # product of `fill` and `channels`. + return + + _test_class_op( + T.ElasticTransform, + meth_kwargs=dict(fill=fill), + channels=channels, + device=device, + ) diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py new file mode 100644 index 0000000000000000000000000000000000000000..3f0056e96ab67352dce71d80a72a6c47290d7315 --- /dev/null +++ b/test/test_transforms_v2.py @@ -0,0 +1,1185 @@ +import itertools +import pathlib +import pickle +import random +import warnings + +import numpy as np + +import PIL.Image +import pytest +import torch +import torchvision.transforms.v2 as transforms + +from common_utils import assert_equal, cpu_and_cuda +from torch.utils._pytree import tree_flatten, tree_unflatten +from torchvision import tv_tensors +from torchvision.ops.boxes import box_iou +from torchvision.transforms.functional import to_pil_image +from torchvision.transforms.v2 import functional as F +from torchvision.transforms.v2._utils import check_type, is_pure_tensor, query_chw +from transforms_v2_legacy_utils import ( + make_bounding_boxes, + make_detection_mask, + make_image, + make_images, + make_multiple_bounding_boxes, + make_segmentation_mask, + make_video, + make_videos, +) + + +def make_vanilla_tensor_images(*args, **kwargs): + for image in make_images(*args, **kwargs): + if image.ndim > 3: + continue + yield image.data + + +def make_pil_images(*args, **kwargs): + for image in make_vanilla_tensor_images(*args, **kwargs): + yield to_pil_image(image) + + +def make_vanilla_tensor_bounding_boxes(*args, **kwargs): + for bounding_boxes in make_multiple_bounding_boxes(*args, **kwargs): + yield bounding_boxes.data + + +def parametrize(transforms_with_inputs): + return pytest.mark.parametrize( + ("transform", "input"), + [ + pytest.param( + transform, + input, + id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}", + ) + for transform, inputs in transforms_with_inputs + for idx, input in enumerate(inputs) + ], + ) + + +def auto_augment_adapter(transform, input, device): + adapted_input = {} + image_or_video_found = False + for key, value in input.items(): + if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.Mask)): + # AA transforms don't support bounding boxes or masks + continue + elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)): + if image_or_video_found: + # AA transforms only support a single image or video + continue + image_or_video_found = True + adapted_input[key] = value + return adapted_input + + +def linear_transformation_adapter(transform, input, device): + flat_inputs = list(input.values()) + c, h, w = query_chw( + [ + item + for item, needs_transform in zip(flat_inputs, transforms.Transform()._needs_transform_list(flat_inputs)) + if needs_transform + ] + ) + num_elements = c * h * w + transform.transformation_matrix = torch.randn((num_elements, num_elements), device=device) + transform.mean_vector = torch.randn((num_elements,), device=device) + return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)} + + +def normalize_adapter(transform, input, device): + adapted_input = {} + for key, value in input.items(): + if isinstance(value, PIL.Image.Image): + # normalize doesn't support PIL images + continue + elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor)): + # normalize doesn't support integer images + value = F.to_dtype(value, torch.float32, scale=True) + adapted_input[key] = value + return adapted_input + + +class TestSmoke: + @pytest.mark.parametrize( + ("transform", "adapter"), + [ + (transforms.RandomErasing(p=1.0), None), + (transforms.AugMix(), auto_augment_adapter), + (transforms.AutoAugment(), auto_augment_adapter), + (transforms.RandAugment(), auto_augment_adapter), + (transforms.TrivialAugmentWide(), auto_augment_adapter), + (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None), + (transforms.Grayscale(), None), + (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None), + (transforms.RandomAutocontrast(p=1.0), None), + (transforms.RandomEqualize(p=1.0), None), + (transforms.RandomGrayscale(p=1.0), None), + (transforms.RandomInvert(p=1.0), None), + (transforms.RandomChannelPermutation(), None), + (transforms.RandomPhotometricDistort(p=1.0), None), + (transforms.RandomPosterize(bits=4, p=1.0), None), + (transforms.RandomSolarize(threshold=0.5, p=1.0), None), + (transforms.CenterCrop([16, 16]), None), + (transforms.ElasticTransform(sigma=1.0), None), + (transforms.Pad(4), None), + (transforms.RandomAffine(degrees=30.0), None), + (transforms.RandomCrop([16, 16], pad_if_needed=True), None), + (transforms.RandomHorizontalFlip(p=1.0), None), + (transforms.RandomPerspective(p=1.0), None), + (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None), + (transforms.RandomResizedCrop([16, 16], antialias=True), None), + (transforms.RandomRotation(degrees=30), None), + (transforms.RandomShortestSize(min_size=10, antialias=True), None), + (transforms.RandomVerticalFlip(p=1.0), None), + (transforms.RandomZoomOut(p=1.0), None), + (transforms.Resize([16, 16], antialias=True), None), + (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None), + (transforms.ClampBoundingBoxes(), None), + (transforms.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.CXCYWH), None), + (transforms.ConvertImageDtype(), None), + (transforms.GaussianBlur(kernel_size=3), None), + ( + transforms.LinearTransformation( + # These are just dummy values that will be filled by the adapter. We can't define them upfront, + # because for we neither know the spatial size nor the device at this point + transformation_matrix=torch.empty((1, 1)), + mean_vector=torch.empty((1,)), + ), + linear_transformation_adapter, + ), + (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter), + (transforms.ToDtype(torch.float64), None), + (transforms.UniformTemporalSubsample(num_samples=2), None), + ], + ids=lambda transform: type(transform).__name__, + ) + @pytest.mark.parametrize("container_type", [dict, list, tuple]) + @pytest.mark.parametrize( + "image_or_video", + [ + make_image(), + make_video(), + next(make_pil_images(color_spaces=["RGB"])), + next(make_vanilla_tensor_images()), + ], + ) + @pytest.mark.parametrize("de_serialize", [lambda t: t, lambda t: pickle.loads(pickle.dumps(t))]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_common(self, transform, adapter, container_type, image_or_video, de_serialize, device): + transform = de_serialize(transform) + + canvas_size = F.get_size(image_or_video) + input = dict( + image_or_video=image_or_video, + image_tv_tensor=make_image(size=canvas_size), + video_tv_tensor=make_video(size=canvas_size), + image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])), + bounding_boxes_xyxy=make_bounding_boxes( + format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,) + ), + bounding_boxes_xywh=make_bounding_boxes( + format=tv_tensors.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,) + ), + bounding_boxes_cxcywh=make_bounding_boxes( + format=tv_tensors.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,) + ), + bounding_boxes_degenerate_xyxy=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [2, 0, 1, 1], # x1 > x2, y1 < y2 + [0, 2, 1, 1], # x1 < x2, y1 > y2 + [2, 2, 1, 1], # x1 > x2, y1 > y2 + ], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=canvas_size, + ), + bounding_boxes_degenerate_xywh=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [0, 0, 1, -1], # negative height + [0, 0, -1, 1], # negative width + [0, 0, -1, -1], # negative height and width + ], + format=tv_tensors.BoundingBoxFormat.XYWH, + canvas_size=canvas_size, + ), + bounding_boxes_degenerate_cxcywh=tv_tensors.BoundingBoxes( + [ + [0, 0, 0, 0], # no height or width + [0, 0, 0, 1], # no height + [0, 0, 1, 0], # no width + [0, 0, 1, -1], # negative height + [0, 0, -1, 1], # negative width + [0, 0, -1, -1], # negative height and width + ], + format=tv_tensors.BoundingBoxFormat.CXCYWH, + canvas_size=canvas_size, + ), + detection_mask=make_detection_mask(size=canvas_size), + segmentation_mask=make_segmentation_mask(size=canvas_size), + int=0, + float=0.0, + bool=True, + none=None, + str="str", + path=pathlib.Path.cwd(), + object=object(), + tensor=torch.empty(5), + array=np.empty(5), + ) + if adapter is not None: + input = adapter(transform, input, device) + + if container_type in {tuple, list}: + input = container_type(input.values()) + + input_flat, input_spec = tree_flatten(input) + input_flat = [item.to(device) if isinstance(item, torch.Tensor) else item for item in input_flat] + input = tree_unflatten(input_flat, input_spec) + + torch.manual_seed(0) + output = transform(input) + output_flat, output_spec = tree_flatten(output) + + assert output_spec == input_spec + + for output_item, input_item, should_be_transformed in zip( + output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat) + ): + if should_be_transformed: + assert type(output_item) is type(input_item) + else: + assert output_item is input_item + + if isinstance(input_item, tv_tensors.BoundingBoxes) and not isinstance( + transform, transforms.ConvertBoundingBoxFormat + ): + assert output_item.format == input_item.format + + # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future + # transform that does this), back into a valid one. + # TODO: we should test that against all degenerate boxes above + for format in list(tv_tensors.BoundingBoxFormat): + sample = dict( + boxes=tv_tensors.BoundingBoxes([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)), + labels=torch.tensor([3]), + ) + assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4) + + @parametrize( + [ + ( + transform, + itertools.chain.from_iterable( + fn( + color_spaces=[ + "GRAY", + "RGB", + ], + dtypes=[torch.uint8], + extra_dims=[(), (4,)], + **(dict(num_frames=[3]) if fn is make_videos else dict()), + ) + for fn in [ + make_images, + make_vanilla_tensor_images, + make_pil_images, + make_videos, + ] + ), + ) + for transform in ( + transforms.RandAugment(), + transforms.TrivialAugmentWide(), + transforms.AutoAugment(), + transforms.AugMix(), + ) + ] + ) + def test_auto_augment(self, transform, input): + transform(input) + + @parametrize( + [ + ( + transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]), + itertools.chain.from_iterable( + fn(color_spaces=["RGB"], dtypes=[torch.float32]) + for fn in [ + make_images, + make_vanilla_tensor_images, + make_videos, + ] + ), + ), + ] + ) + def test_normalize(self, transform, input): + transform(input) + + @parametrize( + [ + ( + transforms.RandomResizedCrop([16, 16], antialias=True), + itertools.chain( + make_images(extra_dims=[(4,)]), + make_vanilla_tensor_images(), + make_pil_images(), + make_videos(extra_dims=[()]), + ), + ) + ] + ) + def test_random_resized_crop(self, transform, input): + transform(input) + + +@pytest.mark.parametrize( + "flat_inputs", + itertools.permutations( + [ + next(make_vanilla_tensor_images()), + next(make_vanilla_tensor_images()), + next(make_pil_images()), + make_image(), + next(make_videos()), + ], + 3, + ), +) +def test_pure_tensor_heuristic(flat_inputs): + def split_on_pure_tensor(to_split): + # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts: + # 1. The first pure tensor. If none is present, this will be `None` + # 2. A list of the remaining pure tensors + # 3. A list of all other items + pure_tensors = [] + others = [] + # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to + # affect the splitting. + for item, inpt in zip(to_split, flat_inputs): + (pure_tensors if is_pure_tensor(inpt) else others).append(item) + return pure_tensors[0] if pure_tensors else None, pure_tensors[1:], others + + class CopyCloneTransform(transforms.Transform): + def _transform(self, inpt, params): + return inpt.clone() if isinstance(inpt, torch.Tensor) else inpt.copy() + + @staticmethod + def was_applied(output, inpt): + identity = output is inpt + if identity: + return False + + # Make sure nothing fishy is going on + assert_equal(output, inpt) + return True + + first_pure_tensor_input, other_pure_tensor_inputs, other_inputs = split_on_pure_tensor(flat_inputs) + + transform = CopyCloneTransform() + transformed_sample = transform(flat_inputs) + + first_pure_tensor_output, other_pure_tensor_outputs, other_outputs = split_on_pure_tensor(transformed_sample) + + if first_pure_tensor_input is not None: + if other_inputs: + assert not transform.was_applied(first_pure_tensor_output, first_pure_tensor_input) + else: + assert transform.was_applied(first_pure_tensor_output, first_pure_tensor_input) + + for output, inpt in zip(other_pure_tensor_outputs, other_pure_tensor_inputs): + assert not transform.was_applied(output, inpt) + + for input, output in zip(other_inputs, other_outputs): + assert transform.was_applied(output, input) + + +class TestPad: + def test_assertions(self): + with pytest.raises(TypeError, match="Got inappropriate padding arg"): + transforms.Pad("abc") + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.Pad([-0.7, 0, 0.7]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.Pad(12, fill="abc") + + with pytest.raises(ValueError, match="Padding mode should be either"): + transforms.Pad(12, padding_mode="abc") + + +class TestRandomZoomOut: + def test_assertions(self): + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomZoomOut(fill="abc") + + with pytest.raises(TypeError, match="should be a sequence of length"): + transforms.RandomZoomOut(0, side_range=0) + + with pytest.raises(ValueError, match="Invalid canvas side range"): + transforms.RandomZoomOut(0, side_range=[4.0, 1.0]) + + @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)]) + @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]]) + def test__get_params(self, fill, side_range): + transform = transforms.RandomZoomOut(fill=fill, side_range=side_range) + + h, w = size = (24, 32) + image = make_image(size) + + params = transform._get_params([image]) + + assert len(params["padding"]) == 4 + assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w + assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h + assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w + assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h + + +class TestRandomPerspective: + def test_assertions(self): + with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"): + transforms.RandomPerspective(distortion_scale=-1.0) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomPerspective(0.5, fill="abc") + + def test__get_params(self): + dscale = 0.5 + transform = transforms.RandomPerspective(dscale) + + image = make_image((24, 32)) + + params = transform._get_params([image]) + + assert "coefficients" in params + assert len(params["coefficients"]) == 8 + + +class TestElasticTransform: + def test_assertions(self): + + with pytest.raises(TypeError, match="alpha should be a number or a sequence of numbers"): + transforms.ElasticTransform({}) + + with pytest.raises(ValueError, match="alpha is a sequence its length should be 1 or 2"): + transforms.ElasticTransform([1.0, 2.0, 3.0]) + + with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"): + transforms.ElasticTransform(1.0, {}) + + with pytest.raises(ValueError, match="sigma is a sequence its length should be 1 or 2"): + transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.ElasticTransform(1.0, 2.0, fill="abc") + + def test__get_params(self): + alpha = 2.0 + sigma = 3.0 + transform = transforms.ElasticTransform(alpha, sigma) + + h, w = size = (24, 32) + image = make_image(size) + + params = transform._get_params([image]) + + displacement = params["displacement"] + assert displacement.shape == (1, h, w, 2) + assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all() + assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all() + + +class TestTransform: + @pytest.mark.parametrize( + "inpt_type", + [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int], + ) + def test_check_transformed_types(self, inpt_type, mocker): + # This test ensures that we correctly handle which types to transform and which to bypass + t = transforms.Transform() + inpt = mocker.MagicMock(spec=inpt_type) + + if inpt_type in (np.ndarray, str, int): + output = t(inpt) + assert output is inpt + else: + with pytest.raises(NotImplementedError): + t(inpt) + + +class TestToImage: + @pytest.mark.parametrize( + "inpt_type", + [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int], + ) + def test__transform(self, inpt_type, mocker): + fn = mocker.patch( + "torchvision.transforms.v2.functional.to_image", + return_value=torch.rand(1, 3, 8, 8), + ) + + inpt = mocker.MagicMock(spec=inpt_type) + transform = transforms.ToImage() + transform(inpt) + if inpt_type in (tv_tensors.BoundingBoxes, tv_tensors.Image, str, int): + assert fn.call_count == 0 + else: + fn.assert_called_once_with(inpt) + + +class TestToPILImage: + @pytest.mark.parametrize( + "inpt_type", + [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int], + ) + def test__transform(self, inpt_type, mocker): + fn = mocker.patch("torchvision.transforms.v2.functional.to_pil_image") + + inpt = mocker.MagicMock(spec=inpt_type) + transform = transforms.ToPILImage() + transform(inpt) + if inpt_type in (PIL.Image.Image, tv_tensors.BoundingBoxes, str, int): + assert fn.call_count == 0 + else: + fn.assert_called_once_with(inpt, mode=transform.mode) + + +class TestToTensor: + @pytest.mark.parametrize( + "inpt_type", + [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int], + ) + def test__transform(self, inpt_type, mocker): + fn = mocker.patch("torchvision.transforms.functional.to_tensor") + + inpt = mocker.MagicMock(spec=inpt_type) + with pytest.warns(UserWarning, match="deprecated and will be removed"): + transform = transforms.ToTensor() + transform(inpt) + if inpt_type in (tv_tensors.Image, torch.Tensor, tv_tensors.BoundingBoxes, str, int): + assert fn.call_count == 0 + else: + fn.assert_called_once_with(inpt) + + +class TestContainers: + @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder]) + def test_assertions(self, transform_cls): + with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"): + transform_cls(transforms.RandomCrop(28)) + + @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder]) + @pytest.mark.parametrize( + "trfms", + [ + [transforms.Pad(2), transforms.RandomCrop(28)], + [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)], + [transforms.Pad(2), lambda x: 2.0 * x, transforms.RandomCrop(28)], + ], + ) + def test_ctor(self, transform_cls, trfms): + c = transform_cls(trfms) + inpt = torch.rand(1, 3, 32, 32) + output = c(inpt) + assert isinstance(output, torch.Tensor) + assert output.ndim == 4 + + +class TestRandomChoice: + def test_assertions(self): + with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"): + transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1]) + + +class TestRandomIoUCrop: + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]]) + def test__get_params(self, device, options): + orig_h, orig_w = size = (24, 32) + image = make_image(size) + bboxes = tv_tensors.BoundingBoxes( + torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]), + format="XYXY", + canvas_size=size, + device=device, + ) + sample = [image, bboxes] + + transform = transforms.RandomIoUCrop(sampler_options=options) + + n_samples = 5 + for _ in range(n_samples): + + params = transform._get_params(sample) + + if options == [2.0]: + assert len(params) == 0 + return + + assert len(params["is_within_crop_area"]) > 0 + assert params["is_within_crop_area"].dtype == torch.bool + + assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h) + assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w) + + left, top = params["left"], params["top"] + new_h, new_w = params["height"], params["width"] + ious = box_iou( + bboxes, + torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device), + ) + assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}" + + def test__transform_empty_params(self, mocker): + transform = transforms.RandomIoUCrop(sampler_options=[2.0]) + image = tv_tensors.Image(torch.rand(1, 3, 4, 4)) + bboxes = tv_tensors.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4)) + label = torch.tensor([1]) + sample = [image, bboxes, label] + # Let's mock transform._get_params to control the output: + transform._get_params = mocker.MagicMock(return_value={}) + output = transform(sample) + torch.testing.assert_close(output, sample) + + def test_forward_assertion(self): + transform = transforms.RandomIoUCrop() + with pytest.raises( + TypeError, + match="requires input sample to contain tensor or PIL images and bounding boxes", + ): + transform(torch.tensor(0)) + + def test__transform(self, mocker): + transform = transforms.RandomIoUCrop() + + size = (32, 24) + image = make_image(size) + bboxes = make_bounding_boxes(format="XYXY", canvas_size=size, batch_dims=(6,)) + masks = make_detection_mask(size, num_objects=6) + + sample = [image, bboxes, masks] + + is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool) + + params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area) + transform._get_params = mocker.MagicMock(return_value=params) + output = transform(sample) + + # check number of bboxes vs number of labels: + output_bboxes = output[1] + assert isinstance(output_bboxes, tv_tensors.BoundingBoxes) + assert (output_bboxes[~is_within_crop_area] == 0).all() + + output_masks = output[2] + assert isinstance(output_masks, tv_tensors.Mask) + + +class TestScaleJitter: + def test__get_params(self): + canvas_size = (24, 32) + target_size = (16, 12) + scale_range = (0.5, 1.5) + + transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range) + + sample = make_image(canvas_size) + + n_samples = 5 + for _ in range(n_samples): + + params = transform._get_params([sample]) + + assert "size" in params + size = params["size"] + + assert isinstance(size, tuple) and len(size) == 2 + height, width = size + + r_min = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[0] + r_max = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[1] + + assert int(canvas_size[0] * r_min) <= height <= int(canvas_size[0] * r_max) + assert int(canvas_size[1] * r_min) <= width <= int(canvas_size[1] * r_max) + + +class TestRandomShortestSize: + @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)]) + def test__get_params(self, min_size, max_size): + canvas_size = (3, 10) + + transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True) + + sample = make_image(canvas_size) + params = transform._get_params([sample]) + + assert "size" in params + size = params["size"] + + assert isinstance(size, tuple) and len(size) == 2 + + longer = max(size) + shorter = min(size) + if max_size is not None: + assert longer <= max_size + assert shorter <= max_size + else: + assert shorter in min_size + + +class TestLinearTransformation: + def test_assertions(self): + with pytest.raises(ValueError, match="transformation_matrix should be square"): + transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5)) + + with pytest.raises(ValueError, match="mean_vector should have the same length"): + transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5)) + + @pytest.mark.parametrize( + "inpt", + [ + 122 * torch.ones(1, 3, 8, 8), + 122.0 * torch.ones(1, 3, 8, 8), + tv_tensors.Image(122 * torch.ones(1, 3, 8, 8)), + PIL.Image.new("RGB", (8, 8), (122, 122, 122)), + ], + ) + def test__transform(self, inpt): + + v = 121 * torch.ones(3 * 8 * 8) + m = torch.ones(3 * 8 * 8, 3 * 8 * 8) + transform = transforms.LinearTransformation(m, v) + + if isinstance(inpt, PIL.Image.Image): + with pytest.raises(TypeError, match="does not support PIL images"): + transform(inpt) + else: + output = transform(inpt) + assert isinstance(output, torch.Tensor) + assert output.unique() == 3 * 8 * 8 + assert output.dtype == inpt.dtype + + +class TestRandomResize: + def test__get_params(self): + min_size = 3 + max_size = 6 + + transform = transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True) + + for _ in range(10): + params = transform._get_params([]) + + assert isinstance(params["size"], list) and len(params["size"]) == 1 + size = params["size"][0] + + assert min_size <= size < max_size + + +class TestUniformTemporalSubsample: + @pytest.mark.parametrize( + "inpt", + [ + torch.zeros(10, 3, 8, 8), + torch.zeros(1, 10, 3, 8, 8), + tv_tensors.Video(torch.zeros(1, 10, 3, 8, 8)), + ], + ) + def test__transform(self, inpt): + num_samples = 5 + transform = transforms.UniformTemporalSubsample(num_samples) + + output = transform(inpt) + assert type(output) is type(inpt) + assert output.shape[-4] == num_samples + assert output.dtype == inpt.dtype + + +# TODO: remove this test in 0.17 when the default of antialias changes to True +def test_antialias_warning(): + pil_img = PIL.Image.new("RGB", size=(10, 10), color=127) + tensor_img = torch.randint(0, 256, size=(3, 10, 10), dtype=torch.uint8) + tensor_video = torch.randint(0, 256, size=(2, 3, 10, 10), dtype=torch.uint8) + + match = "The default value of the antialias parameter" + with pytest.warns(UserWarning, match=match): + transforms.RandomResizedCrop((20, 20))(tensor_img) + with pytest.warns(UserWarning, match=match): + transforms.ScaleJitter((20, 20))(tensor_img) + with pytest.warns(UserWarning, match=match): + transforms.RandomShortestSize((20, 20))(tensor_img) + with pytest.warns(UserWarning, match=match): + transforms.RandomResize(10, 20)(tensor_img) + + with pytest.warns(UserWarning, match=match): + F.resized_crop(tv_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20)) + + with pytest.warns(UserWarning, match=match): + F.resize(tv_tensors.Video(tensor_video), (20, 20)) + with pytest.warns(UserWarning, match=match): + F.resized_crop(tv_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20)) + + with warnings.catch_warnings(): + warnings.simplefilter("error") + transforms.RandomResizedCrop((20, 20))(pil_img) + transforms.ScaleJitter((20, 20))(pil_img) + transforms.RandomShortestSize((20, 20))(pil_img) + transforms.RandomResize(10, 20)(pil_img) + + transforms.RandomResizedCrop((20, 20), antialias=True)(tensor_img) + transforms.ScaleJitter((20, 20), antialias=True)(tensor_img) + transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img) + transforms.RandomResize(10, 20, antialias=True)(tensor_img) + + F.resized_crop(tv_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20), antialias=True) + F.resized_crop(tv_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20), antialias=True) + + +@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image)) +@pytest.mark.parametrize("label_type", (torch.Tensor, int)) +@pytest.mark.parametrize("dataset_return_type", (dict, tuple)) +@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage)) +def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor): + + image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8)) + if image_type is PIL.Image: + image = to_pil_image(image[0]) + elif image_type is torch.Tensor: + image = image.as_subclass(torch.Tensor) + assert is_pure_tensor(image) + + label = 1 if label_type is int else torch.tensor([1]) + + if dataset_return_type is dict: + sample = { + "image": image, + "label": label, + } + else: + sample = image, label + + if to_tensor is transforms.ToTensor: + with pytest.warns(UserWarning, match="deprecated and will be removed"): + to_tensor = to_tensor() + else: + to_tensor = to_tensor() + + t = transforms.Compose( + [ + transforms.RandomResizedCrop((224, 224), antialias=True), + transforms.RandomHorizontalFlip(p=1), + transforms.RandAugment(), + transforms.TrivialAugmentWide(), + transforms.AugMix(), + transforms.AutoAugment(), + to_tensor, + # TODO: ConvertImageDtype is a pass-through on PIL images, is that + # intended? This results in a failure if we convert to tensor after + # it, because the image would still be uint8 which make Normalize + # fail. + transforms.ConvertImageDtype(torch.float), + transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]), + transforms.RandomErasing(p=1), + ] + ) + + out = t(sample) + + assert type(out) == type(sample) + + if dataset_return_type is tuple: + out_image, out_label = out + else: + assert out.keys() == sample.keys() + out_image, out_label = out.values() + + assert out_image.shape[-2:] == (224, 224) + assert out_label == label + + +@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image)) +@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite")) +@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage)) +@pytest.mark.parametrize("sanitize", (True, False)) +def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize): + torch.manual_seed(0) + + if to_tensor is transforms.ToTensor: + with pytest.warns(UserWarning, match="deprecated and will be removed"): + to_tensor = to_tensor() + else: + to_tensor = to_tensor() + + if data_augmentation == "hflip": + t = [ + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + elif data_augmentation == "lsj": + t = [ + transforms.ScaleJitter(target_size=(1024, 1024), antialias=True), + # Note: replaced FixedSizeCrop with RandomCrop, becuase we're + # leaving FixedSizeCrop in prototype for now, and it expects Label + # classes which we won't release yet. + # transforms.FixedSizeCrop( + # size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {tv_tensors.Mask: 0}) + # ), + transforms.RandomCrop((1024, 1024), pad_if_needed=True), + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + elif data_augmentation == "multiscale": + t = [ + transforms.RandomShortestSize( + min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True + ), + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + elif data_augmentation == "ssd": + t = [ + transforms.RandomPhotometricDistort(p=1), + transforms.RandomZoomOut(fill={"others": (123.0, 117.0, 104.0), tv_tensors.Mask: 0}, p=1), + transforms.RandomIoUCrop(), + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + elif data_augmentation == "ssdlite": + t = [ + transforms.RandomIoUCrop(), + transforms.RandomHorizontalFlip(p=1), + to_tensor, + transforms.ConvertImageDtype(torch.float), + ] + if sanitize: + t += [transforms.SanitizeBoundingBoxes()] + t = transforms.Compose(t) + + num_boxes = 5 + H = W = 250 + + image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)) + if image_type is PIL.Image: + image = to_pil_image(image[0]) + elif image_type is torch.Tensor: + image = image.as_subclass(torch.Tensor) + assert is_pure_tensor(image) + + label = torch.randint(0, 10, size=(num_boxes,)) + + boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4)) + boxes[:, 2:] += boxes[:, :2] + boxes = boxes.clamp(min=0, max=min(H, W)) + boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W)) + + masks = tv_tensors.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8)) + + sample = { + "image": image, + "label": label, + "boxes": boxes, + "masks": masks, + } + + out = t(sample) + + if isinstance(to_tensor, transforms.ToTensor) and image_type is not tv_tensors.Image: + assert is_pure_tensor(out["image"]) + else: + assert isinstance(out["image"], tv_tensors.Image) + assert isinstance(out["label"], type(sample["label"])) + + num_boxes_expected = { + # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It + # doesn't remove them strictly speaking, it just marks some boxes as + # degenerate and those boxes will be later removed by + # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize + # param is True. + # Note that the values below are probably specific to the random seed + # set above (which is fine). + (True, "ssd"): 5, + (True, "ssdlite"): 4, + }.get((sanitize, data_augmentation), num_boxes) + + assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected + + +@pytest.mark.parametrize("min_size", (1, 10)) +@pytest.mark.parametrize("labels_getter", ("default", lambda inputs: inputs["labels"], None, lambda inputs: None)) +@pytest.mark.parametrize("sample_type", (tuple, dict)) +def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type): + + if sample_type is tuple and not isinstance(labels_getter, str): + # The "lambda inputs: inputs["labels"]" labels_getter used in this test + # doesn't work if the input is a tuple. + return + + H, W = 256, 128 + + boxes_and_validity = [ + ([0, 1, 10, 1], False), # Y1 == Y2 + ([0, 1, 0, 20], False), # X1 == X2 + ([0, 0, min_size - 1, 10], False), # H < min_size + ([0, 0, 10, min_size - 1], False), # W < min_size + ([0, 0, 10, H + 1], False), # Y2 > H + ([0, 0, W + 1, 10], False), # X2 > W + ([-1, 1, 10, 20], False), # any < 0 + ([0, 0, -1, 20], False), # any < 0 + ([0, 0, -10, -1], False), # any < 0 + ([0, 0, min_size, 10], True), # H < min_size + ([0, 0, 10, min_size], True), # W < min_size + ([0, 0, W, H], True), # TODO: Is that actually OK?? Should it be -1? + ([1, 1, 30, 20], True), + ([0, 0, 10, 10], True), + ([1, 1, 30, 20], True), + ] + + random.shuffle(boxes_and_validity) # For test robustness: mix order of wrong and correct cases + boxes, is_valid_mask = zip(*boxes_and_validity) + valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid] + + boxes = torch.tensor(boxes) + labels = torch.arange(boxes.shape[0]) + + boxes = tv_tensors.BoundingBoxes( + boxes, + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=(H, W), + ) + + masks = tv_tensors.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W))) + whatever = torch.rand(10) + input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8) + sample = { + "image": input_img, + "labels": labels, + "boxes": boxes, + "whatever": whatever, + "None": None, + "masks": masks, + } + + if sample_type is tuple: + img = sample.pop("image") + sample = (img, sample) + + out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample) + + if sample_type is tuple: + out_image = out[0] + out_labels = out[1]["labels"] + out_boxes = out[1]["boxes"] + out_masks = out[1]["masks"] + out_whatever = out[1]["whatever"] + else: + out_image = out["image"] + out_labels = out["labels"] + out_boxes = out["boxes"] + out_masks = out["masks"] + out_whatever = out["whatever"] + + assert out_image is input_img + assert out_whatever is whatever + + assert isinstance(out_boxes, tv_tensors.BoundingBoxes) + assert isinstance(out_masks, tv_tensors.Mask) + + if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None): + assert out_labels is labels + else: + assert isinstance(out_labels, torch.Tensor) + assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0] + # This works because we conveniently set labels to arange(num_boxes) + assert out_labels.tolist() == valid_indices + + +def test_sanitize_bounding_boxes_no_label(): + # Non-regression test for https://github.com/pytorch/vision/issues/7878 + + img = make_image() + boxes = make_bounding_boxes() + + with pytest.raises(ValueError, match="or a two-tuple whose second item is a dict"): + transforms.SanitizeBoundingBoxes()(img, boxes) + + out_img, out_boxes = transforms.SanitizeBoundingBoxes(labels_getter=None)(img, boxes) + assert isinstance(out_img, tv_tensors.Image) + assert isinstance(out_boxes, tv_tensors.BoundingBoxes) + + +def test_sanitize_bounding_boxes_errors(): + + good_bbox = tv_tensors.BoundingBoxes( + [[0, 0, 10, 10]], + format=tv_tensors.BoundingBoxFormat.XYXY, + canvas_size=(20, 20), + ) + + with pytest.raises(ValueError, match="min_size must be >= 1"): + transforms.SanitizeBoundingBoxes(min_size=0) + with pytest.raises(ValueError, match="labels_getter should either be 'default'"): + transforms.SanitizeBoundingBoxes(labels_getter=12) + + with pytest.raises(ValueError, match="Could not infer where the labels are"): + bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])} + transforms.SanitizeBoundingBoxes()(bad_labels_key) + + with pytest.raises(ValueError, match="must be a tensor"): + not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()} + transforms.SanitizeBoundingBoxes()(not_a_tensor) + + with pytest.raises(ValueError, match="Number of boxes"): + different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)} + transforms.SanitizeBoundingBoxes()(different_sizes) + + +class TestLambda: + inputs = pytest.mark.parametrize("input", [object(), torch.empty(()), np.empty(()), "string", 1, 0.0]) + + @inputs + def test_default(self, input): + was_applied = False + + def was_applied_fn(input): + nonlocal was_applied + was_applied = True + return input + + transform = transforms.Lambda(was_applied_fn) + + transform(input) + + assert was_applied + + @inputs + def test_with_types(self, input): + was_applied = False + + def was_applied_fn(input): + nonlocal was_applied + was_applied = True + return input + + types = (torch.Tensor, np.ndarray) + transform = transforms.Lambda(was_applied_fn, *types) + + transform(input) + + assert was_applied is isinstance(input, types) diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py new file mode 100644 index 0000000000000000000000000000000000000000..1f47eb2117f7fef887d179ae08663d82c739eddd --- /dev/null +++ b/test/test_transforms_v2_consistency.py @@ -0,0 +1,1254 @@ +import importlib.machinery +import importlib.util +import inspect +import random +import re +from pathlib import Path + +import numpy as np +import PIL.Image +import pytest + +import torch +import torchvision.transforms.v2 as v2_transforms +from common_utils import assert_close, assert_equal, set_rng_seed +from torch import nn +from torchvision import transforms as legacy_transforms, tv_tensors +from torchvision._utils import sequence_to_str + +from torchvision.transforms import functional as legacy_F +from torchvision.transforms.v2 import functional as prototype_F +from torchvision.transforms.v2._utils import _get_fill, query_size +from torchvision.transforms.v2.functional import to_pil_image +from transforms_v2_legacy_utils import ( + ArgsKwargs, + make_bounding_boxes, + make_detection_mask, + make_image, + make_images, + make_segmentation_mask, +) + +DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)]) + + +@pytest.fixture(autouse=True) +def fix_rng_seed(): + set_rng_seed(0) + yield + + +class NotScriptableArgsKwargs(ArgsKwargs): + """ + This class is used to mark parameters that render the transform non-scriptable. They still work in eager mode and + thus will be tested there, but will be skipped by the JIT tests. + """ + + pass + + +class ConsistencyConfig: + def __init__( + self, + prototype_cls, + legacy_cls, + # If no args_kwargs is passed, only the signature will be checked + args_kwargs=(), + make_images_kwargs=None, + supports_pil=True, + removed_params=(), + closeness_kwargs=None, + ): + self.prototype_cls = prototype_cls + self.legacy_cls = legacy_cls + self.args_kwargs = args_kwargs + self.make_images_kwargs = make_images_kwargs or DEFAULT_MAKE_IMAGES_KWARGS + self.supports_pil = supports_pil + self.removed_params = removed_params + self.closeness_kwargs = closeness_kwargs or dict(rtol=0, atol=0) + + +# These are here since both the prototype and legacy transform need to be constructed with the same random parameters +LINEAR_TRANSFORMATION_MEAN = torch.rand(36) +LINEAR_TRANSFORMATION_MATRIX = torch.rand([LINEAR_TRANSFORMATION_MEAN.numel()] * 2) + +CONSISTENCY_CONFIGS = [ + ConsistencyConfig( + v2_transforms.Normalize, + legacy_transforms.Normalize, + [ + ArgsKwargs(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), + ], + supports_pil=False, + make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.float]), + ), + ConsistencyConfig( + v2_transforms.CenterCrop, + legacy_transforms.CenterCrop, + [ + ArgsKwargs(18), + ArgsKwargs((18, 13)), + ], + ), + ConsistencyConfig( + v2_transforms.FiveCrop, + legacy_transforms.FiveCrop, + [ + ArgsKwargs(18), + ArgsKwargs((18, 13)), + ], + make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]), + ), + ConsistencyConfig( + v2_transforms.TenCrop, + legacy_transforms.TenCrop, + [ + ArgsKwargs(18), + ArgsKwargs((18, 13)), + ArgsKwargs(18, vertical_flip=True), + ], + make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]), + ), + ConsistencyConfig( + v2_transforms.Pad, + legacy_transforms.Pad, + [ + NotScriptableArgsKwargs(3), + ArgsKwargs([3]), + ArgsKwargs([2, 3]), + ArgsKwargs([3, 2, 1, 4]), + NotScriptableArgsKwargs(5, fill=1, padding_mode="constant"), + ArgsKwargs([5], fill=1, padding_mode="constant"), + NotScriptableArgsKwargs(5, padding_mode="edge"), + NotScriptableArgsKwargs(5, padding_mode="reflect"), + NotScriptableArgsKwargs(5, padding_mode="symmetric"), + ], + ), + *[ + ConsistencyConfig( + v2_transforms.LinearTransformation, + legacy_transforms.LinearTransformation, + [ + ArgsKwargs(LINEAR_TRANSFORMATION_MATRIX.to(matrix_dtype), LINEAR_TRANSFORMATION_MEAN.to(matrix_dtype)), + ], + # Make sure that the product of the height, width and number of channels matches the number of elements in + # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36. + make_images_kwargs=dict( + DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=["RGB"], dtypes=[image_dtype] + ), + supports_pil=False, + ) + for matrix_dtype, image_dtype in [ + (torch.float32, torch.float32), + (torch.float64, torch.float64), + (torch.float32, torch.uint8), + (torch.float64, torch.float32), + (torch.float32, torch.float64), + ] + ], + ConsistencyConfig( + v2_transforms.Grayscale, + legacy_transforms.Grayscale, + [ + ArgsKwargs(num_output_channels=1), + ArgsKwargs(num_output_channels=3), + ], + make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=["RGB", "GRAY"]), + # Use default tolerances of `torch.testing.assert_close` + closeness_kwargs=dict(rtol=None, atol=None), + ), + ConsistencyConfig( + v2_transforms.ToPILImage, + legacy_transforms.ToPILImage, + [NotScriptableArgsKwargs()], + make_images_kwargs=dict( + color_spaces=[ + "GRAY", + "GRAY_ALPHA", + "RGB", + "RGBA", + ], + extra_dims=[()], + ), + supports_pil=False, + ), + ConsistencyConfig( + v2_transforms.Lambda, + legacy_transforms.Lambda, + [ + NotScriptableArgsKwargs(lambda image: image / 2), + ], + # Technically, this also supports PIL, but it is overkill to write a function here that supports tensor and PIL + # images given that the transform does nothing but call it anyway. + supports_pil=False, + ), + ConsistencyConfig( + v2_transforms.RandomEqualize, + legacy_transforms.RandomEqualize, + [ + ArgsKwargs(p=0), + ArgsKwargs(p=1), + ], + make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]), + ), + ConsistencyConfig( + v2_transforms.RandomInvert, + legacy_transforms.RandomInvert, + [ + ArgsKwargs(p=0), + ArgsKwargs(p=1), + ], + ), + ConsistencyConfig( + v2_transforms.RandomPosterize, + legacy_transforms.RandomPosterize, + [ + ArgsKwargs(p=0, bits=5), + ArgsKwargs(p=1, bits=1), + ArgsKwargs(p=1, bits=3), + ], + make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]), + ), + ConsistencyConfig( + v2_transforms.RandomSolarize, + legacy_transforms.RandomSolarize, + [ + ArgsKwargs(p=0, threshold=0.5), + ArgsKwargs(p=1, threshold=0.3), + ArgsKwargs(p=1, threshold=0.99), + ], + ), + *[ + ConsistencyConfig( + v2_transforms.RandomAutocontrast, + legacy_transforms.RandomAutocontrast, + [ + ArgsKwargs(p=0), + ArgsKwargs(p=1), + ], + make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[dt]), + closeness_kwargs=ckw, + ) + for dt, ckw in [(torch.uint8, dict(atol=1, rtol=0)), (torch.float32, dict(rtol=None, atol=None))] + ], + ConsistencyConfig( + v2_transforms.RandomAdjustSharpness, + legacy_transforms.RandomAdjustSharpness, + [ + ArgsKwargs(p=0, sharpness_factor=0.5), + ArgsKwargs(p=1, sharpness_factor=0.2), + ArgsKwargs(p=1, sharpness_factor=0.99), + ], + closeness_kwargs={"atol": 1e-6, "rtol": 1e-6}, + ), + ConsistencyConfig( + v2_transforms.RandomGrayscale, + legacy_transforms.RandomGrayscale, + [ + ArgsKwargs(p=0), + ArgsKwargs(p=1), + ], + make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=["RGB", "GRAY"]), + # Use default tolerances of `torch.testing.assert_close` + closeness_kwargs=dict(rtol=None, atol=None), + ), + ConsistencyConfig( + v2_transforms.RandomResizedCrop, + legacy_transforms.RandomResizedCrop, + [ + ArgsKwargs(16), + ArgsKwargs(17, scale=(0.3, 0.7)), + ArgsKwargs(25, ratio=(0.5, 1.5)), + ArgsKwargs((31, 28), interpolation=v2_transforms.InterpolationMode.NEAREST), + ArgsKwargs((31, 28), interpolation=PIL.Image.NEAREST), + ArgsKwargs((29, 32), antialias=False), + ArgsKwargs((28, 31), antialias=True), + ], + # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes + closeness_kwargs=dict(rtol=0, atol=1), + ), + ConsistencyConfig( + v2_transforms.RandomResizedCrop, + legacy_transforms.RandomResizedCrop, + [ + ArgsKwargs((33, 26), interpolation=v2_transforms.InterpolationMode.BICUBIC, antialias=True), + ArgsKwargs((33, 26), interpolation=PIL.Image.BICUBIC, antialias=True), + ], + closeness_kwargs=dict(rtol=0, atol=21), + ), + ConsistencyConfig( + v2_transforms.ColorJitter, + legacy_transforms.ColorJitter, + [ + ArgsKwargs(), + ArgsKwargs(brightness=0.1), + ArgsKwargs(brightness=(0.2, 0.3)), + ArgsKwargs(contrast=0.4), + ArgsKwargs(contrast=(0.5, 0.6)), + ArgsKwargs(saturation=0.7), + ArgsKwargs(saturation=(0.8, 0.9)), + ArgsKwargs(hue=0.3), + ArgsKwargs(hue=(-0.1, 0.2)), + ArgsKwargs(brightness=0.1, contrast=0.4, saturation=0.5, hue=0.3), + ], + closeness_kwargs={"atol": 1e-5, "rtol": 1e-5}, + ), + ConsistencyConfig( + v2_transforms.GaussianBlur, + legacy_transforms.GaussianBlur, + [ + ArgsKwargs(kernel_size=3), + ArgsKwargs(kernel_size=(1, 5)), + ArgsKwargs(kernel_size=3, sigma=0.7), + ArgsKwargs(kernel_size=5, sigma=(0.3, 1.4)), + ], + closeness_kwargs={"rtol": 1e-5, "atol": 1e-5}, + ), + ConsistencyConfig( + v2_transforms.RandomPerspective, + legacy_transforms.RandomPerspective, + [ + ArgsKwargs(p=0), + ArgsKwargs(p=1), + ArgsKwargs(p=1, distortion_scale=0.3), + ArgsKwargs(p=1, distortion_scale=0.2, interpolation=v2_transforms.InterpolationMode.NEAREST), + ArgsKwargs(p=1, distortion_scale=0.2, interpolation=PIL.Image.NEAREST), + ArgsKwargs(p=1, distortion_scale=0.1, fill=1), + ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)), + ], + closeness_kwargs={"atol": None, "rtol": None}, + ), + ConsistencyConfig( + v2_transforms.PILToTensor, + legacy_transforms.PILToTensor, + ), + ConsistencyConfig( + v2_transforms.ToTensor, + legacy_transforms.ToTensor, + ), + ConsistencyConfig( + v2_transforms.Compose, + legacy_transforms.Compose, + ), + ConsistencyConfig( + v2_transforms.RandomApply, + legacy_transforms.RandomApply, + ), + ConsistencyConfig( + v2_transforms.RandomChoice, + legacy_transforms.RandomChoice, + ), + ConsistencyConfig( + v2_transforms.RandomOrder, + legacy_transforms.RandomOrder, + ), + ConsistencyConfig( + v2_transforms.AugMix, + legacy_transforms.AugMix, + ), + ConsistencyConfig( + v2_transforms.AutoAugment, + legacy_transforms.AutoAugment, + ), + ConsistencyConfig( + v2_transforms.RandAugment, + legacy_transforms.RandAugment, + ), + ConsistencyConfig( + v2_transforms.TrivialAugmentWide, + legacy_transforms.TrivialAugmentWide, + ), +] + + +@pytest.mark.parametrize("config", CONSISTENCY_CONFIGS, ids=lambda config: config.legacy_cls.__name__) +def test_signature_consistency(config): + legacy_params = dict(inspect.signature(config.legacy_cls).parameters) + prototype_params = dict(inspect.signature(config.prototype_cls).parameters) + + for param in config.removed_params: + legacy_params.pop(param, None) + + missing = legacy_params.keys() - prototype_params.keys() + if missing: + raise AssertionError( + f"The prototype transform does not support the parameters " + f"{sequence_to_str(sorted(missing), separate_last='and ')}, but the legacy transform does. " + f"If that is intentional, e.g. pending deprecation, please add the parameters to the `removed_params` on " + f"the `ConsistencyConfig`." + ) + + extra = prototype_params.keys() - legacy_params.keys() + extra_without_default = { + param + for param in extra + if prototype_params[param].default is inspect.Parameter.empty + and prototype_params[param].kind not in {inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD} + } + if extra_without_default: + raise AssertionError( + f"The prototype transform requires the parameters " + f"{sequence_to_str(sorted(extra_without_default), separate_last='and ')}, but the legacy transform does " + f"not. Please add a default value." + ) + + legacy_signature = list(legacy_params.keys()) + # Since we made sure that we don't have any extra parameters without default above, we clamp the prototype signature + # to the same number of parameters as the legacy one + prototype_signature = list(prototype_params.keys())[: len(legacy_signature)] + + assert prototype_signature == legacy_signature + + +def check_call_consistency( + prototype_transform, legacy_transform, images=None, supports_pil=True, closeness_kwargs=None +): + if images is None: + images = make_images(**DEFAULT_MAKE_IMAGES_KWARGS) + + closeness_kwargs = closeness_kwargs or dict() + + for image in images: + image_repr = f"[{tuple(image.shape)}, {str(image.dtype).rsplit('.')[-1]}]" + + image_tensor = torch.Tensor(image) + try: + torch.manual_seed(0) + output_legacy_tensor = legacy_transform(image_tensor) + except Exception as exc: + raise pytest.UsageError( + f"Transforming a tensor image {image_repr} failed in the legacy transform with the " + f"error above. This means that you need to specify the parameters passed to `make_images` through the " + "`make_images_kwargs` of the `ConsistencyConfig`." + ) from exc + + try: + torch.manual_seed(0) + output_prototype_tensor = prototype_transform(image_tensor) + except Exception as exc: + raise AssertionError( + f"Transforming a tensor image with shape {image_repr} failed in the prototype transform with " + f"the error above. This means there is a consistency bug either in `_get_params` or in the " + f"`is_pure_tensor` path in `_transform`." + ) from exc + + assert_close( + output_prototype_tensor, + output_legacy_tensor, + msg=lambda msg: f"Tensor image consistency check failed with: \n\n{msg}", + **closeness_kwargs, + ) + + try: + torch.manual_seed(0) + output_prototype_image = prototype_transform(image) + except Exception as exc: + raise AssertionError( + f"Transforming a image tv_tensor with shape {image_repr} failed in the prototype transform with " + f"the error above. This means there is a consistency bug either in `_get_params` or in the " + f"`tv_tensors.Image` path in `_transform`." + ) from exc + + assert_close( + output_prototype_image, + output_prototype_tensor, + msg=lambda msg: f"Output for tv_tensor and tensor images is not equal: \n\n{msg}", + **closeness_kwargs, + ) + + if image.ndim == 3 and supports_pil: + image_pil = to_pil_image(image) + + try: + torch.manual_seed(0) + output_legacy_pil = legacy_transform(image_pil) + except Exception as exc: + raise pytest.UsageError( + f"Transforming a PIL image with shape {image_repr} failed in the legacy transform with the " + f"error above. If this transform does not support PIL images, set `supports_pil=False` on the " + "`ConsistencyConfig`. " + ) from exc + + try: + torch.manual_seed(0) + output_prototype_pil = prototype_transform(image_pil) + except Exception as exc: + raise AssertionError( + f"Transforming a PIL image with shape {image_repr} failed in the prototype transform with " + f"the error above. This means there is a consistency bug either in `_get_params` or in the " + f"`PIL.Image.Image` path in `_transform`." + ) from exc + + assert_close( + output_prototype_pil, + output_legacy_pil, + msg=lambda msg: f"PIL image consistency check failed with: \n\n{msg}", + **closeness_kwargs, + ) + + +@pytest.mark.parametrize( + ("config", "args_kwargs"), + [ + pytest.param( + config, args_kwargs, id=f"{config.legacy_cls.__name__}-{idx:0{len(str(len(config.args_kwargs)))}d}" + ) + for config in CONSISTENCY_CONFIGS + for idx, args_kwargs in enumerate(config.args_kwargs) + ], +) +@pytest.mark.filterwarnings("ignore") +def test_call_consistency(config, args_kwargs): + args, kwargs = args_kwargs + + try: + legacy_transform = config.legacy_cls(*args, **kwargs) + except Exception as exc: + raise pytest.UsageError( + f"Initializing the legacy transform failed with the error above. " + f"Please correct the `ArgsKwargs({args_kwargs})` in the `ConsistencyConfig`." + ) from exc + + try: + prototype_transform = config.prototype_cls(*args, **kwargs) + except Exception as exc: + raise AssertionError( + "Initializing the prototype transform failed with the error above. " + "This means there is a consistency bug in the constructor." + ) from exc + + check_call_consistency( + prototype_transform, + legacy_transform, + images=make_images(**config.make_images_kwargs), + supports_pil=config.supports_pil, + closeness_kwargs=config.closeness_kwargs, + ) + + +get_params_parametrization = pytest.mark.parametrize( + ("config", "get_params_args_kwargs"), + [ + pytest.param( + next(config for config in CONSISTENCY_CONFIGS if config.prototype_cls is transform_cls), + get_params_args_kwargs, + id=transform_cls.__name__, + ) + for transform_cls, get_params_args_kwargs in [ + (v2_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])), + (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)), + (v2_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)), + (v2_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)), + (v2_transforms.AutoAugment, ArgsKwargs(5)), + ] + ], +) + + +@get_params_parametrization +def test_get_params_alias(config, get_params_args_kwargs): + assert config.prototype_cls.get_params is config.legacy_cls.get_params + + if not config.args_kwargs: + return + args, kwargs = config.args_kwargs[0] + legacy_transform = config.legacy_cls(*args, **kwargs) + prototype_transform = config.prototype_cls(*args, **kwargs) + + assert prototype_transform.get_params is legacy_transform.get_params + + +@get_params_parametrization +def test_get_params_jit(config, get_params_args_kwargs): + get_params_args, get_params_kwargs = get_params_args_kwargs + + torch.jit.script(config.prototype_cls.get_params)(*get_params_args, **get_params_kwargs) + + if not config.args_kwargs: + return + args, kwargs = config.args_kwargs[0] + transform = config.prototype_cls(*args, **kwargs) + + torch.jit.script(transform.get_params)(*get_params_args, **get_params_kwargs) + + +@pytest.mark.parametrize( + ("config", "args_kwargs"), + [ + pytest.param( + config, args_kwargs, id=f"{config.legacy_cls.__name__}-{idx:0{len(str(len(config.args_kwargs)))}d}" + ) + for config in CONSISTENCY_CONFIGS + for idx, args_kwargs in enumerate(config.args_kwargs) + if not isinstance(args_kwargs, NotScriptableArgsKwargs) + ], +) +def test_jit_consistency(config, args_kwargs): + args, kwargs = args_kwargs + + prototype_transform_eager = config.prototype_cls(*args, **kwargs) + legacy_transform_eager = config.legacy_cls(*args, **kwargs) + + legacy_transform_scripted = torch.jit.script(legacy_transform_eager) + prototype_transform_scripted = torch.jit.script(prototype_transform_eager) + + for image in make_images(**config.make_images_kwargs): + image = image.as_subclass(torch.Tensor) + + torch.manual_seed(0) + output_legacy_scripted = legacy_transform_scripted(image) + + torch.manual_seed(0) + output_prototype_scripted = prototype_transform_scripted(image) + + assert_close(output_prototype_scripted, output_legacy_scripted, **config.closeness_kwargs) + + +class TestContainerTransforms: + """ + Since we are testing containers here, we also need some transforms to wrap. Thus, testing a container transform for + consistency automatically tests the wrapped transforms consistency. + + Instead of complicated mocking or creating custom transforms just for these tests, here we use deterministic ones + that were already tested for consistency above. + """ + + def test_compose(self): + prototype_transform = v2_transforms.Compose( + [ + v2_transforms.Resize(256), + v2_transforms.CenterCrop(224), + ] + ) + legacy_transform = legacy_transforms.Compose( + [ + legacy_transforms.Resize(256), + legacy_transforms.CenterCrop(224), + ] + ) + + # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes + check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1)) + + @pytest.mark.parametrize("p", [0, 0.1, 0.5, 0.9, 1]) + @pytest.mark.parametrize("sequence_type", [list, nn.ModuleList]) + def test_random_apply(self, p, sequence_type): + prototype_transform = v2_transforms.RandomApply( + sequence_type( + [ + v2_transforms.Resize(256), + v2_transforms.CenterCrop(224), + ] + ), + p=p, + ) + legacy_transform = legacy_transforms.RandomApply( + sequence_type( + [ + legacy_transforms.Resize(256), + legacy_transforms.CenterCrop(224), + ] + ), + p=p, + ) + + # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes + check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1)) + + if sequence_type is nn.ModuleList: + # quick and dirty test that it is jit-scriptable + scripted = torch.jit.script(prototype_transform) + scripted(torch.rand(1, 3, 300, 300)) + + # We can't test other values for `p` since the random parameter generation is different + @pytest.mark.parametrize("probabilities", [(0, 1), (1, 0)]) + def test_random_choice(self, probabilities): + prototype_transform = v2_transforms.RandomChoice( + [ + v2_transforms.Resize(256), + legacy_transforms.CenterCrop(224), + ], + p=probabilities, + ) + legacy_transform = legacy_transforms.RandomChoice( + [ + legacy_transforms.Resize(256), + legacy_transforms.CenterCrop(224), + ], + p=probabilities, + ) + + # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes + check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1)) + + +class TestToTensorTransforms: + def test_pil_to_tensor(self): + prototype_transform = v2_transforms.PILToTensor() + legacy_transform = legacy_transforms.PILToTensor() + + for image in make_images(extra_dims=[()]): + image_pil = to_pil_image(image) + + assert_equal(prototype_transform(image_pil), legacy_transform(image_pil)) + + def test_to_tensor(self): + with pytest.warns(UserWarning, match=re.escape("The transform `ToTensor()` is deprecated")): + prototype_transform = v2_transforms.ToTensor() + legacy_transform = legacy_transforms.ToTensor() + + for image in make_images(extra_dims=[()]): + image_pil = to_pil_image(image) + image_numpy = np.array(image_pil) + + assert_equal(prototype_transform(image_pil), legacy_transform(image_pil)) + assert_equal(prototype_transform(image_numpy), legacy_transform(image_numpy)) + + +class TestAATransforms: + @pytest.mark.parametrize( + "inpt", + [ + torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8), + PIL.Image.new("RGB", (256, 256), 123), + tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), + ], + ) + @pytest.mark.parametrize( + "interpolation", + [ + v2_transforms.InterpolationMode.NEAREST, + v2_transforms.InterpolationMode.BILINEAR, + PIL.Image.NEAREST, + ], + ) + def test_randaug(self, inpt, interpolation, mocker): + t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1) + t = v2_transforms.RandAugment(interpolation=interpolation, num_ops=1) + + le = len(t._AUGMENTATION_SPACE) + keys = list(t._AUGMENTATION_SPACE.keys()) + randint_values = [] + for i in range(le): + # Stable API, op_index random call + randint_values.append(i) + # Stable API, if signed there is another random call + if t._AUGMENTATION_SPACE[keys[i]][1]: + randint_values.append(0) + # New API, _get_random_item + randint_values.append(i) + randint_values = iter(randint_values) + + mocker.patch("torch.randint", side_effect=lambda *arg, **kwargs: torch.tensor(next(randint_values))) + mocker.patch("torch.rand", return_value=1.0) + + for i in range(le): + expected_output = t_ref(inpt) + output = t(inpt) + + assert_close(expected_output, output, atol=1, rtol=0.1) + + @pytest.mark.parametrize( + "interpolation", + [ + v2_transforms.InterpolationMode.NEAREST, + v2_transforms.InterpolationMode.BILINEAR, + ], + ) + @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) + def test_randaug_jit(self, interpolation, fill): + inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8) + t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1, fill=fill) + t = v2_transforms.RandAugment(interpolation=interpolation, num_ops=1, fill=fill) + + tt_ref = torch.jit.script(t_ref) + tt = torch.jit.script(t) + + torch.manual_seed(12) + expected_output = tt_ref(inpt) + + torch.manual_seed(12) + scripted_output = tt(inpt) + + assert_equal(scripted_output, expected_output) + + @pytest.mark.parametrize( + "inpt", + [ + torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8), + PIL.Image.new("RGB", (256, 256), 123), + tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), + ], + ) + @pytest.mark.parametrize( + "interpolation", + [ + v2_transforms.InterpolationMode.NEAREST, + v2_transforms.InterpolationMode.BILINEAR, + PIL.Image.NEAREST, + ], + ) + def test_trivial_aug(self, inpt, interpolation, mocker): + t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation) + t = v2_transforms.TrivialAugmentWide(interpolation=interpolation) + + le = len(t._AUGMENTATION_SPACE) + keys = list(t._AUGMENTATION_SPACE.keys()) + randint_values = [] + for i in range(le): + # Stable API, op_index random call + randint_values.append(i) + key = keys[i] + # Stable API, random magnitude + aug_op = t._AUGMENTATION_SPACE[key] + magnitudes = aug_op[0](2, 0, 0) + if magnitudes is not None: + randint_values.append(5) + # Stable API, if signed there is another random call + if aug_op[1]: + randint_values.append(0) + # New API, _get_random_item + randint_values.append(i) + # New API, random magnitude + if magnitudes is not None: + randint_values.append(5) + + randint_values = iter(randint_values) + + mocker.patch("torch.randint", side_effect=lambda *arg, **kwargs: torch.tensor(next(randint_values))) + mocker.patch("torch.rand", return_value=1.0) + + for _ in range(le): + expected_output = t_ref(inpt) + output = t(inpt) + + assert_close(expected_output, output, atol=1, rtol=0.1) + + @pytest.mark.parametrize( + "interpolation", + [ + v2_transforms.InterpolationMode.NEAREST, + v2_transforms.InterpolationMode.BILINEAR, + ], + ) + @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) + def test_trivial_aug_jit(self, interpolation, fill): + inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8) + t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation, fill=fill) + t = v2_transforms.TrivialAugmentWide(interpolation=interpolation, fill=fill) + + tt_ref = torch.jit.script(t_ref) + tt = torch.jit.script(t) + + torch.manual_seed(12) + expected_output = tt_ref(inpt) + + torch.manual_seed(12) + scripted_output = tt(inpt) + + assert_equal(scripted_output, expected_output) + + @pytest.mark.parametrize( + "inpt", + [ + torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8), + PIL.Image.new("RGB", (256, 256), 123), + tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), + ], + ) + @pytest.mark.parametrize( + "interpolation", + [ + v2_transforms.InterpolationMode.NEAREST, + v2_transforms.InterpolationMode.BILINEAR, + PIL.Image.NEAREST, + ], + ) + def test_augmix(self, inpt, interpolation, mocker): + t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1) + t_ref._sample_dirichlet = lambda t: t.softmax(dim=-1) + t = v2_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1) + t._sample_dirichlet = lambda t: t.softmax(dim=-1) + + le = len(t._AUGMENTATION_SPACE) + keys = list(t._AUGMENTATION_SPACE.keys()) + randint_values = [] + for i in range(le): + # Stable API, op_index random call + randint_values.append(i) + key = keys[i] + # Stable API, random magnitude + aug_op = t._AUGMENTATION_SPACE[key] + magnitudes = aug_op[0](2, 0, 0) + if magnitudes is not None: + randint_values.append(5) + # Stable API, if signed there is another random call + if aug_op[1]: + randint_values.append(0) + # New API, _get_random_item + randint_values.append(i) + # New API, random magnitude + if magnitudes is not None: + randint_values.append(5) + + randint_values = iter(randint_values) + + mocker.patch("torch.randint", side_effect=lambda *arg, **kwargs: torch.tensor(next(randint_values))) + mocker.patch("torch.rand", return_value=1.0) + + expected_output = t_ref(inpt) + output = t(inpt) + + assert_equal(expected_output, output) + + @pytest.mark.parametrize( + "interpolation", + [ + v2_transforms.InterpolationMode.NEAREST, + v2_transforms.InterpolationMode.BILINEAR, + ], + ) + @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1]) + def test_augmix_jit(self, interpolation, fill): + inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8) + + t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1, fill=fill) + t = v2_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1, fill=fill) + + tt_ref = torch.jit.script(t_ref) + tt = torch.jit.script(t) + + torch.manual_seed(12) + expected_output = tt_ref(inpt) + + torch.manual_seed(12) + scripted_output = tt(inpt) + + assert_equal(scripted_output, expected_output) + + @pytest.mark.parametrize( + "inpt", + [ + torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8), + PIL.Image.new("RGB", (256, 256), 123), + tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)), + ], + ) + @pytest.mark.parametrize( + "interpolation", + [ + v2_transforms.InterpolationMode.NEAREST, + v2_transforms.InterpolationMode.BILINEAR, + PIL.Image.NEAREST, + ], + ) + def test_aa(self, inpt, interpolation): + aa_policy = legacy_transforms.AutoAugmentPolicy("imagenet") + t_ref = legacy_transforms.AutoAugment(aa_policy, interpolation=interpolation) + t = v2_transforms.AutoAugment(aa_policy, interpolation=interpolation) + + torch.manual_seed(12) + expected_output = t_ref(inpt) + + torch.manual_seed(12) + output = t(inpt) + + assert_equal(expected_output, output) + + @pytest.mark.parametrize( + "interpolation", + [ + v2_transforms.InterpolationMode.NEAREST, + v2_transforms.InterpolationMode.BILINEAR, + ], + ) + def test_aa_jit(self, interpolation): + inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8) + aa_policy = legacy_transforms.AutoAugmentPolicy("imagenet") + t_ref = legacy_transforms.AutoAugment(aa_policy, interpolation=interpolation) + t = v2_transforms.AutoAugment(aa_policy, interpolation=interpolation) + + tt_ref = torch.jit.script(t_ref) + tt = torch.jit.script(t) + + torch.manual_seed(12) + expected_output = tt_ref(inpt) + + torch.manual_seed(12) + scripted_output = tt(inpt) + + assert_equal(scripted_output, expected_output) + + +def import_transforms_from_references(reference): + HERE = Path(__file__).parent + PROJECT_ROOT = HERE.parent + + loader = importlib.machinery.SourceFileLoader( + "transforms", str(PROJECT_ROOT / "references" / reference / "transforms.py") + ) + spec = importlib.util.spec_from_loader("transforms", loader) + module = importlib.util.module_from_spec(spec) + loader.exec_module(module) + return module + + +det_transforms = import_transforms_from_references("detection") + + +class TestRefDetTransforms: + def make_tv_tensors(self, with_mask=True): + size = (600, 800) + num_objects = 22 + + def make_label(extra_dims, categories): + return torch.randint(categories, extra_dims, dtype=torch.int64) + + pil_image = to_pil_image(make_image(size=size, color_space="RGB")) + target = { + "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "labels": make_label(extra_dims=(num_objects,), categories=80), + } + if with_mask: + target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) + + yield (pil_image, target) + + tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32)) + target = { + "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "labels": make_label(extra_dims=(num_objects,), categories=80), + } + if with_mask: + target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) + + yield (tensor_image, target) + + tv_tensor_image = make_image(size=size, color_space="RGB", dtype=torch.float32) + target = { + "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float), + "labels": make_label(extra_dims=(num_objects,), categories=80), + } + if with_mask: + target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long) + + yield (tv_tensor_image, target) + + @pytest.mark.parametrize( + "t_ref, t, data_kwargs", + [ + (det_transforms.RandomHorizontalFlip(p=1.0), v2_transforms.RandomHorizontalFlip(p=1.0), {}), + ( + det_transforms.RandomIoUCrop(), + v2_transforms.Compose( + [ + v2_transforms.RandomIoUCrop(), + v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]), + ] + ), + {"with_mask": False}, + ), + (det_transforms.RandomZoomOut(), v2_transforms.RandomZoomOut(), {"with_mask": False}), + (det_transforms.ScaleJitter((1024, 1024)), v2_transforms.ScaleJitter((1024, 1024), antialias=True), {}), + ( + det_transforms.RandomShortestSize( + min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 + ), + v2_transforms.RandomShortestSize( + min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333 + ), + {}, + ), + ], + ) + def test_transform(self, t_ref, t, data_kwargs): + for dp in self.make_tv_tensors(**data_kwargs): + + # We should use prototype transform first as reference transform performs inplace target update + torch.manual_seed(12) + output = t(dp) + + torch.manual_seed(12) + expected_output = t_ref(*dp) + + assert_equal(expected_output, output) + + +seg_transforms = import_transforms_from_references("segmentation") + + +# We need this transform for two reasons: +# 1. transforms.RandomCrop uses a different scheme to pad images and masks of insufficient size than its name +# counterpart in the detection references. Thus, we cannot use it with `pad_if_needed=True` +# 2. transforms.Pad only supports a fixed padding, but the segmentation datasets don't have a fixed image size. +class PadIfSmaller(v2_transforms.Transform): + def __init__(self, size, fill=0): + super().__init__() + self.size = size + self.fill = v2_transforms._geometry._setup_fill_arg(fill) + + def _get_params(self, sample): + height, width = query_size(sample) + padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)] + needs_padding = any(padding) + return dict(padding=padding, needs_padding=needs_padding) + + def _transform(self, inpt, params): + if not params["needs_padding"]: + return inpt + + fill = _get_fill(self.fill, type(inpt)) + return prototype_F.pad(inpt, padding=params["padding"], fill=fill) + + +class TestRefSegTransforms: + def make_tv_tensors(self, supports_pil=True, image_dtype=torch.uint8): + size = (256, 460) + num_categories = 21 + + conv_fns = [] + if supports_pil: + conv_fns.append(to_pil_image) + conv_fns.extend([torch.Tensor, lambda x: x]) + + for conv_fn in conv_fns: + tv_tensor_image = make_image(size=size, color_space="RGB", dtype=image_dtype) + tv_tensor_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8) + + dp = (conv_fn(tv_tensor_image), tv_tensor_mask) + dp_ref = ( + to_pil_image(tv_tensor_image) if supports_pil else tv_tensor_image.as_subclass(torch.Tensor), + to_pil_image(tv_tensor_mask), + ) + + yield dp, dp_ref + + def set_seed(self, seed=12): + torch.manual_seed(seed) + random.seed(seed) + + def check(self, t, t_ref, data_kwargs=None): + for dp, dp_ref in self.make_tv_tensors(**data_kwargs or dict()): + + self.set_seed() + actual = actual_image, actual_mask = t(dp) + + self.set_seed() + expected_image, expected_mask = t_ref(*dp_ref) + if isinstance(actual_image, torch.Tensor) and not isinstance(expected_image, torch.Tensor): + expected_image = legacy_F.pil_to_tensor(expected_image) + expected_mask = legacy_F.pil_to_tensor(expected_mask).squeeze(0) + expected = (expected_image, expected_mask) + + assert_equal(actual, expected) + + @pytest.mark.parametrize( + ("t_ref", "t", "data_kwargs"), + [ + ( + seg_transforms.RandomHorizontalFlip(flip_prob=1.0), + v2_transforms.RandomHorizontalFlip(p=1.0), + dict(), + ), + ( + seg_transforms.RandomHorizontalFlip(flip_prob=0.0), + v2_transforms.RandomHorizontalFlip(p=0.0), + dict(), + ), + ( + seg_transforms.RandomCrop(size=480), + v2_transforms.Compose( + [ + PadIfSmaller(size=480, fill={tv_tensors.Mask: 255, "others": 0}), + v2_transforms.RandomCrop(size=480), + ] + ), + dict(), + ), + ( + seg_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), + v2_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)), + dict(supports_pil=False, image_dtype=torch.float), + ), + ], + ) + def test_common(self, t_ref, t, data_kwargs): + self.check(t, t_ref, data_kwargs) + + +@pytest.mark.parametrize( + ("legacy_dispatcher", "name_only_params"), + [ + (legacy_F.get_dimensions, {}), + (legacy_F.get_image_size, {}), + (legacy_F.get_image_num_channels, {}), + (legacy_F.to_tensor, {}), + (legacy_F.pil_to_tensor, {}), + (legacy_F.convert_image_dtype, {}), + (legacy_F.to_pil_image, {}), + (legacy_F.normalize, {}), + (legacy_F.resize, {"interpolation"}), + (legacy_F.pad, {"padding", "fill"}), + (legacy_F.crop, {}), + (legacy_F.center_crop, {}), + (legacy_F.resized_crop, {"interpolation"}), + (legacy_F.hflip, {}), + (legacy_F.perspective, {"startpoints", "endpoints", "fill", "interpolation"}), + (legacy_F.vflip, {}), + (legacy_F.five_crop, {}), + (legacy_F.ten_crop, {}), + (legacy_F.adjust_brightness, {}), + (legacy_F.adjust_contrast, {}), + (legacy_F.adjust_saturation, {}), + (legacy_F.adjust_hue, {}), + (legacy_F.adjust_gamma, {}), + (legacy_F.rotate, {"center", "fill", "interpolation"}), + (legacy_F.affine, {"angle", "translate", "center", "fill", "interpolation"}), + (legacy_F.to_grayscale, {}), + (legacy_F.rgb_to_grayscale, {}), + (legacy_F.to_tensor, {}), + (legacy_F.erase, {}), + (legacy_F.gaussian_blur, {}), + (legacy_F.invert, {}), + (legacy_F.posterize, {}), + (legacy_F.solarize, {}), + (legacy_F.adjust_sharpness, {}), + (legacy_F.autocontrast, {}), + (legacy_F.equalize, {}), + (legacy_F.elastic_transform, {"fill", "interpolation"}), + ], +) +def test_dispatcher_signature_consistency(legacy_dispatcher, name_only_params): + legacy_signature = inspect.signature(legacy_dispatcher) + legacy_params = list(legacy_signature.parameters.values())[1:] + + try: + prototype_dispatcher = getattr(prototype_F, legacy_dispatcher.__name__) + except AttributeError: + raise AssertionError( + f"Legacy dispatcher `F.{legacy_dispatcher.__name__}` has no prototype equivalent" + ) from None + + prototype_signature = inspect.signature(prototype_dispatcher) + prototype_params = list(prototype_signature.parameters.values())[1:] + + # Some dispatchers got extra parameters. This makes sure they have a default argument and thus are BC. We don't + # need to check if parameters were added in the middle rather than at the end, since that will be caught by the + # regular check below. + prototype_params, new_prototype_params = ( + prototype_params[: len(legacy_params)], + prototype_params[len(legacy_params) :], + ) + for param in new_prototype_params: + assert param.default is not param.empty + + # Some annotations were changed mostly to supersets of what was there before. Plus, some legacy dispatchers had no + # annotations. In these cases we simply drop the annotation and default argument from the comparison + for prototype_param, legacy_param in zip(prototype_params, legacy_params): + if legacy_param.name in name_only_params: + prototype_param._annotation = prototype_param._default = inspect.Parameter.empty + legacy_param._annotation = legacy_param._default = inspect.Parameter.empty + elif legacy_param.annotation is inspect.Parameter.empty: + prototype_param._annotation = inspect.Parameter.empty + + assert prototype_params == legacy_params diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py new file mode 100644 index 0000000000000000000000000000000000000000..23f06475cf144bb73858155bdf9aef7021e4a5f4 --- /dev/null +++ b/test/test_transforms_v2_functional.py @@ -0,0 +1,958 @@ +import inspect +import math +import os +import re + +import numpy as np +import PIL.Image +import pytest +import torch + +from common_utils import assert_close, cache, cpu_and_cuda, needs_cuda, set_rng_seed +from torch.utils._pytree import tree_map +from torchvision import tv_tensors +from torchvision.transforms.functional import _get_perspective_coeffs +from torchvision.transforms.v2 import functional as F +from torchvision.transforms.v2._utils import is_pure_tensor +from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding +from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_bounding_box_format +from transforms_v2_dispatcher_infos import DISPATCHER_INFOS +from transforms_v2_kernel_infos import KERNEL_INFOS +from transforms_v2_legacy_utils import ( + DEFAULT_SQUARE_SPATIAL_SIZE, + make_multiple_bounding_boxes, + parametrized_error_message, +) + + +KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS} +DISPATCHER_INFOS_MAP = {info.dispatcher: info for info in DISPATCHER_INFOS} + + +@cache +def script(fn): + try: + return torch.jit.script(fn) + except Exception as error: + raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error + + +# Scripting a function often triggers a warning like +# `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information` +# with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore +# them. +ignore_jit_warning_no_profile = pytest.mark.filterwarnings( + f"ignore:{re.escape('operator() profile_node %')}:UserWarning" +) + + +def make_info_args_kwargs_params(info, *, args_kwargs_fn, test_id=None): + args_kwargs = list(args_kwargs_fn(info)) + if not args_kwargs: + raise pytest.UsageError( + f"Couldn't collect a single `ArgsKwargs` for `{info.id}`{f' in {test_id}' if test_id else ''}" + ) + idx_field_len = len(str(len(args_kwargs))) + return [ + pytest.param( + info, + args_kwargs_, + marks=info.get_marks(test_id, args_kwargs_) if test_id else [], + id=f"{info.id}-{idx:0{idx_field_len}}", + ) + for idx, args_kwargs_ in enumerate(args_kwargs) + ] + + +def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn): + def decorator(test_fn): + parts = test_fn.__qualname__.split(".") + if len(parts) == 1: + test_class_name = None + test_function_name = parts[0] + elif len(parts) == 2: + test_class_name, test_function_name = parts + else: + raise pytest.UsageError("Unable to parse the test class name and test function name from test function") + test_id = (test_class_name, test_function_name) + + argnames = ("info", "args_kwargs") + argvalues = [] + for info in infos: + argvalues.extend(make_info_args_kwargs_params(info, args_kwargs_fn=args_kwargs_fn, test_id=test_id)) + + return pytest.mark.parametrize(argnames, argvalues)(test_fn) + + return decorator + + +@pytest.fixture(autouse=True) +def fix_rng_seed(): + set_rng_seed(0) + yield + + +@pytest.fixture() +def test_id(request): + test_class_name = request.cls.__name__ if request.cls is not None else None + test_function_name = request.node.originalname + return test_class_name, test_function_name + + +class TestKernels: + sample_inputs = make_info_args_kwargs_parametrization( + KERNEL_INFOS, + args_kwargs_fn=lambda kernel_info: kernel_info.sample_inputs_fn(), + ) + reference_inputs = make_info_args_kwargs_parametrization( + [info for info in KERNEL_INFOS if info.reference_fn is not None], + args_kwargs_fn=lambda info: info.reference_inputs_fn(), + ) + + @make_info_args_kwargs_parametrization( + [info for info in KERNEL_INFOS if info.logs_usage], + args_kwargs_fn=lambda info: info.sample_inputs_fn(), + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_logging(self, spy_on, info, args_kwargs, device): + spy = spy_on(torch._C._log_api_usage_once) + + (input, *other_args), kwargs = args_kwargs.load(device) + info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs) + + spy.assert_any_call(f"{info.kernel.__module__}.{info.id}") + + @ignore_jit_warning_no_profile + @sample_inputs + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_scripted_vs_eager(self, test_id, info, args_kwargs, device): + kernel_eager = info.kernel + kernel_scripted = script(kernel_eager) + + (input, *other_args), kwargs = args_kwargs.load(device) + input = input.as_subclass(torch.Tensor) + + actual = kernel_scripted(input, *other_args, **kwargs) + expected = kernel_eager(input, *other_args, **kwargs) + + assert_close( + actual, + expected, + **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device), + msg=parametrized_error_message(input, other_args, **kwargs), + ) + + def _unbatch(self, batch, *, data_dims): + if isinstance(batch, torch.Tensor): + batched_tensor = batch + metadata = () + else: + batched_tensor, *metadata = batch + + if batched_tensor.ndim == data_dims: + return batch + + return [ + self._unbatch(unbatched, data_dims=data_dims) + for unbatched in ( + batched_tensor.unbind(0) if not metadata else [(t, *metadata) for t in batched_tensor.unbind(0)] + ) + ] + + @sample_inputs + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_batched_vs_single(self, test_id, info, args_kwargs, device): + (batched_input, *other_args), kwargs = args_kwargs.load(device) + + tv_tensor_type = tv_tensors.Image if is_pure_tensor(batched_input) else type(batched_input) + # This dictionary contains the number of rightmost dimensions that contain the actual data. + # Everything to the left is considered a batch dimension. + data_dims = { + tv_tensors.Image: 3, + tv_tensors.BoundingBoxes: 1, + # `Mask`'s are special in the sense that the data dimensions depend on the type of mask. For detection masks + # it is 3 `(*, N, H, W)`, but for segmentation masks it is 2 `(*, H, W)`. Since both a grouped under one + # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as + # common ground. + tv_tensors.Mask: 2, + tv_tensors.Video: 4, + }.get(tv_tensor_type) + if data_dims is None: + raise pytest.UsageError( + f"The number of data dimensions cannot be determined for input of type {tv_tensor_type.__name__}." + ) from None + elif batched_input.ndim <= data_dims: + pytest.skip("Input is not batched.") + elif not all(batched_input.shape[:-data_dims]): + pytest.skip("Input has a degenerate batch shape.") + + batched_input = batched_input.as_subclass(torch.Tensor) + batched_output = info.kernel(batched_input, *other_args, **kwargs) + actual = self._unbatch(batched_output, data_dims=data_dims) + + single_inputs = self._unbatch(batched_input, data_dims=data_dims) + expected = tree_map(lambda single_input: info.kernel(single_input, *other_args, **kwargs), single_inputs) + + assert_close( + actual, + expected, + **info.get_closeness_kwargs(test_id, dtype=batched_input.dtype, device=batched_input.device), + msg=parametrized_error_message(batched_input, *other_args, **kwargs), + ) + + @sample_inputs + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_no_inplace(self, info, args_kwargs, device): + (input, *other_args), kwargs = args_kwargs.load(device) + input = input.as_subclass(torch.Tensor) + + if input.numel() == 0: + pytest.skip("The input has a degenerate shape.") + + input_version = input._version + info.kernel(input, *other_args, **kwargs) + + assert input._version == input_version + + @sample_inputs + @needs_cuda + def test_cuda_vs_cpu(self, test_id, info, args_kwargs): + (input_cpu, *other_args), kwargs = args_kwargs.load("cpu") + input_cpu = input_cpu.as_subclass(torch.Tensor) + input_cuda = input_cpu.to("cuda") + + output_cpu = info.kernel(input_cpu, *other_args, **kwargs) + output_cuda = info.kernel(input_cuda, *other_args, **kwargs) + + assert_close( + output_cuda, + output_cpu, + check_device=False, + **info.get_closeness_kwargs(test_id, dtype=input_cuda.dtype, device=input_cuda.device), + msg=parametrized_error_message(input_cpu, *other_args, **kwargs), + ) + + @sample_inputs + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_dtype_and_device_consistency(self, info, args_kwargs, device): + (input, *other_args), kwargs = args_kwargs.load(device) + input = input.as_subclass(torch.Tensor) + + output = info.kernel(input, *other_args, **kwargs) + # Most kernels just return a tensor, but some also return some additional metadata + if not isinstance(output, torch.Tensor): + output, *_ = output + + assert output.dtype == input.dtype + assert output.device == input.device + + @reference_inputs + def test_against_reference(self, test_id, info, args_kwargs): + (input, *other_args), kwargs = args_kwargs.load("cpu") + + actual = info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs) + # We intnetionally don't unwrap the input of the reference function in order for it to have access to all + # metadata regardless of whether the kernel takes it explicitly or not + expected = info.reference_fn(input, *other_args, **kwargs) + + assert_close( + actual, + expected, + **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device), + msg=parametrized_error_message(input, *other_args, **kwargs), + ) + + @make_info_args_kwargs_parametrization( + [info for info in KERNEL_INFOS if info.float32_vs_uint8], + args_kwargs_fn=lambda info: info.reference_inputs_fn(), + ) + def test_float32_vs_uint8(self, test_id, info, args_kwargs): + (input, *other_args), kwargs = args_kwargs.load("cpu") + input = input.as_subclass(torch.Tensor) + + if input.dtype != torch.uint8: + pytest.skip(f"Input dtype is {input.dtype}.") + + adapted_other_args, adapted_kwargs = info.float32_vs_uint8(other_args, kwargs) + + actual = info.kernel( + F.to_dtype_image(input, dtype=torch.float32, scale=True), + *adapted_other_args, + **adapted_kwargs, + ) + + expected = F.to_dtype_image(info.kernel(input, *other_args, **kwargs), dtype=torch.float32, scale=True) + + assert_close( + actual, + expected, + **info.get_closeness_kwargs(test_id, dtype=torch.float32, device=input.device), + msg=parametrized_error_message(input, *other_args, **kwargs), + ) + + +@pytest.fixture +def spy_on(mocker): + def make_spy(fn, *, module=None, name=None): + # TODO: we can probably get rid of the non-default modules and names if we eliminate aliasing + module = module or fn.__module__ + name = name or fn.__name__ + spy = mocker.patch(f"{module}.{name}", wraps=fn) + return spy + + return make_spy + + +class TestDispatchers: + image_sample_inputs = make_info_args_kwargs_parametrization( + [info for info in DISPATCHER_INFOS if tv_tensors.Image in info.kernels], + args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.Image), + ) + + @make_info_args_kwargs_parametrization( + DISPATCHER_INFOS, + args_kwargs_fn=lambda info: info.sample_inputs(), + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_logging(self, spy_on, info, args_kwargs, device): + spy = spy_on(torch._C._log_api_usage_once) + + args, kwargs = args_kwargs.load(device) + info.dispatcher(*args, **kwargs) + + spy.assert_any_call(f"{info.dispatcher.__module__}.{info.id}") + + @ignore_jit_warning_no_profile + @image_sample_inputs + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_scripted_smoke(self, info, args_kwargs, device): + dispatcher = script(info.dispatcher) + + (image_tv_tensor, *other_args), kwargs = args_kwargs.load(device) + image_pure_tensor = torch.Tensor(image_tv_tensor) + + dispatcher(image_pure_tensor, *other_args, **kwargs) + + # TODO: We need this until the dispatchers below also have `DispatcherInfo`'s. If they do, `test_scripted_smoke` + # replaces this test for them. + @ignore_jit_warning_no_profile + @pytest.mark.parametrize( + "dispatcher", + [ + F.get_dimensions, + F.get_image_num_channels, + F.get_image_size, + F.get_num_channels, + F.get_num_frames, + F.get_size, + F.rgb_to_grayscale, + F.uniform_temporal_subsample, + ], + ids=lambda dispatcher: dispatcher.__name__, + ) + def test_scriptable(self, dispatcher): + script(dispatcher) + + @image_sample_inputs + def test_pure_tensor_output_type(self, info, args_kwargs): + (image_tv_tensor, *other_args), kwargs = args_kwargs.load() + image_pure_tensor = image_tv_tensor.as_subclass(torch.Tensor) + + output = info.dispatcher(image_pure_tensor, *other_args, **kwargs) + + # We cannot use `isinstance` here since all tv_tensors are instances of `torch.Tensor` as well + assert type(output) is torch.Tensor + + @make_info_args_kwargs_parametrization( + [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None], + args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.Image), + ) + def test_pil_output_type(self, info, args_kwargs): + (image_tv_tensor, *other_args), kwargs = args_kwargs.load() + + if image_tv_tensor.ndim > 3: + pytest.skip("Input is batched") + + image_pil = F.to_pil_image(image_tv_tensor) + + output = info.dispatcher(image_pil, *other_args, **kwargs) + + assert isinstance(output, PIL.Image.Image) + + @make_info_args_kwargs_parametrization( + DISPATCHER_INFOS, + args_kwargs_fn=lambda info: info.sample_inputs(), + ) + def test_tv_tensor_output_type(self, info, args_kwargs): + (tv_tensor, *other_args), kwargs = args_kwargs.load() + + output = info.dispatcher(tv_tensor, *other_args, **kwargs) + + assert isinstance(output, type(tv_tensor)) + + if isinstance(tv_tensor, tv_tensors.BoundingBoxes) and info.dispatcher is not F.convert_bounding_box_format: + assert output.format == tv_tensor.format + + @pytest.mark.parametrize( + ("dispatcher_info", "tv_tensor_type", "kernel_info"), + [ + pytest.param( + dispatcher_info, tv_tensor_type, kernel_info, id=f"{dispatcher_info.id}-{tv_tensor_type.__name__}" + ) + for dispatcher_info in DISPATCHER_INFOS + for tv_tensor_type, kernel_info in dispatcher_info.kernel_infos.items() + ], + ) + def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, tv_tensor_type, kernel_info): + dispatcher_signature = inspect.signature(dispatcher_info.dispatcher) + dispatcher_params = list(dispatcher_signature.parameters.values())[1:] + + kernel_signature = inspect.signature(kernel_info.kernel) + kernel_params = list(kernel_signature.parameters.values())[1:] + + # We filter out metadata that is implicitly passed to the dispatcher through the input tv_tensor, but has to be + # explicitly passed to the kernel. + input_type = {v: k for k, v in dispatcher_info.kernels.items()}.get(kernel_info.kernel) + explicit_metadata = { + tv_tensors.BoundingBoxes: {"format", "canvas_size"}, + } + kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())] + + dispatcher_params = iter(dispatcher_params) + for dispatcher_param, kernel_param in zip(dispatcher_params, kernel_params): + try: + # In general, the dispatcher parameters are a superset of the kernel parameters. Thus, we filter out + # dispatcher parameters that have no kernel equivalent while keeping the order intact. + while dispatcher_param.name != kernel_param.name: + dispatcher_param = next(dispatcher_params) + except StopIteration: + raise AssertionError( + f"Parameter `{kernel_param.name}` of kernel `{kernel_info.id}` " + f"has no corresponding parameter on the dispatcher `{dispatcher_info.id}`." + ) from None + + assert dispatcher_param == kernel_param + + @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id) + def test_unkown_type(self, info): + unkown_input = object() + (_, *other_args), kwargs = next(iter(info.sample_inputs())).load("cpu") + + with pytest.raises(TypeError, match=re.escape(str(type(unkown_input)))): + info.dispatcher(unkown_input, *other_args, **kwargs) + + @make_info_args_kwargs_parametrization( + [ + info + for info in DISPATCHER_INFOS + if tv_tensors.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_bounding_box_format + ], + args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.BoundingBoxes), + ) + def test_bounding_boxes_format_consistency(self, info, args_kwargs): + (bounding_boxes, *other_args), kwargs = args_kwargs.load() + format = bounding_boxes.format + + output = info.dispatcher(bounding_boxes, *other_args, **kwargs) + + assert output.format == format + + +@pytest.mark.parametrize( + ("alias", "target"), + [ + pytest.param(alias, target, id=alias.__name__) + for alias, target in [ + (F.hflip, F.horizontal_flip), + (F.vflip, F.vertical_flip), + (F.get_image_num_channels, F.get_num_channels), + (F.to_pil_image, F.to_pil_image), + (F.elastic_transform, F.elastic), + (F.to_grayscale, F.rgb_to_grayscale), + ] + ], +) +def test_alias(alias, target): + assert alias is target + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize("num_channels", [1, 3]) +def test_normalize_image_tensor_stats(device, num_channels): + stats = pytest.importorskip("scipy.stats", reason="SciPy is not available") + + def assert_samples_from_standard_normal(t): + p_value = stats.kstest(t.flatten(), cdf="norm", args=(0, 1)).pvalue + return p_value > 1e-4 + + image = torch.rand(num_channels, DEFAULT_SQUARE_SPATIAL_SIZE, DEFAULT_SQUARE_SPATIAL_SIZE) + mean = image.mean(dim=(1, 2)).tolist() + std = image.std(dim=(1, 2)).tolist() + + assert_samples_from_standard_normal(F.normalize_image(image, mean, std)) + + +class TestClampBoundingBoxes: + @pytest.mark.parametrize( + "metadata", + [ + dict(), + dict(format=tv_tensors.BoundingBoxFormat.XYXY), + dict(canvas_size=(1, 1)), + ], + ) + def test_pure_tensor_insufficient_metadata(self, metadata): + pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor) + + with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")): + F.clamp_bounding_boxes(pure_tensor, **metadata) + + @pytest.mark.parametrize( + "metadata", + [ + dict(format=tv_tensors.BoundingBoxFormat.XYXY), + dict(canvas_size=(1, 1)), + dict(format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=(1, 1)), + ], + ) + def test_tv_tensor_explicit_metadata(self, metadata): + tv_tensor = next(make_multiple_bounding_boxes()) + + with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")): + F.clamp_bounding_boxes(tv_tensor, **metadata) + + +class TestConvertFormatBoundingBoxes: + @pytest.mark.parametrize( + ("inpt", "old_format"), + [ + (next(make_multiple_bounding_boxes()), None), + (next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor), tv_tensors.BoundingBoxFormat.XYXY), + ], + ) + def test_missing_new_format(self, inpt, old_format): + with pytest.raises(TypeError, match=re.escape("missing 1 required argument: 'new_format'")): + F.convert_bounding_box_format(inpt, old_format) + + def test_pure_tensor_insufficient_metadata(self): + pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor) + + with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")): + F.convert_bounding_box_format(pure_tensor, new_format=tv_tensors.BoundingBoxFormat.CXCYWH) + + def test_tv_tensor_explicit_metadata(self): + tv_tensor = next(make_multiple_bounding_boxes()) + + with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")): + F.convert_bounding_box_format( + tv_tensor, old_format=tv_tensor.format, new_format=tv_tensors.BoundingBoxFormat.CXCYWH + ) + + +# TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in +# `transforms_v2_kernel_infos.py` + + +def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_): + rot = math.radians(angle_) + cx, cy = center_ + tx, ty = translate_ + sx, sy = [math.radians(sh_) for sh_ in shear_] + + c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]]) + t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]]) + c_matrix_inv = np.linalg.inv(c_matrix) + rs_matrix = np.array( + [ + [scale_ * math.cos(rot), -scale_ * math.sin(rot), 0], + [scale_ * math.sin(rot), scale_ * math.cos(rot), 0], + [0, 0, 1], + ] + ) + shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]]) + shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]]) + rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix)) + true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv))) + return true_matrix + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device): + mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device) + mask[:, 0, :] = 1 + + out_mask = F.vertical_flip_mask(mask) + + expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device) + expected_mask[:, -1, :] = 1 + torch.testing.assert_close(out_mask, expected_mask) + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize( + "format", + [tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH, tv_tensors.BoundingBoxFormat.CXCYWH], +) +@pytest.mark.parametrize( + "top, left, height, width, size", + [ + [0, 0, 30, 30, (60, 60)], + [-5, 5, 35, 45, (32, 34)], + ], +) +def test_correctness_resized_crop_bounding_boxes(device, format, top, left, height, width, size): + def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_): + # bbox should be xyxy + bbox[0] = (bbox[0] - left_) * size_[1] / width_ + bbox[1] = (bbox[1] - top_) * size_[0] / height_ + bbox[2] = (bbox[2] - left_) * size_[1] / width_ + bbox[3] = (bbox[3] - top_) * size_[0] / height_ + return bbox + + format = tv_tensors.BoundingBoxFormat.XYXY + canvas_size = (100, 100) + in_boxes = [ + [10.0, 10.0, 20.0, 20.0], + [5.0, 10.0, 15.0, 20.0], + ] + expected_bboxes = [] + for in_box in in_boxes: + expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size)) + expected_bboxes = torch.tensor(expected_bboxes, device=device) + + in_boxes = tv_tensors.BoundingBoxes( + in_boxes, format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device + ) + if format != tv_tensors.BoundingBoxFormat.XYXY: + in_boxes = convert_bounding_box_format(in_boxes, tv_tensors.BoundingBoxFormat.XYXY, format) + + output_boxes, output_canvas_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size) + + if format != tv_tensors.BoundingBoxFormat.XYXY: + output_boxes = convert_bounding_box_format(output_boxes, format, tv_tensors.BoundingBoxFormat.XYXY) + + torch.testing.assert_close(output_boxes, expected_bboxes) + torch.testing.assert_close(output_canvas_size, size) + + +def _parse_padding(padding): + if isinstance(padding, int): + return [padding] * 4 + if isinstance(padding, list): + if len(padding) == 1: + return padding * 4 + if len(padding) == 2: + return padding * 2 # [left, up, right, down] + + return padding + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]]) +def test_correctness_pad_bounding_boxes(device, padding): + def _compute_expected_bbox(bbox, format, padding_): + pad_left, pad_up, _, _ = _parse_padding(padding_) + + dtype = bbox.dtype + bbox = ( + bbox.clone() + if format == tv_tensors.BoundingBoxFormat.XYXY + else convert_bounding_box_format(bbox, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY) + ) + + bbox[0::2] += pad_left + bbox[1::2] += pad_up + + bbox = convert_bounding_box_format(bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format) + if bbox.dtype != dtype: + # Temporary cast to original dtype + # e.g. float32 -> int + bbox = bbox.to(dtype) + return bbox + + def _compute_expected_canvas_size(bbox, padding_): + pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_) + height, width = bbox.canvas_size + return height + pad_up + pad_down, width + pad_left + pad_right + + for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)): + bboxes = bboxes.to(device) + bboxes_format = bboxes.format + bboxes_canvas_size = bboxes.canvas_size + + output_boxes, output_canvas_size = F.pad_bounding_boxes( + bboxes, format=bboxes_format, canvas_size=bboxes_canvas_size, padding=padding + ) + + torch.testing.assert_close(output_canvas_size, _compute_expected_canvas_size(bboxes, padding)) + + expected_bboxes = torch.stack( + [_compute_expected_bbox(b, bboxes_format, padding) for b in bboxes.reshape(-1, 4).unbind()] + ).reshape(bboxes.shape) + + torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0) + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_correctness_pad_segmentation_mask_on_fixed_input(device): + mask = torch.ones((1, 3, 3), dtype=torch.long, device=device) + + out_mask = F.pad_mask(mask, padding=[1, 1, 1, 1]) + + expected_mask = torch.zeros((1, 5, 5), dtype=torch.long, device=device) + expected_mask[:, 1:-1, 1:-1] = 1 + torch.testing.assert_close(out_mask, expected_mask) + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize( + "startpoints, endpoints", + [ + [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]], + [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]], + [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]], + ], +) +def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints): + def _compute_expected_bbox(bbox, format_, canvas_size_, pcoeffs_): + m1 = np.array( + [ + [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]], + [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]], + ] + ) + m2 = np.array( + [ + [pcoeffs_[6], pcoeffs_[7], 1.0], + [pcoeffs_[6], pcoeffs_[7], 1.0], + ] + ) + + bbox_xyxy = convert_bounding_box_format(bbox, old_format=format_, new_format=tv_tensors.BoundingBoxFormat.XYXY) + points = np.array( + [ + [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0], + [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0], + [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0], + [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0], + ] + ) + numer = np.matmul(points, m1.T) + denom = np.matmul(points, m2.T) + transformed_points = numer / denom + out_bbox = np.array( + [ + np.min(transformed_points[:, 0]), + np.min(transformed_points[:, 1]), + np.max(transformed_points[:, 0]), + np.max(transformed_points[:, 1]), + ] + ) + out_bbox = torch.from_numpy(out_bbox) + out_bbox = convert_bounding_box_format( + out_bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format_ + ) + return clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_).to(bbox) + + canvas_size = (32, 38) + + pcoeffs = _get_perspective_coeffs(startpoints, endpoints) + inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints) + + for bboxes in make_multiple_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)): + bboxes = bboxes.to(device) + + output_bboxes = F.perspective_bounding_boxes( + bboxes.as_subclass(torch.Tensor), + format=bboxes.format, + canvas_size=bboxes.canvas_size, + startpoints=None, + endpoints=None, + coefficients=pcoeffs, + ) + + expected_bboxes = torch.stack( + [ + _compute_expected_bbox(b, bboxes.format, bboxes.canvas_size, inv_pcoeffs) + for b in bboxes.reshape(-1, 4).unbind() + ] + ).reshape(bboxes.shape) + + torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=0, atol=1) + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize( + "output_size", + [(18, 18), [18, 15], (16, 19), [12], [46, 48]], +) +def test_correctness_center_crop_bounding_boxes(device, output_size): + def _compute_expected_bbox(bbox, format_, canvas_size_, output_size_): + dtype = bbox.dtype + bbox = convert_bounding_box_format(bbox.float(), format_, tv_tensors.BoundingBoxFormat.XYWH) + + if len(output_size_) == 1: + output_size_.append(output_size_[-1]) + + cy = int(round((canvas_size_[0] - output_size_[0]) * 0.5)) + cx = int(round((canvas_size_[1] - output_size_[1]) * 0.5)) + out_bbox = [ + bbox[0].item() - cx, + bbox[1].item() - cy, + bbox[2].item(), + bbox[3].item(), + ] + out_bbox = torch.tensor(out_bbox) + out_bbox = convert_bounding_box_format(out_bbox, tv_tensors.BoundingBoxFormat.XYWH, format_) + out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size) + return out_bbox.to(dtype=dtype, device=bbox.device) + + for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)): + bboxes = bboxes.to(device) + bboxes_format = bboxes.format + bboxes_canvas_size = bboxes.canvas_size + + output_boxes, output_canvas_size = F.center_crop_bounding_boxes( + bboxes, bboxes_format, bboxes_canvas_size, output_size + ) + + expected_bboxes = torch.stack( + [ + _compute_expected_bbox(b, bboxes_format, bboxes_canvas_size, output_size) + for b in bboxes.reshape(-1, 4).unbind() + ] + ).reshape(bboxes.shape) + + torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0) + torch.testing.assert_close(output_canvas_size, output_size) + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize("output_size", [[4, 2], [4], [7, 6]]) +def test_correctness_center_crop_mask(device, output_size): + def _compute_expected_mask(mask, output_size): + crop_height, crop_width = output_size if len(output_size) > 1 else [output_size[0], output_size[0]] + + _, image_height, image_width = mask.shape + if crop_width > image_height or crop_height > image_width: + padding = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width) + mask = F.pad_image(mask, padding, fill=0) + + left = round((image_width - crop_width) * 0.5) + top = round((image_height - crop_height) * 0.5) + + return mask[:, top : top + crop_height, left : left + crop_width] + + mask = torch.randint(0, 2, size=(1, 6, 6), dtype=torch.long, device=device) + actual = F.center_crop_mask(mask, output_size) + + expected = _compute_expected_mask(mask, output_size) + torch.testing.assert_close(expected, actual) + + +# Copied from test/test_functional_tensor.py +@pytest.mark.parametrize("device", cpu_and_cuda()) +@pytest.mark.parametrize("canvas_size", ("small", "large")) +@pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16]) +@pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)]) +@pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)]) +def test_correctness_gaussian_blur_image_tensor(device, canvas_size, dt, ksize, sigma): + fn = F.gaussian_blur_image + + # true_cv2_results = { + # # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3)) + # # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8) + # "3_3_0.8": ... + # # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5) + # "3_3_0.5": ... + # # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8) + # "3_5_0.8": ... + # # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5) + # "3_5_0.5": ... + # # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28)) + # # cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7) + # "23_23_1.7": ... + # } + p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt") + true_cv2_results = torch.load(p) + + if canvas_size == "small": + tensor = ( + torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device) + ) + else: + tensor = torch.from_numpy(np.arange(26 * 28, dtype="uint8").reshape((1, 26, 28))).to(device) + + if dt == torch.float16 and device == "cpu": + # skip float16 on CPU case + return + + if dt is not None: + tensor = tensor.to(dtype=dt) + + _ksize = (ksize, ksize) if isinstance(ksize, int) else ksize + _sigma = sigma[0] if sigma is not None else None + shape = tensor.shape + gt_key = f"{shape[-2]}_{shape[-1]}_{shape[-3]}__{_ksize[0]}_{_ksize[1]}_{_sigma}" + if gt_key not in true_cv2_results: + return + + true_out = ( + torch.tensor(true_cv2_results[gt_key]).reshape(shape[-2], shape[-1], shape[-3]).permute(2, 0, 1).to(tensor) + ) + + image = tv_tensors.Image(tensor) + + out = fn(image, kernel_size=ksize, sigma=sigma) + torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}") + + +@pytest.mark.parametrize( + "inpt", + [ + 127 * np.ones((32, 32, 3), dtype="uint8"), + PIL.Image.new("RGB", (32, 32), 122), + ], +) +def test_to_image(inpt): + output = F.to_image(inpt) + assert isinstance(output, torch.Tensor) + assert output.shape == (3, 32, 32) + + assert np.asarray(inpt).sum() == output.sum().item() + + +@pytest.mark.parametrize( + "inpt", + [ + torch.randint(0, 256, size=(3, 32, 32), dtype=torch.uint8), + 127 * np.ones((32, 32, 3), dtype="uint8"), + ], +) +@pytest.mark.parametrize("mode", [None, "RGB"]) +def test_to_pil_image(inpt, mode): + output = F.to_pil_image(inpt, mode=mode) + assert isinstance(output, PIL.Image.Image) + + assert np.asarray(inpt).sum() == np.asarray(output).sum() + + +def test_equalize_image_tensor_edge_cases(): + inpt = torch.zeros(3, 200, 200, dtype=torch.uint8) + output = F.equalize_image(inpt) + torch.testing.assert_close(inpt, output) + + inpt = torch.zeros(5, 3, 200, 200, dtype=torch.uint8) + inpt[..., 100:, 100:] = 1 + output = F.equalize_image(inpt) + assert output.unique().tolist() == [0, 255] + + +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_correctness_uniform_temporal_subsample(device): + video = torch.arange(10, device=device)[:, None, None, None].expand(-1, 3, 8, 8) + out_video = F.uniform_temporal_subsample(video, 5) + assert out_video.unique().tolist() == [0, 2, 4, 6, 9] + + out_video = F.uniform_temporal_subsample(video, 8) + assert out_video.unique().tolist() == [0, 1, 2, 3, 5, 6, 7, 9] diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py new file mode 100644 index 0000000000000000000000000000000000000000..e18beb35a484353af52771b1ed7e458b6a5ceeeb --- /dev/null +++ b/test/test_transforms_v2_refactored.py @@ -0,0 +1,2909 @@ +import contextlib +import decimal +import inspect +import math +import pickle +import re +from pathlib import Path +from unittest import mock + +import numpy as np +import PIL.Image +import pytest + +import torch +import torchvision.transforms.v2 as transforms +from common_utils import ( + assert_equal, + assert_no_warnings, + cache, + cpu_and_cuda, + freeze_rng_state, + ignore_jit_no_profile_information_warning, + make_bounding_boxes, + make_detection_mask, + make_image, + make_image_pil, + make_image_tensor, + make_segmentation_mask, + make_video, + make_video_tensor, + needs_cuda, + set_rng_seed, +) + +from torch import nn +from torch.testing import assert_close +from torch.utils._pytree import tree_map +from torch.utils.data import DataLoader, default_collate +from torchvision import tv_tensors + +from torchvision.transforms._functional_tensor import _max_value as get_max_value +from torchvision.transforms.functional import pil_modes_mapping +from torchvision.transforms.v2 import functional as F +from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal + + +@pytest.fixture(autouse=True) +def fix_rng_seed(): + set_rng_seed(0) + yield + + +def _to_tolerances(maybe_tolerance_dict): + if not isinstance(maybe_tolerance_dict, dict): + return dict(rtol=None, atol=None) + + tolerances = dict(rtol=0, atol=0) + tolerances.update(maybe_tolerance_dict) + return tolerances + + +def _check_kernel_cuda_vs_cpu(kernel, input, *args, rtol, atol, **kwargs): + """Checks if the kernel produces closes results for inputs on GPU and CPU.""" + if input.device.type != "cuda": + return + + input_cuda = input.as_subclass(torch.Tensor) + input_cpu = input_cuda.to("cpu") + + with freeze_rng_state(): + actual = kernel(input_cuda, *args, **kwargs) + with freeze_rng_state(): + expected = kernel(input_cpu, *args, **kwargs) + + assert_close(actual, expected, check_device=False, rtol=rtol, atol=atol) + + +@cache +def _script(obj): + try: + return torch.jit.script(obj) + except Exception as error: + name = getattr(obj, "__name__", obj.__class__.__name__) + raise AssertionError(f"Trying to `torch.jit.script` '{name}' raised the error above.") from error + + +def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs): + """Checks if the kernel is scriptable and if the scripted output is close to the eager one.""" + if input.device.type != "cpu": + return + + kernel_scripted = _script(kernel) + + input = input.as_subclass(torch.Tensor) + with ignore_jit_no_profile_information_warning(): + actual = kernel_scripted(input, *args, **kwargs) + expected = kernel(input, *args, **kwargs) + + assert_close(actual, expected, rtol=rtol, atol=atol) + + +def _check_kernel_batched_vs_unbatched(kernel, input, *args, rtol, atol, **kwargs): + """Checks if the kernel produces close results for batched and unbatched inputs.""" + unbatched_input = input.as_subclass(torch.Tensor) + + for batch_dims in [(2,), (2, 1)]: + repeats = [*batch_dims, *[1] * input.ndim] + + actual = kernel(unbatched_input.repeat(repeats), *args, **kwargs) + + expected = kernel(unbatched_input, *args, **kwargs) + # We can't directly call `.repeat()` on the output, since some kernel also return some additional metadata + if isinstance(expected, torch.Tensor): + expected = expected.repeat(repeats) + else: + tensor, *metadata = expected + expected = (tensor.repeat(repeats), *metadata) + + assert_close(actual, expected, rtol=rtol, atol=atol) + + for degenerate_batch_dims in [(0,), (5, 0), (0, 5)]: + degenerate_batched_input = torch.empty( + degenerate_batch_dims + input.shape, dtype=input.dtype, device=input.device + ) + + output = kernel(degenerate_batched_input, *args, **kwargs) + # Most kernels just return a tensor, but some also return some additional metadata + if not isinstance(output, torch.Tensor): + output, *_ = output + + assert output.shape[: -input.ndim] == degenerate_batch_dims + + +def check_kernel( + kernel, + input, + *args, + check_cuda_vs_cpu=True, + check_scripted_vs_eager=True, + check_batched_vs_unbatched=True, + expect_same_dtype=True, + **kwargs, +): + initial_input_version = input._version + + output = kernel(input.as_subclass(torch.Tensor), *args, **kwargs) + # Most kernels just return a tensor, but some also return some additional metadata + if not isinstance(output, torch.Tensor): + output, *_ = output + + # check that no inplace operation happened + assert input._version == initial_input_version + + if expect_same_dtype: + assert output.dtype == input.dtype + assert output.device == input.device + + if check_cuda_vs_cpu: + _check_kernel_cuda_vs_cpu(kernel, input, *args, **kwargs, **_to_tolerances(check_cuda_vs_cpu)) + + if check_scripted_vs_eager: + _check_kernel_scripted_vs_eager(kernel, input, *args, **kwargs, **_to_tolerances(check_scripted_vs_eager)) + + if check_batched_vs_unbatched: + _check_kernel_batched_vs_unbatched(kernel, input, *args, **kwargs, **_to_tolerances(check_batched_vs_unbatched)) + + +def _check_functional_scripted_smoke(functional, input, *args, **kwargs): + """Checks if the functional can be scripted and the scripted version can be called without error.""" + if not isinstance(input, tv_tensors.Image): + return + + functional_scripted = _script(functional) + with ignore_jit_no_profile_information_warning(): + functional_scripted(input.as_subclass(torch.Tensor), *args, **kwargs) + + +def check_functional(functional, input, *args, check_scripted_smoke=True, **kwargs): + unknown_input = object() + with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))): + functional(unknown_input, *args, **kwargs) + + with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy: + output = functional(input, *args, **kwargs) + + spy.assert_any_call(f"{functional.__module__}.{functional.__name__}") + + assert isinstance(output, type(input)) + + if isinstance(input, tv_tensors.BoundingBoxes): + assert output.format == input.format + + if check_scripted_smoke: + _check_functional_scripted_smoke(functional, input, *args, **kwargs) + + +def check_functional_kernel_signature_match(functional, *, kernel, input_type): + """Checks if the signature of the functional matches the kernel signature.""" + functional_params = list(inspect.signature(functional).parameters.values())[1:] + kernel_params = list(inspect.signature(kernel).parameters.values())[1:] + + if issubclass(input_type, tv_tensors.TVTensor): + # We filter out metadata that is implicitly passed to the functional through the input tv_tensor, but has to be + # explicitly passed to the kernel. + explicit_metadata = { + tv_tensors.BoundingBoxes: {"format", "canvas_size"}, + } + kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())] + + functional_params = iter(functional_params) + for functional_param, kernel_param in zip(functional_params, kernel_params): + try: + # In general, the functional parameters are a superset of the kernel parameters. Thus, we filter out + # functional parameters that have no kernel equivalent while keeping the order intact. + while functional_param.name != kernel_param.name: + functional_param = next(functional_params) + except StopIteration: + raise AssertionError( + f"Parameter `{kernel_param.name}` of kernel `{kernel.__name__}` " + f"has no corresponding parameter on the functional `{functional.__name__}`." + ) from None + + if issubclass(input_type, PIL.Image.Image): + # PIL kernels often have more correct annotations, since they are not limited by JIT. Thus, we don't check + # them in the first place. + functional_param._annotation = kernel_param._annotation = inspect.Parameter.empty + + assert functional_param == kernel_param + + +def _check_transform_v1_compatibility(transform, input, *, rtol, atol): + """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static + ``get_params`` method that is the v1 equivalent, the output is close to v1, is scriptable, and the scripted version + can be called without error.""" + if type(input) is not torch.Tensor or isinstance(input, PIL.Image.Image): + return + + v1_transform_cls = transform._v1_transform_cls + if v1_transform_cls is None: + return + + if hasattr(v1_transform_cls, "get_params"): + assert type(transform).get_params is v1_transform_cls.get_params + + v1_transform = v1_transform_cls(**transform._extract_params_for_v1_transform()) + + with freeze_rng_state(): + output_v2 = transform(input) + + with freeze_rng_state(): + output_v1 = v1_transform(input) + + assert_close(output_v2, output_v1, rtol=rtol, atol=atol) + + if isinstance(input, PIL.Image.Image): + return + + _script(v1_transform)(input) + + +def check_transform(transform, input, check_v1_compatibility=True): + pickle.loads(pickle.dumps(transform)) + + output = transform(input) + assert isinstance(output, type(input)) + + if isinstance(input, tv_tensors.BoundingBoxes): + assert output.format == input.format + + if check_v1_compatibility: + _check_transform_v1_compatibility(transform, input, **_to_tolerances(check_v1_compatibility)) + + +def transform_cls_to_functional(transform_cls, **transform_specific_kwargs): + def wrapper(input, *args, **kwargs): + transform = transform_cls(*args, **transform_specific_kwargs, **kwargs) + return transform(input) + + wrapper.__name__ = transform_cls.__name__ + + return wrapper + + +def param_value_parametrization(**kwargs): + """Helper function to turn + + @pytest.mark.parametrize( + ("param", "value"), + ("a", 1), + ("a", 2), + ("a", 3), + ("b", -1.0) + ("b", 1.0) + ) + + into + + @param_value_parametrization(a=[1, 2, 3], b=[-1.0, 1.0]) + """ + return pytest.mark.parametrize( + ("param", "value"), + [(param, value) for param, values in kwargs.items() for value in values], + ) + + +def adapt_fill(value, *, dtype): + """Adapt fill values in the range [0.0, 1.0] to the value range of the dtype""" + if value is None: + return value + + max_value = get_max_value(dtype) + value_type = float if dtype.is_floating_point else int + + if isinstance(value, (int, float)): + return value_type(value * max_value) + elif isinstance(value, (list, tuple)): + return type(value)(value_type(v * max_value) for v in value) + else: + raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'.") + + +EXHAUSTIVE_TYPE_FILLS = [ + None, + 1, + 0.5, + [1], + [0.2], + (0,), + (0.7,), + [1, 0, 1], + [0.1, 0.2, 0.3], + (0, 1, 0), + (0.9, 0.234, 0.314), +] +CORRECTNESS_FILLS = [ + v for v in EXHAUSTIVE_TYPE_FILLS if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1) +] + + +# We cannot use `list(transforms.InterpolationMode)` here, since it includes some PIL-only ones as well +INTERPOLATION_MODES = [ + transforms.InterpolationMode.NEAREST, + transforms.InterpolationMode.NEAREST_EXACT, + transforms.InterpolationMode.BILINEAR, + transforms.InterpolationMode.BICUBIC, +] + + +@contextlib.contextmanager +def assert_warns_antialias_default_value(): + with pytest.warns(UserWarning, match="The default value of the antialias parameter of all the resizing transforms"): + yield + + +def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True): + format = bounding_boxes.format + canvas_size = new_canvas_size or bounding_boxes.canvas_size + + def affine_bounding_boxes(bounding_boxes): + dtype = bounding_boxes.dtype + device = bounding_boxes.device + + # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 + input_xyxy = F.convert_bounding_box_format( + bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True), + old_format=format, + new_format=tv_tensors.BoundingBoxFormat.XYXY, + inplace=True, + ) + x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist() + + points = np.array( + [ + [x1, y1, 1.0], + [x2, y1, 1.0], + [x1, y2, 1.0], + [x2, y2, 1.0], + ] + ) + transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T) + + output_xyxy = torch.Tensor( + [ + float(np.min(transformed_points[:, 0])), + float(np.min(transformed_points[:, 1])), + float(np.max(transformed_points[:, 0])), + float(np.max(transformed_points[:, 1])), + ] + ) + + output = F.convert_bounding_box_format( + output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format + ) + + if clamp: + # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 + output = F.clamp_bounding_boxes( + output, + format=format, + canvas_size=canvas_size, + ) + else: + # We leave the bounding box as float64 so the caller gets the full precision to perform any additional + # operation + dtype = output.dtype + + return output.to(dtype=dtype, device=device) + + return tv_tensors.BoundingBoxes( + torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape( + bounding_boxes.shape + ), + format=format, + canvas_size=canvas_size, + ) + + +# turns all warnings into errors for this module +pytestmark = pytest.mark.filterwarnings("error") + + +class TestResize: + INPUT_SIZE = (17, 11) + OUTPUT_SIZES = [17, [17], (17,), [12, 13], (12, 13)] + + def _make_max_size_kwarg(self, *, use_max_size, size): + if use_max_size: + if not (isinstance(size, int) or len(size) == 1): + # This would result in an `ValueError` + return None + + max_size = (size if isinstance(size, int) else size[0]) + 1 + else: + max_size = None + + return dict(max_size=max_size) + + def _compute_output_size(self, *, input_size, size, max_size): + if not (isinstance(size, int) or len(size) == 1): + return tuple(size) + + if not isinstance(size, int): + size = size[0] + + old_height, old_width = input_size + ratio = old_width / old_height + if ratio > 1: + new_height = size + new_width = int(ratio * new_height) + else: + new_width = size + new_height = int(new_width / ratio) + + if max_size is not None and max(new_height, new_width) > max_size: + # Need to recompute the aspect ratio, since it might have changed due to rounding + ratio = new_width / new_height + if ratio > 1: + new_width = max_size + new_height = int(new_width / ratio) + else: + new_height = max_size + new_width = int(new_height * ratio) + + return new_height, new_width + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("antialias", [True, False]) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, size, interpolation, use_max_size, antialias, dtype, device): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + # In contrast to CPU, there is no native `InterpolationMode.BICUBIC` implementation for uint8 images on CUDA. + # Internally, it uses the float path. Thus, we need to test with an enormous tolerance here to account for that. + atol = 30 if transforms.InterpolationMode.BICUBIC and dtype is torch.uint8 else 1 + check_cuda_vs_cpu_tolerances = dict(rtol=0, atol=atol / 255 if dtype.is_floating_point else atol) + + check_kernel( + F.resize_image, + make_image(self.INPUT_SIZE, dtype=dtype, device=device), + size=size, + interpolation=interpolation, + **max_size_kwarg, + antialias=antialias, + check_cuda_vs_cpu=check_cuda_vs_cpu_tolerances, + check_scripted_vs_eager=not isinstance(size, int), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + bounding_boxes = make_bounding_boxes( + format=format, + canvas_size=self.INPUT_SIZE, + dtype=dtype, + device=device, + ) + check_kernel( + F.resize_bounding_boxes, + bounding_boxes, + canvas_size=bounding_boxes.canvas_size, + size=size, + **max_size_kwarg, + check_scripted_vs_eager=not isinstance(size, int), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) + def test_kernel_mask(self, make_mask): + check_kernel(F.resize_mask, make_mask(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1]) + + def test_kernel_video(self): + check_kernel(F.resize_video, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1], antialias=True) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, size, make_input): + check_functional( + F.resize, + make_input(self.INPUT_SIZE), + size=size, + antialias=True, + check_scripted_smoke=not isinstance(size, int), + ) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.resize_image, torch.Tensor), + (F._resize_image_pil, PIL.Image.Image), + (F.resize_image, tv_tensors.Image), + (F.resize_bounding_boxes, tv_tensors.BoundingBoxes), + (F.resize_mask, tv_tensors.Mask), + (F.resize_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.resize, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_mask, + make_video, + ], + ) + def test_transform(self, size, device, make_input): + check_transform( + transforms.Resize(size=size, antialias=True), + make_input(self.INPUT_SIZE, device=device), + # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes + check_v1_compatibility=dict(rtol=0, atol=1), + ) + + def _check_output_size(self, input, output, *, size, max_size): + assert tuple(F.get_size(output)) == self._compute_output_size( + input_size=F.get_size(input), size=size, max_size=max_size + ) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2. + # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT` + @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST}) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) + def test_image_correctness(self, size, interpolation, use_max_size, fn): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + image = make_image(self.INPUT_SIZE, dtype=torch.uint8) + + actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True) + expected = F.to_image(F.resize(F.to_pil_image(image), size=size, interpolation=interpolation, **max_size_kwarg)) + + self._check_output_size(image, actual, size=size, **max_size_kwarg) + torch.testing.assert_close(actual, expected, atol=1, rtol=0) + + def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=None): + old_height, old_width = bounding_boxes.canvas_size + new_height, new_width = self._compute_output_size( + input_size=bounding_boxes.canvas_size, size=size, max_size=max_size + ) + + if (old_height, old_width) == (new_height, new_width): + return bounding_boxes + + affine_matrix = np.array( + [ + [new_width / old_width, 0, 0], + [0, new_height / old_height, 0], + ], + ) + + return reference_affine_bounding_boxes_helper( + bounding_boxes, + affine_matrix=affine_matrix, + new_canvas_size=(new_height, new_width), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize("use_max_size", [True, False]) + @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)]) + def test_bounding_boxes_correctness(self, format, size, use_max_size, fn): + if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)): + return + + bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE) + + actual = fn(bounding_boxes, size=size, **max_size_kwarg) + expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg) + + self._check_output_size(bounding_boxes, actual, size=size, **max_size_kwarg) + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES)) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_pil_interpolation_compat_smoke(self, interpolation, make_input): + input = make_input(self.INPUT_SIZE) + + with ( + contextlib.nullcontext() + if isinstance(input, PIL.Image.Image) + # This error is triggered in PyTorch core + else pytest.raises(NotImplementedError, match=f"got {interpolation.value.lower()}") + ): + F.resize( + input, + size=self.OUTPUT_SIZES[0], + interpolation=interpolation, + ) + + def test_functional_pil_antialias_warning(self): + with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"): + F.resize(make_image_pil(self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False) + + @pytest.mark.parametrize("size", OUTPUT_SIZES) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_mask, + make_video, + ], + ) + def test_max_size_error(self, size, make_input): + if isinstance(size, int) or len(size) == 1: + max_size = (size if isinstance(size, int) else size[0]) - 1 + match = "must be strictly greater than the requested size" + else: + # value can be anything other than None + max_size = -1 + match = "size should be an int or a sequence of length 1" + + with pytest.raises(ValueError, match=match): + F.resize(make_input(self.INPUT_SIZE), size=size, max_size=max_size, antialias=True) + + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_video], + ) + def test_antialias_warning(self, interpolation, make_input): + with ( + assert_warns_antialias_default_value() + if interpolation in {transforms.InterpolationMode.BILINEAR, transforms.InterpolationMode.BICUBIC} + else assert_no_warnings() + ): + F.resize( + make_input(self.INPUT_SIZE), + size=self.OUTPUT_SIZES[0], + interpolation=interpolation, + ) + + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_interpolation_int(self, interpolation, make_input): + input = make_input(self.INPUT_SIZE) + + # `InterpolationMode.NEAREST_EXACT` has no proper corresponding integer equivalent. Internally, we map it to + # `0` to be the same as `InterpolationMode.NEAREST` for PIL. However, for the tensor backend there is a + # difference and thus we don't test it here. + if isinstance(input, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT: + return + + expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True) + actual = F.resize( + input, size=self.OUTPUT_SIZES[0], interpolation=pil_modes_mapping[interpolation], antialias=True + ) + + assert_equal(actual, expected) + + def test_transform_unknown_size_error(self): + with pytest.raises(ValueError, match="size can either be an integer or a list or tuple of one or two integers"): + transforms.Resize(size=object()) + + @pytest.mark.parametrize( + "size", [min(INPUT_SIZE), [min(INPUT_SIZE)], (min(INPUT_SIZE),), list(INPUT_SIZE), tuple(INPUT_SIZE)] + ) + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_mask, + make_video, + ], + ) + def test_noop(self, size, make_input): + input = make_input(self.INPUT_SIZE) + + output = F.resize(input, size=F.get_size(input), antialias=True) + + # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there + # is a good reason to break this, feel free to downgrade to an equality check. + if isinstance(input, tv_tensors.TVTensor): + # We can't test identity directly, since that checks for the identity of the Python object. Since all + # tv_tensors unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check + # that the underlying storage is the same + assert output.data_ptr() == input.data_ptr() + else: + assert output is input + + @pytest.mark.parametrize( + "make_input", + [ + make_image_tensor, + make_image_pil, + make_image, + make_bounding_boxes, + make_segmentation_mask, + make_detection_mask, + make_video, + ], + ) + def test_no_regression_5405(self, make_input): + # Checks that `max_size` is not ignored if `size == small_edge_size` + # See https://github.com/pytorch/vision/issues/5405 + + input = make_input(self.INPUT_SIZE) + + size = min(F.get_size(input)) + max_size = size + 1 + output = F.resize(input, size=size, max_size=max_size, antialias=True) + + assert max(F.get_size(output)) == max_size + + def _make_image(self, *args, batch_dims=(), memory_format=torch.contiguous_format, **kwargs): + # torch.channels_last memory_format is only available for 4D tensors, i.e. (B, C, H, W). However, images coming + # from PIL or our own I/O functions do not have a batch dimensions and are thus 3D, i.e. (C, H, W). Still, the + # layout of the data in memory is channels last. To emulate this when a 3D input is requested here, we create + # the image as 4D and create a view with the right shape afterwards. With this the layout in memory is channels + # last although PyTorch doesn't recognizes it as such. + emulate_channels_last = memory_format is torch.channels_last and len(batch_dims) != 1 + + image = make_image( + *args, + batch_dims=(math.prod(batch_dims),) if emulate_channels_last else batch_dims, + memory_format=memory_format, + **kwargs, + ) + + if emulate_channels_last: + image = tv_tensors.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image) + + return image + + def _check_stride(self, image, *, memory_format): + C, H, W = F.get_dimensions(image) + if memory_format is torch.contiguous_format: + expected_stride = (H * W, W, 1) + elif memory_format is torch.channels_last: + expected_stride = (1, W * C, C) + else: + raise ValueError(f"Unknown memory_format: {memory_format}") + + assert image.stride() == expected_stride + + # TODO: We can remove this test and related torchvision workaround + # once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430 + @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES) + @pytest.mark.parametrize("antialias", [True, False]) + @pytest.mark.parametrize("memory_format", [torch.contiguous_format, torch.channels_last]) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image_memory_format_consistency(self, interpolation, antialias, memory_format, dtype, device): + size = self.OUTPUT_SIZES[0] + + input = self._make_image(self.INPUT_SIZE, dtype=dtype, device=device, memory_format=memory_format) + + # Smoke test to make sure we aren't starting with wrong assumptions + self._check_stride(input, memory_format=memory_format) + + output = F.resize_image(input, size=size, interpolation=interpolation, antialias=antialias) + + self._check_stride(output, memory_format=memory_format) + + def test_float16_no_rounding(self): + # Make sure Resize() doesn't round float16 images + # Non-regression test for https://github.com/pytorch/vision/issues/7667 + + input = make_image_tensor(self.INPUT_SIZE, dtype=torch.float16) + output = F.resize_image(input, size=self.OUTPUT_SIZES[0], antialias=True) + + assert output.dtype is torch.float16 + assert (output.round() - output).abs().sum() > 0 + + +class TestHorizontalFlip: + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device)) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + check_kernel( + F.horizontal_flip_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) + def test_kernel_mask(self, make_mask): + check_kernel(F.horizontal_flip_mask, make_mask()) + + def test_kernel_video(self): + check_kernel(F.horizontal_flip_video, make_video()) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.horizontal_flip, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.horizontal_flip_image, torch.Tensor), + (F._horizontal_flip_image_pil, PIL.Image.Image), + (F.horizontal_flip_image, tv_tensors.Image), + (F.horizontal_flip_bounding_boxes, tv_tensors.BoundingBoxes), + (F.horizontal_flip_mask, tv_tensors.Mask), + (F.horizontal_flip_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform(transforms.RandomHorizontalFlip(p=1), make_input(device=device)) + + @pytest.mark.parametrize( + "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] + ) + def test_image_correctness(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.horizontal_flip(F.to_pil_image(image))) + + torch.testing.assert_close(actual, expected) + + def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes): + affine_matrix = np.array( + [ + [-1, 0, bounding_boxes.canvas_size[1]], + [0, 1, 0], + ], + ) + + return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize( + "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)] + ) + def test_bounding_boxes_correctness(self, format, fn): + bounding_boxes = make_bounding_boxes(format=format) + + actual = fn(bounding_boxes) + expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_noop(self, make_input, device): + input = make_input(device=device) + + transform = transforms.RandomHorizontalFlip(p=0) + + output = transform(input) + + assert_equal(output, input) + + +class TestAffine: + _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict( + # float, int + angle=[-10.9, 18], + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + translate=[[6.3, -0.6], [1, -3], (16.6, -6.6), (-2, 4)], + # float + scale=[0.5], + # float, int, + # one-list of float, one-list of int, one-tuple of float, one-tuple of int + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + shear=[35.6, 38, [-37.7], [-23], (5.3,), (-52,), [5.4, 21.8], [-47, 51], (-11.2, 36.7), (8, -53)], + # None + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)], + ) + # The special case for shear makes sure we pick a value that is supported while JIT scripting + _MINIMAL_AFFINE_KWARGS = { + k: vs[0] if k != "shear" else next(v for v in vs if isinstance(v, list)) + for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() + } + _CORRECTNESS_AFFINE_KWARGS = { + k: [v for v in vs if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)] + for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() + } + + _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict( + degrees=[30, (-15, 20)], + translate=[None, (0.5, 0.5)], + scale=[None, (0.75, 1.25)], + shear=[None, (12, 30, -17, 5), 10, (-5, 12)], + ) + _CORRECTNESS_TRANSFORM_AFFINE_RANGES = { + k: next(v for v in vs if v is not None) for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items() + } + + def _check_kernel(self, kernel, input, *args, **kwargs): + kwargs_ = self._MINIMAL_AFFINE_KWARGS.copy() + kwargs_.update(kwargs) + check_kernel(kernel, input, *args, **kwargs_) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"], + shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + if param == "fill": + value = adapt_fill(value, dtype=dtype) + self._check_kernel( + F.affine_image, + make_image(dtype=dtype, device=device), + **{param: value}, + check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))), + check_cuda_vs_cpu=dict(atol=1, rtol=0) + if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR + else True, + ) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"], + shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + ) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, param, value, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + self._check_kernel( + F.affine_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + **{param: value}, + check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) + def test_kernel_mask(self, make_mask): + self._check_kernel(F.affine_mask, make_mask()) + + def test_kernel_video(self): + self._check_kernel(F.affine_video, make_video()) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.affine_image, torch.Tensor), + (F._affine_image_pil, PIL.Image.Image), + (F.affine_image, tv_tensors.Image), + (F.affine_bounding_boxes, tv_tensors.BoundingBoxes), + (F.affine_mask, tv_tensors.Mask), + (F.affine_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.affine, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + input = make_input(device=device) + + check_transform(transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), input) + + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) + @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) + @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + actual = F.affine( + image, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + interpolation=interpolation, + fill=fill, + ) + expected = F.to_image( + F.affine( + F.to_pil_image(image), + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + interpolation=interpolation, + fill=fill, + ) + ) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8 + + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, center, interpolation, fill, seed): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + transform = transforms.RandomAffine( + **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center, interpolation=interpolation, fill=fill + ) + + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8 + + def _compute_affine_matrix(self, *, angle, translate, scale, shear, center): + rot = math.radians(angle) + cx, cy = center + tx, ty = translate + sx, sy = [math.radians(s) for s in ([shear, 0.0] if isinstance(shear, (int, float)) else shear)] + + c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]]) + t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]]) + c_matrix_inv = np.linalg.inv(c_matrix) + rs_matrix = np.array( + [ + [scale * math.cos(rot), -scale * math.sin(rot), 0], + [scale * math.sin(rot), scale * math.cos(rot), 0], + [0, 0, 1], + ] + ) + shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]]) + shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]]) + rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix)) + true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv))) + return true_matrix[:2, :] + + def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center): + if center is None: + center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] + + return reference_affine_bounding_boxes_helper( + bounding_boxes, + affine_matrix=self._compute_affine_matrix( + angle=angle, translate=translate, scale=scale, shear=shear, center=center + ), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"]) + @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"]) + @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center): + bounding_boxes = make_bounding_boxes(format=format) + + actual = F.affine( + bounding_boxes, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + ) + expected = self._reference_affine_bounding_boxes( + bounding_boxes, + angle=angle, + translate=translate, + scale=scale, + shear=shear, + center=center, + ) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_bounding_boxes_correctness(self, format, center, seed): + bounding_boxes = make_bounding_boxes(format=format) + + transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center) + + torch.manual_seed(seed) + params = transform._get_params([bounding_boxes]) + + torch.manual_seed(seed) + actual = transform(bounding_boxes) + + expected = self._reference_affine_bounding_boxes(bounding_boxes, **params, center=center) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"]) + @pytest.mark.parametrize("translate", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["translate"]) + @pytest.mark.parametrize("scale", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["scale"]) + @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed): + image = make_image() + height, width = F.get_size(image) + + transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear) + + torch.manual_seed(seed) + params = transform._get_params([image]) + + if isinstance(degrees, (int, float)): + assert -degrees <= params["angle"] <= degrees + else: + assert degrees[0] <= params["angle"] <= degrees[1] + + if translate is not None: + width_max = int(round(translate[0] * width)) + height_max = int(round(translate[1] * height)) + assert -width_max <= params["translate"][0] <= width_max + assert -height_max <= params["translate"][1] <= height_max + else: + assert params["translate"] == (0, 0) + + if scale is not None: + assert scale[0] <= params["scale"] <= scale[1] + else: + assert params["scale"] == 1.0 + + if shear is not None: + if isinstance(shear, (int, float)): + assert -shear <= params["shear"][0] <= shear + assert params["shear"][1] == 0.0 + elif len(shear) == 2: + assert shear[0] <= params["shear"][0] <= shear[1] + assert params["shear"][1] == 0.0 + elif len(shear) == 4: + assert shear[0] <= params["shear"][0] <= shear[1] + assert shear[2] <= params["shear"][1] <= shear[3] + else: + assert params["shear"] == (0, 0) + + @pytest.mark.parametrize("param", ["degrees", "translate", "scale", "shear", "center"]) + @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]]) + def test_transform_sequence_len_errors(self, param, value): + if param in {"degrees", "shear"} and not isinstance(value, list): + return + + kwargs = {param: value} + if param != "degrees": + kwargs["degrees"] = 0 + + with pytest.raises( + ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2" + ): + transforms.RandomAffine(**kwargs) + + def test_transform_negative_degrees_error(self): + with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"): + transforms.RandomAffine(degrees=-1) + + @pytest.mark.parametrize("translate", [[-1, 0], [2, 0], [-1, 2]]) + def test_transform_translate_range_error(self, translate): + with pytest.raises(ValueError, match="translation values should be between 0 and 1"): + transforms.RandomAffine(degrees=0, translate=translate) + + @pytest.mark.parametrize("scale", [[-1, 0], [0, -1], [-1, -1]]) + def test_transform_scale_range_error(self, scale): + with pytest.raises(ValueError, match="scale values should be positive"): + transforms.RandomAffine(degrees=0, scale=scale) + + def test_transform_negative_shear_error(self): + with pytest.raises(ValueError, match="If shear is a single number, it must be positive"): + transforms.RandomAffine(degrees=0, shear=-1) + + def test_transform_unknown_fill_error(self): + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomAffine(degrees=0, fill="fill") + + +class TestVerticalFlip: + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device)) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + check_kernel( + F.vertical_flip_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) + def test_kernel_mask(self, make_mask): + check_kernel(F.vertical_flip_mask, make_mask()) + + def test_kernel_video(self): + check_kernel(F.vertical_flip_video, make_video()) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.vertical_flip, make_input()) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.vertical_flip_image, torch.Tensor), + (F._vertical_flip_image_pil, PIL.Image.Image), + (F.vertical_flip_image, tv_tensors.Image), + (F.vertical_flip_bounding_boxes, tv_tensors.BoundingBoxes), + (F.vertical_flip_mask, tv_tensors.Mask), + (F.vertical_flip_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform(transforms.RandomVerticalFlip(p=1), make_input(device=device)) + + @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) + def test_image_correctness(self, fn): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = fn(image) + expected = F.to_image(F.vertical_flip(F.to_pil_image(image))) + + torch.testing.assert_close(actual, expected) + + def _reference_vertical_flip_bounding_boxes(self, bounding_boxes): + affine_matrix = np.array( + [ + [1, 0, 0], + [0, -1, bounding_boxes.canvas_size[0]], + ], + ) + + return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)]) + def test_bounding_boxes_correctness(self, format, fn): + bounding_boxes = make_bounding_boxes(format=format) + + actual = fn(bounding_boxes) + expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes) + + torch.testing.assert_close(actual, expected) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform_noop(self, make_input, device): + input = make_input(device=device) + + transform = transforms.RandomVerticalFlip(p=0) + + output = transform(input) + + assert_equal(output, input) + + +class TestRotate: + _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict( + # float, int + angle=[-10.9, 18], + # None + # two-list of float, two-list of int, two-tuple of float, two-tuple of int + center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)], + ) + _MINIMAL_AFFINE_KWARGS = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()} + _CORRECTNESS_AFFINE_KWARGS = { + k: [v for v in vs if v is None or isinstance(v, float) or isinstance(v, list)] + for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items() + } + + _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict( + degrees=[30, (-15, 20)], + ) + _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()} + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], + expand=[False, True], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + kwargs = {param: value} + if param != "angle": + kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] + check_kernel( + F.rotate_image, + make_image(dtype=dtype, device=device), + **kwargs, + check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), + ) + + @param_value_parametrization( + angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"], + expand=[False, True], + center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"], + ) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, param, value, format, dtype, device): + kwargs = {param: value} + if param != "angle": + kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"] + + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + check_kernel( + F.rotate_bounding_boxes, + bounding_boxes, + format=format, + canvas_size=bounding_boxes.canvas_size, + **kwargs, + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) + def test_kernel_mask(self, make_mask): + check_kernel(F.rotate_mask, make_mask(), **self._MINIMAL_AFFINE_KWARGS) + + def test_kernel_video(self): + check_kernel(F.rotate_video, make_video(), **self._MINIMAL_AFFINE_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.rotate_image, torch.Tensor), + (F._rotate_image_pil, PIL.Image.Image), + (F.rotate_image, tv_tensors.Image), + (F.rotate_bounding_boxes, tv_tensors.BoundingBoxes), + (F.rotate_mask, tv_tensors.Mask), + (F.rotate_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.rotate, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform( + transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), make_input(device=device) + ) + + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + def test_functional_image_correctness(self, angle, center, interpolation, expand, fill): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + actual = F.rotate(image, angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill) + expected = F.to_image( + F.rotate( + F.to_pil_image(image), angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill + ) + ) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6 + + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize( + "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR] + ) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("fill", CORRECTNESS_FILLS) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, center, interpolation, expand, fill, seed): + image = make_image(dtype=torch.uint8, device="cpu") + + fill = adapt_fill(fill, dtype=torch.uint8) + + transform = transforms.RandomRotation( + **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, + center=center, + interpolation=interpolation, + expand=expand, + fill=fill, + ) + + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + mae = (actual.float() - expected.float()).abs().mean() + assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6 + + def _compute_output_canvas_size(self, *, expand, canvas_size, affine_matrix): + if not expand: + return canvas_size, (0.0, 0.0) + + input_height, input_width = canvas_size + + input_image_frame = np.array( + [ + [0.0, 0.0, 1.0], + [0.0, input_height, 1.0], + [input_width, input_height, 1.0], + [input_width, 0.0, 1.0], + ], + dtype=np.float64, + ) + output_image_frame = np.matmul(input_image_frame, affine_matrix.astype(input_image_frame.dtype).T) + + recenter_x = float(np.min(output_image_frame[:, 0])) + recenter_y = float(np.min(output_image_frame[:, 1])) + + output_width = int(np.max(output_image_frame[:, 0]) - recenter_x) + output_height = int(np.max(output_image_frame[:, 1]) - recenter_y) + + return (output_height, output_width), (recenter_x, recenter_y) + + def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy): + x, y = recenter_xy + if bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYXY: + translate = [x, y, x, y] + else: + translate = [x, y, 0.0, 0.0] + return tv_tensors.wrap( + (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes + ) + + def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center): + if center is None: + center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]] + cx, cy = center + + a = np.cos(angle * np.pi / 180.0) + b = np.sin(angle * np.pi / 180.0) + affine_matrix = np.array( + [ + [a, b, cx - cx * a - b * cy], + [-b, a, cy + cx * b - a * cy], + ], + ) + + new_canvas_size, recenter_xy = self._compute_output_canvas_size( + expand=expand, canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix + ) + + output = reference_affine_bounding_boxes_helper( + bounding_boxes, + affine_matrix=affine_matrix, + new_canvas_size=new_canvas_size, + clamp=False, + ) + + return F.clamp_bounding_boxes(self._recenter_bounding_boxes_after_expand(output, recenter_xy=recenter_xy)).to( + bounding_boxes + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"]) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + def test_functional_bounding_boxes_correctness(self, format, angle, expand, center): + bounding_boxes = make_bounding_boxes(format=format) + + actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center) + expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center) + + torch.testing.assert_close(actual, expected) + torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("expand", [False, True]) + @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"]) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_bounding_boxes_correctness(self, format, expand, center, seed): + bounding_boxes = make_bounding_boxes(format=format) + + transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center) + + torch.manual_seed(seed) + params = transform._get_params([bounding_boxes]) + + torch.manual_seed(seed) + actual = transform(bounding_boxes) + + expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center) + + torch.testing.assert_close(actual, expected) + torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0) + + @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"]) + @pytest.mark.parametrize("seed", list(range(10))) + def test_transform_get_params_bounds(self, degrees, seed): + transform = transforms.RandomRotation(degrees=degrees) + + torch.manual_seed(seed) + params = transform._get_params([]) + + if isinstance(degrees, (int, float)): + assert -degrees <= params["angle"] <= degrees + else: + assert degrees[0] <= params["angle"] <= degrees[1] + + @pytest.mark.parametrize("param", ["degrees", "center"]) + @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]]) + def test_transform_sequence_len_errors(self, param, value): + if param == "degrees" and not isinstance(value, list): + return + + kwargs = {param: value} + if param != "degrees": + kwargs["degrees"] = 0 + + with pytest.raises( + ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2" + ): + transforms.RandomRotation(**kwargs) + + def test_transform_negative_degrees_error(self): + with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"): + transforms.RandomAffine(degrees=-1) + + def test_transform_unknown_fill_error(self): + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomAffine(degrees=0, fill="fill") + + +class TestCompose: + class BuiltinTransform(transforms.Transform): + def _transform(self, inpt, params): + return inpt + + class PackedInputTransform(nn.Module): + def forward(self, sample): + assert len(sample) == 2 + return sample + + class UnpackedInputTransform(nn.Module): + def forward(self, image, label): + return image, label + + @pytest.mark.parametrize( + "transform_clss", + [ + [BuiltinTransform], + [PackedInputTransform], + [UnpackedInputTransform], + [BuiltinTransform, BuiltinTransform], + [PackedInputTransform, PackedInputTransform], + [UnpackedInputTransform, UnpackedInputTransform], + [BuiltinTransform, PackedInputTransform, BuiltinTransform], + [BuiltinTransform, UnpackedInputTransform, BuiltinTransform], + [PackedInputTransform, BuiltinTransform, PackedInputTransform], + [UnpackedInputTransform, BuiltinTransform, UnpackedInputTransform], + ], + ) + @pytest.mark.parametrize("unpack", [True, False]) + def test_packed_unpacked(self, transform_clss, unpack): + needs_packed_inputs = any(issubclass(cls, self.PackedInputTransform) for cls in transform_clss) + needs_unpacked_inputs = any(issubclass(cls, self.UnpackedInputTransform) for cls in transform_clss) + assert not (needs_packed_inputs and needs_unpacked_inputs) + + transform = transforms.Compose([cls() for cls in transform_clss]) + + image = make_image() + label = 3 + packed_input = (image, label) + + def call_transform(): + if unpack: + return transform(*packed_input) + else: + return transform(packed_input) + + if needs_unpacked_inputs and not unpack: + with pytest.raises(TypeError, match="missing 1 required positional argument"): + call_transform() + elif needs_packed_inputs and unpack: + with pytest.raises(TypeError, match="takes 2 positional arguments but 3 were given"): + call_transform() + else: + output = call_transform() + + assert isinstance(output, tuple) and len(output) == 2 + assert output[0] is image + assert output[1] is label + + +class TestToDtype: + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.to_dtype_image, make_image_tensor), + (F.to_dtype_image, make_image), + (F.to_dtype_video, make_video), + ], + ) + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + def test_kernel(self, kernel, make_input, input_dtype, output_dtype, device, scale): + check_kernel( + kernel, + make_input(dtype=input_dtype, device=device), + expect_same_dtype=input_dtype is output_dtype, + dtype=output_dtype, + scale=scale, + ) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video]) + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + def test_functional(self, make_input, input_dtype, output_dtype, device, scale): + check_functional( + F.to_dtype, + make_input(dtype=input_dtype, device=device), + dtype=output_dtype, + scale=scale, + ) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + @pytest.mark.parametrize("as_dict", (True, False)) + def test_transform(self, make_input, input_dtype, output_dtype, device, scale, as_dict): + input = make_input(dtype=input_dtype, device=device) + if as_dict: + output_dtype = {type(input): output_dtype} + check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input) + + def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False): + input_dtype = image.dtype + output_dtype = dtype + + if not scale: + return image.to(dtype) + + if output_dtype == input_dtype: + return image + + def fn(value): + if input_dtype.is_floating_point: + if output_dtype.is_floating_point: + return value + else: + return round(decimal.Decimal(value) * torch.iinfo(output_dtype).max) + else: + input_max_value = torch.iinfo(input_dtype).max + + if output_dtype.is_floating_point: + return float(decimal.Decimal(value) / input_max_value) + else: + output_max_value = torch.iinfo(output_dtype).max + + if input_max_value > output_max_value: + factor = (input_max_value + 1) // (output_max_value + 1) + return value / factor + else: + factor = (output_max_value + 1) // (input_max_value + 1) + return value * factor + + return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype, device=image.device) + + @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("scale", (True, False)) + def test_image_correctness(self, input_dtype, output_dtype, device, scale): + if input_dtype.is_floating_point and output_dtype == torch.int64: + pytest.xfail("float to int64 conversion is not supported") + + input = make_image(dtype=input_dtype, device=device) + + out = F.to_dtype(input, dtype=output_dtype, scale=scale) + expected = self.reference_convert_dtype_image_tensor(input, dtype=output_dtype, scale=scale) + + if input_dtype.is_floating_point and not output_dtype.is_floating_point and scale: + torch.testing.assert_close(out, expected, atol=1, rtol=0) + else: + torch.testing.assert_close(out, expected) + + def was_scaled(self, inpt): + # this assumes the target dtype is float + return inpt.max() <= 1 + + def make_inpt_with_bbox_and_mask(self, make_input): + H, W = 10, 10 + inpt_dtype = torch.uint8 + bbox_dtype = torch.float32 + mask_dtype = torch.bool + sample = { + "inpt": make_input(size=(H, W), dtype=inpt_dtype), + "bbox": make_bounding_boxes(canvas_size=(H, W), dtype=bbox_dtype), + "mask": make_detection_mask(size=(H, W), dtype=mask_dtype), + } + + return sample, inpt_dtype, bbox_dtype, mask_dtype + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + @pytest.mark.parametrize("scale", (True, False)) + def test_dtype_not_a_dict(self, make_input, scale): + # assert only inpt gets transformed when dtype isn't a dict + + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + out = transforms.ToDtype(dtype=torch.float32, scale=scale)(sample) + + assert out["inpt"].dtype != inpt_dtype + assert out["inpt"].dtype == torch.float32 + if scale: + assert self.was_scaled(out["inpt"]) + else: + assert not self.was_scaled(out["inpt"]) + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype == mask_dtype + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + def test_others_catch_all_and_none(self, make_input): + # make sure "others" works as a catch-all and that None means no conversion + + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.int64, "others": None})(sample) + assert out["inpt"].dtype == inpt_dtype + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype != mask_dtype + assert out["mask"].dtype == torch.int64 + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + def test_typical_use_case(self, make_input): + # Typical use-case: want to convert dtype and scale for inpt and just dtype for masks. + # This just makes sure we now have a decent API for this + + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + out = transforms.ToDtype( + dtype={type(sample["inpt"]): torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True + )(sample) + assert out["inpt"].dtype != inpt_dtype + assert out["inpt"].dtype == torch.float32 + assert self.was_scaled(out["inpt"]) + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype != mask_dtype + assert out["mask"].dtype == torch.int64 + + @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video)) + def test_errors_warnings(self, make_input): + sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input) + + with pytest.raises(ValueError, match="No dtype was specified for"): + out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.float32})(sample) + with pytest.warns(UserWarning, match=re.escape("plain `torch.Tensor` will *not* be transformed")): + transforms.ToDtype(dtype={torch.Tensor: torch.float32, tv_tensors.Image: torch.float32}) + with pytest.warns(UserWarning, match="no scaling will be done"): + out = transforms.ToDtype(dtype={"others": None}, scale=True)(sample) + assert out["inpt"].dtype == inpt_dtype + assert out["bbox"].dtype == bbox_dtype + assert out["mask"].dtype == mask_dtype + + +class TestAdjustBrightness: + _CORRECTNESS_BRIGHTNESS_FACTORS = [0.5, 0.0, 1.0, 5.0] + _DEFAULT_BRIGHTNESS_FACTOR = _CORRECTNESS_BRIGHTNESS_FACTORS[0] + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.adjust_brightness_image, make_image), + (F.adjust_brightness_video, make_video), + ], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel(self, kernel, make_input, dtype, device): + check_kernel(kernel, make_input(dtype=dtype, device=device), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.adjust_brightness, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.adjust_brightness_image, torch.Tensor), + (F._adjust_brightness_image_pil, PIL.Image.Image), + (F.adjust_brightness_image, tv_tensors.Image), + (F.adjust_brightness_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.adjust_brightness, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("brightness_factor", _CORRECTNESS_BRIGHTNESS_FACTORS) + def test_image_correctness(self, brightness_factor): + image = make_image(dtype=torch.uint8, device="cpu") + + actual = F.adjust_brightness(image, brightness_factor=brightness_factor) + expected = F.to_image(F.adjust_brightness(F.to_pil_image(image), brightness_factor=brightness_factor)) + + torch.testing.assert_close(actual, expected) + + +class TestCutMixMixUp: + class DummyDataset: + def __init__(self, size, num_classes): + self.size = size + self.num_classes = num_classes + assert size < num_classes + + def __getitem__(self, idx): + img = torch.rand(3, 100, 100) + label = idx # This ensures all labels in a batch are unique and makes testing easier + return img, label + + def __len__(self): + return self.size + + @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) + def test_supported_input_structure(self, T): + + batch_size = 32 + num_classes = 100 + + dataset = self.DummyDataset(size=batch_size, num_classes=num_classes) + + cutmix_mixup = T(num_classes=num_classes) + + dl = DataLoader(dataset, batch_size=batch_size) + + # Input sanity checks + img, target = next(iter(dl)) + input_img_size = img.shape[-3:] + assert isinstance(img, torch.Tensor) and isinstance(target, torch.Tensor) + assert target.shape == (batch_size,) + + def check_output(img, target): + assert img.shape == (batch_size, *input_img_size) + assert target.shape == (batch_size, num_classes) + torch.testing.assert_close(target.sum(axis=-1), torch.ones(batch_size)) + num_non_zero_labels = (target != 0).sum(axis=-1) + assert (num_non_zero_labels == 2).all() + + # After Dataloader, as unpacked input + img, target = next(iter(dl)) + assert target.shape == (batch_size,) + img, target = cutmix_mixup(img, target) + check_output(img, target) + + # After Dataloader, as packed input + packed_from_dl = next(iter(dl)) + assert isinstance(packed_from_dl, list) + img, target = cutmix_mixup(packed_from_dl) + check_output(img, target) + + # As collation function. We expect default_collate to be used by users. + def collate_fn_1(batch): + return cutmix_mixup(default_collate(batch)) + + def collate_fn_2(batch): + return cutmix_mixup(*default_collate(batch)) + + for collate_fn in (collate_fn_1, collate_fn_2): + dl = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn) + img, target = next(iter(dl)) + check_output(img, target) + + @needs_cuda + @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) + def test_cpu_vs_gpu(self, T): + num_classes = 10 + batch_size = 3 + H, W = 12, 12 + + imgs = torch.rand(batch_size, 3, H, W) + labels = torch.randint(0, num_classes, (batch_size,)) + cutmix_mixup = T(alpha=0.5, num_classes=num_classes) + + _check_kernel_cuda_vs_cpu(cutmix_mixup, imgs, labels, rtol=None, atol=None) + + @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) + def test_error(self, T): + + num_classes = 10 + batch_size = 9 + + imgs = torch.rand(batch_size, 3, 12, 12) + cutmix_mixup = T(alpha=0.5, num_classes=num_classes) + + for input_with_bad_type in ( + F.to_pil_image(imgs[0]), + tv_tensors.Mask(torch.rand(12, 12)), + tv_tensors.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12), + ): + with pytest.raises(ValueError, match="does not support PIL images, "): + cutmix_mixup(input_with_bad_type) + + with pytest.raises(ValueError, match="Could not infer where the labels are"): + cutmix_mixup({"img": imgs, "Nothing_else": 3}) + + with pytest.raises(ValueError, match="labels tensor should be of shape"): + # Note: the error message isn't ideal, but that's because the label heuristic found the img as the label + # It's OK, it's an edge-case. The important thing is that this fails loudly instead of passing silently + cutmix_mixup(imgs) + + with pytest.raises(ValueError, match="When using the default labels_getter"): + cutmix_mixup(imgs, "not_a_tensor") + + with pytest.raises(ValueError, match="labels tensor should be of shape"): + cutmix_mixup(imgs, torch.randint(0, 2, size=(2, 3))) + + with pytest.raises(ValueError, match="Expected a batched input with 4 dims"): + cutmix_mixup(imgs[None, None], torch.randint(0, num_classes, size=(batch_size,))) + + with pytest.raises(ValueError, match="does not match the batch size of the labels"): + cutmix_mixup(imgs, torch.randint(0, num_classes, size=(batch_size + 1,))) + + with pytest.raises(ValueError, match="labels tensor should be of shape"): + # The purpose of this check is more about documenting the current + # behaviour of what happens on a Compose(), rather than actually + # asserting the expected behaviour. We may support Compose() in the + # future, e.g. for 2 consecutive CutMix? + labels = torch.randint(0, num_classes, size=(batch_size,)) + transforms.Compose([cutmix_mixup, cutmix_mixup])(imgs, labels) + + +@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT")) +@pytest.mark.parametrize("sample_type", (tuple, list, dict)) +def test_labels_getter_default_heuristic(key, sample_type): + labels = torch.arange(10) + sample = {key: labels, "another_key": "whatever"} + if sample_type is not dict: + sample = sample_type((None, sample, "whatever_again")) + assert transforms._utils._find_labels_default_heuristic(sample) is labels + + if key.lower() != "labels": + # If "labels" is in the dict (case-insensitive), + # it takes precedence over other keys which would otherwise be a match + d = {key: "something_else", "labels": labels} + assert transforms._utils._find_labels_default_heuristic(d) is labels + + +class TestShapeGetters: + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_dimensions_image, make_image_tensor), + (F._get_dimensions_image_pil, make_image_pil), + (F.get_dimensions_image, make_image), + (F.get_dimensions_video, make_video), + ], + ) + def test_get_dimensions(self, kernel, make_input): + size = (10, 10) + color_space, num_channels = "RGB", 3 + + input = make_input(size, color_space=color_space) + + assert kernel(input) == F.get_dimensions(input) == [num_channels, *size] + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_num_channels_image, make_image_tensor), + (F._get_num_channels_image_pil, make_image_pil), + (F.get_num_channels_image, make_image), + (F.get_num_channels_video, make_video), + ], + ) + def test_get_num_channels(self, kernel, make_input): + color_space, num_channels = "RGB", 3 + + input = make_input(color_space=color_space) + + assert kernel(input) == F.get_num_channels(input) == num_channels + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_size_image, make_image_tensor), + (F._get_size_image_pil, make_image_pil), + (F.get_size_image, make_image), + (F.get_size_bounding_boxes, make_bounding_boxes), + (F.get_size_mask, make_detection_mask), + (F.get_size_mask, make_segmentation_mask), + (F.get_size_video, make_video), + ], + ) + def test_get_size(self, kernel, make_input): + size = (10, 10) + + input = make_input(size) + + assert kernel(input) == F.get_size(input) == list(size) + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.get_num_frames_video, make_video_tensor), + (F.get_num_frames_video, make_video), + ], + ) + def test_get_num_frames(self, kernel, make_input): + num_frames = 4 + + input = make_input(num_frames=num_frames) + + assert kernel(input) == F.get_num_frames(input) == num_frames + + @pytest.mark.parametrize( + ("functional", "make_input"), + [ + (F.get_dimensions, make_bounding_boxes), + (F.get_dimensions, make_detection_mask), + (F.get_dimensions, make_segmentation_mask), + (F.get_num_channels, make_bounding_boxes), + (F.get_num_channels, make_detection_mask), + (F.get_num_channels, make_segmentation_mask), + (F.get_num_frames, make_image_pil), + (F.get_num_frames, make_image), + (F.get_num_frames, make_bounding_boxes), + (F.get_num_frames, make_detection_mask), + (F.get_num_frames, make_segmentation_mask), + ], + ) + def test_unsupported_types(self, functional, make_input): + input = make_input() + + with pytest.raises(TypeError, match=re.escape(str(type(input)))): + functional(input) + + +class TestRegisterKernel: + @pytest.mark.parametrize("functional", (F.resize, "resize")) + def test_register_kernel(self, functional): + class CustomTVTensor(tv_tensors.TVTensor): + pass + + kernel_was_called = False + + @F.register_kernel(functional, CustomTVTensor) + def new_resize(dp, *args, **kwargs): + nonlocal kernel_was_called + kernel_was_called = True + return dp + + t = transforms.Resize(size=(224, 224), antialias=True) + + my_dp = CustomTVTensor(torch.rand(3, 10, 10)) + out = t(my_dp) + assert out is my_dp + assert kernel_was_called + + # Sanity check to make sure we didn't override the kernel of other types + t(torch.rand(3, 10, 10)).shape == (3, 224, 224) + t(tv_tensors.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224) + + def test_errors(self): + with pytest.raises(ValueError, match="Could not find functional with name"): + F.register_kernel("bad_name", tv_tensors.Image) + + with pytest.raises(ValueError, match="Kernels can only be registered on functionals"): + F.register_kernel(tv_tensors.Image, F.resize) + + with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"): + F.register_kernel(F.resize, object) + + with pytest.raises(ValueError, match="cannot be registered for the builtin tv_tensor classes"): + F.register_kernel(F.resize, tv_tensors.Image)(F.resize_image) + + class CustomTVTensor(tv_tensors.TVTensor): + pass + + def resize_custom_tv_tensor(): + pass + + F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor) + + with pytest.raises(ValueError, match="already has a kernel registered for type"): + F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor) + + +class TestGetKernel: + # We are using F.resize as functional and the kernels below as proxy. Any other functional / kernels combination + # would also be fine + KERNELS = { + torch.Tensor: F.resize_image, + PIL.Image.Image: F._resize_image_pil, + tv_tensors.Image: F.resize_image, + tv_tensors.BoundingBoxes: F.resize_bounding_boxes, + tv_tensors.Mask: F.resize_mask, + tv_tensors.Video: F.resize_video, + } + + @pytest.mark.parametrize("input_type", [str, int, object]) + def test_unsupported_types(self, input_type): + with pytest.raises(TypeError, match="supports inputs of type"): + _get_kernel(F.resize, input_type) + + def test_exact_match(self): + # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the + # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional + # here, register the kernels without wrapper, and check the exact matching afterwards. + def resize_with_pure_kernels(): + pass + + for input_type, kernel in self.KERNELS.items(): + _register_kernel_internal(resize_with_pure_kernels, input_type, tv_tensor_wrapper=False)(kernel) + + assert _get_kernel(resize_with_pure_kernels, input_type) is kernel + + def test_builtin_tv_tensor_subclass(self): + # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the + # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional + # here, register the kernels without wrapper, and check if subclasses of our builtin tv_tensors get dispatched + # to the kernel of the corresponding superclass + def resize_with_pure_kernels(): + pass + + class MyImage(tv_tensors.Image): + pass + + class MyBoundingBoxes(tv_tensors.BoundingBoxes): + pass + + class MyMask(tv_tensors.Mask): + pass + + class MyVideo(tv_tensors.Video): + pass + + for custom_tv_tensor_subclass in [ + MyImage, + MyBoundingBoxes, + MyMask, + MyVideo, + ]: + builtin_tv_tensor_class = custom_tv_tensor_subclass.__mro__[1] + builtin_tv_tensor_kernel = self.KERNELS[builtin_tv_tensor_class] + _register_kernel_internal(resize_with_pure_kernels, builtin_tv_tensor_class, tv_tensor_wrapper=False)( + builtin_tv_tensor_kernel + ) + + assert _get_kernel(resize_with_pure_kernels, custom_tv_tensor_subclass) is builtin_tv_tensor_kernel + + def test_tv_tensor_subclass(self): + class MyTVTensor(tv_tensors.TVTensor): + pass + + with pytest.raises(TypeError, match="supports inputs of type"): + _get_kernel(F.resize, MyTVTensor) + + def resize_my_tv_tensor(): + pass + + _register_kernel_internal(F.resize, MyTVTensor, tv_tensor_wrapper=False)(resize_my_tv_tensor) + + assert _get_kernel(F.resize, MyTVTensor) is resize_my_tv_tensor + + def test_pil_image_subclass(self): + opened_image = PIL.Image.open(Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg") + loaded_image = opened_image.convert("RGB") + + # check the assumptions + assert isinstance(opened_image, PIL.Image.Image) + assert type(opened_image) is not PIL.Image.Image + + assert type(loaded_image) is PIL.Image.Image + + size = [17, 11] + for image in [opened_image, loaded_image]: + kernel = _get_kernel(F.resize, type(image)) + + output = kernel(image, size=size) + + assert F.get_size(output) == size + + +class TestPermuteChannels: + _DEFAULT_PERMUTATION = [2, 0, 1] + + @pytest.mark.parametrize( + ("kernel", "make_input"), + [ + (F.permute_channels_image, make_image_tensor), + # FIXME + # check_kernel does not support PIL kernel, but it should + (F.permute_channels_image, make_image), + (F.permute_channels_video, make_video), + ], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel(self, kernel, make_input, dtype, device): + check_kernel(kernel, make_input(dtype=dtype, device=device), permutation=self._DEFAULT_PERMUTATION) + + @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video]) + def test_functional(self, make_input): + check_functional(F.permute_channels, make_input(), permutation=self._DEFAULT_PERMUTATION) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.permute_channels_image, torch.Tensor), + (F._permute_channels_image_pil, PIL.Image.Image), + (F.permute_channels_image, tv_tensors.Image), + (F.permute_channels_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.permute_channels, kernel=kernel, input_type=input_type) + + def reference_image_correctness(self, image, permutation): + channel_images = image.split(1, dim=-3) + permuted_channel_images = [channel_images[channel_idx] for channel_idx in permutation] + return tv_tensors.Image(torch.concat(permuted_channel_images, dim=-3)) + + @pytest.mark.parametrize("permutation", [[2, 0, 1], [1, 2, 0], [2, 0, 1], [0, 1, 2]]) + @pytest.mark.parametrize("batch_dims", [(), (2,), (2, 1)]) + def test_image_correctness(self, permutation, batch_dims): + image = make_image(batch_dims=batch_dims) + + actual = F.permute_channels(image, permutation=permutation) + expected = self.reference_image_correctness(image, permutation=permutation) + + torch.testing.assert_close(actual, expected) + + +class TestElastic: + def _make_displacement(self, inpt): + return torch.rand( + 1, + *F.get_size(inpt), + 2, + dtype=torch.float32, + device=inpt.device if isinstance(inpt, torch.Tensor) else "cpu", + ) + + @param_value_parametrization( + interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, param, value, dtype, device): + image = make_image_tensor(dtype=dtype, device=device) + + check_kernel( + F.elastic_image, + image, + displacement=self._make_displacement(image), + **{param: value}, + check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))), + ) + + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_boxes(self, format, dtype, device): + bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device) + + check_kernel( + F.elastic_bounding_boxes, + bounding_boxes, + format=bounding_boxes.format, + canvas_size=bounding_boxes.canvas_size, + displacement=self._make_displacement(bounding_boxes), + ) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) + def test_kernel_mask(self, make_mask): + mask = make_mask() + check_kernel(F.elastic_mask, mask, displacement=self._make_displacement(mask)) + + def test_kernel_video(self): + video = make_video() + check_kernel(F.elastic_video, video, displacement=self._make_displacement(video)) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + input = make_input() + check_functional(F.elastic, input, displacement=self._make_displacement(input)) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.elastic_image, torch.Tensor), + (F._elastic_image_pil, PIL.Image.Image), + (F.elastic_image, tv_tensors.Image), + (F.elastic_bounding_boxes, tv_tensors.BoundingBoxes), + (F.elastic_mask, tv_tensors.Mask), + (F.elastic_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.elastic, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_displacement_error(self, make_input): + input = make_input() + + with pytest.raises(TypeError, match="displacement should be a Tensor"): + F.elastic(input, displacement=None) + + with pytest.raises(ValueError, match="displacement shape should be"): + F.elastic(input, displacement=torch.rand(F.get_size(input))) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image + @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, size, device): + check_transform( + transforms.ElasticTransform(), + make_input(size, device=device), + # We updated gaussian blur kernel generation with a faster and numerically more stable version + check_v1_compatibility=dict(rtol=0, atol=1), + ) + + +class TestToPureTensor: + def test_correctness(self): + input = { + "img": make_image(), + "img_tensor": make_image_tensor(), + "img_pil": make_image_pil(), + "mask": make_detection_mask(), + "video": make_video(), + "bbox": make_bounding_boxes(), + "str": "str", + } + + out = transforms.ToPureTensor()(input) + + for input_value, out_value in zip(input.values(), out.values()): + if isinstance(input_value, tv_tensors.TVTensor): + assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, tv_tensors.TVTensor) + else: + assert isinstance(out_value, type(input_value)) + + +class TestCrop: + INPUT_SIZE = (21, 11) + + CORRECTNESS_CROP_KWARGS = [ + # center + dict(top=5, left=5, height=10, width=5), + # larger than input, i.e. pad + dict(top=-5, left=-5, height=30, width=20), + # sides: left, right, top, bottom + dict(top=-5, left=-5, height=30, width=10), + dict(top=-5, left=5, height=30, width=10), + dict(top=-5, left=-5, height=20, width=20), + dict(top=5, left=-5, height=20, width=20), + # corners: top-left, top-right, bottom-left, bottom-right + dict(top=-5, left=-5, height=20, width=10), + dict(top=-5, left=5, height=20, width=10), + dict(top=5, left=-5, height=20, width=10), + dict(top=5, left=5, height=20, width=10), + ] + MINIMAL_CROP_KWARGS = CORRECTNESS_CROP_KWARGS[0] + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, kwargs, dtype, device): + check_kernel(F.crop_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **kwargs) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_bounding_box(self, kwargs, format, dtype, device): + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) + check_kernel(F.crop_bounding_boxes, bounding_boxes, format=format, **kwargs) + + @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask]) + def test_kernel_mask(self, make_mask): + check_kernel(F.crop_mask, make_mask(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) + + def test_kernel_video(self): + check_kernel(F.crop_video, make_video(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_functional(self, make_input): + check_functional(F.crop, make_input(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.crop_image, torch.Tensor), + (F._crop_image_pil, PIL.Image.Image), + (F.crop_image, tv_tensors.Image), + (F.crop_bounding_boxes, tv_tensors.BoundingBoxes), + (F.crop_mask, tv_tensors.Mask), + (F.crop_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.crop, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + def test_functional_image_correctness(self, kwargs): + image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu") + + actual = F.crop(image, **kwargs) + expected = F.to_image(F.crop(F.to_pil_image(image), **kwargs)) + + assert_equal(actual, expected) + + @param_value_parametrization( + size=[(10, 5), (25, 15), (25, 5), (10, 15)], + fill=EXHAUSTIVE_TYPE_FILLS, + ) + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + def test_transform(self, param, value, make_input): + input = make_input(self.INPUT_SIZE) + + if param == "fill": + if isinstance(input, tv_tensors.Mask) and isinstance(value, (tuple, list)): + pytest.skip("F.pad_mask doesn't support non-scalar fill.") + + kwargs = dict( + # 1. size is required + # 2. the fill parameter only has an affect if we need padding + size=[s + 4 for s in self.INPUT_SIZE], + fill=adapt_fill(value, dtype=input.dtype if isinstance(input, torch.Tensor) else torch.uint8), + ) + else: + kwargs = {param: value} + + check_transform( + transforms.RandomCrop(**kwargs, pad_if_needed=True), + input, + check_v1_compatibility=param != "fill" or isinstance(value, (int, float)), + ) + + @pytest.mark.parametrize("padding", [1, (1, 1), (1, 1, 1, 1)]) + def test_transform_padding(self, padding): + inpt = make_image(self.INPUT_SIZE) + + output_size = [s + 2 for s in F.get_size(inpt)] + transform = transforms.RandomCrop(output_size, padding=padding) + + output = transform(inpt) + + assert F.get_size(output) == output_size + + @pytest.mark.parametrize("padding", [None, 1, (1, 1), (1, 1, 1, 1)]) + def test_transform_insufficient_padding(self, padding): + inpt = make_image(self.INPUT_SIZE) + + output_size = [s + 3 for s in F.get_size(inpt)] + transform = transforms.RandomCrop(output_size, padding=padding) + + with pytest.raises(ValueError, match="larger than (padded )?input image size"): + transform(inpt) + + def test_transform_pad_if_needed(self): + inpt = make_image(self.INPUT_SIZE) + + output_size = [s * 2 for s in F.get_size(inpt)] + transform = transforms.RandomCrop(output_size, pad_if_needed=True) + + output = transform(inpt) + + assert F.get_size(output) == output_size + + @param_value_parametrization( + size=[(10, 5), (25, 15), (25, 5), (10, 15)], + fill=CORRECTNESS_FILLS, + padding_mode=["constant", "edge", "reflect", "symmetric"], + ) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, param, value, seed): + kwargs = {param: value} + if param != "size": + # 1. size is required + # 2. the fill / padding_mode parameters only have an affect if we need padding + kwargs["size"] = [s + 4 for s in self.INPUT_SIZE] + if param == "fill": + kwargs["fill"] = adapt_fill(kwargs["fill"], dtype=torch.uint8) + + transform = transforms.RandomCrop(pad_if_needed=True, **kwargs) + + image = make_image(self.INPUT_SIZE) + + with freeze_rng_state(): + torch.manual_seed(seed) + actual = transform(image) + + torch.manual_seed(seed) + expected = F.to_image(transform(F.to_pil_image(image))) + + assert_equal(actual, expected) + + def _reference_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width): + affine_matrix = np.array( + [ + [1, 0, -left], + [0, 1, -top], + ], + ) + return reference_affine_bounding_boxes_helper( + bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width) + ) + + @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_functional_bounding_box_correctness(self, kwargs, format, dtype, device): + bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device) + + actual = F.crop(bounding_boxes, **kwargs) + expected = self._reference_crop_bounding_boxes(bounding_boxes, **kwargs) + + assert_equal(actual, expected, atol=1, rtol=0) + assert_equal(F.get_size(actual), F.get_size(expected)) + + @pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)]) + @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat)) + @pytest.mark.parametrize("dtype", [torch.float32, torch.int64]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_bounding_boxes_correctness(self, output_size, format, dtype, device, seed): + input_size = [s * 2 for s in output_size] + bounding_boxes = make_bounding_boxes(input_size, format=format, dtype=dtype, device=device) + + transform = transforms.RandomCrop(output_size) + + with freeze_rng_state(): + torch.manual_seed(seed) + params = transform._get_params([bounding_boxes]) + assert not params.pop("needs_pad") + del params["padding"] + assert params.pop("needs_crop") + + torch.manual_seed(seed) + actual = transform(bounding_boxes) + + expected = self._reference_crop_bounding_boxes(bounding_boxes, **params) + + assert_equal(actual, expected) + assert_equal(F.get_size(actual), F.get_size(expected)) + + def test_errors(self): + with pytest.raises(ValueError, match="Please provide only two dimensions"): + transforms.RandomCrop([10, 12, 14]) + + with pytest.raises(TypeError, match="Got inappropriate padding arg"): + transforms.RandomCrop([10, 12], padding="abc") + + with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"): + transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7]) + + with pytest.raises(TypeError, match="Got inappropriate fill arg"): + transforms.RandomCrop([10, 12], padding=1, fill="abc") + + with pytest.raises(ValueError, match="Padding mode should be either"): + transforms.RandomCrop([10, 12], padding=1, padding_mode="abc") + + +class TestErase: + INPUT_SIZE = (17, 11) + FUNCTIONAL_KWARGS = dict( + zip("ijhwv", [2, 2, 10, 8, torch.tensor(0.0, dtype=torch.float32, device="cpu").reshape(-1, 1, 1)]) + ) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image(self, dtype, device): + check_kernel(F.erase_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **self.FUNCTIONAL_KWARGS) + + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_kernel_image_inplace(self, dtype, device): + input = make_image(self.INPUT_SIZE, dtype=dtype, device=device) + input_version = input._version + + output_out_of_place = F.erase_image(input, **self.FUNCTIONAL_KWARGS) + assert output_out_of_place.data_ptr() != input.data_ptr() + assert output_out_of_place is not input + + output_inplace = F.erase_image(input, **self.FUNCTIONAL_KWARGS, inplace=True) + assert output_inplace.data_ptr() == input.data_ptr() + assert output_inplace._version > input_version + assert output_inplace is input + + assert_equal(output_inplace, output_out_of_place) + + def test_kernel_video(self): + check_kernel(F.erase_video, make_video(self.INPUT_SIZE), **self.FUNCTIONAL_KWARGS) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + def test_functional(self, make_input): + check_functional(F.erase, make_input(), **self.FUNCTIONAL_KWARGS) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.erase_image, torch.Tensor), + (F._erase_image_pil, PIL.Image.Image), + (F.erase_image, tv_tensors.Image), + (F.erase_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.erase, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_transform(self, make_input, device): + check_transform(transforms.RandomErasing(p=1), make_input(device=device)) + + def _reference_erase_image(self, image, *, i, j, h, w, v): + mask = torch.zeros_like(image, dtype=torch.bool) + mask[..., i : i + h, j : j + w] = True + + # The broadcasting and type casting logic is handled automagically in the kernel through indexing + value = torch.broadcast_to(v, (*image.shape[:-2], h, w)).to(image) + + erased_image = torch.empty_like(image) + erased_image[mask] = value.flatten() + erased_image[~mask] = image[~mask] + + return erased_image + + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + def test_functional_image_correctness(self, dtype, device): + image = make_image(dtype=dtype, device=device) + + actual = F.erase(image, **self.FUNCTIONAL_KWARGS) + expected = self._reference_erase_image(image, **self.FUNCTIONAL_KWARGS) + + assert_equal(actual, expected) + + @param_value_parametrization( + scale=[(0.1, 0.2), [0.0, 1.0]], + ratio=[(0.3, 0.7), [0.1, 5.0]], + value=[0, 0.5, (0, 1, 0), [-0.2, 0.0, 1.3], "random"], + ) + @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8]) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("seed", list(range(5))) + def test_transform_image_correctness(self, param, value, dtype, device, seed): + transform = transforms.RandomErasing(**{param: value}, p=1) + + image = make_image(dtype=dtype, device=device) + + with freeze_rng_state(): + torch.manual_seed(seed) + # This emulates the random apply check that happens before _get_params is called + torch.rand(1) + params = transform._get_params([image]) + + torch.manual_seed(seed) + actual = transform(image) + + expected = self._reference_erase_image(image, **params) + + assert_equal(actual, expected) + + def test_transform_errors(self): + with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"): + transforms.RandomErasing(value={}) + + with pytest.raises(ValueError, match="If value is str, it should be 'random'"): + transforms.RandomErasing(value="abc") + + with pytest.raises(TypeError, match="Scale should be a sequence"): + transforms.RandomErasing(scale=123) + + with pytest.raises(TypeError, match="Ratio should be a sequence"): + transforms.RandomErasing(ratio=123) + + with pytest.raises(ValueError, match="Scale should be between 0 and 1"): + transforms.RandomErasing(scale=[-1, 2]) + + transform = transforms.RandomErasing(value=[1, 2, 3, 4]) + + with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"): + transform._get_params([make_image()]) + + @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_mask]) + def test_transform_passthrough(self, make_input): + transform = transforms.RandomErasing(p=1) + + input = make_input(self.INPUT_SIZE) + + with pytest.warns(UserWarning, match="currently passing through inputs of type"): + # RandomErasing requires an image or video to be present + _, output = transform(make_image(self.INPUT_SIZE), input) + + assert output is input + + +class TestGaussianBlur: + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video], + ) + @pytest.mark.parametrize("device", cpu_and_cuda()) + @pytest.mark.parametrize("sigma", [5, (0.5, 2)]) + def test_transform(self, make_input, device, sigma): + check_transform(transforms.GaussianBlur(kernel_size=3, sigma=sigma), make_input(device=device)) + + def test_assertions(self): + with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"): + transforms.GaussianBlur([10, 12, 14]) + + with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"): + transforms.GaussianBlur(4) + + with pytest.raises(ValueError, match="If sigma is a sequence its length should be 1 or 2. Got 3"): + transforms.GaussianBlur(3, sigma=[1, 2, 3]) + + with pytest.raises(ValueError, match="sigma values should be positive and of the form"): + transforms.GaussianBlur(3, sigma=-1.0) + + with pytest.raises(ValueError, match="sigma values should be positive and of the form"): + transforms.GaussianBlur(3, sigma=[2.0, 1.0]) + + with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"): + transforms.GaussianBlur(3, sigma={}) + + @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0], (10, 12.0), [10]]) + def test__get_params(self, sigma): + transform = transforms.GaussianBlur(3, sigma=sigma) + params = transform._get_params([]) + + if isinstance(sigma, float): + assert params["sigma"][0] == params["sigma"][1] == sigma + elif isinstance(sigma, list) and len(sigma) == 1: + assert params["sigma"][0] == params["sigma"][1] == sigma[0] + else: + assert sigma[0] <= params["sigma"][0] <= sigma[1] + assert sigma[0] <= params["sigma"][1] <= sigma[1] diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..246e31485f3c1ba29b193a565af9c7327dc08754 --- /dev/null +++ b/test/test_transforms_v2_utils.py @@ -0,0 +1,92 @@ +import PIL.Image +import pytest + +import torch + +import torchvision.transforms.v2._utils +from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_mask, make_image + +from torchvision import tv_tensors +from torchvision.transforms.v2._utils import has_all, has_any +from torchvision.transforms.v2.functional import to_pil_image + + +IMAGE = make_image(DEFAULT_SIZE, color_space="RGB") +BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=tv_tensors.BoundingBoxFormat.XYXY) +MASK = make_detection_mask(DEFAULT_SIZE) + + +@pytest.mark.parametrize( + ("sample", "types", "expected"), + [ + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image,), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes,), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Mask,), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True), + ((MASK,), (tv_tensors.Image, tv_tensors.BoundingBoxes), False), + ((BOUNDING_BOX,), (tv_tensors.Image, tv_tensors.Mask), False), + ((IMAGE,), (tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ( + (IMAGE, BOUNDING_BOX, MASK), + (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), + True, + ), + ((), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, tv_tensors.Image),), True), + ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False), + ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True), + ((IMAGE,), (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True), + ( + (torch.Tensor(IMAGE),), + (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), + True, + ), + ( + (to_pil_image(IMAGE),), + (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), + True, + ), + ], +) +def test_has_any(sample, types, expected): + assert has_any(sample, *types) is expected + + +@pytest.mark.parametrize( + ("sample", "types", "expected"), + [ + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image,), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes,), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Mask,), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), True), + ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True), + ( + (IMAGE, BOUNDING_BOX, MASK), + (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), + True, + ), + ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), False), + ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), False), + ((IMAGE, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ( + (IMAGE, BOUNDING_BOX, MASK), + (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), + True, + ), + ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ((IMAGE, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ((IMAGE, BOUNDING_BOX), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False), + ( + (IMAGE, BOUNDING_BOX, MASK), + (lambda obj: isinstance(obj, (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask)),), + True, + ), + ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False), + ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True), + ], +) +def test_has_all(sample, types, expected): + assert has_all(sample, *types) is expected diff --git a/test/test_tv_tensors.py b/test/test_tv_tensors.py new file mode 100644 index 0000000000000000000000000000000000000000..ed75ae35ecd1cbf63ecf3a1b3b67725108d219c6 --- /dev/null +++ b/test/test_tv_tensors.py @@ -0,0 +1,320 @@ +from copy import deepcopy + +import pytest +import torch +from common_utils import assert_equal, make_bounding_boxes, make_image, make_segmentation_mask, make_video +from PIL import Image + +from torchvision import tv_tensors + + +@pytest.fixture(autouse=True) +def restore_tensor_return_type(): + # This is for security, as we should already be restoring the default manually in each test anyway + # (at least at the time of writing...) + yield + tv_tensors.set_return_type("Tensor") + + +@pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)]) +def test_image_instance(data): + image = tv_tensors.Image(data) + assert isinstance(image, torch.Tensor) + assert image.ndim == 3 and image.shape[0] == 3 + + +@pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)]) +def test_mask_instance(data): + mask = tv_tensors.Mask(data) + assert isinstance(mask, torch.Tensor) + assert mask.ndim == 3 and mask.shape[0] == 1 + + +@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]], [1, 2, 3, 4]]) +@pytest.mark.parametrize( + "format", ["XYXY", "CXCYWH", tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH] +) +def test_bbox_instance(data, format): + bboxes = tv_tensors.BoundingBoxes(data, format=format, canvas_size=(32, 32)) + assert isinstance(bboxes, torch.Tensor) + assert bboxes.ndim == 2 and bboxes.shape[1] == 4 + if isinstance(format, str): + format = tv_tensors.BoundingBoxFormat[(format.upper())] + assert bboxes.format == format + + +def test_bbox_dim_error(): + data_3d = [[[1, 2, 3, 4]]] + with pytest.raises(ValueError, match="Expected a 1D or 2D tensor, got 3D"): + tv_tensors.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32)) + + +@pytest.mark.parametrize( + ("data", "input_requires_grad", "expected_requires_grad"), + [ + ([[[0.0, 1.0], [0.0, 1.0]]], None, False), + ([[[0.0, 1.0], [0.0, 1.0]]], False, False), + ([[[0.0, 1.0], [0.0, 1.0]]], True, True), + (torch.rand(3, 16, 16, requires_grad=False), None, False), + (torch.rand(3, 16, 16, requires_grad=False), False, False), + (torch.rand(3, 16, 16, requires_grad=False), True, True), + (torch.rand(3, 16, 16, requires_grad=True), None, True), + (torch.rand(3, 16, 16, requires_grad=True), False, False), + (torch.rand(3, 16, 16, requires_grad=True), True, True), + ], +) +def test_new_requires_grad(data, input_requires_grad, expected_requires_grad): + tv_tensor = tv_tensors.Image(data, requires_grad=input_requires_grad) + assert tv_tensor.requires_grad is expected_requires_grad + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +def test_isinstance(make_input): + assert isinstance(make_input(), torch.Tensor) + + +def test_wrapping_no_copy(): + tensor = torch.rand(3, 16, 16) + image = tv_tensors.Image(tensor) + + assert image.data_ptr() == tensor.data_ptr() + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +def test_to_wrapping(make_input): + dp = make_input() + + dp_to = dp.to(torch.float64) + + assert type(dp_to) is type(dp) + assert dp_to.dtype is torch.float64 + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_to_tv_tensor_reference(make_input, return_type): + tensor = torch.rand((3, 16, 16), dtype=torch.float64) + dp = make_input() + + with tv_tensors.set_return_type(return_type): + tensor_to = tensor.to(dp) + + assert type(tensor_to) is (type(dp) if return_type == "TVTensor" else torch.Tensor) + assert tensor_to.dtype is dp.dtype + assert type(tensor) is torch.Tensor + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_clone_wrapping(make_input, return_type): + dp = make_input() + + with tv_tensors.set_return_type(return_type): + dp_clone = dp.clone() + + assert type(dp_clone) is type(dp) + assert dp_clone.data_ptr() != dp.data_ptr() + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_requires_grad__wrapping(make_input, return_type): + dp = make_input(dtype=torch.float) + + assert not dp.requires_grad + + with tv_tensors.set_return_type(return_type): + dp_requires_grad = dp.requires_grad_(True) + + assert type(dp_requires_grad) is type(dp) + assert dp.requires_grad + assert dp_requires_grad.requires_grad + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_detach_wrapping(make_input, return_type): + dp = make_input(dtype=torch.float).requires_grad_(True) + + with tv_tensors.set_return_type(return_type): + dp_detached = dp.detach() + + assert type(dp_detached) is type(dp) + + +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_force_subclass_with_metadata(return_type): + # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and tv_tensors with metadata + # Largely the same as above, we additionally check that the metadata is preserved + format, canvas_size = "XYXY", (32, 32) + bbox = tv_tensors.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size) + + tv_tensors.set_return_type(return_type) + bbox = bbox.clone() + if return_type == "TVTensor": + assert bbox.format, bbox.canvas_size == (format, canvas_size) + + bbox = bbox.to(torch.float64) + if return_type == "TVTensor": + assert bbox.format, bbox.canvas_size == (format, canvas_size) + + bbox = bbox.detach() + if return_type == "TVTensor": + assert bbox.format, bbox.canvas_size == (format, canvas_size) + + assert not bbox.requires_grad + bbox.requires_grad_(True) + if return_type == "TVTensor": + assert bbox.format, bbox.canvas_size == (format, canvas_size) + assert bbox.requires_grad + tv_tensors.set_return_type("tensor") + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_other_op_no_wrapping(make_input, return_type): + dp = make_input() + + with tv_tensors.set_return_type(return_type): + # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here + output = dp * 2 + + assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor) + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +@pytest.mark.parametrize( + "op", + [ + lambda t: t.numpy(), + lambda t: t.tolist(), + lambda t: t.max(dim=-1), + ], +) +def test_no_tensor_output_op_no_wrapping(make_input, op): + dp = make_input() + + output = op(dp) + + assert type(output) is not type(dp) + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +def test_inplace_op_no_wrapping(make_input, return_type): + dp = make_input() + original_type = type(dp) + + with tv_tensors.set_return_type(return_type): + output = dp.add_(0) + + assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor) + assert type(dp) is original_type + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +def test_wrap(make_input): + dp = make_input() + + # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here + output = dp * 2 + + dp_new = tv_tensors.wrap(output, like=dp) + + assert type(dp_new) is type(dp) + assert dp_new.data_ptr() == output.data_ptr() + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +@pytest.mark.parametrize("requires_grad", [False, True]) +def test_deepcopy(make_input, requires_grad): + dp = make_input(dtype=torch.float) + + dp.requires_grad_(requires_grad) + + dp_deepcopied = deepcopy(dp) + + assert dp_deepcopied is not dp + assert dp_deepcopied.data_ptr() != dp.data_ptr() + assert_equal(dp_deepcopied, dp) + + assert type(dp_deepcopied) is type(dp) + assert dp_deepcopied.requires_grad is requires_grad + + +@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video]) +@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"]) +@pytest.mark.parametrize( + "op", + ( + lambda dp: dp + torch.rand(*dp.shape), + lambda dp: torch.rand(*dp.shape) + dp, + lambda dp: dp * torch.rand(*dp.shape), + lambda dp: torch.rand(*dp.shape) * dp, + lambda dp: dp + 3, + lambda dp: 3 + dp, + lambda dp: dp + dp, + lambda dp: dp.sum(), + lambda dp: dp.reshape(-1), + lambda dp: dp.int(), + lambda dp: torch.stack([dp, dp]), + lambda dp: torch.chunk(dp, 2)[0], + lambda dp: torch.unbind(dp)[0], + ), +) +def test_usual_operations(make_input, return_type, op): + + dp = make_input() + with tv_tensors.set_return_type(return_type): + out = op(dp) + assert type(out) is (type(dp) if return_type == "TVTensor" else torch.Tensor) + if isinstance(dp, tv_tensors.BoundingBoxes) and return_type == "TVTensor": + assert hasattr(out, "format") + assert hasattr(out, "canvas_size") + + +def test_subclasses(): + img = make_image() + masks = make_segmentation_mask() + + with pytest.raises(TypeError, match="unsupported operand"): + img + masks + + +def test_set_return_type(): + img = make_image() + + assert type(img + 3) is torch.Tensor + + with tv_tensors.set_return_type("TVTensor"): + assert type(img + 3) is tv_tensors.Image + assert type(img + 3) is torch.Tensor + + tv_tensors.set_return_type("TVTensor") + assert type(img + 3) is tv_tensors.Image + + with tv_tensors.set_return_type("tensor"): + assert type(img + 3) is torch.Tensor + with tv_tensors.set_return_type("TVTensor"): + assert type(img + 3) is tv_tensors.Image + tv_tensors.set_return_type("tensor") + assert type(img + 3) is torch.Tensor + assert type(img + 3) is torch.Tensor + # Exiting a context manager will restore the return type as it was prior to entering it, + # regardless of whether the "global" tv_tensors.set_return_type() was called within the context manager. + assert type(img + 3) is tv_tensors.Image + + tv_tensors.set_return_type("tensor") + + +def test_return_type_input(): + img = make_image() + + # Case-insensitive + with tv_tensors.set_return_type("tvtensor"): + assert type(img + 3) is tv_tensors.Image + + with pytest.raises(ValueError, match="return_type must be"): + tv_tensors.set_return_type("typo") + + tv_tensors.set_return_type("tensor") diff --git a/test/test_utils.py b/test/test_utils.py index dde3ee90dc306f8e2a7c1be74c2a85b8855dd1a6..b13bd0f0f5bbfffbd296ccf25f4ad22f990a01de 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -9,7 +9,7 @@ import pytest import torch import torchvision.transforms.functional as F import torchvision.utils as utils -from common_utils import assert_equal +from common_utils import assert_equal, cpu_and_cuda from PIL import __version__ as PILLOW_VERSION, Image, ImageColor @@ -120,6 +120,9 @@ def test_draw_boxes_colors(colors): img = torch.full((3, 100, 100), 0, dtype=torch.uint8) utils.draw_bounding_boxes(img, boxes, fill=False, width=7, colors=colors) + with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"): + utils.draw_bounding_boxes(image=img, boxes=boxes, colors=[]) + def test_draw_boxes_vanilla(): img = torch.full((3, 100, 100), 0, dtype=torch.uint8) @@ -184,7 +187,7 @@ def test_draw_no_boxes(): boxes = torch.full((0, 4), 0, dtype=torch.float) with pytest.warns(UserWarning, match=re.escape("boxes doesn't contain any box. No box was drawn")): res = utils.draw_bounding_boxes(img, boxes) - # Check that the function didnt change the image + # Check that the function didn't change the image assert res.eq(img).all() @@ -200,16 +203,17 @@ def test_draw_no_boxes(): ], ) @pytest.mark.parametrize("alpha", (0, 0.5, 0.7, 1)) -def test_draw_segmentation_masks(colors, alpha): +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_draw_segmentation_masks(colors, alpha, device): """This test makes sure that masks draw their corresponding color where they should""" num_masks, h, w = 2, 100, 100 dtype = torch.uint8 - img = torch.randint(0, 256, size=(3, h, w), dtype=dtype) - masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool) + img = torch.randint(0, 256, size=(3, h, w), dtype=dtype, device=device) + masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool, device=device) # For testing we enforce that there's no overlap between the masks. The # current behaviour is that the last mask's color will take priority when - # masks overlap, but this makes testing slightly harder so we don't really + # masks overlap, but this makes testing slightly harder, so we don't really # care overlap = masks[0] & masks[1] masks[:, overlap] = False @@ -231,7 +235,7 @@ def test_draw_segmentation_masks(colors, alpha): for mask, color in zip(masks, colors): if isinstance(color, str): color = ImageColor.getrgb(color) - color = torch.tensor(color, dtype=dtype) + color = torch.tensor(color, dtype=dtype, device=device) if alpha == 1: assert (out[:, mask] == color[:, None]).all() @@ -242,11 +246,12 @@ def test_draw_segmentation_masks(colors, alpha): torch.testing.assert_close(out[:, mask], interpolated_color, rtol=0.0, atol=1.0) -def test_draw_segmentation_masks_errors(): +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_draw_segmentation_masks_errors(device): h, w = 10, 10 - masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool) - img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8) + masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool, device=device) + img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8, device=device) with pytest.raises(TypeError, match="The image must be a tensor"): utils.draw_segmentation_masks(image="Not A Tensor Image", masks=masks) @@ -268,22 +273,23 @@ def test_draw_segmentation_masks_errors(): with pytest.raises(ValueError, match="must have the same height and width"): masks_bad_shape = torch.randint(0, 2, size=(h + 4, w), dtype=torch.bool) utils.draw_segmentation_masks(image=img, masks=masks_bad_shape) - with pytest.raises(ValueError, match="There are more masks"): + with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"): utils.draw_segmentation_masks(image=img, masks=masks, colors=[]) - with pytest.raises(ValueError, match="colors must be a tuple or a string, or a list thereof"): + with pytest.raises(ValueError, match="`colors` must be a tuple or a string, or a list thereof"): bad_colors = np.array(["red", "blue"]) # should be a list utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors) - with pytest.raises(ValueError, match="It seems that you passed a tuple of colors instead of"): + with pytest.raises(ValueError, match="If passed as tuple, colors should be an RGB triplet"): bad_colors = ("red", "blue") # should be a list utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors) -def test_draw_no_segmention_mask(): - img = torch.full((3, 100, 100), 0, dtype=torch.uint8) - masks = torch.full((0, 100, 100), 0, dtype=torch.bool) +@pytest.mark.parametrize("device", cpu_and_cuda()) +def test_draw_no_segmention_mask(device): + img = torch.full((3, 100, 100), 0, dtype=torch.uint8, device=device) + masks = torch.full((0, 100, 100), 0, dtype=torch.bool, device=device) with pytest.warns(UserWarning, match=re.escape("masks doesn't contain any mask. No mask was drawn")): res = utils.draw_segmentation_masks(img, masks) - # Check that the function didnt change the image + # Check that the function didn't change the image assert res.eq(img).all() diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py index d987db6ddebdb756c98c7d5ed788a28eaa7fb984..aa6d0aee9e04afe5d36227922e2bc589d16d3ee6 100644 --- a/test/test_video_gpu_decoder.py +++ b/test/test_video_gpu_decoder.py @@ -3,6 +3,7 @@ import os import pytest import torch +import torchvision from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader try: @@ -29,8 +30,9 @@ class TestVideoGPUDecoder: ], ) def test_frame_reading(self, video_file): + torchvision.set_video_backend("cuda") full_path = os.path.join(VIDEO_DIR, video_file) - decoder = VideoReader(full_path, device="cuda") + decoder = VideoReader(full_path) with av.open(full_path) as container: for av_frame in container.decode(container.streams.video[0]): av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray()) @@ -54,7 +56,8 @@ class TestVideoGPUDecoder: ], ) def test_seek_reading(self, keyframes, full_path, duration): - decoder = VideoReader(full_path, device="cuda") + torchvision.set_video_backend("cuda") + decoder = VideoReader(full_path) time = duration / 2 decoder.seek(time, keyframes_only=keyframes) with av.open(full_path) as container: @@ -79,8 +82,9 @@ class TestVideoGPUDecoder: ], ) def test_metadata(self, video_file): + torchvision.set_video_backend("cuda") full_path = os.path.join(VIDEO_DIR, video_file) - decoder = VideoReader(full_path, device="cuda") + decoder = VideoReader(full_path) video_metadata = decoder.get_metadata()["video"] with av.open(full_path) as container: video = container.streams.video[0] diff --git a/test/test_video_reader.py b/test/test_video_reader.py index 867923d10d0732a3f144839732250575cfaac722..243aa12fc120ed0102f66d064cf834ec3a26cafb 100644 --- a/test/test_video_reader.py +++ b/test/test_video_reader.py @@ -127,7 +127,7 @@ def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer ascending order. We need to decode more frames even when we meet end pts """ - # seeking in the stream is imprecise. Thus, seek to an ealier PTS by a margin + # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin margin = 1 seek_offset = max(start_pts - margin, 0) diff --git a/test/test_videoapi.py b/test/test_videoapi.py index 895b9b83555dbc28801d4903bf0f3b996d1e99e8..05fbcbdbff29b1e961226eeea48e3380aac9d761 100644 --- a/test/test_videoapi.py +++ b/test/test_videoapi.py @@ -10,6 +10,12 @@ from torchvision.datasets.utils import download_url from torchvision.io import _HAS_VIDEO_OPT, VideoReader +# WARNING: these tests have been skipped forever on the CI because the video ops +# are never properly available. This is bad, but things have been in a terrible +# state for a long time already as we write this comment, and we'll hopefully be +# able to get rid of this all soon. + + try: import av @@ -25,6 +31,13 @@ CheckerConfig = ["duration", "video_fps", "audio_sample_rate"] GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig)) +def backends(): + backends_ = ["video_reader"] + if av is not None: + backends_.append("pyav") + return backends_ + + def fate(name, path="."): """Download and return a path to a sample from the FFmpeg test suite. See the `FFmpeg Automated Test Environment `_ @@ -53,7 +66,9 @@ test_videos = { class TestVideoApi: @pytest.mark.skipif(av is None, reason="PyAV unavailable") @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_frame_reading(self, test_video): + @pytest.mark.parametrize("backend", backends()) + def test_frame_reading(self, test_video, backend): + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) with av.open(full_path) as av_reader: if av_reader.streams.video: @@ -77,6 +92,7 @@ class TestVideoApi: # compare the frames and ptss for i in range(len(vr_frames)): assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1) + mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float())) # on average the difference is very small and caused # by decoding (around 1%) @@ -114,12 +130,62 @@ class TestVideoApi: # we assure that there is never more than 1% difference in signal assert max_delta.item() < 0.001 + @pytest.mark.parametrize("stream", ["video", "audio"]) + @pytest.mark.parametrize("test_video", test_videos.keys()) + @pytest.mark.parametrize("backend", backends()) + def test_frame_reading_mem_vs_file(self, test_video, stream, backend): + torchvision.set_video_backend(backend) + full_path = os.path.join(VIDEO_DIR, test_video) + + reader = VideoReader(full_path) + reader_md = reader.get_metadata() + + if stream in reader_md: + # Test video reading from file vs from memory + vr_frames, vr_frames_mem = [], [] + vr_pts, vr_pts_mem = [], [] + # get vr frames + video_reader = VideoReader(full_path, stream) + for vr_frame in video_reader: + vr_frames.append(vr_frame["data"]) + vr_pts.append(vr_frame["pts"]) + + # get vr frames = read from memory + f = open(full_path, "rb") + fbytes = f.read() + f.close() + video_reader_from_mem = VideoReader(fbytes, stream) + + for vr_frame_from_mem in video_reader_from_mem: + vr_frames_mem.append(vr_frame_from_mem["data"]) + vr_pts_mem.append(vr_frame_from_mem["pts"]) + + # same number of frames + assert len(vr_frames) == len(vr_frames_mem) + assert len(vr_pts) == len(vr_pts_mem) + + # compare the frames and ptss + for i in range(len(vr_frames)): + assert vr_pts[i] == vr_pts_mem[i] + mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float())) + # on average the difference is very small and caused + # by decoding (around 1%) + # TODO: asses empirically how to set this? atm it's 1% + # averaged over all frames + assert mean_delta.item() < 2.55 + + del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem + else: + del reader, reader_md + @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_metadata(self, test_video, config): + @pytest.mark.parametrize("backend", backends()) + def test_metadata(self, test_video, config, backend): """ Test that the metadata returned via pyav corresponds to the one returned by the new video decoder API """ + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) reader = VideoReader(full_path, "video") reader_md = reader.get_metadata() @@ -127,7 +193,9 @@ class TestVideoApi: assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5) @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_seek_start(self, test_video): + @pytest.mark.parametrize("backend", backends()) + def test_seek_start(self, test_video, backend): + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) video_reader = VideoReader(full_path, "video") num_frames = 0 @@ -153,7 +221,9 @@ class TestVideoApi: assert start_num_frames == num_frames @pytest.mark.parametrize("test_video", test_videos.keys()) - def test_accurateseek_middle(self, test_video): + @pytest.mark.parametrize("backend", ["video_reader"]) + def test_accurateseek_middle(self, test_video, backend): + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) stream = "video" video_reader = VideoReader(full_path, stream) @@ -192,7 +262,9 @@ class TestVideoApi: @pytest.mark.skipif(av is None, reason="PyAV unavailable") @pytest.mark.parametrize("test_video,config", test_videos.items()) - def test_keyframe_reading(self, test_video, config): + @pytest.mark.parametrize("backend", backends()) + def test_keyframe_reading(self, test_video, config, backend): + torchvision.set_video_backend(backend) full_path = os.path.join(VIDEO_DIR, test_video) av_reader = av.open(full_path) diff --git a/test/tracing/frcnn/CMakeLists.txt b/test/tracing/frcnn/CMakeLists.txt index c79382470bd528e17e38fb01ad3078d77eccf24b..8ede462e34b7b87884a2e6f929d8480930ecd9f8 100644 --- a/test/tracing/frcnn/CMakeLists.txt +++ b/test/tracing/frcnn/CMakeLists.txt @@ -10,4 +10,4 @@ find_package(Python3 COMPONENTS Development) add_executable(test_frcnn_tracing test_frcnn_tracing.cpp) target_compile_features(test_frcnn_tracing PUBLIC cxx_range_for) target_link_libraries(test_frcnn_tracing ${TORCH_LIBRARIES} TorchVision::TorchVision Python3::Python) -set_property(TARGET test_frcnn_tracing PROPERTY CXX_STANDARD 14) +set_property(TARGET test_frcnn_tracing PROPERTY CXX_STANDARD 17) diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py new file mode 100644 index 0000000000000000000000000000000000000000..b84d87eb7aec6aac904e46357674687db531bbc0 --- /dev/null +++ b/test/transforms_v2_dispatcher_infos.py @@ -0,0 +1,325 @@ +import pytest +import torchvision.transforms.v2.functional as F +from torchvision import tv_tensors +from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition +from transforms_v2_legacy_utils import InfoBase, TestMark + +__all__ = ["DispatcherInfo", "DISPATCHER_INFOS"] + + +class PILKernelInfo(InfoBase): + def __init__( + self, + kernel, + *, + # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name + # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then + kernel_name=None, + ): + super().__init__(id=kernel_name or kernel.__name__) + self.kernel = kernel + + +class DispatcherInfo(InfoBase): + _KERNEL_INFO_MAP = {info.kernel: info for info in KERNEL_INFOS} + + def __init__( + self, + dispatcher, + *, + # Dictionary of types that map to the kernel the dispatcher dispatches to. + kernels, + # If omitted, no PIL dispatch test will be performed. + pil_kernel_info=None, + # See InfoBase + test_marks=None, + # See InfoBase + closeness_kwargs=None, + ): + super().__init__(id=dispatcher.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs) + self.dispatcher = dispatcher + self.kernels = kernels + self.pil_kernel_info = pil_kernel_info + + kernel_infos = {} + for tv_tensor_type, kernel in self.kernels.items(): + kernel_info = self._KERNEL_INFO_MAP.get(kernel) + if not kernel_info: + raise pytest.UsageError( + f"Can't register {kernel.__name__} for type {tv_tensor_type} since there is no `KernelInfo` for it. " + f"Please add a `KernelInfo` for it in `transforms_v2_kernel_infos.py`." + ) + kernel_infos[tv_tensor_type] = kernel_info + self.kernel_infos = kernel_infos + + def sample_inputs(self, *tv_tensor_types, filter_metadata=True): + for tv_tensor_type in tv_tensor_types or self.kernel_infos.keys(): + kernel_info = self.kernel_infos.get(tv_tensor_type) + if not kernel_info: + raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}") + + sample_inputs = kernel_info.sample_inputs_fn() + + if not filter_metadata: + yield from sample_inputs + return + + import itertools + + for args_kwargs in sample_inputs: + if hasattr(tv_tensor_type, "__annotations__"): + for name in itertools.chain( + tv_tensor_type.__annotations__.keys(), + # FIXME: this seems ok for conversion dispatchers, but we should probably handle this on a + # per-dispatcher level. However, so far there is no option for that. + (f"old_{name}" for name in tv_tensor_type.__annotations__.keys()), + ): + if name in args_kwargs.kwargs: + del args_kwargs.kwargs[name] + + yield args_kwargs + + +def xfail_jit(reason, *, condition=None): + return TestMark( + ("TestDispatchers", "test_scripted_smoke"), + pytest.mark.xfail(reason=reason), + condition=condition, + ) + + +def xfail_jit_python_scalar_arg(name, *, reason=None): + return xfail_jit( + reason or f"Python scalar int or float for `{name}` is not supported when scripting", + condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)), + ) + + +skip_dispatch_tv_tensor = TestMark( + ("TestDispatchers", "test_dispatch_tv_tensor"), + pytest.mark.skip(reason="Dispatcher doesn't support arbitrary tv_tensor dispatch."), +) + +multi_crop_skips = [ + TestMark( + ("TestDispatchers", test_name), + pytest.mark.skip(reason="Multi-crop dispatchers return a sequence of items rather than a single one."), + ) + for test_name in ["test_pure_tensor_output_type", "test_pil_output_type", "test_tv_tensor_output_type"] +] +multi_crop_skips.append(skip_dispatch_tv_tensor) + + +DISPATCHER_INFOS = [ + DispatcherInfo( + F.resized_crop, + kernels={ + tv_tensors.Image: F.resized_crop_image, + tv_tensors.Video: F.resized_crop_video, + tv_tensors.BoundingBoxes: F.resized_crop_bounding_boxes, + tv_tensors.Mask: F.resized_crop_mask, + }, + pil_kernel_info=PILKernelInfo(F._resized_crop_image_pil), + ), + DispatcherInfo( + F.pad, + kernels={ + tv_tensors.Image: F.pad_image, + tv_tensors.Video: F.pad_video, + tv_tensors.BoundingBoxes: F.pad_bounding_boxes, + tv_tensors.Mask: F.pad_mask, + }, + pil_kernel_info=PILKernelInfo(F._pad_image_pil, kernel_name="pad_image_pil"), + test_marks=[ + xfail_jit("F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition), + xfail_jit_python_scalar_arg("padding"), + ], + ), + DispatcherInfo( + F.perspective, + kernels={ + tv_tensors.Image: F.perspective_image, + tv_tensors.Video: F.perspective_video, + tv_tensors.BoundingBoxes: F.perspective_bounding_boxes, + tv_tensors.Mask: F.perspective_mask, + }, + pil_kernel_info=PILKernelInfo(F._perspective_image_pil), + test_marks=[ + xfail_jit_python_scalar_arg("fill"), + ], + ), + DispatcherInfo( + F.elastic, + kernels={ + tv_tensors.Image: F.elastic_image, + tv_tensors.Video: F.elastic_video, + tv_tensors.BoundingBoxes: F.elastic_bounding_boxes, + tv_tensors.Mask: F.elastic_mask, + }, + pil_kernel_info=PILKernelInfo(F._elastic_image_pil), + test_marks=[xfail_jit_python_scalar_arg("fill")], + ), + DispatcherInfo( + F.center_crop, + kernels={ + tv_tensors.Image: F.center_crop_image, + tv_tensors.Video: F.center_crop_video, + tv_tensors.BoundingBoxes: F.center_crop_bounding_boxes, + tv_tensors.Mask: F.center_crop_mask, + }, + pil_kernel_info=PILKernelInfo(F._center_crop_image_pil), + test_marks=[ + xfail_jit_python_scalar_arg("output_size"), + ], + ), + DispatcherInfo( + F.gaussian_blur, + kernels={ + tv_tensors.Image: F.gaussian_blur_image, + tv_tensors.Video: F.gaussian_blur_video, + }, + pil_kernel_info=PILKernelInfo(F._gaussian_blur_image_pil), + test_marks=[ + xfail_jit_python_scalar_arg("kernel_size"), + xfail_jit_python_scalar_arg("sigma"), + ], + ), + DispatcherInfo( + F.equalize, + kernels={ + tv_tensors.Image: F.equalize_image, + tv_tensors.Video: F.equalize_video, + }, + pil_kernel_info=PILKernelInfo(F._equalize_image_pil, kernel_name="equalize_image_pil"), + ), + DispatcherInfo( + F.invert, + kernels={ + tv_tensors.Image: F.invert_image, + tv_tensors.Video: F.invert_video, + }, + pil_kernel_info=PILKernelInfo(F._invert_image_pil, kernel_name="invert_image_pil"), + ), + DispatcherInfo( + F.posterize, + kernels={ + tv_tensors.Image: F.posterize_image, + tv_tensors.Video: F.posterize_video, + }, + pil_kernel_info=PILKernelInfo(F._posterize_image_pil, kernel_name="posterize_image_pil"), + ), + DispatcherInfo( + F.solarize, + kernels={ + tv_tensors.Image: F.solarize_image, + tv_tensors.Video: F.solarize_video, + }, + pil_kernel_info=PILKernelInfo(F._solarize_image_pil, kernel_name="solarize_image_pil"), + ), + DispatcherInfo( + F.autocontrast, + kernels={ + tv_tensors.Image: F.autocontrast_image, + tv_tensors.Video: F.autocontrast_video, + }, + pil_kernel_info=PILKernelInfo(F._autocontrast_image_pil, kernel_name="autocontrast_image_pil"), + ), + DispatcherInfo( + F.adjust_sharpness, + kernels={ + tv_tensors.Image: F.adjust_sharpness_image, + tv_tensors.Video: F.adjust_sharpness_video, + }, + pil_kernel_info=PILKernelInfo(F._adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"), + ), + DispatcherInfo( + F.adjust_contrast, + kernels={ + tv_tensors.Image: F.adjust_contrast_image, + tv_tensors.Video: F.adjust_contrast_video, + }, + pil_kernel_info=PILKernelInfo(F._adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"), + ), + DispatcherInfo( + F.adjust_gamma, + kernels={ + tv_tensors.Image: F.adjust_gamma_image, + tv_tensors.Video: F.adjust_gamma_video, + }, + pil_kernel_info=PILKernelInfo(F._adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"), + ), + DispatcherInfo( + F.adjust_hue, + kernels={ + tv_tensors.Image: F.adjust_hue_image, + tv_tensors.Video: F.adjust_hue_video, + }, + pil_kernel_info=PILKernelInfo(F._adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"), + ), + DispatcherInfo( + F.adjust_saturation, + kernels={ + tv_tensors.Image: F.adjust_saturation_image, + tv_tensors.Video: F.adjust_saturation_video, + }, + pil_kernel_info=PILKernelInfo(F._adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"), + ), + DispatcherInfo( + F.five_crop, + kernels={ + tv_tensors.Image: F.five_crop_image, + tv_tensors.Video: F.five_crop_video, + }, + pil_kernel_info=PILKernelInfo(F._five_crop_image_pil), + test_marks=[ + xfail_jit_python_scalar_arg("size"), + *multi_crop_skips, + ], + ), + DispatcherInfo( + F.ten_crop, + kernels={ + tv_tensors.Image: F.ten_crop_image, + tv_tensors.Video: F.ten_crop_video, + }, + test_marks=[ + xfail_jit_python_scalar_arg("size"), + *multi_crop_skips, + ], + pil_kernel_info=PILKernelInfo(F._ten_crop_image_pil), + ), + DispatcherInfo( + F.normalize, + kernels={ + tv_tensors.Image: F.normalize_image, + tv_tensors.Video: F.normalize_video, + }, + test_marks=[ + xfail_jit_python_scalar_arg("mean"), + xfail_jit_python_scalar_arg("std"), + ], + ), + DispatcherInfo( + F.uniform_temporal_subsample, + kernels={ + tv_tensors.Video: F.uniform_temporal_subsample_video, + }, + test_marks=[ + skip_dispatch_tv_tensor, + ], + ), + DispatcherInfo( + F.clamp_bounding_boxes, + kernels={tv_tensors.BoundingBoxes: F.clamp_bounding_boxes}, + test_marks=[ + skip_dispatch_tv_tensor, + ], + ), + DispatcherInfo( + F.convert_bounding_box_format, + kernels={tv_tensors.BoundingBoxes: F.convert_bounding_box_format}, + test_marks=[ + skip_dispatch_tv_tensor, + ], + ), +] diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py new file mode 100644 index 0000000000000000000000000000000000000000..a549bfe72ddfc950d4e3faf4827dbdeb78e128d0 --- /dev/null +++ b/test/transforms_v2_kernel_infos.py @@ -0,0 +1,1522 @@ +import functools +import itertools + +import numpy as np +import PIL.Image +import pytest +import torch.testing +import torchvision.ops +import torchvision.transforms.v2.functional as F +from torchvision import tv_tensors +from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding +from transforms_v2_legacy_utils import ( + ArgsKwargs, + combinations_grid, + DEFAULT_PORTRAIT_SPATIAL_SIZE, + get_num_channels, + ImageLoader, + InfoBase, + make_bounding_box_loader, + make_bounding_box_loaders, + make_detection_mask_loader, + make_image_loader, + make_image_loaders, + make_image_loaders_for_interpolation, + make_mask_loaders, + make_video_loader, + make_video_loaders, + mark_framework_limitation, + TestMark, +) + +__all__ = ["KernelInfo", "KERNEL_INFOS"] + + +class KernelInfo(InfoBase): + def __init__( + self, + kernel, + *, + # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name + # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then + kernel_name=None, + # Most common tests use these inputs to check the kernel. As such it should cover all valid code paths, but + # should not include extensive parameter combinations to keep to overall test count moderate. + sample_inputs_fn, + # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also + # take tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should + # happen inside the function. It should return a tensor or to be more precise an object that can be compared to + # a tensor by `assert_close`. If omitted, no reference test will be performed. + reference_fn=None, + # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter + # values to be tested. If not specified, `sample_inputs_fn` will be used. + reference_inputs_fn=None, + # If true-ish, triggers a test that checks the kernel for consistency between uint8 and float32 inputs with the + # reference inputs. This is usually used whenever we use a PIL kernel as reference. + # Can be a callable in which case it will be called with `other_args, kwargs`. It should return the same + # structure, but with adapted parameters. This is useful in case a parameter value is closely tied to the input + # dtype. + float32_vs_uint8=False, + # Some kernels don't have dispatchers that would handle logging the usage. Thus, the kernel has to do it + # manually. If set, triggers a test that makes sure this happens. + logs_usage=False, + # See InfoBase + test_marks=None, + # See InfoBase + closeness_kwargs=None, + ): + super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs) + self.kernel = kernel + self.sample_inputs_fn = sample_inputs_fn + self.reference_fn = reference_fn + self.reference_inputs_fn = reference_inputs_fn + + if float32_vs_uint8 and not callable(float32_vs_uint8): + float32_vs_uint8 = lambda other_args, kwargs: (other_args, kwargs) # noqa: E731 + self.float32_vs_uint8 = float32_vs_uint8 + self.logs_usage = logs_usage + + +def pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=False): + return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, mae=mae) + + +def cuda_vs_cpu_pixel_difference(atol=1): + return { + (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): pixel_difference_closeness_kwargs(atol, dtype=dtype) + for dtype in [torch.uint8, torch.float32] + } + + +def pil_reference_pixel_difference(atol=1, mae=False): + return { + (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs( + atol, mae=mae + ) + } + + +def float32_vs_uint8_pixel_difference(atol=1, mae=False): + return { + ( + ("TestKernels", "test_float32_vs_uint8"), + torch.float32, + "cpu", + ): pixel_difference_closeness_kwargs(atol, dtype=torch.float32, mae=mae) + } + + +def scripted_vs_eager_float64_tolerances(device, atol=1e-6, rtol=1e-6): + return { + (("TestKernels", "test_scripted_vs_eager"), torch.float64, device): {"atol": atol, "rtol": rtol, "mae": False}, + } + + +def pil_reference_wrapper(pil_kernel): + @functools.wraps(pil_kernel) + def wrapper(input_tensor, *other_args, **kwargs): + if input_tensor.dtype != torch.uint8: + raise pytest.UsageError(f"Can only test uint8 tensor images against PIL, but input is {input_tensor.dtype}") + if input_tensor.ndim > 3: + raise pytest.UsageError( + f"Can only test single tensor images against PIL, but input has shape {input_tensor.shape}" + ) + + input_pil = F.to_pil_image(input_tensor) + output_pil = pil_kernel(input_pil, *other_args, **kwargs) + if not isinstance(output_pil, PIL.Image.Image): + return output_pil + + output_tensor = F.to_image(output_pil) + + # 2D mask shenanigans + if output_tensor.ndim == 2 and input_tensor.ndim == 3: + output_tensor = output_tensor.unsqueeze(0) + elif output_tensor.ndim == 3 and input_tensor.ndim == 2: + output_tensor = output_tensor.squeeze(0) + + return output_tensor + + return wrapper + + +def xfail_jit(reason, *, condition=None): + return TestMark(("TestKernels", "test_scripted_vs_eager"), pytest.mark.xfail(reason=reason), condition=condition) + + +def xfail_jit_python_scalar_arg(name, *, reason=None): + return xfail_jit( + reason or f"Python scalar int or float for `{name}` is not supported when scripting", + condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)), + ) + + +KERNEL_INFOS = [] + + +def get_fills(*, num_channels, dtype): + yield None + + int_value = get_max_value(dtype) + float_value = int_value / 2 + yield int_value + yield float_value + + for vector_type in [list, tuple]: + yield vector_type([int_value]) + yield vector_type([float_value]) + + if num_channels > 1: + yield vector_type(float_value * c / 10 for c in range(num_channels)) + yield vector_type(int_value if c % 2 == 0 else 0 for c in range(num_channels)) + + +def float32_vs_uint8_fill_adapter(other_args, kwargs): + fill = kwargs.get("fill") + if fill is None: + return other_args, kwargs + + if isinstance(fill, (int, float)): + fill /= 255 + else: + fill = type(fill)(fill_ / 255 for fill_ in fill) + + return other_args, dict(kwargs, fill=fill) + + +def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_size, affine_matrix): + def transform(bbox, affine_matrix_, format_, canvas_size_): + # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1 + in_dtype = bbox.dtype + if not torch.is_floating_point(bbox): + bbox = bbox.float() + bbox_xyxy = F.convert_bounding_box_format( + bbox.as_subclass(torch.Tensor), + old_format=format_, + new_format=tv_tensors.BoundingBoxFormat.XYXY, + inplace=True, + ) + points = np.array( + [ + [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0], + [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0], + [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0], + [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0], + ] + ) + transformed_points = np.matmul(points, affine_matrix_.T) + out_bbox = torch.tensor( + [ + np.min(transformed_points[:, 0]).item(), + np.min(transformed_points[:, 1]).item(), + np.max(transformed_points[:, 0]).item(), + np.max(transformed_points[:, 1]).item(), + ], + dtype=bbox_xyxy.dtype, + ) + out_bbox = F.convert_bounding_box_format( + out_bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format_, inplace=True + ) + # It is important to clamp before casting, especially for CXCYWH format, dtype=int64 + out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_) + out_bbox = out_bbox.to(dtype=in_dtype) + return out_bbox + + return torch.stack( + [transform(b, affine_matrix, format, canvas_size) for b in bounding_boxes.reshape(-1, 4).unbind()] + ).reshape(bounding_boxes.shape) + + +def sample_inputs_convert_bounding_box_format(): + formats = list(tv_tensors.BoundingBoxFormat) + for bounding_boxes_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats): + yield ArgsKwargs(bounding_boxes_loader, old_format=bounding_boxes_loader.format, new_format=new_format) + + +def reference_convert_bounding_box_format(bounding_boxes, old_format, new_format): + return torchvision.ops.box_convert( + bounding_boxes, in_fmt=old_format.name.lower(), out_fmt=new_format.name.lower() + ).to(bounding_boxes.dtype) + + +def reference_inputs_convert_bounding_box_format(): + for args_kwargs in sample_inputs_convert_bounding_box_format(): + if len(args_kwargs.args[0].shape) == 2: + yield args_kwargs + + +KERNEL_INFOS.append( + KernelInfo( + F.convert_bounding_box_format, + sample_inputs_fn=sample_inputs_convert_bounding_box_format, + reference_fn=reference_convert_bounding_box_format, + reference_inputs_fn=reference_inputs_convert_bounding_box_format, + logs_usage=True, + closeness_kwargs={ + (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0), + }, + ), +) + + +_RESIZED_CROP_PARAMS = combinations_grid(top=[-8, 9], left=[-8, 9], height=[12], width=[12], size=[(16, 18)]) + + +def sample_inputs_resized_crop_image_tensor(): + for image_loader in make_image_loaders(): + yield ArgsKwargs(image_loader, **_RESIZED_CROP_PARAMS[0]) + + +@pil_reference_wrapper +def reference_resized_crop_image_tensor(*args, **kwargs): + if not kwargs.pop("antialias", False) and kwargs.get("interpolation", F.InterpolationMode.BILINEAR) in { + F.InterpolationMode.BILINEAR, + F.InterpolationMode.BICUBIC, + }: + raise pytest.UsageError("Anti-aliasing is always active in PIL") + return F._resized_crop_image_pil(*args, **kwargs) + + +def reference_inputs_resized_crop_image_tensor(): + for image_loader, interpolation, params in itertools.product( + make_image_loaders_for_interpolation(), + [ + F.InterpolationMode.NEAREST, + F.InterpolationMode.NEAREST_EXACT, + F.InterpolationMode.BILINEAR, + F.InterpolationMode.BICUBIC, + ], + _RESIZED_CROP_PARAMS, + ): + yield ArgsKwargs( + image_loader, + interpolation=interpolation, + antialias=interpolation + in { + F.InterpolationMode.BILINEAR, + F.InterpolationMode.BICUBIC, + }, + **params, + ) + + +def sample_inputs_resized_crop_bounding_boxes(): + for bounding_boxes_loader in make_bounding_box_loaders(): + yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **_RESIZED_CROP_PARAMS[0]) + + +def sample_inputs_resized_crop_mask(): + for mask_loader in make_mask_loaders(): + yield ArgsKwargs(mask_loader, **_RESIZED_CROP_PARAMS[0]) + + +def sample_inputs_resized_crop_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, **_RESIZED_CROP_PARAMS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.resized_crop_image, + sample_inputs_fn=sample_inputs_resized_crop_image_tensor, + reference_fn=reference_resized_crop_image_tensor, + reference_inputs_fn=reference_inputs_resized_crop_image_tensor, + float32_vs_uint8=True, + closeness_kwargs={ + **cuda_vs_cpu_pixel_difference(), + **pil_reference_pixel_difference(3, mae=True), + **float32_vs_uint8_pixel_difference(3, mae=True), + }, + ), + KernelInfo( + F.resized_crop_bounding_boxes, + sample_inputs_fn=sample_inputs_resized_crop_bounding_boxes, + ), + KernelInfo( + F.resized_crop_mask, + sample_inputs_fn=sample_inputs_resized_crop_mask, + ), + KernelInfo( + F.resized_crop_video, + sample_inputs_fn=sample_inputs_resized_crop_video, + closeness_kwargs=cuda_vs_cpu_pixel_difference(), + ), + ] +) + +_PAD_PARAMS = combinations_grid( + padding=[[1], [1, 1], [1, 1, 2, 2]], + padding_mode=["constant", "symmetric", "edge", "reflect"], +) + + +def sample_inputs_pad_image_tensor(): + make_pad_image_loaders = functools.partial( + make_image_loaders, sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32] + ) + + for image_loader, padding in itertools.product( + make_pad_image_loaders(), + [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]], + ): + yield ArgsKwargs(image_loader, padding=padding) + + for image_loader in make_pad_image_loaders(): + for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): + yield ArgsKwargs(image_loader, padding=[1], fill=fill) + + for image_loader, padding_mode in itertools.product( + # We branch for non-constant padding and integer inputs + make_pad_image_loaders(dtypes=[torch.uint8]), + ["constant", "symmetric", "edge", "reflect"], + ): + yield ArgsKwargs(image_loader, padding=[1], padding_mode=padding_mode) + + # `torch.nn.functional.pad` does not support symmetric padding, and thus we have a custom implementation. Besides + # negative padding, this is already handled by the inputs above. + for image_loader in make_pad_image_loaders(): + yield ArgsKwargs(image_loader, padding=[-1], padding_mode="symmetric") + + +def reference_inputs_pad_image_tensor(): + for image_loader, params in itertools.product( + make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _PAD_PARAMS + ): + for fill in get_fills( + num_channels=image_loader.num_channels, + dtype=image_loader.dtype, + ): + # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it? + if isinstance(fill, (list, tuple)): + continue + + yield ArgsKwargs(image_loader, fill=fill, **params) + + +def sample_inputs_pad_bounding_boxes(): + for bounding_boxes_loader, padding in itertools.product( + make_bounding_box_loaders(), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] + ): + yield ArgsKwargs( + bounding_boxes_loader, + format=bounding_boxes_loader.format, + canvas_size=bounding_boxes_loader.canvas_size, + padding=padding, + padding_mode="constant", + ) + + +def sample_inputs_pad_mask(): + for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_categories=[10], num_objects=[5]): + yield ArgsKwargs(mask_loader, padding=[1]) + + +def reference_inputs_pad_mask(): + for mask_loader, fill, params in itertools.product( + make_mask_loaders(num_objects=[1], extra_dims=[()]), [None, 127], _PAD_PARAMS + ): + yield ArgsKwargs(mask_loader, fill=fill, **params) + + +def sample_inputs_pad_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, padding=[1]) + + +def reference_pad_bounding_boxes(bounding_boxes, *, format, canvas_size, padding, padding_mode): + + left, right, top, bottom = _parse_pad_padding(padding) + + affine_matrix = np.array( + [ + [1, 0, left], + [0, 1, top], + ], + dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32", + ) + + height = canvas_size[0] + top + bottom + width = canvas_size[1] + left + right + + expected_bboxes = reference_affine_bounding_boxes_helper( + bounding_boxes, format=format, canvas_size=(height, width), affine_matrix=affine_matrix + ) + return expected_bboxes, (height, width) + + +def reference_inputs_pad_bounding_boxes(): + for bounding_boxes_loader, padding in itertools.product( + make_bounding_box_loaders(extra_dims=((), (4,))), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]] + ): + yield ArgsKwargs( + bounding_boxes_loader, + format=bounding_boxes_loader.format, + canvas_size=bounding_boxes_loader.canvas_size, + padding=padding, + padding_mode="constant", + ) + + +def pad_xfail_jit_fill_condition(args_kwargs): + fill = args_kwargs.kwargs.get("fill") + if not isinstance(fill, (list, tuple)): + return False + elif isinstance(fill, tuple): + return True + else: # isinstance(fill, list): + return all(isinstance(f, int) for f in fill) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.pad_image, + sample_inputs_fn=sample_inputs_pad_image_tensor, + reference_fn=pil_reference_wrapper(F._pad_image_pil), + reference_inputs_fn=reference_inputs_pad_image_tensor, + float32_vs_uint8=float32_vs_uint8_fill_adapter, + closeness_kwargs=float32_vs_uint8_pixel_difference(), + test_marks=[ + xfail_jit_python_scalar_arg("padding"), + xfail_jit( + "F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition + ), + ], + ), + KernelInfo( + F.pad_bounding_boxes, + sample_inputs_fn=sample_inputs_pad_bounding_boxes, + reference_fn=reference_pad_bounding_boxes, + reference_inputs_fn=reference_inputs_pad_bounding_boxes, + test_marks=[ + xfail_jit_python_scalar_arg("padding"), + ], + ), + KernelInfo( + F.pad_mask, + sample_inputs_fn=sample_inputs_pad_mask, + reference_fn=pil_reference_wrapper(F._pad_image_pil), + reference_inputs_fn=reference_inputs_pad_mask, + float32_vs_uint8=float32_vs_uint8_fill_adapter, + ), + KernelInfo( + F.pad_video, + sample_inputs_fn=sample_inputs_pad_video, + ), + ] +) + +_PERSPECTIVE_COEFFS = [ + [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018], + [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063], +] +_STARTPOINTS = [[0, 1], [2, 3], [4, 5], [6, 7]] +_ENDPOINTS = [[9, 8], [7, 6], [5, 4], [3, 2]] + + +def sample_inputs_perspective_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]): + for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): + yield ArgsKwargs( + image_loader, startpoints=None, endpoints=None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0] + ) + + yield ArgsKwargs(make_image_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS) + + +def reference_inputs_perspective_image_tensor(): + for image_loader, coefficients, interpolation in itertools.product( + make_image_loaders_for_interpolation(), + _PERSPECTIVE_COEFFS, + [ + F.InterpolationMode.NEAREST, + F.InterpolationMode.BILINEAR, + ], + ): + for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): + # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it? + if isinstance(fill, (list, tuple)): + continue + + yield ArgsKwargs( + image_loader, + startpoints=None, + endpoints=None, + interpolation=interpolation, + fill=fill, + coefficients=coefficients, + ) + + +def sample_inputs_perspective_bounding_boxes(): + for bounding_boxes_loader in make_bounding_box_loaders(): + yield ArgsKwargs( + bounding_boxes_loader, + format=bounding_boxes_loader.format, + canvas_size=bounding_boxes_loader.canvas_size, + startpoints=None, + endpoints=None, + coefficients=_PERSPECTIVE_COEFFS[0], + ) + + format = tv_tensors.BoundingBoxFormat.XYXY + loader = make_bounding_box_loader(format=format) + yield ArgsKwargs( + loader, format=format, canvas_size=loader.canvas_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS + ) + + +def sample_inputs_perspective_mask(): + for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]): + yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0]) + + yield ArgsKwargs(make_detection_mask_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS) + + +def reference_inputs_perspective_mask(): + for mask_loader, perspective_coeffs in itertools.product( + make_mask_loaders(extra_dims=[()], num_objects=[1]), _PERSPECTIVE_COEFFS + ): + yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=perspective_coeffs) + + +def sample_inputs_perspective_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0]) + + yield ArgsKwargs(make_video_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.perspective_image, + sample_inputs_fn=sample_inputs_perspective_image_tensor, + reference_fn=pil_reference_wrapper(F._perspective_image_pil), + reference_inputs_fn=reference_inputs_perspective_image_tensor, + float32_vs_uint8=float32_vs_uint8_fill_adapter, + closeness_kwargs={ + **pil_reference_pixel_difference(2, mae=True), + **cuda_vs_cpu_pixel_difference(), + **float32_vs_uint8_pixel_difference(), + **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5), + **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5), + }, + test_marks=[xfail_jit_python_scalar_arg("fill")], + ), + KernelInfo( + F.perspective_bounding_boxes, + sample_inputs_fn=sample_inputs_perspective_bounding_boxes, + closeness_kwargs={ + **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6), + **scripted_vs_eager_float64_tolerances("cuda", atol=1e-6, rtol=1e-6), + }, + ), + KernelInfo( + F.perspective_mask, + sample_inputs_fn=sample_inputs_perspective_mask, + reference_fn=pil_reference_wrapper(F._perspective_image_pil), + reference_inputs_fn=reference_inputs_perspective_mask, + float32_vs_uint8=True, + closeness_kwargs={ + (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): dict(atol=10, rtol=0), + }, + ), + KernelInfo( + F.perspective_video, + sample_inputs_fn=sample_inputs_perspective_video, + closeness_kwargs={ + **cuda_vs_cpu_pixel_difference(), + **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5), + **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5), + }, + ), + ] +) + + +def _get_elastic_displacement(canvas_size): + return torch.rand(1, *canvas_size, 2) + + +def sample_inputs_elastic_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]): + displacement = _get_elastic_displacement(image_loader.canvas_size) + for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): + yield ArgsKwargs(image_loader, displacement=displacement, fill=fill) + + +def reference_inputs_elastic_image_tensor(): + for image_loader, interpolation in itertools.product( + make_image_loaders_for_interpolation(), + [ + F.InterpolationMode.NEAREST, + F.InterpolationMode.BILINEAR, + F.InterpolationMode.BICUBIC, + ], + ): + displacement = _get_elastic_displacement(image_loader.canvas_size) + for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype): + yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill) + + +def sample_inputs_elastic_bounding_boxes(): + for bounding_boxes_loader in make_bounding_box_loaders(): + displacement = _get_elastic_displacement(bounding_boxes_loader.canvas_size) + yield ArgsKwargs( + bounding_boxes_loader, + format=bounding_boxes_loader.format, + canvas_size=bounding_boxes_loader.canvas_size, + displacement=displacement, + ) + + +def sample_inputs_elastic_mask(): + for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]): + displacement = _get_elastic_displacement(mask_loader.shape[-2:]) + yield ArgsKwargs(mask_loader, displacement=displacement) + + +def sample_inputs_elastic_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + displacement = _get_elastic_displacement(video_loader.shape[-2:]) + yield ArgsKwargs(video_loader, displacement=displacement) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.elastic_image, + sample_inputs_fn=sample_inputs_elastic_image_tensor, + reference_inputs_fn=reference_inputs_elastic_image_tensor, + float32_vs_uint8=float32_vs_uint8_fill_adapter, + closeness_kwargs={ + **float32_vs_uint8_pixel_difference(6, mae=True), + **cuda_vs_cpu_pixel_difference(), + }, + test_marks=[xfail_jit_python_scalar_arg("fill")], + ), + KernelInfo( + F.elastic_bounding_boxes, + sample_inputs_fn=sample_inputs_elastic_bounding_boxes, + ), + KernelInfo( + F.elastic_mask, + sample_inputs_fn=sample_inputs_elastic_mask, + ), + KernelInfo( + F.elastic_video, + sample_inputs_fn=sample_inputs_elastic_video, + closeness_kwargs=cuda_vs_cpu_pixel_difference(), + ), + ] +) + + +_CENTER_CROP_SPATIAL_SIZES = [(16, 16), (7, 33), (31, 9)] +_CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)] + + +def sample_inputs_center_crop_image_tensor(): + for image_loader, output_size in itertools.product( + make_image_loaders(sizes=[(16, 17)], color_spaces=["RGB"], dtypes=[torch.float32]), + [ + # valid `output_size` types for which cropping is applied to both dimensions + *[5, (4,), (2, 3), [6], [3, 2]], + # `output_size`'s for which at least one dimension needs to be padded + *[[4, 18], [17, 5], [17, 18]], + ], + ): + yield ArgsKwargs(image_loader, output_size=output_size) + + +def reference_inputs_center_crop_image_tensor(): + for image_loader, output_size in itertools.product( + make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], dtypes=[torch.uint8]), + _CENTER_CROP_OUTPUT_SIZES, + ): + yield ArgsKwargs(image_loader, output_size=output_size) + + +def sample_inputs_center_crop_bounding_boxes(): + for bounding_boxes_loader, output_size in itertools.product(make_bounding_box_loaders(), _CENTER_CROP_OUTPUT_SIZES): + yield ArgsKwargs( + bounding_boxes_loader, + format=bounding_boxes_loader.format, + canvas_size=bounding_boxes_loader.canvas_size, + output_size=output_size, + ) + + +def sample_inputs_center_crop_mask(): + for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_categories=[10], num_objects=[5]): + height, width = mask_loader.shape[-2:] + yield ArgsKwargs(mask_loader, output_size=(height // 2, width // 2)) + + +def reference_inputs_center_crop_mask(): + for mask_loader, output_size in itertools.product( + make_mask_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES + ): + yield ArgsKwargs(mask_loader, output_size=output_size) + + +def sample_inputs_center_crop_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + height, width = video_loader.shape[-2:] + yield ArgsKwargs(video_loader, output_size=(height // 2, width // 2)) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.center_crop_image, + sample_inputs_fn=sample_inputs_center_crop_image_tensor, + reference_fn=pil_reference_wrapper(F._center_crop_image_pil), + reference_inputs_fn=reference_inputs_center_crop_image_tensor, + float32_vs_uint8=True, + test_marks=[ + xfail_jit_python_scalar_arg("output_size"), + ], + ), + KernelInfo( + F.center_crop_bounding_boxes, + sample_inputs_fn=sample_inputs_center_crop_bounding_boxes, + test_marks=[ + xfail_jit_python_scalar_arg("output_size"), + ], + ), + KernelInfo( + F.center_crop_mask, + sample_inputs_fn=sample_inputs_center_crop_mask, + reference_fn=pil_reference_wrapper(F._center_crop_image_pil), + reference_inputs_fn=reference_inputs_center_crop_mask, + float32_vs_uint8=True, + test_marks=[ + xfail_jit_python_scalar_arg("output_size"), + ], + ), + KernelInfo( + F.center_crop_video, + sample_inputs_fn=sample_inputs_center_crop_video, + ), + ] +) + + +def sample_inputs_gaussian_blur_image_tensor(): + make_gaussian_blur_image_loaders = functools.partial(make_image_loaders, sizes=[(7, 33)], color_spaces=["RGB"]) + + for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]): + yield ArgsKwargs(image_loader, kernel_size=kernel_size) + + for image_loader, sigma in itertools.product( + make_gaussian_blur_image_loaders(), [None, (3.0, 3.0), [2.0, 2.0], 4.0, [1.5], (3.14,)] + ): + yield ArgsKwargs(image_loader, kernel_size=5, sigma=sigma) + + +def sample_inputs_gaussian_blur_video(): + for video_loader in make_video_loaders(sizes=[(7, 33)], num_frames=[5]): + yield ArgsKwargs(video_loader, kernel_size=[3, 3]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.gaussian_blur_image, + sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor, + closeness_kwargs=cuda_vs_cpu_pixel_difference(), + test_marks=[ + xfail_jit_python_scalar_arg("kernel_size"), + xfail_jit_python_scalar_arg("sigma"), + ], + ), + KernelInfo( + F.gaussian_blur_video, + sample_inputs_fn=sample_inputs_gaussian_blur_video, + closeness_kwargs=cuda_vs_cpu_pixel_difference(), + ), + ] +) + + +def sample_inputs_equalize_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): + yield ArgsKwargs(image_loader) + + +def reference_inputs_equalize_image_tensor(): + # We are not using `make_image_loaders` here since that uniformly samples the values over the whole value range. + # Since the whole point of this kernel is to transform an arbitrary distribution of values into a uniform one, + # the information gain is low if we already provide something really close to the expected value. + def make_uniform_band_image(shape, dtype, device, *, low_factor, high_factor, memory_format): + if dtype.is_floating_point: + low = low_factor + high = high_factor + else: + max_value = torch.iinfo(dtype).max + low = int(low_factor * max_value) + high = int(high_factor * max_value) + return torch.testing.make_tensor(shape, dtype=dtype, device=device, low=low, high=high).to( + memory_format=memory_format, copy=True + ) + + def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_format): + image = torch.distributions.Beta(alpha, beta).sample(shape) + if not dtype.is_floating_point: + image.mul_(torch.iinfo(dtype).max).round_() + return image.to(dtype=dtype, device=device, memory_format=memory_format, copy=True) + + canvas_size = (256, 256) + for dtype, color_space, fn in itertools.product( + [torch.uint8], + ["GRAY", "RGB"], + [ + lambda shape, dtype, device, memory_format: torch.zeros(shape, dtype=dtype, device=device).to( + memory_format=memory_format, copy=True + ), + lambda shape, dtype, device, memory_format: torch.full( + shape, 1.0 if dtype.is_floating_point else torch.iinfo(dtype).max, dtype=dtype, device=device + ).to(memory_format=memory_format, copy=True), + *[ + functools.partial(make_uniform_band_image, low_factor=low_factor, high_factor=high_factor) + for low_factor, high_factor in [ + (0.0, 0.25), + (0.25, 0.75), + (0.75, 1.0), + ] + ], + *[ + functools.partial(make_beta_distributed_image, alpha=alpha, beta=beta) + for alpha, beta in [ + (0.5, 0.5), + (2, 2), + (2, 5), + (5, 2), + ] + ], + ], + ): + image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *canvas_size), dtype=dtype) + yield ArgsKwargs(image_loader) + + +def sample_inputs_equalize_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.equalize_image, + kernel_name="equalize_image_tensor", + sample_inputs_fn=sample_inputs_equalize_image_tensor, + reference_fn=pil_reference_wrapper(F._equalize_image_pil), + float32_vs_uint8=True, + reference_inputs_fn=reference_inputs_equalize_image_tensor, + ), + KernelInfo( + F.equalize_video, + sample_inputs_fn=sample_inputs_equalize_video, + ), + ] +) + + +def sample_inputs_invert_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): + yield ArgsKwargs(image_loader) + + +def reference_inputs_invert_image_tensor(): + for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]): + yield ArgsKwargs(image_loader) + + +def sample_inputs_invert_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.invert_image, + kernel_name="invert_image_tensor", + sample_inputs_fn=sample_inputs_invert_image_tensor, + reference_fn=pil_reference_wrapper(F._invert_image_pil), + reference_inputs_fn=reference_inputs_invert_image_tensor, + float32_vs_uint8=True, + ), + KernelInfo( + F.invert_video, + sample_inputs_fn=sample_inputs_invert_video, + ), + ] +) + + +_POSTERIZE_BITS = [1, 4, 8] + + +def sample_inputs_posterize_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): + yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0]) + + +def reference_inputs_posterize_image_tensor(): + for image_loader, bits in itertools.product( + make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), + _POSTERIZE_BITS, + ): + yield ArgsKwargs(image_loader, bits=bits) + + +def sample_inputs_posterize_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, bits=_POSTERIZE_BITS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.posterize_image, + kernel_name="posterize_image_tensor", + sample_inputs_fn=sample_inputs_posterize_image_tensor, + reference_fn=pil_reference_wrapper(F._posterize_image_pil), + reference_inputs_fn=reference_inputs_posterize_image_tensor, + float32_vs_uint8=True, + closeness_kwargs=float32_vs_uint8_pixel_difference(), + ), + KernelInfo( + F.posterize_video, + sample_inputs_fn=sample_inputs_posterize_video, + ), + ] +) + + +def _get_solarize_thresholds(dtype): + for factor in [0.1, 0.5]: + max_value = get_max_value(dtype) + yield (float if dtype.is_floating_point else int)(max_value * factor) + + +def sample_inputs_solarize_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): + yield ArgsKwargs(image_loader, threshold=next(_get_solarize_thresholds(image_loader.dtype))) + + +def reference_inputs_solarize_image_tensor(): + for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]): + for threshold in _get_solarize_thresholds(image_loader.dtype): + yield ArgsKwargs(image_loader, threshold=threshold) + + +def uint8_to_float32_threshold_adapter(other_args, kwargs): + return other_args, dict(threshold=kwargs["threshold"] / 255) + + +def sample_inputs_solarize_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, threshold=next(_get_solarize_thresholds(video_loader.dtype))) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.solarize_image, + kernel_name="solarize_image_tensor", + sample_inputs_fn=sample_inputs_solarize_image_tensor, + reference_fn=pil_reference_wrapper(F._solarize_image_pil), + reference_inputs_fn=reference_inputs_solarize_image_tensor, + float32_vs_uint8=uint8_to_float32_threshold_adapter, + closeness_kwargs=float32_vs_uint8_pixel_difference(), + ), + KernelInfo( + F.solarize_video, + sample_inputs_fn=sample_inputs_solarize_video, + ), + ] +) + + +def sample_inputs_autocontrast_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): + yield ArgsKwargs(image_loader) + + +def reference_inputs_autocontrast_image_tensor(): + for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]): + yield ArgsKwargs(image_loader) + + +def sample_inputs_autocontrast_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.autocontrast_image, + kernel_name="autocontrast_image_tensor", + sample_inputs_fn=sample_inputs_autocontrast_image_tensor, + reference_fn=pil_reference_wrapper(F._autocontrast_image_pil), + reference_inputs_fn=reference_inputs_autocontrast_image_tensor, + float32_vs_uint8=True, + closeness_kwargs={ + **pil_reference_pixel_difference(), + **float32_vs_uint8_pixel_difference(), + }, + ), + KernelInfo( + F.autocontrast_video, + sample_inputs_fn=sample_inputs_autocontrast_video, + ), + ] +) + +_ADJUST_SHARPNESS_FACTORS = [0.1, 0.5] + + +def sample_inputs_adjust_sharpness_image_tensor(): + for image_loader in make_image_loaders( + sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE, (2, 2)], + color_spaces=("GRAY", "RGB"), + ): + yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0]) + + +def reference_inputs_adjust_sharpness_image_tensor(): + for image_loader, sharpness_factor in itertools.product( + make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), + _ADJUST_SHARPNESS_FACTORS, + ): + yield ArgsKwargs(image_loader, sharpness_factor=sharpness_factor) + + +def sample_inputs_adjust_sharpness_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_sharpness_image, + kernel_name="adjust_sharpness_image_tensor", + sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor, + reference_fn=pil_reference_wrapper(F._adjust_sharpness_image_pil), + reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor, + float32_vs_uint8=True, + closeness_kwargs=float32_vs_uint8_pixel_difference(2), + ), + KernelInfo( + F.adjust_sharpness_video, + sample_inputs_fn=sample_inputs_adjust_sharpness_video, + ), + ] +) + + +_ADJUST_CONTRAST_FACTORS = [0.1, 0.5] + + +def sample_inputs_adjust_contrast_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): + yield ArgsKwargs(image_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0]) + + +def reference_inputs_adjust_contrast_image_tensor(): + for image_loader, contrast_factor in itertools.product( + make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), + _ADJUST_CONTRAST_FACTORS, + ): + yield ArgsKwargs(image_loader, contrast_factor=contrast_factor) + + +def sample_inputs_adjust_contrast_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_contrast_image, + kernel_name="adjust_contrast_image_tensor", + sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor, + reference_fn=pil_reference_wrapper(F._adjust_contrast_image_pil), + reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor, + float32_vs_uint8=True, + closeness_kwargs={ + **pil_reference_pixel_difference(), + **float32_vs_uint8_pixel_difference(2), + **cuda_vs_cpu_pixel_difference(), + (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1), + }, + ), + KernelInfo( + F.adjust_contrast_video, + sample_inputs_fn=sample_inputs_adjust_contrast_video, + closeness_kwargs={ + **cuda_vs_cpu_pixel_difference(), + (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1), + }, + ), + ] +) + +_ADJUST_GAMMA_GAMMAS_GAINS = [ + (0.5, 2.0), + (0.0, 1.0), +] + + +def sample_inputs_adjust_gamma_image_tensor(): + gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0] + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): + yield ArgsKwargs(image_loader, gamma=gamma, gain=gain) + + +def reference_inputs_adjust_gamma_image_tensor(): + for image_loader, (gamma, gain) in itertools.product( + make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), + _ADJUST_GAMMA_GAMMAS_GAINS, + ): + yield ArgsKwargs(image_loader, gamma=gamma, gain=gain) + + +def sample_inputs_adjust_gamma_video(): + gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0] + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, gamma=gamma, gain=gain) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_gamma_image, + kernel_name="adjust_gamma_image_tensor", + sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor, + reference_fn=pil_reference_wrapper(F._adjust_gamma_image_pil), + reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor, + float32_vs_uint8=True, + closeness_kwargs={ + **pil_reference_pixel_difference(), + **float32_vs_uint8_pixel_difference(), + }, + ), + KernelInfo( + F.adjust_gamma_video, + sample_inputs_fn=sample_inputs_adjust_gamma_video, + ), + ] +) + + +_ADJUST_HUE_FACTORS = [-0.1, 0.5] + + +def sample_inputs_adjust_hue_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): + yield ArgsKwargs(image_loader, hue_factor=_ADJUST_HUE_FACTORS[0]) + + +def reference_inputs_adjust_hue_image_tensor(): + for image_loader, hue_factor in itertools.product( + make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), + _ADJUST_HUE_FACTORS, + ): + yield ArgsKwargs(image_loader, hue_factor=hue_factor) + + +def sample_inputs_adjust_hue_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, hue_factor=_ADJUST_HUE_FACTORS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_hue_image, + kernel_name="adjust_hue_image_tensor", + sample_inputs_fn=sample_inputs_adjust_hue_image_tensor, + reference_fn=pil_reference_wrapper(F._adjust_hue_image_pil), + reference_inputs_fn=reference_inputs_adjust_hue_image_tensor, + float32_vs_uint8=True, + closeness_kwargs={ + **pil_reference_pixel_difference(2, mae=True), + **float32_vs_uint8_pixel_difference(), + }, + ), + KernelInfo( + F.adjust_hue_video, + sample_inputs_fn=sample_inputs_adjust_hue_video, + ), + ] +) + +_ADJUST_SATURATION_FACTORS = [0.1, 0.5] + + +def sample_inputs_adjust_saturation_image_tensor(): + for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")): + yield ArgsKwargs(image_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0]) + + +def reference_inputs_adjust_saturation_image_tensor(): + for image_loader, saturation_factor in itertools.product( + make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]), + _ADJUST_SATURATION_FACTORS, + ): + yield ArgsKwargs(image_loader, saturation_factor=saturation_factor) + + +def sample_inputs_adjust_saturation_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]): + yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0]) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.adjust_saturation_image, + kernel_name="adjust_saturation_image_tensor", + sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor, + reference_fn=pil_reference_wrapper(F._adjust_saturation_image_pil), + reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor, + float32_vs_uint8=True, + closeness_kwargs={ + **pil_reference_pixel_difference(), + **float32_vs_uint8_pixel_difference(2), + **cuda_vs_cpu_pixel_difference(), + }, + ), + KernelInfo( + F.adjust_saturation_video, + sample_inputs_fn=sample_inputs_adjust_saturation_video, + closeness_kwargs=cuda_vs_cpu_pixel_difference(), + ), + ] +) + + +def sample_inputs_clamp_bounding_boxes(): + for bounding_boxes_loader in make_bounding_box_loaders(): + yield ArgsKwargs( + bounding_boxes_loader, + format=bounding_boxes_loader.format, + canvas_size=bounding_boxes_loader.canvas_size, + ) + + +KERNEL_INFOS.append( + KernelInfo( + F.clamp_bounding_boxes, + sample_inputs_fn=sample_inputs_clamp_bounding_boxes, + logs_usage=True, + ) +) + +_FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]] + + +def _get_five_ten_crop_canvas_size(size): + if isinstance(size, int): + crop_height = crop_width = size + elif len(size) == 1: + crop_height = crop_width = size[0] + else: + crop_height, crop_width = size + return 2 * crop_height, 2 * crop_width + + +def sample_inputs_five_crop_image_tensor(): + for size in _FIVE_TEN_CROP_SIZES: + for image_loader in make_image_loaders( + sizes=[_get_five_ten_crop_canvas_size(size)], + color_spaces=["RGB"], + dtypes=[torch.float32], + ): + yield ArgsKwargs(image_loader, size=size) + + +def reference_inputs_five_crop_image_tensor(): + for size in _FIVE_TEN_CROP_SIZES: + for image_loader in make_image_loaders( + sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8] + ): + yield ArgsKwargs(image_loader, size=size) + + +def sample_inputs_five_crop_video(): + size = _FIVE_TEN_CROP_SIZES[0] + for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]): + yield ArgsKwargs(video_loader, size=size) + + +def sample_inputs_ten_crop_image_tensor(): + for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): + for image_loader in make_image_loaders( + sizes=[_get_five_ten_crop_canvas_size(size)], + color_spaces=["RGB"], + dtypes=[torch.float32], + ): + yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) + + +def reference_inputs_ten_crop_image_tensor(): + for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]): + for image_loader in make_image_loaders( + sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8] + ): + yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip) + + +def sample_inputs_ten_crop_video(): + size = _FIVE_TEN_CROP_SIZES[0] + for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]): + yield ArgsKwargs(video_loader, size=size) + + +def multi_crop_pil_reference_wrapper(pil_kernel): + def wrapper(input_tensor, *other_args, **kwargs): + output = pil_reference_wrapper(pil_kernel)(input_tensor, *other_args, **kwargs) + return type(output)( + F.to_dtype_image(F.to_image(output_pil), dtype=input_tensor.dtype, scale=True) for output_pil in output + ) + + return wrapper + + +_common_five_ten_crop_marks = [ + xfail_jit_python_scalar_arg("size"), + mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."), +] + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.five_crop_image, + sample_inputs_fn=sample_inputs_five_crop_image_tensor, + reference_fn=multi_crop_pil_reference_wrapper(F._five_crop_image_pil), + reference_inputs_fn=reference_inputs_five_crop_image_tensor, + test_marks=_common_five_ten_crop_marks, + ), + KernelInfo( + F.five_crop_video, + sample_inputs_fn=sample_inputs_five_crop_video, + test_marks=_common_five_ten_crop_marks, + ), + KernelInfo( + F.ten_crop_image, + sample_inputs_fn=sample_inputs_ten_crop_image_tensor, + reference_fn=multi_crop_pil_reference_wrapper(F._ten_crop_image_pil), + reference_inputs_fn=reference_inputs_ten_crop_image_tensor, + test_marks=_common_five_ten_crop_marks, + ), + KernelInfo( + F.ten_crop_video, + sample_inputs_fn=sample_inputs_ten_crop_video, + test_marks=_common_five_ten_crop_marks, + ), + ] +) + +_NORMALIZE_MEANS_STDS = [ + ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), + ([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]), + (0.5, 2.0), +] + + +def sample_inputs_normalize_image_tensor(): + for image_loader, (mean, std) in itertools.product( + make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32]), + _NORMALIZE_MEANS_STDS, + ): + yield ArgsKwargs(image_loader, mean=mean, std=std) + + +def reference_normalize_image_tensor(image, mean, std, inplace=False): + mean = torch.tensor(mean).view(-1, 1, 1) + std = torch.tensor(std).view(-1, 1, 1) + + sub = torch.Tensor.sub_ if inplace else torch.Tensor.sub + return sub(image, mean).div_(std) + + +def reference_inputs_normalize_image_tensor(): + yield ArgsKwargs( + make_image_loader(size=(32, 32), color_space="RGB", extra_dims=[1]), + mean=[0.5, 0.5, 0.5], + std=[1.0, 1.0, 1.0], + ) + + +def sample_inputs_normalize_video(): + mean, std = _NORMALIZE_MEANS_STDS[0] + for video_loader in make_video_loaders( + sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[3], dtypes=[torch.float32] + ): + yield ArgsKwargs(video_loader, mean=mean, std=std) + + +KERNEL_INFOS.extend( + [ + KernelInfo( + F.normalize_image, + kernel_name="normalize_image_tensor", + sample_inputs_fn=sample_inputs_normalize_image_tensor, + reference_fn=reference_normalize_image_tensor, + reference_inputs_fn=reference_inputs_normalize_image_tensor, + test_marks=[ + xfail_jit_python_scalar_arg("mean"), + xfail_jit_python_scalar_arg("std"), + ], + ), + KernelInfo( + F.normalize_video, + sample_inputs_fn=sample_inputs_normalize_video, + ), + ] +) + + +def sample_inputs_uniform_temporal_subsample_video(): + for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[4]): + yield ArgsKwargs(video_loader, num_samples=2) + + +def reference_uniform_temporal_subsample_video(x, num_samples): + # Copy-pasted from + # https://github.com/facebookresearch/pytorchvideo/blob/c8d23d8b7e597586a9e2d18f6ed31ad8aa379a7a/pytorchvideo/transforms/functional.py#L19 + t = x.shape[-4] + assert num_samples > 0 and t > 0 + # Sample by nearest neighbor interpolation if num_samples > t. + indices = torch.linspace(0, t - 1, num_samples) + indices = torch.clamp(indices, 0, t - 1).long() + return torch.index_select(x, -4, indices) + + +def reference_inputs_uniform_temporal_subsample_video(): + for video_loader in make_video_loaders( + sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[10] + ): + for num_samples in range(1, video_loader.shape[-4] + 1): + yield ArgsKwargs(video_loader, num_samples) + + +KERNEL_INFOS.append( + KernelInfo( + F.uniform_temporal_subsample_video, + sample_inputs_fn=sample_inputs_uniform_temporal_subsample_video, + reference_fn=reference_uniform_temporal_subsample_video, + reference_inputs_fn=reference_inputs_uniform_temporal_subsample_video, + ) +) diff --git a/test/transforms_v2_legacy_utils.py b/test/transforms_v2_legacy_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..9dead79342246c354211b59da5aa6233c9fd4191 --- /dev/null +++ b/test/transforms_v2_legacy_utils.py @@ -0,0 +1,633 @@ +""" +As the name implies, these are legacy utilities that are hopefully removed soon. The future of +transforms v2 testing is in test/test_transforms_v2_refactored.py. All new test should be +implemented there and must not use any of the utilities here. + +The following legacy modules depend on this module + +- transforms_v2_kernel_infos.py +- transforms_v2_dispatcher_infos.py +- test_transforms_v2_functional.py +- test_transforms_v2_consistency.py +- test_transforms.py + +When all the logic is ported from the files above to test_transforms_v2_refactored.py, delete +all the legacy modules including this one and drop the _refactored prefix from the name. +""" + +import collections.abc +import dataclasses +import enum +import itertools +import pathlib +from collections import defaultdict +from typing import Callable, Sequence, Tuple, Union + +import PIL.Image +import pytest +import torch + +from torchvision import tv_tensors +from torchvision.transforms._functional_tensor import _max_value as get_max_value +from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image + + +def combinations_grid(**kwargs): + """Creates a grid of input combinations. + + Each element in the returned sequence is a dictionary containing one possible combination as values. + + Example: + >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham")) + [ + {'foo': 'bar', 'spam': 'eggs'}, + {'foo': 'bar', 'spam': 'ham'}, + {'foo': 'baz', 'spam': 'eggs'}, + {'foo': 'baz', 'spam': 'ham'} + ] + """ + return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())] + + +DEFAULT_SIZE = (17, 11) + +NUM_CHANNELS_MAP = { + "GRAY": 1, + "GRAY_ALPHA": 2, + "RGB": 3, + "RGBA": 4, +} + + +def make_image( + size=DEFAULT_SIZE, + *, + color_space="RGB", + batch_dims=(), + dtype=None, + device="cpu", + memory_format=torch.contiguous_format, +): + num_channels = NUM_CHANNELS_MAP[color_space] + dtype = dtype or torch.uint8 + max_value = get_max_value(dtype) + data = torch.testing.make_tensor( + (*batch_dims, num_channels, *size), + low=0, + high=max_value, + dtype=dtype, + device=device, + memory_format=memory_format, + ) + if color_space in {"GRAY_ALPHA", "RGBA"}: + data[..., -1, :, :] = max_value + + return tv_tensors.Image(data) + + +def make_image_tensor(*args, **kwargs): + return make_image(*args, **kwargs).as_subclass(torch.Tensor) + + +def make_image_pil(*args, **kwargs): + return to_pil_image(make_image(*args, **kwargs)) + + +def make_bounding_boxes( + canvas_size=DEFAULT_SIZE, + *, + format=tv_tensors.BoundingBoxFormat.XYXY, + batch_dims=(), + dtype=None, + device="cpu", +): + def sample_position(values, max_value): + # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high. + # However, if we have batch_dims, we need tensors as limits. + return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape) + + if isinstance(format, str): + format = tv_tensors.BoundingBoxFormat[format] + + dtype = dtype or torch.float32 + + if any(dim == 0 for dim in batch_dims): + return tv_tensors.BoundingBoxes( + torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size + ) + + h, w = [torch.randint(1, c, batch_dims) for c in canvas_size] + y = sample_position(h, canvas_size[0]) + x = sample_position(w, canvas_size[1]) + + if format is tv_tensors.BoundingBoxFormat.XYWH: + parts = (x, y, w, h) + elif format is tv_tensors.BoundingBoxFormat.XYXY: + x1, y1 = x, y + x2 = x1 + w + y2 = y1 + h + parts = (x1, y1, x2, y2) + elif format is tv_tensors.BoundingBoxFormat.CXCYWH: + cx = x + w / 2 + cy = y + h / 2 + parts = (cx, cy, w, h) + else: + raise ValueError(f"Format {format} is not supported") + + return tv_tensors.BoundingBoxes( + torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size + ) + + +def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"): + """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks""" + return tv_tensors.Mask( + torch.testing.make_tensor( + (*batch_dims, num_objects, *size), + low=0, + high=2, + dtype=dtype or torch.bool, + device=device, + ) + ) + + +def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"): + """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value""" + return tv_tensors.Mask( + torch.testing.make_tensor( + (*batch_dims, *size), + low=0, + high=num_categories, + dtype=dtype or torch.uint8, + device=device, + ) + ) + + +def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs): + return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs)) + + +def make_video_tensor(*args, **kwargs): + return make_video(*args, **kwargs).as_subclass(torch.Tensor) + + +DEFAULT_SQUARE_SPATIAL_SIZE = 15 +DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33) +DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9) +DEFAULT_SPATIAL_SIZES = ( + DEFAULT_LANDSCAPE_SPATIAL_SIZE, + DEFAULT_PORTRAIT_SPATIAL_SIZE, + DEFAULT_SQUARE_SPATIAL_SIZE, +) + + +def _parse_size(size, *, name="size"): + if size == "random": + raise ValueError("This should never happen") + elif isinstance(size, int) and size > 0: + return (size, size) + elif ( + isinstance(size, collections.abc.Sequence) + and len(size) == 2 + and all(isinstance(length, int) and length > 0 for length in size) + ): + return tuple(size) + else: + raise pytest.UsageError( + f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers," + f"but got {size} instead." + ) + + +def get_num_channels(color_space): + num_channels = NUM_CHANNELS_MAP.get(color_space) + if not num_channels: + raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}") + return num_channels + + +VALID_EXTRA_DIMS = ((), (4,), (2, 3)) +DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5)) + +DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS) + + +def from_loader(loader_fn): + def wrapper(*args, **kwargs): + device = kwargs.pop("device", "cpu") + loader = loader_fn(*args, **kwargs) + return loader.load(device) + + return wrapper + + +def from_loaders(loaders_fn): + def wrapper(*args, **kwargs): + device = kwargs.pop("device", "cpu") + loaders = loaders_fn(*args, **kwargs) + for loader in loaders: + yield loader.load(device) + + return wrapper + + +@dataclasses.dataclass +class TensorLoader: + fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor] + shape: Sequence[int] + dtype: torch.dtype + + def load(self, device): + return self.fn(self.shape, self.dtype, device) + + +@dataclasses.dataclass +class ImageLoader(TensorLoader): + spatial_size: Tuple[int, int] = dataclasses.field(init=False) + num_channels: int = dataclasses.field(init=False) + memory_format: torch.memory_format = torch.contiguous_format + canvas_size: Tuple[int, int] = dataclasses.field(init=False) + + def __post_init__(self): + self.spatial_size = self.canvas_size = self.shape[-2:] + self.num_channels = self.shape[-3] + + def load(self, device): + return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format) + + +def make_image_loader( + size=DEFAULT_PORTRAIT_SPATIAL_SIZE, + *, + color_space="RGB", + extra_dims=(), + dtype=torch.float32, + constant_alpha=True, + memory_format=torch.contiguous_format, +): + if not constant_alpha: + raise ValueError("This should never happen") + size = _parse_size(size) + num_channels = get_num_channels(color_space) + + def fn(shape, dtype, device, memory_format): + *batch_dims, _, height, width = shape + return make_image( + (height, width), + color_space=color_space, + batch_dims=batch_dims, + dtype=dtype, + device=device, + memory_format=memory_format, + ) + + return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format) + + +def make_image_loaders( + *, + sizes=DEFAULT_SPATIAL_SIZES, + color_spaces=( + "GRAY", + "GRAY_ALPHA", + "RGB", + "RGBA", + ), + extra_dims=DEFAULT_EXTRA_DIMS, + dtypes=(torch.float32, torch.float64, torch.uint8), + constant_alpha=True, +): + for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes): + yield make_image_loader(**params, constant_alpha=constant_alpha) + + +make_images = from_loaders(make_image_loaders) + + +def make_image_loader_for_interpolation( + size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format +): + size = _parse_size(size) + num_channels = get_num_channels(color_space) + + def fn(shape, dtype, device, memory_format): + height, width = shape[-2:] + + image_pil = ( + PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg") + .resize((width, height)) + .convert( + { + "GRAY": "L", + "GRAY_ALPHA": "LA", + "RGB": "RGB", + "RGBA": "RGBA", + }[color_space] + ) + ) + + image_tensor = to_image(image_pil) + if memory_format == torch.contiguous_format: + image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True) + else: + image_tensor = image_tensor.to(device=device) + image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True) + + return tv_tensors.Image(image_tensor) + + return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format) + + +def make_image_loaders_for_interpolation( + sizes=((233, 147),), + color_spaces=("RGB",), + dtypes=(torch.uint8,), + memory_formats=(torch.contiguous_format, torch.channels_last), +): + for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats): + yield make_image_loader_for_interpolation(**params) + + +@dataclasses.dataclass +class BoundingBoxesLoader(TensorLoader): + format: tv_tensors.BoundingBoxFormat + spatial_size: Tuple[int, int] + canvas_size: Tuple[int, int] = dataclasses.field(init=False) + + def __post_init__(self): + self.canvas_size = self.spatial_size + + +def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32): + if isinstance(format, str): + format = tv_tensors.BoundingBoxFormat[format] + + spatial_size = _parse_size(spatial_size, name="spatial_size") + + def fn(shape, dtype, device): + *batch_dims, num_coordinates = shape + if num_coordinates != 4: + raise pytest.UsageError() + + return make_bounding_boxes( + format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device + ) + + return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size) + + +def make_bounding_box_loaders( + *, + extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2), + formats=tuple(tv_tensors.BoundingBoxFormat), + spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, + dtypes=(torch.float32, torch.float64, torch.int64), +): + for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes): + yield make_bounding_box_loader(**params, spatial_size=spatial_size) + + +make_multiple_bounding_boxes = from_loaders(make_bounding_box_loaders) + + +class MaskLoader(TensorLoader): + pass + + +def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8): + # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects + size = _parse_size(size) + + def fn(shape, dtype, device): + *batch_dims, num_objects, height, width = shape + return make_detection_mask( + (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device + ) + + return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype) + + +def make_detection_mask_loaders( + sizes=DEFAULT_SPATIAL_SIZES, + num_objects=(1, 0, 5), + extra_dims=DEFAULT_EXTRA_DIMS, + dtypes=(torch.uint8,), +): + for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes): + yield make_detection_mask_loader(**params) + + +make_detection_masks = from_loaders(make_detection_mask_loaders) + + +def make_segmentation_mask_loader( + size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8 +): + # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values + size = _parse_size(size) + + def fn(shape, dtype, device): + *batch_dims, height, width = shape + return make_segmentation_mask( + (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device + ) + + return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype) + + +def make_segmentation_mask_loaders( + *, + sizes=DEFAULT_SPATIAL_SIZES, + num_categories=(1, 2, 10), + extra_dims=DEFAULT_EXTRA_DIMS, + dtypes=(torch.uint8,), +): + for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes): + yield make_segmentation_mask_loader(**params) + + +make_segmentation_masks = from_loaders(make_segmentation_mask_loaders) + + +def make_mask_loaders( + *, + sizes=DEFAULT_SPATIAL_SIZES, + num_objects=(1, 0, 5), + num_categories=(1, 2, 10), + extra_dims=DEFAULT_EXTRA_DIMS, + dtypes=(torch.uint8,), +): + yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes) + yield from make_segmentation_mask_loaders( + sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes + ) + + +make_masks = from_loaders(make_mask_loaders) + + +class VideoLoader(ImageLoader): + pass + + +def make_video_loader( + size=DEFAULT_PORTRAIT_SPATIAL_SIZE, + *, + color_space="RGB", + num_frames=3, + extra_dims=(), + dtype=torch.uint8, +): + size = _parse_size(size) + + def fn(shape, dtype, device, memory_format): + *batch_dims, num_frames, _, height, width = shape + return make_video( + (height, width), + num_frames=num_frames, + batch_dims=batch_dims, + color_space=color_space, + dtype=dtype, + device=device, + memory_format=memory_format, + ) + + return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype) + + +def make_video_loaders( + *, + sizes=DEFAULT_SPATIAL_SIZES, + color_spaces=( + "GRAY", + "RGB", + ), + num_frames=(1, 0, 3), + extra_dims=DEFAULT_EXTRA_DIMS, + dtypes=(torch.uint8, torch.float32, torch.float64), +): + for params in combinations_grid( + size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes + ): + yield make_video_loader(**params) + + +make_videos = from_loaders(make_video_loaders) + + +class TestMark: + def __init__( + self, + # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is + # no test class, i.e. a standalone test function, use `None`. + test_id, + # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail` + mark, + *, + # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be + # applied. If omitted, defaults to always apply. + condition=None, + ): + self.test_id = test_id + self.mark = mark + self.condition = condition or (lambda args_kwargs: True) + + +def mark_framework_limitation(test_id, reason, condition=None): + # The purpose of this function is to have a single entry point for skip marks that are only there, because the test + # framework cannot handle the kernel in general or a specific parameter combination. + # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is + # still justified. + # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus, + # we are wasting CI resources for no reason for most of the time + return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition) + + +class InfoBase: + def __init__( + self, + *, + # Identifier if the info that shows up the parametrization. + id, + # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization. + # See the `TestMark` class for details + test_marks=None, + # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see + # `TestMark`), the dtype, and the device. + closeness_kwargs=None, + ): + self.id = id + + self.test_marks = test_marks or [] + test_marks_map = defaultdict(list) + for test_mark in self.test_marks: + test_marks_map[test_mark.test_id].append(test_mark) + self._test_marks_map = dict(test_marks_map) + + self.closeness_kwargs = closeness_kwargs or dict() + + def get_marks(self, test_id, args_kwargs): + return [ + test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs) + ] + + def get_closeness_kwargs(self, test_id, *, dtype, device): + if not (isinstance(test_id, tuple) and len(test_id) == 2): + msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name" + if callable(test_id): + msg += ". Did you forget to add the `test_id` fixture to parameters of the test?" + else: + msg += f", but got {test_id} instead." + raise pytest.UsageError(msg) + if isinstance(device, torch.device): + device = device.type + return self.closeness_kwargs.get((test_id, dtype, device), dict()) + + +class ArgsKwargs: + def __init__(self, *args, **kwargs): + self.args = args + self.kwargs = kwargs + + def __iter__(self): + yield self.args + yield self.kwargs + + def load(self, device="cpu"): + return ArgsKwargs( + *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args), + **{ + keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg + for keyword, arg in self.kwargs.items() + }, + ) + + +def parametrized_error_message(*args, **kwargs): + def to_str(obj): + if isinstance(obj, torch.Tensor) and obj.numel() > 30: + return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})" + elif isinstance(obj, enum.Enum): + return f"{type(obj).__name__}.{obj.name}" + else: + return repr(obj) + + if args or kwargs: + postfix = "\n".join( + [ + "", + "Failure happened for the following parameters:", + "", + *[to_str(arg) for arg in args], + *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()], + ] + ) + else: + postfix = "" + + def wrapper(msg): + return msg + postfix + + return wrapper diff --git a/torchvision/__init__.py b/torchvision/__init__.py index 06e9b42301f4403404ca392f24490636163995eb..5e8f06e3d0f98a63426f070d9b9b28d910b2c096 100644 --- a/torchvision/__init__.py +++ b/torchvision/__init__.py @@ -1,8 +1,9 @@ import os import warnings +from modulefinder import Module import torch -from torchvision import datasets, io, models, ops, transforms, utils +from torchvision import _meta_registrations, datasets, io, models, ops, transforms, utils from .extension import _HAS_OPS @@ -71,11 +72,16 @@ def set_video_backend(backend): backend, please compile torchvision from source. """ global _video_backend - if backend not in ["pyav", "video_reader"]: - raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend) + if backend not in ["pyav", "video_reader", "cuda"]: + raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend) if backend == "video_reader" and not io._HAS_VIDEO_OPT: + # TODO: better messages message = "video_reader video backend is not available. Please compile torchvision from source and try again" - warnings.warn(message) + raise RuntimeError(message) + elif backend == "cuda" and not io._HAS_GPU_VIDEO_DECODER: + # TODO: better messages + message = "cuda video backend is not available." + raise RuntimeError(message) else: _video_backend = backend @@ -93,3 +99,9 @@ def get_video_backend(): def _is_tracing(): return torch._C._get_tracing_state() + + +def disable_beta_transforms_warning(): + # Noop, only exists to avoid breaking existing code. + # See https://github.com/pytorch/vision/issues/7896 + pass diff --git a/torchvision/_internally_replaced_utils.py b/torchvision/_internally_replaced_utils.py index 18afc3ed93a8272600d73cc240047a0a49f23991..d9a6e261ea277989f4362037352cb24da6564460 100644 --- a/torchvision/_internally_replaced_utils.py +++ b/torchvision/_internally_replaced_utils.py @@ -28,7 +28,6 @@ def _get_extension_path(lib_name): if os.name == "nt": # Register the main torchvision library location on the default DLL path import ctypes - import sys kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True) with_load_library_flags = hasattr(kernel32, "AddDllDirectory") @@ -37,14 +36,7 @@ def _get_extension_path(lib_name): if with_load_library_flags: kernel32.AddDllDirectory.restype = ctypes.c_void_p - if sys.version_info >= (3, 8): - os.add_dll_directory(lib_dir) - elif with_load_library_flags: - res = kernel32.AddDllDirectory(lib_dir) - if res is None: - err = ctypes.WinError(ctypes.get_last_error()) - err.strerror += f' Error adding "{lib_dir}" to the DLL directories.' - raise err + os.add_dll_directory(lib_dir) kernel32.SetErrorMode(prev_error_mode) diff --git a/torchvision/_meta_registrations.py b/torchvision/_meta_registrations.py new file mode 100644 index 0000000000000000000000000000000000000000..9831cfdcb456ef5d569f2a76b3de366f9a59f8f9 --- /dev/null +++ b/torchvision/_meta_registrations.py @@ -0,0 +1,50 @@ +import functools + +import torch +import torch.library + +# Ensure that torch.ops.torchvision is visible +import torchvision.extension # noqa: F401 + + +@functools.lru_cache(None) +def get_meta_lib(): + return torch.library.Library("torchvision", "IMPL", "Meta") + + +def register_meta(op_name, overload_name="default"): + def wrapper(fn): + if torchvision.extension._has_ops(): + get_meta_lib().impl(getattr(getattr(torch.ops.torchvision, op_name), overload_name), fn) + return fn + + return wrapper + + +@register_meta("roi_align") +def meta_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned): + torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]") + torch._check( + input.dtype == rois.dtype, + lambda: ( + "Expected tensor for input to have the same type as tensor for rois; " + f"but type {input.dtype} does not equal {rois.dtype}" + ), + ) + num_rois = rois.size(0) + _, channels, height, width = input.size() + return input.new_empty((num_rois, channels, pooled_height, pooled_width)) + + +@register_meta("_roi_align_backward") +def meta_roi_align_backward( + grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio, aligned +): + torch._check( + grad.dtype == rois.dtype, + lambda: ( + "Expected tensor for grad to have the same type as tensor for rois; " + f"but type {grad.dtype} does not equal {rois.dtype}" + ), + ) + return grad.new_empty((batch_size, channels, height, width)) diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp index f13e2c3ffcfdfdae59be744c5fabc211fb7d0fd3..a895eed2d397488d65641c73d4227a53ef8ac59d 100644 --- a/torchvision/csrc/io/decoder/decoder.cpp +++ b/torchvision/csrc/io/decoder/decoder.cpp @@ -312,6 +312,8 @@ bool Decoder::init( } } + av_dict_set_int(&options, "probesize", params_.probeSize, 0); + interrupted_ = false; // ffmpeg avformat_open_input call can hang if media source doesn't respond diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h index dac6293d366ff7580b64050d48119b30f71b2fd3..6be50f8abc2dc1bd7e10d279f267c67542a585e7 100644 --- a/torchvision/csrc/io/decoder/defs.h +++ b/torchvision/csrc/io/decoder/defs.h @@ -165,7 +165,7 @@ struct MediaFormat { struct DecoderParameters { // local file, remote file, http url, rtmp stream uri, etc. anything that // ffmpeg can recognize - std::string uri; + std::string uri{std::string()}; // timeout on getting bytes for decoding size_t timeoutMs{1000}; // logging level, default AV_LOG_PANIC @@ -213,6 +213,12 @@ struct DecoderParameters { // Skip packets that fail with EPERM errors and continue decoding. bool skipOperationNotPermittedPackets{false}; + + // probing size in bytes, i.e. the size of the data to analyze to get stream + // information. A higher value will enable detecting more information in case + // it is dispersed into the stream, but will increase latency. Must be an + // integer not lesser than 32. It is 5000000 by default. + int64_t probeSize{5000000}; }; struct DecoderHeader { @@ -295,7 +301,7 @@ struct DecoderMetadata { }; /** * Abstract class for decoding media bytes - * It has two diffrent modes. Internal media bytes retrieval for given uri and + * It has two different modes. Internal media bytes retrieval for given uri and * external media bytes provider in case of memory streams */ class MediaDecoder { diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp index a7b0128e3edecfa3b18bc3730a8e75e384d2738a..4e420c3b3cd685e8fbda2fd84f6f9256dbfc2229 100644 --- a/torchvision/csrc/io/decoder/memory_buffer.cpp +++ b/torchvision/csrc/io/decoder/memory_buffer.cpp @@ -61,7 +61,7 @@ DecoderInCallback MemoryBuffer::getCallback( } // seek mode if (!timeoutMs) { - // seek capabilty, yes - supported + // seek capability, yes - supported return 0; } return object.seek(size, whence); diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp index 936d1e94f46b535387142c39d9f7da51584ba658..980725c2fcb43fdd6dee2b61ee230c93673f3de8 100644 --- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp +++ b/torchvision/csrc/io/decoder/sync_decoder_test.cpp @@ -368,7 +368,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) { } // seek mode if (!timeoutMs) { - // seek capabilty, yes - no + // seek capability, yes - no return -1; } return object.seek(size, whence); @@ -408,7 +408,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) { } // seek mode if (!timeoutMs) { - // seek capabilty, yes - no + // seek capability, yes - no return -1; } return object.seek(size, whence); diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp index 62ec0709be1d90bf6ff47c33919410fc15601414..8b712609e3439cd6478968e7a5410a276cb9758b 100644 --- a/torchvision/csrc/io/decoder/video_sampler.cpp +++ b/torchvision/csrc/io/decoder/video_sampler.cpp @@ -181,6 +181,23 @@ bool VideoSampler::init(const SamplerParameters& params) { // set output format params_ = params; + if (params.in.video.format == AV_PIX_FMT_YUV420P) { + /* When the video width and height are not multiples of 8, + * and there is no size change in the conversion, + * a blurry screen will appear on the right side + * This problem was discovered in 2012 and + * continues to exist in version 4.1.3 in 2019 + * This problem can be avoided by increasing SWS_ACCURATE_RND + * details https://trac.ffmpeg.org/ticket/1582 + */ + if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) { + VLOG(1) << "The width " << params.in.video.width << " and height " + << params.in.video.height << " the image is not a multiple of 8, " + << "the decoding speed may be reduced"; + swsFlags_ |= SWS_ACCURATE_RND; + } + } + scaleContext_ = sws_getContext( params.in.video.width, params.in.video.height, diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp index 1cc05dc76cadcb563a777c444139799186d1977e..da4dc5833dea5dc4c8ce78772412e2881c84aadd 100644 --- a/torchvision/csrc/io/image/cpu/decode_image.cpp +++ b/torchvision/csrc/io/image/cpu/decode_image.cpp @@ -7,6 +7,8 @@ namespace vision { namespace image { torch::Tensor decode_image(const torch::Tensor& data, ImageReadMode mode) { + // Check that tensor is a CPU tensor + TORCH_CHECK(data.device() == torch::kCPU, "Expected a CPU tensor"); // Check that the input tensor dtype is uint8 TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor"); // Check that the input tensor is 1-dimensional diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp index 6ec644d003ee9d4211c5cb27e81e124b03a7bbe7..63a4e5b42ec8f137356fe4f5be848a3233522882 100644 --- a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp +++ b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp @@ -67,6 +67,58 @@ static void torch_jpeg_set_source_mgr( src->pub.next_input_byte = src->data; } +inline unsigned char clamped_cmyk_rgb_convert( + unsigned char k, + unsigned char cmy) { + // Inspired from Pillow: + // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L568-L569 + int v = k * cmy + 128; + v = ((v >> 8) + v) >> 8; + return std::clamp(k - v, 0, 255); +} + +void convert_line_cmyk_to_rgb( + j_decompress_ptr cinfo, + const unsigned char* cmyk_line, + unsigned char* rgb_line) { + int width = cinfo->output_width; + for (int i = 0; i < width; ++i) { + int c = cmyk_line[i * 4 + 0]; + int m = cmyk_line[i * 4 + 1]; + int y = cmyk_line[i * 4 + 2]; + int k = cmyk_line[i * 4 + 3]; + + rgb_line[i * 3 + 0] = clamped_cmyk_rgb_convert(k, 255 - c); + rgb_line[i * 3 + 1] = clamped_cmyk_rgb_convert(k, 255 - m); + rgb_line[i * 3 + 2] = clamped_cmyk_rgb_convert(k, 255 - y); + } +} + +inline unsigned char rgb_to_gray(int r, int g, int b) { + // Inspired from Pillow: + // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L226 + return (r * 19595 + g * 38470 + b * 7471 + 0x8000) >> 16; +} + +void convert_line_cmyk_to_gray( + j_decompress_ptr cinfo, + const unsigned char* cmyk_line, + unsigned char* gray_line) { + int width = cinfo->output_width; + for (int i = 0; i < width; ++i) { + int c = cmyk_line[i * 4 + 0]; + int m = cmyk_line[i * 4 + 1]; + int y = cmyk_line[i * 4 + 2]; + int k = cmyk_line[i * 4 + 3]; + + int r = clamped_cmyk_rgb_convert(k, 255 - c); + int g = clamped_cmyk_rgb_convert(k, 255 - m); + int b = clamped_cmyk_rgb_convert(k, 255 - y); + + gray_line[i] = rgb_to_gray(r, g, b); + } +} + } // namespace torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) { @@ -102,20 +154,29 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) { jpeg_read_header(&cinfo, TRUE); int channels = cinfo.num_components; + bool cmyk_to_rgb_or_gray = false; if (mode != IMAGE_READ_MODE_UNCHANGED) { switch (mode) { case IMAGE_READ_MODE_GRAY: - if (cinfo.jpeg_color_space != JCS_GRAYSCALE) { + if (cinfo.jpeg_color_space == JCS_CMYK || + cinfo.jpeg_color_space == JCS_YCCK) { + cinfo.out_color_space = JCS_CMYK; + cmyk_to_rgb_or_gray = true; + } else { cinfo.out_color_space = JCS_GRAYSCALE; - channels = 1; } + channels = 1; break; case IMAGE_READ_MODE_RGB: - if (cinfo.jpeg_color_space != JCS_RGB) { + if (cinfo.jpeg_color_space == JCS_CMYK || + cinfo.jpeg_color_space == JCS_YCCK) { + cinfo.out_color_space = JCS_CMYK; + cmyk_to_rgb_or_gray = true; + } else { cinfo.out_color_space = JCS_RGB; - channels = 3; } + channels = 3; break; /* * Libjpeg does not support converting from CMYK to grayscale etc. There @@ -139,12 +200,28 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) { auto tensor = torch::empty({int64_t(height), int64_t(width), channels}, torch::kU8); auto ptr = tensor.data_ptr(); + torch::Tensor cmyk_line_tensor; + if (cmyk_to_rgb_or_gray) { + cmyk_line_tensor = torch::empty({int64_t(width), 4}, torch::kU8); + } + while (cinfo.output_scanline < cinfo.output_height) { /* jpeg_read_scanlines expects an array of pointers to scanlines. * Here the array is only one element long, but you could ask for * more than one scanline at a time if that's more convenient. */ - jpeg_read_scanlines(&cinfo, &ptr, 1); + if (cmyk_to_rgb_or_gray) { + auto cmyk_line_ptr = cmyk_line_tensor.data_ptr(); + jpeg_read_scanlines(&cinfo, &cmyk_line_ptr, 1); + + if (channels == 3) { + convert_line_cmyk_to_rgb(&cinfo, cmyk_line_ptr, ptr); + } else if (channels == 1) { + convert_line_cmyk_to_gray(&cinfo, cmyk_line_ptr, ptr); + } + } else { + jpeg_read_scanlines(&cinfo, &ptr, 1); + } ptr += stride; } @@ -152,8 +229,23 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) { jpeg_destroy_decompress(&cinfo); return tensor.permute({2, 0, 1}); } +#endif // #if !JPEG_FOUND + +int64_t _jpeg_version() { +#if JPEG_FOUND + return JPEG_LIB_VERSION; +#else + return -1; +#endif +} +bool _is_compiled_against_turbo() { +#ifdef LIBJPEG_TURBO_VERSION + return true; +#else + return false; #endif +} } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.h b/torchvision/csrc/io/image/cpu/decode_jpeg.h index 97ed3d51a54e625989d695c42ccf78f1e2e79d9f..254e94680b6726c499f22d247d33d0e01fb524c0 100644 --- a/torchvision/csrc/io/image/cpu/decode_jpeg.h +++ b/torchvision/csrc/io/image/cpu/decode_jpeg.h @@ -10,5 +10,8 @@ C10_EXPORT torch::Tensor decode_jpeg( const torch::Tensor& data, ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED); +C10_EXPORT int64_t _jpeg_version(); +C10_EXPORT bool _is_compiled_against_turbo(); + } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_png.cpp b/torchvision/csrc/io/image/cpu/decode_png.cpp index b1ceaf1badd8dad866fc51d8570de5a78bdfe7ef..d27eafe45a754caa643cbd4309eb18776865cb55 100644 --- a/torchvision/csrc/io/image/cpu/decode_png.cpp +++ b/torchvision/csrc/io/image/cpu/decode_png.cpp @@ -49,6 +49,7 @@ torch::Tensor decode_png( png_destroy_read_struct(&png_ptr, &info_ptr, nullptr); TORCH_CHECK(false, "Internal error."); } + TORCH_CHECK(datap_len >= 8, "Content is too small for png!") auto is_png = !png_sig_cmp(datap, 0, 8); TORCH_CHECK(is_png, "Content is not png!") diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp index 3c9d632f03057df9d3f15535a1e63af030163c8a..b5952739a7a041e5b07ac40c8a15851bb4efbb5d 100644 --- a/torchvision/csrc/io/image/image.cpp +++ b/torchvision/csrc/io/image/image.cpp @@ -19,15 +19,18 @@ PyMODINIT_FUNC PyInit_image(void) { namespace vision { namespace image { -static auto registry = torch::RegisterOperators() - .op("image::decode_png", &decode_png) - .op("image::encode_png", &encode_png) - .op("image::decode_jpeg", &decode_jpeg) - .op("image::encode_jpeg", &encode_jpeg) - .op("image::read_file", &read_file) - .op("image::write_file", &write_file) - .op("image::decode_image", &decode_image) - .op("image::decode_jpeg_cuda", &decode_jpeg_cuda); +static auto registry = + torch::RegisterOperators() + .op("image::decode_png", &decode_png) + .op("image::encode_png", &encode_png) + .op("image::decode_jpeg", &decode_jpeg) + .op("image::encode_jpeg", &encode_jpeg) + .op("image::read_file", &read_file) + .op("image::write_file", &write_file) + .op("image::decode_image", &decode_image) + .op("image::decode_jpeg_cuda", &decode_jpeg_cuda) + .op("image::_jpeg_version", &_jpeg_version) + .op("image::_is_compiled_against_turbo", &_is_compiled_against_turbo); } // namespace image } // namespace vision diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp index 38b350145957faf6bc783b2c3500baf96385f981..2167ea695ec8948c590b1873cb8cd3b98e40e4cc 100644 --- a/torchvision/csrc/io/video/video.cpp +++ b/torchvision/csrc/io/video/video.cpp @@ -156,14 +156,34 @@ void Video::_getDecoderParams( } // _get decoder params -Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { - C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); +void Video::initFromFile( + std::string videoPath, + std::string stream, + int64_t numThreads) { + TORCH_CHECK(!initialized, "Video object can only be initialized once"); + initialized = true; + params.uri = videoPath; + _init(stream, numThreads); +} + +void Video::initFromMemory( + torch::Tensor videoTensor, + std::string stream, + int64_t numThreads) { + TORCH_CHECK(!initialized, "Video object can only be initialized once"); + initialized = true; + callback = MemoryBuffer::getCallback( + videoTensor.data_ptr(), videoTensor.size(0)); + _init(stream, numThreads); +} + +void Video::_init(std::string stream, int64_t numThreads) { // set number of threads global numThreads_ = numThreads; // parse stream information current_stream = _parseStream(stream); // note that in the initial call we want to get all streams - Video::_getDecoderParams( + _getDecoderParams( 0, // video start 0, // headerOnly std::get<0>(current_stream), // stream info - remove that @@ -175,11 +195,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { std::string logMessage, logType; - // TODO: add read from memory option - params.uri = videoPath; - logType = "file"; - logMessage = videoPath; - // locals std::vector audioFPS, videoFPS; std::vector audioDuration, videoDuration, ccDuration, subsDuration; @@ -190,7 +205,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { c10::Dict> subsMetadata; // callback and metadata defined in struct - succeeded = decoder.init(params, std::move(callback), &metadata); + DecoderInCallback tmp_callback = callback; + succeeded = decoder.init(params, std::move(tmp_callback), &metadata); if (succeeded) { for (const auto& header : metadata) { double fps = double(header.fps); @@ -225,16 +241,24 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { streamsMetadata.insert("subtitles", subsMetadata); streamsMetadata.insert("cc", ccMetadata); - succeeded = Video::setCurrentStream(stream); + succeeded = setCurrentStream(stream); LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n"; if (std::get<1>(current_stream) != -1) { LOG(INFO) << "Stream index set to " << std::get<1>(current_stream) << ". If you encounter trouble, consider switching it to automatic stream discovery. \n"; } +} + +Video::Video(std::string videoPath, std::string stream, int64_t numThreads) { + C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video"); + if (!videoPath.empty()) { + initFromFile(videoPath, stream, numThreads); + } } // video bool Video::setCurrentStream(std::string stream = "video") { + TORCH_CHECK(initialized, "Video object has to be initialized first"); if ((!stream.empty()) && (_parseStream(stream) != current_stream)) { current_stream = _parseStream(stream); } @@ -256,19 +280,23 @@ bool Video::setCurrentStream(std::string stream = "video") { ); // callback and metadata defined in Video.h - return (decoder.init(params, std::move(callback), &metadata)); + DecoderInCallback tmp_callback = callback; + return (decoder.init(params, std::move(tmp_callback), &metadata)); } std::tuple Video::getCurrentStream() const { + TORCH_CHECK(initialized, "Video object has to be initialized first"); return current_stream; } c10::Dict>> Video:: getStreamMetadata() const { + TORCH_CHECK(initialized, "Video object has to be initialized first"); return streamsMetadata; } void Video::Seek(double ts, bool fastSeek = false) { + TORCH_CHECK(initialized, "Video object has to be initialized first"); // initialize the class variables used for seeking and retrurn _getDecoderParams( ts, // video start @@ -282,20 +310,23 @@ void Video::Seek(double ts, bool fastSeek = false) { ); // callback and metadata defined in Video.h - succeeded = decoder.init(params, std::move(callback), &metadata); + DecoderInCallback tmp_callback = callback; + succeeded = decoder.init(params, std::move(tmp_callback), &metadata); + LOG(INFO) << "Decoder init at seek " << succeeded << "\n"; } std::tuple Video::Next() { + TORCH_CHECK(initialized, "Video object has to be initialized first"); // if failing to decode simply return a null tensor (note, should we - // raise an exeption?) + // raise an exception?) double frame_pts_s; torch::Tensor outFrame = torch::zeros({0}, torch::kByte); // decode single frame DecoderOutputMessage out; int64_t res = decoder.decode(&out, decoderTimeoutMs); - // if successfull + // if successful if (res == 0) { frame_pts_s = double(double(out.header.pts) * 1e-6); @@ -345,6 +376,8 @@ std::tuple Video::Next() { static auto registerVideo = torch::class_