diff --git a/.circleci/.gitignore b/.circleci/.gitignore
deleted file mode 100644
index 485dee64bcfb48793379b200a1afd14e85a8aaf4..0000000000000000000000000000000000000000
--- a/.circleci/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-.idea
diff --git a/.circleci/build_docs/commit_docs.sh b/.circleci/build_docs/commit_docs.sh
deleted file mode 100755
index 04e3538fefc8ca179c7d678f0c88efba433525de..0000000000000000000000000000000000000000
--- a/.circleci/build_docs/commit_docs.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-set -ex
-
-
-if [ "$2" == "" ]; then
-    echo call as "$0" "<src>" "<target branch>"
-    echo where src is the root of the built documentation git checkout and
-    echo branch should be "main" or "1.7" or so
-    exit 1
-fi
-
-src=$1
-target=$2
-
-echo "committing docs from ${src} to ${target}"
-
-pushd "${src}"
-git checkout gh-pages
-mkdir -p ./"${target}"
-rm -rf ./"${target}"/*
-cp -r "${src}/docs/build/html/"* ./"$target"
-if [ "${target}" == "main" ]; then
-    mkdir -p ./_static
-    rm -rf ./_static/*
-    cp -r "${src}/docs/build/html/_static/"* ./_static
-    git add --all ./_static || true
-fi
-git add --all ./"${target}" || true
-git config user.email "soumith+bot@pytorch.org"
-git config user.name "pytorchbot"
-# If there aren't changes, don't make a commit; push is no-op
-git commit -m "auto-generating sphinx docs" || true
-git remote add https https://github.com/pytorch/vision.git
-git push -u https gh-pages
diff --git a/.circleci/config.yml b/.circleci/config.yml
deleted file mode 100644
index 7e8c399bcf99398fcee028123214338f60569fa9..0000000000000000000000000000000000000000
--- a/.circleci/config.yml
+++ /dev/null
@@ -1,3078 +0,0 @@
-version: 2.1
-
-# How to test the Linux jobs:
-#   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7
-#     - Replace binary_linux_wheel_py3.7 with the name of the job you want to test.
-#       Job names are 'name:' key.
-
-executors:
-  windows-cpu:
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-
-  windows-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-commands:
-  checkout_merge:
-    description: "checkout merge branch"
-    steps:
-      - checkout
-#     - run:
-#         name: Checkout merge branch
-#         command: |
-#           set -ex
-#           BRANCH=$(git rev-parse --abbrev-ref HEAD)
-#           if [[ "$BRANCH" != "main" ]]; then
-#             git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH}
-#             git checkout "merged/$CIRCLE_BRANCH"
-#           fi
-  designate_upload_channel:
-    description: "inserts the correct upload channel into ${BASH_ENV}"
-    steps:
-      - run:
-          name: adding UPLOAD_CHANNEL to BASH_ENV
-          command: |
-            our_upload_channel=test
-            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
-
-  brew_update:
-    description: "Update Homebrew and install base formulae"
-    steps:
-      - run:
-          name: Update Homebrew
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-
-            # Update repositories manually.
-            # Running `brew update` produces a comparison between the
-            # current checkout and the updated checkout, which takes a
-            # very long time because the existing checkout is 2y old.
-            for path in $(find /usr/local/Homebrew -type d -name .git)
-            do
-            cd $path/..
-            git fetch --depth=1 origin
-            git reset --hard origin/master
-            done
-
-            export HOMEBREW_NO_AUTO_UPDATE=1
-
-            # Install expect and moreutils so that we can call `unbuffer` and `ts`.
-            # moreutils installs a `parallel` executable by default, which conflicts
-            # with the executable from the GNU `parallel`, so we must unlink GNU
-            # `parallel` first, and relink it afterwards.
-            brew install coreutils
-            brew unlink parallel
-            brew install moreutils
-            brew link parallel --overwrite
-            brew install expect
-
-  brew_install:
-    description: "Install Homebrew formulae"
-    parameters:
-      formulae:
-        type: string
-        default: ""
-    steps:
-      - run:
-          name: Install << parameters.formulae >>
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-            export HOMEBREW_NO_AUTO_UPDATE=1
-            brew install << parameters.formulae >>
-
-  run_brew_for_ios_build:
-    steps:
-      - brew_update
-      - brew_install:
-          formulae: libtool
-
-  apt_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      update:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> apt install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: |
-            <<# parameters.update >> sudo apt update -qy  <</ parameters.update >>
-            sudo apt install << parameters.args >>
-
-  pip_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      user:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> pip install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: >
-            pip install
-            <<# parameters.user >> --user <</ parameters.user >>
-            --progress-bar=off
-            << parameters.args >>
-
-  install_torchvision:
-    parameters:
-      editable:
-        type: boolean
-        default: true
-    steps:
-      - pip_install:
-          args: --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html
-          descr: Install PyTorch from nightly releases
-      - pip_install:
-          args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
-          descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
-
-  # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
-  # This command can be used if only a selection of tests need to be run, for ad-hoc files.
-  run_tests_selective:
-    parameters:
-      file_or_dir:
-        type: string
-    steps:
-      - run:
-          name: Install test utilities
-          command: pip install --progress-bar=off pytest pytest-mock
-      - run:
-          name: Run tests
-          command: pytest --junitxml=test-results/junit.xml -v --durations 20 <<parameters.file_or_dir>>
-      - store_test_results:
-          path: test-results
-
-  download_model_weights:
-    parameters:
-      extract_roots:
-        type: string
-        default: "torchvision/models"
-      background:
-        type: boolean
-        default: true
-    steps:
-      - apt_install:
-          args: parallel wget
-          descr: Install download utilitites
-      - run:
-          name: Download model weights
-          background: << parameters.background >>
-          command: |
-            mkdir -p ~/.cache/torch/hub/checkpoints
-            python scripts/collect_model_urls.py << parameters.extract_roots >> \
-                | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci'
-
-binary_common: &binary_common
-  parameters:
-    # Edit these defaults to do a release
-    build_version:
-      description: "version number of release binary; by default, build a nightly"
-      type: string
-      default: "0.14.1"
-    pytorch_version:
-      description: "PyTorch version to build against; by default, use a nightly"
-      type: string
-      default: "1.13.1"
-    # Don't edit these
-    python_version:
-      description: "Python version to build against (e.g., 3.7)"
-      type: string
-    cu_version:
-      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
-      type: string
-      default: "cpu"
-    unicode_abi:
-      description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)"
-      type: string
-      default: ""
-    wheel_docker_image:
-      description: "Wheel only: what docker image to use"
-      type: string
-      default: ""
-    conda_docker_image:
-      description: "Conda only: what docker image to use"
-      type: string
-      default: "pytorch/conda-builder:cpu"
-  environment:
-    PYTHON_VERSION: << parameters.python_version >>
-    PYTORCH_VERSION: << parameters.pytorch_version >>
-    UNICODE_ABI: << parameters.unicode_abi >>
-    CU_VERSION: << parameters.cu_version >>
-    MACOSX_DEPLOYMENT_TARGET: 10.9
-
-torchvision_ios_params: &torchvision_ios_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-    ios_arch:
-      type: string
-      default: ""
-    ios_platform:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-    IOS_ARCH: << parameters.ios_arch >>
-    IOS_PLATFORM: << parameters.ios_platform >>
-
-torchvision_android_params: &torchvision_android_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-
-smoke_test_common: &smoke_test_common
-  <<: *binary_common
-  docker:
-    - image: torchvision/smoke_test:latest
-
-jobs:
-  circleci_consistency:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - pip_install:
-          args: jinja2 pyyaml
-      - run:
-          name: Check CircleCI config consistency
-          command: |
-            python .circleci/regenerate.py
-            git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
-
-  lint_python_and_config:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - pip_install:
-          args: pre-commit
-          descr: Install lint utilities
-      - run:
-          name: Install pre-commit hooks
-          command: pre-commit install-hooks
-      - run:
-          name: Lint Python code and config files
-          command: pre-commit run --all-files
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  lint_c:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - apt_install:
-          args: libtinfo5
-          descr: Install additional system libraries
-      - checkout
-      - run:
-          name: Install lint utilities
-          command: |
-            curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format
-            chmod +x clang-format
-            sudo mv clang-format /opt/clang-format
-      - run:
-          name: Lint C code
-          command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  type_check_python:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision:
-          editable: true
-      - pip_install:
-          args: mypy
-          descr: Install Python type check utilities
-      - run:
-          name: Check Python types statically
-          command: mypy --install-types --non-interactive --config-file mypy.ini
-
-  unittest_torchhub:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision
-      - run_tests_selective:
-          file_or_dir: test/test_hub.py
-
-  unittest_onnx:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision
-      - pip_install:
-          args: onnx onnxruntime
-          descr: Install ONNX
-      - run_tests_selective:
-          file_or_dir: test/test_onnx.py
-
-  unittest_extended:
-    docker:
-      - image: cimg/python:3.7
-    resource_class: xlarge
-    steps:
-      - checkout
-      - download_model_weights
-      - install_torchvision
-      - run:
-          name: Enable extended tests
-          command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV
-      - run_tests_selective:
-          file_or_dir: test/test_extended_*.py
-
-  binary_linux_wheel:
-    <<: *binary_common
-    docker:
-      - image: << parameters.wheel_docker_image >>
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-              set -ex
-              packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_linux_conda:
-    <<: *binary_common
-    docker:
-      - image: "<< parameters.conda_docker_image >>"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-              set -ex
-              packaging/build_conda.sh
-      - store_artifacts:
-          path: /opt/conda/conda-bld/linux-64
-      - persist_to_workspace:
-          root: /opt/conda/conda-bld/linux-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_conda:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate base
-            conda install -yq conda-build "conda-package-handling!=1.5.0"
-            packaging/build_conda.sh
-            rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2
-      - store_artifacts:
-          path: C:/tools/miniconda3/conda-bld/win-64
-      - persist_to_workspace:
-          root: C:/tools/miniconda3/conda-bld/win-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_wheel:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build wheel packages
-          no_output_timeout: 30m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_macos_wheel:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          # Cannot easily deduplicate this as source'ing activate
-          # will set environment variables which we need to propagate
-          # to build_wheel.sh
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_ios_build:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh"
-          cat "$script"
-          source "$script"
-    - persist_to_workspace:
-        root: /Users/distiller/workspace/
-        paths: ios
-
-  binary_ios_upload:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_android_build:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh"
-          cat "$script"
-          source "$script"
-    - store_artifacts:
-        path: ~/workspace/artifacts
-
-  binary_android_upload:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_macos_conda:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build
-            packaging/build_conda.sh
-      - store_artifacts:
-          path: /Users/distiller/miniconda3/conda-bld/osx-64
-      - persist_to_workspace:
-          root: /Users/distiller/miniconda3/conda-bld/osx-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  # Requires org-member context
-  binary_conda_upload:
-    docker:
-      - image: continuumio/miniconda
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          command: |
-            # Prevent credential from leaking
-            conda install -yq anaconda-client
-            set -x
-            anaconda  -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force
-
-  # Requires org-member context
-  binary_wheel_upload:
-    parameters:
-      subfolder:
-        description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)"
-        type: string
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - checkout
-      - pip_install:
-          args: awscli
-      - run:
-          command: |
-            export PATH="$HOME/.local/bin:$PATH"
-            # Prevent credential from leaking
-            set +x
-            export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
-            export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
-            set -x
-            for pkg in ~/workspace/*.whl; do
-              aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read
-            done
-
-  smoke_test_linux_conda:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_linux_pip:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/test/torch_test.html
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_docker_image_build:
-    machine:
-      image: ubuntu-2004:202104-01
-    resource_class: large
-    environment:
-      image_name: torchvision/smoke_test
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Build and push Docker image
-          no_output_timeout: "1h"
-          command: |
-            set +x
-            echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin
-            set -x
-            cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest
-            docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker push ${image_name}:latest
-
-  smoke_test_win_conda:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda env remove -n python${PYTHON_VERSION} || true
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_win_pip:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/test/torch_test.html
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  unittest_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cpu"
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.medium
-    environment:
-      image_name: "pytorch/manylinux-cuda116"
-      CU_VERSION: << parameters.cu_version >>
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          # Here we create an envlist file that contains some env variables that we want the docker container to be aware of.
-          # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables.
-          # They're avaiable in all the other workflows (OSX and Windows).
-          # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible.
-          # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run".
-          name: export CIRCLECI env var
-          command: echo "CIRCLECI=true" >> ./env.list
-      - run:
-          name: Install torchvision
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post Process
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    environment:
-      CUDA_VERSION: "11.6"
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install CUDA
-          command: packaging/windows/internal/cuda_install.bat
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    resource_class: large
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Install wget
-          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget
-          # Disable brew auto update which is very slow
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-
-          keys:
-            - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-
-          key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  cmake_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cpu"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run: packaging/build_cmake.sh
-
-  cmake_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.small
-    environment:
-      PYTHON_VERSION: << parameters.python_version >>
-      PYTORCH_VERSION: << parameters.pytorch_version >>
-      UNICODE_ABI: << parameters.unicode_abi >>
-      CU_VERSION: << parameters.cu_version >>
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Build torchvision C++ distribution and test
-          no_output_timeout: 30m
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh
-
-  cmake_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build cmake
-            packaging/build_cmake.sh
-
-  cmake_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/build_cmake.sh
-
-  cmake_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_cmake.sh
-
-  build_docs:
-    <<: *binary_common
-    docker:
-      - image: cimg/python:3.7
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - checkout
-      - download_model_weights
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - designate_upload_channel
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Build docs
-          command: |
-            set -ex
-            # turn v1.12.0rc3 into 1.12.0
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/')
-            VERSION=${tag:-main}
-            eval "$(./conda/bin/conda shell.bash hook)"
-            conda activate ./env
-            pushd docs
-            pip install --progress-bar=off -r requirements.txt
-            make html
-            popd
-      - persist_to_workspace:
-          root: ./
-          paths:
-            - "*"
-      - store_artifacts:
-          path: ./docs/build/html
-          destination: docs
-
-  upload_docs:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda100"
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - run:
-          name: Generate netrc
-          command: |
-            # set credentials for https pushing
-            # requires the org-member context
-            cat > ~/.netrc \<<DONE
-              machine github.com
-              login pytorchbot
-              password ${GITHUB_PYTORCHBOT_TOKEN}
-            DONE
-      - run:
-          name: Upload docs
-          command: |
-            # Don't use "checkout" step since it uses ssh, which cannot git push
-            # https://circleci.com/docs/2.0/configuration-reference/#checkout
-            set -ex
-            # Change v1.12.1rc1 into 1.12 (only major.minor)
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-            target=${tag:-main}
-            ~/workspace/.circleci/build_docs/commit_docs.sh ~/workspace $target
-
-
-workflows:
-  lint:
-    jobs:
-      - circleci_consistency
-      - lint_python_and_config
-      - lint_c
-      - type_check_python
-
-  build:
-    jobs:
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_linux_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_wheel_py3.7_cu117
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.7_rocm5.1.1
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          name: binary_linux_wheel_py3.7_rocm5.2
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_wheel_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.8_rocm5.1.1
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          name: binary_linux_wheel_py3.8_rocm5.2
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_wheel_py3.9_cu117
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.9_rocm5.1.1
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          name: binary_linux_wheel_py3.9_rocm5.2
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_wheel_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_wheel_py3.10_cu117
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          name: binary_linux_wheel_py3.10_rocm5.1.1
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          name: binary_linux_wheel_py3.10_rocm5.2
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cpu
-          python_version: '3.7'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cu116
-          python_version: '3.7'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.7_cu117
-          python_version: '3.7'
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cpu
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu116
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.8_cu117
-          python_version: '3.8'
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cpu
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu116
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.9_cu117
-          python_version: '3.9'
-      - binary_win_wheel:
-          cu_version: cpu
-          name: binary_win_wheel_py3.10_cpu
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_wheel_py3.10_cu116
-          python_version: '3.10'
-      - binary_win_wheel:
-          cu_version: cu117
-          name: binary_win_wheel_py3.10_cu117
-          python_version: '3.10'
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_conda_py3.7_cu117
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_conda_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_conda_py3.9_cu117
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_linux_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          name: binary_linux_conda_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          name: binary_linux_conda_py3.10_cu117
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          name: binary_macos_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cpu
-          python_version: '3.7'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cu116
-          python_version: '3.7'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.7_cu117
-          python_version: '3.7'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cpu
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu116
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.8_cu117
-          python_version: '3.8'
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cpu
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu116
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.9_cu117
-          python_version: '3.9'
-      - binary_win_conda:
-          cu_version: cpu
-          name: binary_win_conda_py3.10_cpu
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: main
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: binary_win_conda_py3.10_cu116
-          python_version: '3.10'
-      - binary_win_conda:
-          cu_version: cu117
-          name: binary_win_conda_py3.10_cu117
-          python_version: '3.10'
-      - build_docs:
-          filters:
-            branches:
-              only:
-              - /.*/
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: build_docs
-          python_version: '3.7'
-          requires:
-          - binary_linux_wheel_py3.7_cpu
-      - upload_docs:
-          context: org-member
-          filters:
-            branches:
-              only:
-              - nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: upload_docs
-          python_version: '3.7'
-          requires:
-          - build_docs
-      - binary_ios_build:
-          build_environment: binary-libtorchvision_ops-ios-12.0.0-x86_64
-          ios_arch: x86_64
-          ios_platform: SIMULATOR
-          name: binary_libtorchvision_ops_ios_12.0.0_x86_64
-      - binary_ios_build:
-          build_environment: binary-libtorchvision_ops-ios-12.0.0-arm64
-          ios_arch: arm64
-          ios_platform: OS
-          name: binary_libtorchvision_ops_ios_12.0.0_arm64
-      - binary_android_build:
-          build_environment: binary-libtorchvision_ops-android
-          name: binary_libtorchvision_ops_android
-
-  unittest:
-    jobs:
-      - unittest_torchhub
-      - unittest_onnx
-      - unittest_extended
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.7
-          python_version: '3.7'
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.8
-          python_version: '3.8'
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.9
-          python_version: '3.9'
-      - unittest_linux_cpu:
-          cu_version: cpu
-          name: unittest_linux_cpu_py3.10
-          python_version: '3.10'
-      - unittest_linux_gpu:
-          cu_version: cu116
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.7
-          python_version: '3.7'
-      - unittest_linux_gpu:
-          cu_version: cu116
-          name: unittest_linux_gpu_py3.8
-          python_version: '3.8'
-      - unittest_linux_gpu:
-          cu_version: cu116
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.9
-          python_version: '3.9'
-      - unittest_linux_gpu:
-          cu_version: cu116
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_linux_gpu_py3.10
-          python_version: '3.10'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.7
-          python_version: '3.7'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.8
-          python_version: '3.8'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.9
-          python_version: '3.9'
-      - unittest_windows_cpu:
-          cu_version: cpu
-          name: unittest_windows_cpu_py3.10
-          python_version: '3.10'
-      - unittest_windows_gpu:
-          cu_version: cu116
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.7
-          python_version: '3.7'
-      - unittest_windows_gpu:
-          cu_version: cu116
-          name: unittest_windows_gpu_py3.8
-          python_version: '3.8'
-      - unittest_windows_gpu:
-          cu_version: cu116
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.9
-          python_version: '3.9'
-      - unittest_windows_gpu:
-          cu_version: cu116
-          filters:
-            branches:
-              only:
-              - main
-              - nightly
-          name: unittest_windows_gpu_py3.10
-          python_version: '3.10'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.7
-          python_version: '3.7'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.8
-          python_version: '3.8'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.9
-          python_version: '3.9'
-      - unittest_macos_cpu:
-          cu_version: cpu
-          name: unittest_macos_cpu_py3.10
-          python_version: '3.10'
-
-  cmake:
-    jobs:
-      - cmake_linux_cpu:
-          cu_version: cpu
-          name: cmake_linux_cpu
-          python_version: '3.8'
-      - cmake_linux_gpu:
-          cu_version: cu116
-          name: cmake_linux_gpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - cmake_windows_cpu:
-          cu_version: cpu
-          name: cmake_windows_cpu
-          python_version: '3.8'
-      - cmake_windows_gpu:
-          cu_version: cu116
-          name: cmake_windows_gpu
-          python_version: '3.8'
-      - cmake_macos_cpu:
-          cu_version: cpu
-          name: cmake_macos_cpu
-          python_version: '3.8'
-
-  nightly:
-    jobs:
-      - binary_ios_build:
-          build_environment: nightly-binary-libtorchvision_ops-ios-12.0.0-x86_64
-          filters:
-            branches:
-              only:
-              - nightly
-          ios_arch: x86_64
-          ios_platform: SIMULATOR
-          name: nightly_binary_libtorchvision_ops_ios_12.0.0_x86_64
-      - binary_ios_build:
-          build_environment: nightly-binary-libtorchvision_ops-ios-12.0.0-arm64
-          filters:
-            branches:
-              only:
-              - nightly
-          ios_arch: arm64
-          ios_platform: OS
-          name: nightly_binary_libtorchvision_ops_ios_12.0.0_arm64
-      - binary_ios_upload:
-          build_environment: nightly-binary-libtorchvision_ops-ios-12.0.0-upload
-          context: org-member
-          filters:
-            branches:
-              only:
-              - nightly
-          requires:
-          - nightly_binary_libtorchvision_ops_ios_12.0.0_x86_64
-          - nightly_binary_libtorchvision_ops_ios_12.0.0_arm64
-      - binary_android_upload:
-          build_environment: nightly-binary-libtorchvision_ops-android-upload
-          context: org-member
-          filters:
-            branches:
-              only:
-              - nightly
-          name: nightly_binary_libtorchvision_ops_android_upload
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu117
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_cu117_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_cu117
-          subfolder: cu117/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.1.1
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.2
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.7_rocm5.2_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.7_rocm5.2
-          subfolder: rocm5.2/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_cu117_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_cu117
-          subfolder: cu117/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.1.1
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.2
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.8_rocm5.2_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.8_rocm5.2
-          subfolder: rocm5.2/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu117
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_cu117_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_cu117
-          subfolder: cu117/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.1.1
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.2
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.9_rocm5.2_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.9_rocm5.2
-          subfolder: rocm5.2/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cpu
-          subfolder: cpu/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu116_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cu116
-          subfolder: cu116/
-      - binary_linux_wheel:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu117
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_cu117_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_cu117
-          subfolder: cu117/
-      - binary_linux_wheel:
-          cu_version: rocm5.1.1
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.1.1
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.1.1
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.1.1_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_rocm5.1.1
-          subfolder: rocm5.1.1/
-      - binary_linux_wheel:
-          cu_version: rocm5.2
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.2
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-rocm:5.2
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_wheel_py3.10_rocm5.2_upload
-          requires:
-          - nightly_binary_linux_wheel_py3.10_rocm5.2
-          subfolder: rocm5.2/
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.7_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.7_cpu
-          subfolder: ''
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.8_cpu
-          subfolder: ''
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.9_cpu
-          subfolder: ''
-      - binary_macos_wheel:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_macos_wheel_py3.10_cpu
-          subfolder: ''
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cpu
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu116
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cu116
-          subfolder: cu116/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu117
-          python_version: '3.7'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.7_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.7_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cpu
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu116
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu116
-          subfolder: cu116/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu117
-          python_version: '3.8'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.8_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.8_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cpu
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu116
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu116
-          subfolder: cu116/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu117
-          python_version: '3.9'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.9_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.9_cu117
-          subfolder: cu117/
-      - binary_win_wheel:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cpu
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cpu_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cpu
-          subfolder: cpu/
-      - binary_win_wheel:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu116
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu116_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu116
-          subfolder: cu116/
-      - binary_win_wheel:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu117
-          python_version: '3.10'
-      - binary_wheel_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_wheel_py3.10_cu117_upload
-          requires:
-          - nightly_binary_win_wheel_py3.10_cu117
-          subfolder: cu117/
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu116
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu117
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.7_cu117_upload
-          requires:
-          - nightly_binary_linux_conda_py3.7_cu117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu116
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu117
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.8_cu117_upload
-          requires:
-          - nightly_binary_linux_conda_py3.8_cu117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu116
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu117
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.9_cu117_upload
-          requires:
-          - nightly_binary_linux_conda_py3.9_cu117
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cpu
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda116
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu116
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda116
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu116_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cu116
-      - binary_linux_conda:
-          conda_docker_image: pytorch/conda-builder:cuda117
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu117
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cuda117
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_linux_conda_py3.10_cu117_upload
-          requires:
-          - nightly_binary_linux_conda_py3.10_cu117
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.7_cpu
-          python_version: '3.7'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.7_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.7_cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.8_cpu
-          python_version: '3.8'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.8_cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.9_cpu
-          python_version: '3.9'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.9_cpu
-      - binary_macos_conda:
-          conda_docker_image: pytorch/conda-builder:cpu
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.10_cpu
-          python_version: '3.10'
-          wheel_docker_image: pytorch/manylinux-cpu
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_macos_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_macos_conda_py3.10_cpu
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cpu
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cpu
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu116
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cu116
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu117
-          python_version: '3.7'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.7_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.7_cu117
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cpu
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cpu
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu116
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu116
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu117
-          python_version: '3.8'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.8_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.8_cu117
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cpu
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cpu
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu116
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu116
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu117
-          python_version: '3.9'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.9_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.9_cu117
-      - binary_win_conda:
-          cu_version: cpu
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cpu
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cpu_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cpu
-      - binary_win_conda:
-          cu_version: cu116
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu116
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu116_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu116
-      - binary_win_conda:
-          cu_version: cu117
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu117
-          python_version: '3.10'
-      - binary_conda_upload:
-          context: org-member
-          filters:
-            branches:
-              only: nightly
-            tags:
-              only: /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
-          name: nightly_binary_win_conda_py3.10_cu117_upload
-          requires:
-          - nightly_binary_win_conda_py3.10_cu117
-  docker_build:
-    triggers:
-      - schedule:
-          cron: "0 10 * * 0"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - smoke_test_docker_image_build:
-          context: org-member
diff --git a/.circleci/config.yml.in b/.circleci/config.yml.in
deleted file mode 100644
index b14c2088526fc704f139fa687e21eedfb436e08b..0000000000000000000000000000000000000000
--- a/.circleci/config.yml.in
+++ /dev/null
@@ -1,1126 +0,0 @@
-version: 2.1
-
-# How to test the Linux jobs:
-#   - Install CircleCI local CLI: https://circleci.com/docs/2.0/local-cli/
-#   - circleci config process .circleci/config.yml > gen.yml && circleci local execute -c gen.yml --job binary_linux_wheel_py3.7
-#     - Replace binary_linux_wheel_py3.7 with the name of the job you want to test.
-#       Job names are 'name:' key.
-
-executors:
-  windows-cpu:
-    machine:
-      resource_class: windows.xlarge
-      image: windows-server-2019-vs2019:stable
-      shell: bash.exe
-
-  windows-gpu:
-    machine:
-      resource_class: windows.gpu.nvidia.medium
-      image: windows-server-2019-nvidia:stable
-      shell: bash.exe
-
-commands:
-  checkout_merge:
-    description: "checkout merge branch"
-    steps:
-      - checkout
-#     - run:
-#         name: Checkout merge branch
-#         command: |
-#           set -ex
-#           BRANCH=$(git rev-parse --abbrev-ref HEAD)
-#           if [[ "$BRANCH" != "main" ]]; then
-#             git fetch --force origin ${CIRCLE_BRANCH}/merge:merged/${CIRCLE_BRANCH}
-#             git checkout "merged/$CIRCLE_BRANCH"
-#           fi
-  designate_upload_channel:
-    description: "inserts the correct upload channel into ${BASH_ENV}"
-    steps:
-      - run:
-          name: adding UPLOAD_CHANNEL to BASH_ENV
-          command: |
-            our_upload_channel=test
-            echo "export UPLOAD_CHANNEL=${our_upload_channel}" >> ${BASH_ENV}
-
-  brew_update:
-    description: "Update Homebrew and install base formulae"
-    steps:
-      - run:
-          name: Update Homebrew
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-
-            # Update repositories manually.
-            # Running `brew update` produces a comparison between the
-            # current checkout and the updated checkout, which takes a
-            # very long time because the existing checkout is 2y old.
-            for path in $(find /usr/local/Homebrew -type d -name .git)
-            do
-            cd $path/..
-            git fetch --depth=1 origin
-            git reset --hard origin/master
-            done
-
-            export HOMEBREW_NO_AUTO_UPDATE=1
-
-            # Install expect and moreutils so that we can call `unbuffer` and `ts`.
-            # moreutils installs a `parallel` executable by default, which conflicts
-            # with the executable from the GNU `parallel`, so we must unlink GNU
-            # `parallel` first, and relink it afterwards.
-            brew install coreutils
-            brew unlink parallel
-            brew install moreutils
-            brew link parallel --overwrite
-            brew install expect
-
-  brew_install:
-    description: "Install Homebrew formulae"
-    parameters:
-      formulae:
-        type: string
-        default: ""
-    steps:
-      - run:
-          name: Install << parameters.formulae >>
-          no_output_timeout: "10m"
-          command: |
-            set -ex
-            export HOMEBREW_NO_AUTO_UPDATE=1
-            brew install << parameters.formulae >>
-
-  run_brew_for_ios_build:
-    steps:
-      - brew_update
-      - brew_install:
-          formulae: libtool
-
-  apt_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      update:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> apt install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: |
-            <<# parameters.update >> sudo apt update -qy  <</ parameters.update >>
-            sudo apt install << parameters.args >>
-
-  pip_install:
-    parameters:
-      args:
-        type: string
-      descr:
-        type: string
-        default: ""
-      user:
-        type: boolean
-        default: true
-    steps:
-      - run:
-          name: >
-            <<^ parameters.descr >> pip install << parameters.args >> <</ parameters.descr >>
-            <<# parameters.descr >> << parameters.descr >>            <</ parameters.descr >>
-          command: >
-            pip install
-            <<# parameters.user >> --user <</ parameters.user >>
-            --progress-bar=off
-            << parameters.args >>
-
-  install_torchvision:
-    parameters:
-      editable:
-        type: boolean
-        default: true
-    steps:
-      - pip_install:
-          args: --pre torch -f https://download.pytorch.org/whl/test/cpu/torch_test.html
-          descr: Install PyTorch from nightly releases
-      - pip_install:
-          args: --no-build-isolation <<# parameters.editable >> --editable <</ parameters.editable >> .
-          descr: Install torchvision <<# parameters.editable >> in editable mode <</ parameters.editable >>
-
-  # Most of the test suite is handled by the `unittest` jobs, with completely different workflow and setup.
-  # This command can be used if only a selection of tests need to be run, for ad-hoc files.
-  run_tests_selective:
-    parameters:
-      file_or_dir:
-        type: string
-    steps:
-      - run:
-          name: Install test utilities
-          command: pip install --progress-bar=off pytest pytest-mock
-      - run:
-          name: Run tests
-          command: pytest --junitxml=test-results/junit.xml -v --durations 20 <<parameters.file_or_dir>>
-      - store_test_results:
-          path: test-results
-
-  download_model_weights:
-    parameters:
-      extract_roots:
-        type: string
-        default: "torchvision/models"
-      background:
-        type: boolean
-        default: true
-    steps:
-      - apt_install:
-          args: parallel wget
-          descr: Install download utilitites
-      - run:
-          name: Download model weights
-          background: << parameters.background >>
-          command: |
-            mkdir -p ~/.cache/torch/hub/checkpoints
-            python scripts/collect_model_urls.py << parameters.extract_roots >> \
-                | parallel -j0 'wget --no-verbose -O ~/.cache/torch/hub/checkpoints/`basename {}` {}\?source=ci'
-
-binary_common: &binary_common
-  parameters:
-    # Edit these defaults to do a release
-    build_version:
-      description: "version number of release binary; by default, build a nightly"
-      type: string
-      default: "0.14.1"
-    pytorch_version:
-      description: "PyTorch version to build against; by default, use a nightly"
-      type: string
-      default: "1.13.1"
-    # Don't edit these
-    python_version:
-      description: "Python version to build against (e.g., 3.7)"
-      type: string
-    cu_version:
-      description: "CUDA version to build against, in CU format (e.g., cpu or cu100)"
-      type: string
-      default: "cpu"
-    unicode_abi:
-      description: "Python 2.7 wheel only: whether or not we are cp27mu (default: no)"
-      type: string
-      default: ""
-    wheel_docker_image:
-      description: "Wheel only: what docker image to use"
-      type: string
-      default: ""
-    conda_docker_image:
-      description: "Conda only: what docker image to use"
-      type: string
-      default: "pytorch/conda-builder:cpu"
-  environment:
-    PYTHON_VERSION: << parameters.python_version >>
-    PYTORCH_VERSION: << parameters.pytorch_version >>
-    UNICODE_ABI: << parameters.unicode_abi >>
-    CU_VERSION: << parameters.cu_version >>
-    MACOSX_DEPLOYMENT_TARGET: 10.9
-
-torchvision_ios_params: &torchvision_ios_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-    ios_arch:
-      type: string
-      default: ""
-    ios_platform:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-    IOS_ARCH: << parameters.ios_arch >>
-    IOS_PLATFORM: << parameters.ios_platform >>
-
-torchvision_android_params: &torchvision_android_params
-  parameters:
-    build_environment:
-      type: string
-      default: ""
-  environment:
-    BUILD_ENVIRONMENT: << parameters.build_environment >>
-
-smoke_test_common: &smoke_test_common
-  <<: *binary_common
-  docker:
-    - image: torchvision/smoke_test:latest
-
-jobs:
-  circleci_consistency:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - pip_install:
-          args: jinja2 pyyaml
-      - run:
-          name: Check CircleCI config consistency
-          command: |
-            python .circleci/regenerate.py
-            git diff --exit-code || (echo ".circleci/config.yml not in sync with config.yml.in! Run .circleci/regenerate.py to update config"; exit 1)
-
-  lint_python_and_config:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - pip_install:
-          args: pre-commit
-          descr: Install lint utilities
-      - run:
-          name: Install pre-commit hooks
-          command: pre-commit install-hooks
-      - run:
-          name: Lint Python code and config files
-          command: pre-commit run --all-files
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  lint_c:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - apt_install:
-          args: libtinfo5
-          descr: Install additional system libraries
-      - checkout
-      - run:
-          name: Install lint utilities
-          command: |
-            curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o clang-format
-            chmod +x clang-format
-            sudo mv clang-format /opt/clang-format
-      - run:
-          name: Lint C code
-          command: ./.circleci/unittest/linux/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable /opt/clang-format
-      - run:
-          name: Required lint modifications
-          when: on_fail
-          command: git --no-pager diff
-
-  type_check_python:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision:
-          editable: true
-      - pip_install:
-          args: mypy
-          descr: Install Python type check utilities
-      - run:
-          name: Check Python types statically
-          command: mypy --install-types --non-interactive --config-file mypy.ini
-
-  unittest_torchhub:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision
-      - run_tests_selective:
-          file_or_dir: test/test_hub.py
-
-  unittest_onnx:
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - checkout
-      - install_torchvision
-      - pip_install:
-          args: onnx onnxruntime
-          descr: Install ONNX
-      - run_tests_selective:
-          file_or_dir: test/test_onnx.py
-
-  unittest_extended:
-    docker:
-      - image: cimg/python:3.7
-    resource_class: xlarge
-    steps:
-      - checkout
-      - download_model_weights
-      - install_torchvision
-      - run:
-          name: Enable extended tests
-          command: echo 'export PYTORCH_TEST_WITH_EXTENDED=1' >> $BASH_ENV
-      - run_tests_selective:
-          file_or_dir: test/test_extended_*.py
-
-  binary_linux_wheel:
-    <<: *binary_common
-    docker:
-      - image: << parameters.wheel_docker_image >>
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-              set -ex
-              packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_linux_conda:
-    <<: *binary_common
-    docker:
-      - image: "<< parameters.conda_docker_image >>"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-              set -ex
-              packaging/build_conda.sh
-      - store_artifacts:
-          path: /opt/conda/conda-bld/linux-64
-      - persist_to_workspace:
-          root: /opt/conda/conda-bld/linux-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_conda:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build conda packages
-          no_output_timeout: 30m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate base
-            conda install -yq conda-build "conda-package-handling!=1.5.0"
-            packaging/build_conda.sh
-            rm /C/tools/miniconda3/conda-bld/win-64/vs${VC_YEAR}*.tar.bz2
-      - store_artifacts:
-          path: C:/tools/miniconda3/conda-bld/win-64
-      - persist_to_workspace:
-          root: C:/tools/miniconda3/conda-bld/win-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_win_wheel:
-    <<: *binary_common
-    executor: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Build wheel packages
-          no_output_timeout: 30m
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  binary_macos_wheel:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          # Cannot easily deduplicate this as source'ing activate
-          # will set environment variables which we need to propagate
-          # to build_wheel.sh
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            packaging/build_wheel.sh
-      - store_artifacts:
-          path: dist
-      - persist_to_workspace:
-          root: dist
-          paths:
-            - "*"
-
-  binary_ios_build:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_build.sh"
-          cat "$script"
-          source "$script"
-    - persist_to_workspace:
-        root: /Users/distiller/workspace/
-        paths: ios
-
-  binary_ios_upload:
-    <<: *torchvision_ios_params
-    macos:
-      xcode: "14.0"
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run_brew_for_ios_build
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/Users/distiller/project/.circleci/unittest/ios/scripts/binary_ios_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_android_build:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Build
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_build.sh"
-          cat "$script"
-          source "$script"
-    - store_artifacts:
-        path: ~/workspace/artifacts
-
-  binary_android_upload:
-    <<: *torchvision_android_params
-    docker:
-      - image: cimg/android:2021.08-ndk
-    resource_class: xlarge
-    steps:
-    - attach_workspace:
-        at: ~/workspace
-    - checkout
-    - run:
-        name: Upload
-        no_output_timeout: "1h"
-        command: |
-          script="/home/circleci/project/.circleci/unittest/android/scripts/binary_android_upload.sh"
-          cat "$script"
-          source "$script"
-
-  binary_macos_conda:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build
-            packaging/build_conda.sh
-      - store_artifacts:
-          path: /Users/distiller/miniconda3/conda-bld/osx-64
-      - persist_to_workspace:
-          root: /Users/distiller/miniconda3/conda-bld/osx-64
-          paths:
-            - "*"
-      - store_test_results:
-          path: build_results/
-
-  # Requires org-member context
-  binary_conda_upload:
-    docker:
-      - image: continuumio/miniconda
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          command: |
-            # Prevent credential from leaking
-            conda install -yq anaconda-client
-            set -x
-            anaconda  -t "${CONDA_PYTORCHBOT_TOKEN}" upload ~/workspace/*.tar.bz2 -u "pytorch-${UPLOAD_CHANNEL}" --label main --no-progress --force
-
-  # Requires org-member context
-  binary_wheel_upload:
-    parameters:
-      subfolder:
-        description: "What whl subfolder to upload to, e.g., blank or cu100/ (trailing slash is important)"
-        type: string
-    docker:
-      - image: cimg/python:3.7
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - checkout
-      - pip_install:
-          args: awscli
-      - run:
-          command: |
-            export PATH="$HOME/.local/bin:$PATH"
-            # Prevent credential from leaking
-            set +x
-            export AWS_ACCESS_KEY_ID="${PYTORCH_BINARY_AWS_ACCESS_KEY_ID}"
-            export AWS_SECRET_ACCESS_KEY="${PYTORCH_BINARY_AWS_SECRET_ACCESS_KEY}"
-            set -x
-            for pkg in ~/workspace/*.whl; do
-              aws s3 cp "$pkg" "s3://pytorch/whl/${UPLOAD_CHANNEL}/<< parameters.subfolder >>" --acl public-read
-            done
-
-  smoke_test_linux_conda:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_linux_pip:
-    <<: *smoke_test_common
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/test/torch_test.html
-      - run:
-          name: smoke test
-          command: |
-            source /usr/local/etc/profile.d/conda.sh && conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_docker_image_build:
-    machine:
-      image: ubuntu-2004:202104-01
-    resource_class: large
-    environment:
-      image_name: torchvision/smoke_test
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Build and push Docker image
-          no_output_timeout: "1h"
-          command: |
-            set +x
-            echo "${DOCKER_HUB_TOKEN}" | docker login --username "${DOCKER_HUB_USERNAME}" --password-stdin
-            set -x
-            cd .circleci/smoke_test/docker && docker build . -t ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker tag ${image_name}:${CIRCLE_WORKFLOW_ID} ${image_name}:latest
-            docker push ${image_name}:${CIRCLE_WORKFLOW_ID}
-            docker push ${image_name}:latest
-
-  smoke_test_win_conda:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda env remove -n python${PYTHON_VERSION} || true
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-            conda install -v -y -c pytorch-nightly pytorch
-            conda install -v -y $(ls ~/workspace/torchvision*.tar.bz2)
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  smoke_test_win_pip:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - designate_upload_channel
-      - run:
-          name: install binaries
-          command: |
-            set -x
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda create -yn python${PYTHON_VERSION} python=${PYTHON_VERSION}
-            conda activate python${PYTHON_VERSION}
-      - pip_install:
-          args: $(ls ~/workspace/torchvision*.whl) --pre -f https://download.pytorch.org/whl/test/torch_test.html
-      - run:
-          name: smoke test
-          command: |
-            eval "$('/C/tools/miniconda3/Scripts/conda.exe' 'shell.bash' 'hook')"
-            conda activate python${PYTHON_VERSION}
-            python -c "import torchvision"
-
-  unittest_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cpu"
-    resource_class: 2xlarge+
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v2-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.medium
-    environment:
-      image_name: "pytorch/manylinux-cuda116"
-      CU_VERSION: << parameters.cu_version >>
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: docker run -e PYTHON_VERSION -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v3-linux-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          # Here we create an envlist file that contains some env variables that we want the docker container to be aware of.
-          # Normally, the CIRCLECI variable is set and available on all CI workflows: https://circleci.com/docs/2.0/env-vars/#built-in-environment-variables.
-          # They're avaiable in all the other workflows (OSX and Windows).
-          # But here, we're running the unittest_linux_gpu workflows in a docker container, where those variables aren't accessible.
-          # So instead we dump the variables we need in env.list and we pass that file when invoking "docker run".
-          name: export CIRCLECI env var
-          command: echo "CIRCLECI=true" >> ./env.list
-      - run:
-          name: Install torchvision
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD -e UPLOAD_CHANNEL -e CU_VERSION "${image_name}" .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: docker run --env-file ./env.list -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post Process
-          command: docker run -t --gpus all -v $PWD:$PWD -w $PWD "${image_name}" .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v2-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    environment:
-      CUDA_VERSION: "11.6"
-      PYTHON_VERSION: << parameters.python_version >>
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/windows/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v1-windows-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/windows/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install CUDA
-          command: packaging/windows/internal/cuda_install.bat
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/windows/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/windows/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/windows/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  unittest_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    resource_class: large
-    steps:
-      - checkout
-      - designate_upload_channel
-      - run:
-          name: Install wget
-          command: HOMEBREW_NO_AUTO_UPDATE=1 brew install wget
-          # Disable brew auto update which is very slow
-      - run:
-          name: Generate cache key
-          # This will refresh cache on Sundays, nightly build should generate new cache.
-          command: echo "$(date +"%Y-%U")" > .circleci-weekly
-      - restore_cache:
-          {% raw %}
-          keys:
-            - env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - save_cache:
-          {% raw %}
-          key: env-v3-macos-{{ arch }}-py<< parameters.python_version >>-{{ checksum ".circleci/unittest/linux/scripts/environment.yml" }}-{{ checksum ".circleci-weekly" }}
-          {% endraw %}
-          paths:
-            - conda
-            - env
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Run tests
-          command: .circleci/unittest/linux/scripts/run_test.sh
-      - run:
-          name: Post process
-          command: .circleci/unittest/linux/scripts/post_process.sh
-      - store_test_results:
-          path: test-results
-
-  cmake_linux_cpu:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cpu"
-    resource_class: 2xlarge+
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - run: packaging/build_cmake.sh
-
-  cmake_linux_gpu:
-    <<: *binary_common
-    machine:
-      image: ubuntu-2004-cuda-11.4:202110-01
-    resource_class: gpu.nvidia.small
-    environment:
-      PYTHON_VERSION: << parameters.python_version >>
-      PYTORCH_VERSION: << parameters.pytorch_version >>
-      UNICODE_ABI: << parameters.unicode_abi >>
-      CU_VERSION: << parameters.cu_version >>
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Setup conda
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> .circleci/unittest/linux/scripts/setup_env.sh
-      - run:
-          name: Build torchvision C++ distribution and test
-          no_output_timeout: 30m
-          command: docker run -e CU_VERSION -e PYTHON_VERSION -e UNICODE_ABI -e PYTORCH_VERSION -e UPLOAD_CHANNEL -t --gpus all -v $PWD:$PWD -w $PWD << parameters.wheel_docker_image >> packaging/build_cmake.sh
-
-  cmake_macos_cpu:
-    <<: *binary_common
-    macos:
-      xcode: "14.0"
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            curl -o conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-            sh conda.sh -b
-            source $HOME/miniconda3/bin/activate
-            conda install -yq conda-build cmake
-            packaging/build_cmake.sh
-
-  cmake_windows_cpu:
-    <<: *binary_common
-    executor:
-      name: windows-cpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/build_cmake.sh
-
-  cmake_windows_gpu:
-    <<: *binary_common
-    executor:
-      name: windows-gpu
-    steps:
-      - checkout_merge
-      - designate_upload_channel
-      - run:
-          name: Update CUDA driver
-          command: packaging/windows/internal/driver_update.bat
-      - run:
-          command: |
-            set -ex
-            source packaging/windows/internal/vc_install_helper.sh
-            packaging/windows/internal/cuda_install.bat
-            packaging/build_cmake.sh
-
-  build_docs:
-    <<: *binary_common
-    docker:
-      - image: cimg/python:3.7
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - checkout
-      - download_model_weights
-      - run:
-          name: Setup
-          command: .circleci/unittest/linux/scripts/setup_env.sh
-      - designate_upload_channel
-      - run:
-          name: Install torchvision
-          command: .circleci/unittest/linux/scripts/install.sh
-      - run:
-          name: Build docs
-          command: |
-            set -ex
-            # turn v1.12.0rc3 into 1.12.0
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9.]*\).*/\1/')
-            VERSION=${tag:-main}
-            eval "$(./conda/bin/conda shell.bash hook)"
-            conda activate ./env
-            pushd docs
-            pip install --progress-bar=off -r requirements.txt
-            make html
-            popd
-      - persist_to_workspace:
-          root: ./
-          paths:
-            - "*"
-      - store_artifacts:
-          path: ./docs/build/html
-          destination: docs
-
-  upload_docs:
-    <<: *binary_common
-    docker:
-      - image: "pytorch/manylinux-cuda100"
-    resource_class: 2xlarge+
-    steps:
-      - attach_workspace:
-          at: ~/workspace
-      - run:
-          name: Generate netrc
-          command: |
-            # set credentials for https pushing
-            # requires the org-member context
-            cat > ~/.netrc \<<DONE
-              machine github.com
-              login pytorchbot
-              password ${GITHUB_PYTORCHBOT_TOKEN}
-            DONE
-      - run:
-          name: Upload docs
-          command: |
-            # Don't use "checkout" step since it uses ssh, which cannot git push
-            # https://circleci.com/docs/2.0/configuration-reference/#checkout
-            set -ex
-            # Change v1.12.1rc1 into 1.12 (only major.minor)
-            tag=$(echo $CIRCLE_TAG | sed -e 's/v*\([0-9]*\.[0-9]*\).*/\1/')
-            target=${tag:-main}
-            ~/workspace/.circleci/build_docs/commit_docs.sh ~/workspace $target
-
-
-workflows:
-  lint:
-    jobs:
-      - circleci_consistency
-      - lint_python_and_config
-      - lint_c
-      - type_check_python
-
-  build:
-    jobs:
-      {{ build_workflows(windows_latest_only=True) }}
-      {{ ios_workflows() }}
-      {{ android_workflows() }}
-
-  unittest:
-    jobs:
-      - unittest_torchhub
-      - unittest_onnx
-      - unittest_extended
-      {{ unittest_workflows() }}
-
-  cmake:
-    jobs:
-      {{ cmake_workflows() }}
-
-  nightly:
-    jobs:
-      {{ ios_workflows(nightly=True) }}
-      {{ android_workflows(nightly=True) }}
-      {{ build_workflows(prefix="nightly_", filter_branch="nightly", upload=True) }}
-  docker_build:
-    triggers:
-      - schedule:
-          cron: "0 10 * * 0"
-          filters:
-            branches:
-              only:
-                - main
-    jobs:
-      - smoke_test_docker_image_build:
-          context: org-member
diff --git a/.circleci/regenerate.py b/.circleci/regenerate.py
deleted file mode 100755
index 07c0358e685e406f550779efd47c02f3d416ef6e..0000000000000000000000000000000000000000
--- a/.circleci/regenerate.py
+++ /dev/null
@@ -1,347 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-This script should use a very simple, functional programming style.
-Avoid Jinja macros in favor of native Python functions.
-
-Don't go overboard on code generation; use Python only to generate
-content that can't be easily declared statically using CircleCI's YAML API.
-
-Data declarations (e.g. the nested loops for defining the configuration matrix)
-should be at the top of the file for easy updating.
-
-See this comment for design rationale:
-https://github.com/pytorch/vision/pull/1321#issuecomment-531033978
-"""
-
-import os.path
-
-import jinja2
-import yaml
-from jinja2 import select_autoescape
-
-
-PYTHON_VERSIONS = ["3.7", "3.8", "3.9", "3.10"]
-
-RC_PATTERN = r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-
-
-def build_workflows(prefix="", filter_branch=None, upload=False, indentation=6, windows_latest_only=False):
-    w = []
-    for btype in ["wheel", "conda"]:
-        for os_type in ["linux", "macos", "win"]:
-            python_versions = PYTHON_VERSIONS
-            cu_versions_dict = {
-                "linux": ["cpu", "cu116", "cu117", "rocm5.1.1", "rocm5.2"],
-                "win": ["cpu", "cu116", "cu117"],
-                "macos": ["cpu"],
-            }
-            cu_versions = cu_versions_dict[os_type]
-            for python_version in python_versions:
-                for cu_version in cu_versions:
-                    # ROCm conda packages not yet supported
-                    if cu_version.startswith("rocm") and btype == "conda":
-                        continue
-                    for unicode in [False]:
-                        fb = filter_branch
-                        if (
-                            windows_latest_only
-                            and os_type == "win"
-                            and filter_branch is None
-                            and (
-                                python_version != python_versions[-1]
-                                or (cu_version not in [cu_versions[0], cu_versions[-1]])
-                            )
-                        ):
-                            fb = "main"
-                        if not fb and (
-                            os_type == "linux" and cu_version == "cpu" and btype == "wheel" and python_version == "3.7"
-                        ):
-                            # the fields must match the build_docs "requires" dependency
-                            fb = "/.*/"
-                        w += workflow_pair(
-                            btype, os_type, python_version, cu_version, unicode, prefix, upload, filter_branch=fb
-                        )
-
-    if not filter_branch:
-        # Build on every pull request, but upload only on nightly and tags
-        w += build_doc_job("/.*/")
-        w += upload_doc_job("nightly")
-    return indent(indentation, w)
-
-
-def workflow_pair(btype, os_type, python_version, cu_version, unicode, prefix="", upload=False, *, filter_branch=None):
-
-    w = []
-    unicode_suffix = "u" if unicode else ""
-    base_workflow_name = f"{prefix}binary_{os_type}_{btype}_py{python_version}{unicode_suffix}_{cu_version}"
-
-    w.append(
-        generate_base_workflow(
-            base_workflow_name, python_version, cu_version, unicode, os_type, btype, filter_branch=filter_branch
-        )
-    )
-
-    if upload:
-        w.append(generate_upload_workflow(base_workflow_name, os_type, btype, cu_version, filter_branch=filter_branch))
-        # disable smoke tests, they are broken and needs to be fixed
-        # if filter_branch == "nightly" and os_type in ["linux", "win"]:
-        #     pydistro = "pip" if btype == "wheel" else "conda"
-        #     w.append(generate_smoketest_workflow(pydistro, base_workflow_name, filter_branch, python_version, os_type))
-
-    return w
-
-
-def build_doc_job(filter_branch):
-    job = {
-        "name": "build_docs",
-        "python_version": "3.7",
-        "requires": [
-            "binary_linux_wheel_py3.7_cpu",
-        ],
-    }
-
-    if filter_branch:
-        job["filters"] = gen_filter_branch_tree(filter_branch, tags_list=RC_PATTERN)
-    return [{"build_docs": job}]
-
-
-def upload_doc_job(filter_branch):
-    job = {
-        "name": "upload_docs",
-        "context": "org-member",
-        "python_version": "3.7",
-        "requires": [
-            "build_docs",
-        ],
-    }
-
-    if filter_branch:
-        job["filters"] = gen_filter_branch_tree(filter_branch, tags_list=RC_PATTERN)
-    return [{"upload_docs": job}]
-
-
-manylinux_images = {
-    "cu116": "pytorch/manylinux-cuda116",
-    "cu117": "pytorch/manylinux-cuda117",
-}
-
-
-def get_manylinux_image(cu_version):
-    if cu_version == "cpu":
-        return "pytorch/manylinux-cpu"
-    elif cu_version.startswith("cu"):
-        cu_suffix = cu_version[len("cu") :]
-        return f"pytorch/manylinux-cuda{cu_suffix}"
-    elif cu_version.startswith("rocm"):
-        rocm_suffix = cu_version[len("rocm") :]
-        return f"pytorch/manylinux-rocm:{rocm_suffix}"
-
-
-def get_conda_image(cu_version):
-    if cu_version == "cpu":
-        return "pytorch/conda-builder:cpu"
-    elif cu_version.startswith("cu"):
-        cu_suffix = cu_version[len("cu") :]
-        return f"pytorch/conda-builder:cuda{cu_suffix}"
-
-
-def generate_base_workflow(
-    base_workflow_name, python_version, cu_version, unicode, os_type, btype, *, filter_branch=None
-):
-
-    d = {
-        "name": base_workflow_name,
-        "python_version": python_version,
-        "cu_version": cu_version,
-    }
-
-    if os_type != "win" and unicode:
-        d["unicode_abi"] = "1"
-
-    if os_type != "win":
-        d["wheel_docker_image"] = get_manylinux_image(cu_version)
-        # ROCm conda packages not yet supported
-        if "rocm" not in cu_version:
-            d["conda_docker_image"] = get_conda_image(cu_version)
-
-    if filter_branch is not None:
-        d["filters"] = {
-            "branches": {"only": filter_branch},
-            "tags": {
-                # Using a raw string here to avoid having to escape
-                # anything
-                "only": r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-            },
-        }
-
-    w = f"binary_{os_type}_{btype}"
-    return {w: d}
-
-
-def gen_filter_branch_tree(*branches, tags_list=None):
-    filter_dict = {"branches": {"only": [b for b in branches]}}
-    if tags_list is not None:
-        filter_dict["tags"] = {"only": tags_list}
-    return filter_dict
-
-
-def generate_upload_workflow(base_workflow_name, os_type, btype, cu_version, *, filter_branch=None):
-    d = {
-        "name": f"{base_workflow_name}_upload",
-        "context": "org-member",
-        "requires": [base_workflow_name],
-    }
-
-    if btype == "wheel":
-        d["subfolder"] = "" if os_type == "macos" else cu_version + "/"
-
-    if filter_branch is not None:
-        d["filters"] = {
-            "branches": {"only": filter_branch},
-            "tags": {
-                # Using a raw string here to avoid having to escape
-                # anything
-                "only": r"/v[0-9]+(\.[0-9]+)*-rc[0-9]+/"
-            },
-        }
-
-    return {f"binary_{btype}_upload": d}
-
-
-def generate_smoketest_workflow(pydistro, base_workflow_name, filter_branch, python_version, os_type):
-
-    required_build_suffix = "_upload"
-    required_build_name = base_workflow_name + required_build_suffix
-
-    smoke_suffix = f"smoke_test_{pydistro}"
-    d = {
-        "name": f"{base_workflow_name}_{smoke_suffix}",
-        "requires": [required_build_name],
-        "python_version": python_version,
-    }
-
-    if filter_branch:
-        d["filters"] = gen_filter_branch_tree(filter_branch)
-
-    return {f"smoke_test_{os_type}_{pydistro}": d}
-
-
-def indent(indentation, data_list):
-    return ("\n" + " " * indentation).join(yaml.dump(data_list, default_flow_style=False).splitlines())
-
-
-def unittest_workflows(indentation=6):
-    jobs = []
-    for os_type in ["linux", "windows", "macos"]:
-        for device_type in ["cpu", "gpu"]:
-            if os_type == "macos" and device_type == "gpu":
-                continue
-            for i, python_version in enumerate(PYTHON_VERSIONS):
-                job = {
-                    "name": f"unittest_{os_type}_{device_type}_py{python_version}",
-                    "python_version": python_version,
-                }
-
-                if device_type == "gpu":
-                    if python_version != "3.8":
-                        job["filters"] = gen_filter_branch_tree("main", "nightly")
-                    job["cu_version"] = "cu116"
-                else:
-                    job["cu_version"] = "cpu"
-
-                jobs.append({f"unittest_{os_type}_{device_type}": job})
-
-    return indent(indentation, jobs)
-
-
-def cmake_workflows(indentation=6):
-    jobs = []
-    python_version = "3.8"
-    for os_type in ["linux", "windows", "macos"]:
-        # Skip OSX CUDA
-        device_types = ["cpu", "gpu"] if os_type != "macos" else ["cpu"]
-        for device in device_types:
-            job = {"name": f"cmake_{os_type}_{device}", "python_version": python_version}
-
-            job["cu_version"] = "cu116" if device == "gpu" else "cpu"
-            if device == "gpu" and os_type == "linux":
-                job["wheel_docker_image"] = "pytorch/manylinux-cuda116"
-            jobs.append({f"cmake_{os_type}_{device}": job})
-    return indent(indentation, jobs)
-
-
-def ios_workflows(indentation=6, nightly=False):
-    jobs = []
-    build_job_names = []
-    name_prefix = "nightly_" if nightly else ""
-    env_prefix = "nightly-" if nightly else ""
-    for arch, platform in [("x86_64", "SIMULATOR"), ("arm64", "OS")]:
-        name = f"{name_prefix}binary_libtorchvision_ops_ios_12.0.0_{arch}"
-        build_job_names.append(name)
-        build_job = {
-            "build_environment": f"{env_prefix}binary-libtorchvision_ops-ios-12.0.0-{arch}",
-            "ios_arch": arch,
-            "ios_platform": platform,
-            "name": name,
-        }
-        if nightly:
-            build_job["filters"] = gen_filter_branch_tree("nightly")
-        jobs.append({"binary_ios_build": build_job})
-
-    if nightly:
-        upload_job = {
-            "build_environment": f"{env_prefix}binary-libtorchvision_ops-ios-12.0.0-upload",
-            "context": "org-member",
-            "filters": gen_filter_branch_tree("nightly"),
-            "requires": build_job_names,
-        }
-        jobs.append({"binary_ios_upload": upload_job})
-    return indent(indentation, jobs)
-
-
-def android_workflows(indentation=6, nightly=False):
-    jobs = []
-    build_job_names = []
-    name_prefix = "nightly_" if nightly else ""
-    env_prefix = "nightly-" if nightly else ""
-
-    name = f"{name_prefix}binary_libtorchvision_ops_android"
-    build_job_names.append(name)
-    build_job = {
-        "build_environment": f"{env_prefix}binary-libtorchvision_ops-android",
-        "name": name,
-    }
-
-    if nightly:
-        upload_job = {
-            "build_environment": f"{env_prefix}binary-libtorchvision_ops-android-upload",
-            "context": "org-member",
-            "filters": gen_filter_branch_tree("nightly"),
-            "name": f"{name_prefix}binary_libtorchvision_ops_android_upload",
-        }
-        jobs.append({"binary_android_upload": upload_job})
-    else:
-        jobs.append({"binary_android_build": build_job})
-    return indent(indentation, jobs)
-
-
-if __name__ == "__main__":
-    d = os.path.dirname(__file__)
-    env = jinja2.Environment(
-        loader=jinja2.FileSystemLoader(d),
-        lstrip_blocks=True,
-        autoescape=select_autoescape(enabled_extensions=("html", "xml")),
-        keep_trailing_newline=True,
-    )
-
-    with open(os.path.join(d, "config.yml"), "w") as f:
-        f.write(
-            env.get_template("config.yml.in").render(
-                build_workflows=build_workflows,
-                unittest_workflows=unittest_workflows,
-                cmake_workflows=cmake_workflows,
-                ios_workflows=ios_workflows,
-                android_workflows=android_workflows,
-            )
-        )
diff --git a/.circleci/smoke_test/docker/Dockerfile b/.circleci/smoke_test/docker/Dockerfile
deleted file mode 100644
index 34bdcda105314a4e8c2954a8559cfa7b70096d4e..0000000000000000000000000000000000000000
--- a/.circleci/smoke_test/docker/Dockerfile
+++ /dev/null
@@ -1,34 +0,0 @@
-# this Dockerfile is for torchvision smoke test, it will be created periodically via CI system
-# if you need to do it locally, follow below steps once you have Docker installed
-# assuming you're within the directory where this Dockerfile located
-#  $ docker build . -t torchvision/smoketest
-
-# if you want to push to aws ecr, make sure you have the rights to write to ECR, then run
-# $ eval $(aws ecr get-login --region us-east-1 --no-include-email)
-# $ export MYTAG=localbuild  ## you can choose whatever tag you like
-# $ docker tag torchvision/smoketest 308535385114.dkr.ecr.us-east-1.amazonaws.com/torchvision/smoke_test:${MYTAG}
-# $ docker push  308535385114.dkr.ecr.us-east-1.amazonaws.com/torchvision/smoke_test:${MYTAG}
-
-FROM ubuntu:latest
-
-RUN apt-get -qq update && apt-get -qq -y install curl bzip2 libsox-fmt-all \
-    && curl -sSL https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o /tmp/miniconda.sh \
-    && bash /tmp/miniconda.sh -bfp /usr/local \
-    && rm -rf /tmp/miniconda.sh \
-    && conda install -y python=3 \
-    && conda update conda \
-    && apt-get -qq -y remove curl bzip2 \
-    && apt-get -qq -y autoremove \
-    && apt-get autoclean \
-    && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log \
-    && conda clean --all --yes
-
-ENV PATH /opt/conda/bin:$PATH
-
-RUN conda create -y --name python3.7 python=3.7
-RUN conda create -y --name python3.8 python=3.8
-RUN conda create -y --name python3.9 python=3.9
-RUN conda create -y --name python3.10 python=3.10
-SHELL [ "/bin/bash", "-c" ]
-RUN echo "source /usr/local/etc/profile.d/conda.sh" >> ~/.bashrc
-CMD [ "/bin/bash"]
diff --git a/.circleci/unittest/android/scripts/binary_android_build.sh b/.circleci/unittest/android/scripts/binary_android_build.sh
deleted file mode 100644
index 0d8c0d47d8a624bcf4cf4c43492f2a92d97b771f..0000000000000000000000000000000000000000
--- a/.circleci/unittest/android/scripts/binary_android_build.sh
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo "DIR: $(pwd)"
-echo "ANDROID_HOME=${ANDROID_HOME}"
-echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}"
-echo "JAVA_HOME=${JAVA_HOME}"
-
-WORKSPACE=/home/circleci/workspace
-VISION_ANDROID=/home/circleci/project/android
-
-. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh
-
-GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties
-rm -f $GRADLE_LOCAL_PROPERTIES
-
-echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-
-echo "GRADLE_PATH $GRADLE_PATH"
-echo "GRADLE_HOME $GRADLE_HOME"
-
-${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} assemble || true
-
-mkdir -p ~/workspace/artifacts
-find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz
-find . -type f -name *apk -print | xargs tar cfvz ~/workspace/artifacts/artifacts-apks.tgz
diff --git a/.circleci/unittest/android/scripts/binary_android_upload.sh b/.circleci/unittest/android/scripts/binary_android_upload.sh
deleted file mode 100644
index 1472a877d9001c6f24d1a26da26284dcc73bc27c..0000000000000000000000000000000000000000
--- a/.circleci/unittest/android/scripts/binary_android_upload.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo "DIR: $(pwd)"
-echo "ANDROID_HOME=${ANDROID_HOME}"
-echo "ANDROID_NDK_HOME=${ANDROID_NDK_HOME}"
-echo "JAVA_HOME=${JAVA_HOME}"
-
-WORKSPACE=/home/circleci/workspace
-VISION_ANDROID=/home/circleci/project/android
-
-. /home/circleci/project/.circleci/unittest/android/scripts/install_gradle.sh
-
-GRADLE_LOCAL_PROPERTIES=${VISION_ANDROID}/local.properties
-rm -f $GRADLE_LOCAL_PROPERTIES
-GRADLE_PROPERTIES=/home/circleci/project/android/gradle.properties
-
-echo "sdk.dir=${ANDROID_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-echo "ndk.dir=${ANDROID_NDK_HOME}" >> $GRADLE_LOCAL_PROPERTIES
-
-echo "SONATYPE_NEXUS_USERNAME=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES
-echo "mavenCentralRepositoryUsername=${SONATYPE_NEXUS_USERNAME}" >> $GRADLE_PROPERTIES
-echo "SONATYPE_NEXUS_PASSWORD=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES
-echo "mavenCentralRepositoryPassword=${SONATYPE_NEXUS_PASSWORD}" >> $GRADLE_PROPERTIES
-
-echo "signing.keyId=${ANDROID_SIGN_KEY}" >> $GRADLE_PROPERTIES
-echo "signing.password=${ANDROID_SIGN_PASS}" >> $GRADLE_PROPERTIES
-
-cat /home/circleci/project/android/gradle.properties | grep VERSION
-
-${GRADLE_PATH} --scan --stacktrace --debug --no-daemon -p ${VISION_ANDROID} ops:uploadArchives
-
-mkdir -p ~/workspace/artifacts
-find . -type f -name *aar -print | xargs tar cfvz ~/workspace/artifacts/artifacts-aars.tgz
diff --git a/.circleci/unittest/android/scripts/install_gradle.sh b/.circleci/unittest/android/scripts/install_gradle.sh
deleted file mode 100755
index 5f803abfa949d95ec3d742f678ad4471b77c9854..0000000000000000000000000000000000000000
--- a/.circleci/unittest/android/scripts/install_gradle.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/bin/bash
-set -ex
-
-_https_amazon_aws=https://ossci-android.s3.amazonaws.com
-GRADLE_VERSION=6.8.3
-
-_gradle_home=/opt/gradle
-sudo rm -rf $gradle_home
-sudo mkdir -p $_gradle_home
-
-curl --silent --output /tmp/gradle.zip --retry 3 $_https_amazon_aws/gradle-${GRADLE_VERSION}-bin.zip
-
-sudo unzip -q /tmp/gradle.zip -d $_gradle_home
-rm /tmp/gradle.zip
-
-sudo chmod -R 777 $_gradle_home
-
-export GRADLE_HOME=$_gradle_home/gradle-$GRADLE_VERSION
-export GRADLE_PATH=${GRADLE_HOME}/bin/gradle
diff --git a/.circleci/unittest/ios/scripts/binary_ios_build.sh b/.circleci/unittest/ios/scripts/binary_ios_build.sh
deleted file mode 100755
index e2ad7b0c55faa836d9cadfceca964490833d5391..0000000000000000000000000000000000000000
--- a/.circleci/unittest/ios/scripts/binary_ios_build.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo ""
-echo "DIR: $(pwd)"
-WORKSPACE=/Users/distiller/workspace
-PROJ_ROOT_IOS=/Users/distiller/project/ios
-PYTORCH_IOS_NIGHTLY_NAME=libtorch_ios_nightly_build.zip
-export TCLLIBPATH="/usr/local/lib"
-
-# install conda
-curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-chmod +x ~/conda.sh
-/bin/bash ~/conda.sh -b -p ~/anaconda
-export PATH="~/anaconda/bin:${PATH}"
-source ~/anaconda/bin/activate
-
-# install dependencies
-conda install numpy ninja pyyaml mkl mkl-include setuptools cmake cffi requests typing_extensions wget --yes
-conda install -c conda-forge valgrind --yes
-export CMAKE_PREFIX_PATH=${CONDA_PREFIX:-"$(dirname $(which conda))/../"}
-
-# sync submodules
-cd ${PROJ_ROOT_IOS}
-git submodule sync
-git submodule update --init --recursive
-
-# download pytorch-iOS nightly build and unzip it
-mkdir -p ${PROJ_ROOT_IOS}/lib
-mkdir -p ${PROJ_ROOT_IOS}/build
-mkdir -p ${PROJ_ROOT_IOS}/pytorch
-TORCH_ROOT="${PROJ_ROOT_IOS}/pytorch"
-
-cd ${TORCH_ROOT}
-wget https://ossci-ios-build.s3.amazonaws.com/${PYTORCH_IOS_NIGHTLY_NAME}
-mkdir -p ./build_ios
-unzip -d ./build_ios ./${PYTORCH_IOS_NIGHTLY_NAME}
-
-LIBTORCH_HEADER_ROOT="${TORCH_ROOT}/build_ios/install/include"
-cd ${PROJ_ROOT_IOS}
-IOS_ARCH=${IOS_ARCH} LIBTORCH_HEADER_ROOT=${LIBTORCH_HEADER_ROOT} ./build_ios.sh
-rm -rf ${TORCH_ROOT}
-
-# store the binary
-DEST_DIR=${WORKSPACE}/ios/${IOS_ARCH}
-mkdir -p ${DEST_DIR}
-cp ${PROJ_ROOT_IOS}/lib/*.a ${DEST_DIR}
diff --git a/.circleci/unittest/ios/scripts/binary_ios_upload.sh b/.circleci/unittest/ios/scripts/binary_ios_upload.sh
deleted file mode 100644
index ce56388e5da417a4b240b5c0389fef8439cb2510..0000000000000000000000000000000000000000
--- a/.circleci/unittest/ios/scripts/binary_ios_upload.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-set -ex -o pipefail
-
-echo ""
-echo "DIR: $(pwd)"
-
-WORKSPACE=/Users/distiller/workspace
-PROJ_ROOT=/Users/distiller/project
-ARTIFACTS_DIR=${WORKSPACE}/ios
-ls ${ARTIFACTS_DIR}
-ZIP_DIR=${WORKSPACE}/zip
-mkdir -p ${ZIP_DIR}/install/lib
-
-# build a FAT bianry
-cd ${ZIP_DIR}/install/lib
-libs=("${ARTIFACTS_DIR}/x86_64/libtorchvision_ops.a" "${ARTIFACTS_DIR}/arm64/libtorchvision_ops.a")
-lipo -create "${libs[@]}" -o ${ZIP_DIR}/install/lib/libtorchvision_ops.a
-lipo -i ${ZIP_DIR}/install/lib/*.a
-
-# copy the license
-cp ${PROJ_ROOT}/LICENSE ${ZIP_DIR}/
-# zip the library
-ZIPFILE=libtorchvision_ops_ios_nightly_build.zip
-cd ${ZIP_DIR}
-#for testing
-touch version.txt
-echo $(date +%s) > version.txt
-zip -r ${ZIPFILE} install version.txt LICENSE
-
-# upload to aws
-# Install conda then 'conda install' awscli
-curl --retry 3 -o ~/conda.sh https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh
-chmod +x ~/conda.sh
-/bin/bash ~/conda.sh -b -p ~/anaconda
-export PATH="~/anaconda/bin:${PATH}"
-source ~/anaconda/bin/activate
-conda install -c conda-forge awscli --yes
-set +x
-export AWS_ACCESS_KEY_ID=${AWS_S3_ACCESS_KEY_FOR_PYTORCH_BINARY_UPLOAD}
-export AWS_SECRET_ACCESS_KEY=${AWS_S3_ACCESS_SECRET_FOR_PYTORCH_BINARY_UPLOAD}
-set -x
-aws s3 cp ${ZIPFILE} s3://ossci-ios-build/ --acl public-read
diff --git a/.circleci/unittest/linux/scripts/environment.yml b/.circleci/unittest/linux/scripts/environment.yml
deleted file mode 100644
index 77ee99295195657edf884acbf6049f36b1f1f709..0000000000000000000000000000000000000000
--- a/.circleci/unittest/linux/scripts/environment.yml
+++ /dev/null
@@ -1,16 +0,0 @@
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - pip
-  - libpng
-  - jpeg
-  - ca-certificates
-  - h5py
-  - pip:
-    - future
-    - scipy
-    - av
diff --git a/.circleci/unittest/linux/scripts/install.sh b/.circleci/unittest/linux/scripts/install.sh
deleted file mode 100755
index 54722842a746a4691710c85298e269f654fd505c..0000000000000000000000000000000000000000
--- a/.circleci/unittest/linux/scripts/install.sh
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env bash
-
-unset PYTORCH_VERSION
-# For unittest, nightly PyTorch is used as the following section,
-# so no need to set PYTORCH_VERSION.
-# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
-
-if [ "${CU_VERSION:-}" == cpu ] ; then
-    cudatoolkit="cpuonly"
-    version="cpu"
-else
-    if [[ ${#CU_VERSION} -eq 4 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
-    elif [[ ${#CU_VERSION} -eq 5 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-    echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION: ${CU_VERSION} "
-    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
-    cudatoolkit="pytorch-cuda=${version}"
-fi
-
-case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
-esac
-
-printf "Installing PyTorch with %s\n" "${cudatoolkit}"
-if [ "${os}" == "MacOSX" ]; then
-    conda install -y -c "pytorch-${UPLOAD_CHANNEL}" "pytorch-${UPLOAD_CHANNEL}"::pytorch "${cudatoolkit}"
-else
-    conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}"
-fi
-
-printf "* Installing torchvision\n"
-python setup.py develop
diff --git a/.circleci/unittest/linux/scripts/post_process.sh b/.circleci/unittest/linux/scripts/post_process.sh
deleted file mode 100755
index e97bf2a7b1b19fe99eaf0889a157f46c38cc0060..0000000000000000000000000000000000000000
--- a/.circleci/unittest/linux/scripts/post_process.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
diff --git a/.circleci/unittest/linux/scripts/run_test.sh b/.circleci/unittest/linux/scripts/run_test.sh
deleted file mode 100755
index 8f6b8cb84850822c5476ebb87c3dc7bec0d57b9b..0000000000000000000000000000000000000000
--- a/.circleci/unittest/linux/scripts/run_test.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/bin/conda shell.bash hook)"
-conda activate ./env
-
-python -m torch.utils.collect_env
-pytest --junitxml=test-results/junit.xml -v --durations 20
diff --git a/.circleci/unittest/linux/scripts/setup_env.sh b/.circleci/unittest/linux/scripts/setup_env.sh
deleted file mode 100755
index 0574cdff1cf6fd2ec91cf01e4a34d37eb95a4717..0000000000000000000000000000000000000000
--- a/.circleci/unittest/linux/scripts/setup_env.sh
+++ /dev/null
@@ -1,47 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is for setting up environment in which unit test is ran.
-# To speed up the CI time, the resulting environment is cached.
-#
-# Do not install PyTorch and torchvision here, otherwise they also get cached.
-
-set -e
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-# Avoid error: "fatal: unsafe repository"
-git config --global --add safe.directory '*'
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
-case "$(uname -s)" in
-    Darwin*) os=MacOSX;;
-    *) os=Linux
-esac
-
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    wget -O miniconda.sh "http://repo.continuum.io/miniconda/Miniconda3-latest-${os}-x86_64.sh"
-    bash ./miniconda.sh -b -f -p "${conda_dir}"
-fi
-eval "$(${conda_dir}/bin/conda shell.bash hook)"
-
-# 2. Create test environment at ./env
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment\n"
-    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
-fi
-conda activate "${env_dir}"
-
-# 3. Install Conda dependencies
-printf "* Installing dependencies (except PyTorch)\n"
-FFMPEG_PIN="=4.2"
-if [[ "${PYTHON_VERSION}" = "3.9" ]]; then
-    FFMPEG_PIN=">=4.2"
-fi
-
-conda install -y -c pytorch "ffmpeg${FFMPEG_PIN}"
-conda env update --file "${this_dir}/environment.yml" --prune
diff --git a/.circleci/unittest/windows/scripts/environment.yml b/.circleci/unittest/windows/scripts/environment.yml
deleted file mode 100644
index 0e07ae80d0d42a639d887425db8b042e030c2cd2..0000000000000000000000000000000000000000
--- a/.circleci/unittest/windows/scripts/environment.yml
+++ /dev/null
@@ -1,19 +0,0 @@
-channels:
-  - pytorch
-  - defaults
-dependencies:
-  - pytest
-  - pytest-cov
-  - pytest-mock
-  - pip
-  - libpng
-  - jpeg
-  - ca-certificates
-  - hdf5
-  - setuptools
-  - pip:
-    - future
-    - scipy
-    - av != 9.1.1
-    - dataclasses
-    - h5py
diff --git a/.circleci/unittest/windows/scripts/install.sh b/.circleci/unittest/windows/scripts/install.sh
deleted file mode 100644
index 85920abb8da88d882b6603d28ddabffec62cd300..0000000000000000000000000000000000000000
--- a/.circleci/unittest/windows/scripts/install.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env bash
-
-unset PYTORCH_VERSION
-# For unittest, nightly PyTorch is used as the following section,
-# so no need to set PYTORCH_VERSION.
-# In fact, keeping PYTORCH_VERSION forces us to hardcode PyTorch version in config.
-
-set -ex
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
-
-# TODO, refactor the below logic to make it easy to understand how to get correct cuda_version.
-if [ "${CU_VERSION:-}" == cpu ] ; then
-    cudatoolkit="cpuonly"
-    version="cpu"
-else
-    if [[ ${#CU_VERSION} -eq 4 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:1}.${CU_VERSION:3:1}"
-    elif [[ ${#CU_VERSION} -eq 5 ]]; then
-        CUDA_VERSION="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-
-    cuda_toolkit_pckg="cudatoolkit"
-    if [[ $CUDA_VERSION == 11.6 || $CUDA_VERSION == 11.7 ]]; then
-        cuda_toolkit_pckg="pytorch-cuda"
-    fi
-
-    echo "Using CUDA $CUDA_VERSION as determined by CU_VERSION"
-    version="$(python -c "print('.'.join(\"${CUDA_VERSION}\".split('.')[:2]))")"
-    cudatoolkit="${cuda_toolkit_pckg}=${version}"
-fi
-
-printf "Installing PyTorch with %s\n" "${cudatoolkit}"
-conda install -y -c "pytorch-${UPLOAD_CHANNEL}" -c nvidia "pytorch-${UPLOAD_CHANNEL}"::pytorch[build="*${version}*"] "${cudatoolkit}"
-
-torch_cuda=$(python -c "import torch; print(torch.cuda.is_available())")
-echo torch.cuda.is_available is $torch_cuda
-
-if [ ! -z "${CUDA_VERSION:-}" ] ; then
-    if [ "$torch_cuda" == "False" ]; then
-        echo "torch with cuda installed but torch.cuda.is_available() is False"
-        exit 1
-    fi
-fi
-
-source "$this_dir/set_cuda_envs.sh"
-
-printf "* Installing torchvision\n"
-"$this_dir/vc_env_helper.bat" python setup.py develop
diff --git a/.circleci/unittest/windows/scripts/install_conda.bat b/.circleci/unittest/windows/scripts/install_conda.bat
deleted file mode 100644
index 6052ad08b106accec140ef3f0e27cb4fe893377a..0000000000000000000000000000000000000000
--- a/.circleci/unittest/windows/scripts/install_conda.bat
+++ /dev/null
@@ -1 +0,0 @@
-start /wait "" "%miniconda_exe%" /S /InstallationType=JustMe /RegisterPython=0 /AddToPath=0 /D=%tmp_conda%
diff --git a/.circleci/unittest/windows/scripts/post_process.sh b/.circleci/unittest/windows/scripts/post_process.sh
deleted file mode 100644
index 5c5cbb758a9ef2b235e6e5af308bef77fc26a253..0000000000000000000000000000000000000000
--- a/.circleci/unittest/windows/scripts/post_process.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
diff --git a/.circleci/unittest/windows/scripts/run_test.sh b/.circleci/unittest/windows/scripts/run_test.sh
deleted file mode 100644
index 802ad37f511adc7ab38adc992738d570a40432c4..0000000000000000000000000000000000000000
--- a/.circleci/unittest/windows/scripts/run_test.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-
-eval "$(./conda/Scripts/conda.exe 'shell.bash' 'hook')"
-conda activate ./env
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-source "$this_dir/set_cuda_envs.sh"
-
-python -m torch.utils.collect_env
-pytest --junitxml=test-results/junit.xml -v --durations 20
diff --git a/.circleci/unittest/windows/scripts/set_cuda_envs.sh b/.circleci/unittest/windows/scripts/set_cuda_envs.sh
deleted file mode 100644
index 7db3137b5944034cb556a341658fa0db95c75761..0000000000000000000000000000000000000000
--- a/.circleci/unittest/windows/scripts/set_cuda_envs.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-set -ex
-
-echo CU_VERSION is "${CU_VERSION}"
-echo CUDA_VERSION is "${CUDA_VERSION}"
-
-# Currenly, CU_VERSION and CUDA_VERSION are not consistent.
-# to understand this code, see https://github.com/pytorch/vision/issues/4443
-version="cpu"
-if [[ ! -z "${CUDA_VERSION}" ]] ; then
-    version="$CUDA_VERSION"
-else
-    if [[ ${#CU_VERSION} -eq 5 ]]; then
-        version="${CU_VERSION:2:2}.${CU_VERSION:4:1}"
-    fi
-fi
-
-# Don't use if [[ "$version" == "cpu" ]]; then exit 0 fi.
-# It would exit the shell. One result is cpu tests would not run if the shell exit.
-# Unless there's an error, Don't exit.
-if [[ "$version" != "cpu" ]]; then
-    # set cuda envs
-    export PATH="/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/bin:/c/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v${version}/libnvvp:$PATH"
-    export CUDA_PATH_V${version/./_}="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
-    export CUDA_PATH="C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v${version}"
-
-    if  [ ! -d "$CUDA_PATH" ]; then
-        echo "$CUDA_PATH" does not exist
-        exit 1
-    fi
-
-    if [ ! -f "${CUDA_PATH}\include\nvjpeg.h" ]; then
-        echo "nvjpeg does not exist"
-        exit 1
-    fi
-
-    # check cuda driver version
-    for path in '/c/Program Files/NVIDIA Corporation/NVSMI/nvidia-smi.exe' /c/Windows/System32/nvidia-smi.exe; do
-        if [[ -x "$path" ]]; then
-            "$path" || echo "true";
-            break
-        fi
-    done
-
-    which nvcc
-    nvcc --version
-    env | grep CUDA
-fi
diff --git a/.circleci/unittest/windows/scripts/setup_env.sh b/.circleci/unittest/windows/scripts/setup_env.sh
deleted file mode 100644
index 5eeb2e17b48976243d6736c7fe5c4b3edd1e582e..0000000000000000000000000000000000000000
--- a/.circleci/unittest/windows/scripts/setup_env.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env bash
-
-# This script is for setting up environment in which unit test is ran.
-# To speed up the CI time, the resulting environment is cached.
-#
-# Do not install PyTorch and torchvision here, otherwise they also get cached.
-
-set -e
-
-this_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-root_dir="$(git rev-parse --show-toplevel)"
-conda_dir="${root_dir}/conda"
-env_dir="${root_dir}/env"
-
-cd "${root_dir}"
-
-# 1. Install conda at ./conda
-if [ ! -d "${conda_dir}" ]; then
-    printf "* Installing conda\n"
-    export tmp_conda="$(echo $conda_dir | tr '/' '\\')"
-    export miniconda_exe="$(echo $root_dir | tr '/' '\\')\\miniconda.exe"
-    curl --output miniconda.exe https://repo.anaconda.com/miniconda/Miniconda3-latest-Windows-x86_64.exe -O
-    "$this_dir/install_conda.bat"
-    unset tmp_conda
-    unset miniconda_exe
-fi
-
-eval "$(${conda_dir}/Scripts/conda.exe 'shell.bash' 'hook')"
-
-# 2. Create test environment at ./env
-if [ ! -d "${env_dir}" ]; then
-    printf "* Creating a test environment\n"
-    conda create --prefix "${env_dir}" -y python="$PYTHON_VERSION"
-fi
-conda activate "${env_dir}"
-
-# 3. Install Conda dependencies
-printf "* Installing dependencies (except PyTorch)\n"
-conda env update --file "${this_dir}/environment.yml" --prune
-
-# 4. Downgrade setuptools on Python 3.7.
-#    See https://github.com/pytorch/vision/pull/5868
-if [[ "${PYTHON_VERSION}" == '3.7' ]]; then
-  pip install --upgrade setuptools==58.0.4
-fi
diff --git a/.circleci/unittest/windows/scripts/vc_env_helper.bat b/.circleci/unittest/windows/scripts/vc_env_helper.bat
deleted file mode 100644
index 9410135677a4fdc1113d96c5a422583992c688c3..0000000000000000000000000000000000000000
--- a/.circleci/unittest/windows/scripts/vc_env_helper.bat
+++ /dev/null
@@ -1,39 +0,0 @@
-@echo on
-
-set VC_VERSION_LOWER=16
-set VC_VERSION_UPPER=17
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [%VC_VERSION_LOWER%^,%VC_VERSION_UPPER%^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15INSTALLDIR=%%i"
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto vswhere
-    )
-)
-
-:vswhere
-if "%VSDEVCMD_ARGS%" == "" (
-    call "%VS15VCVARSALL%" x64 || exit /b 1
-) else (
-    call "%VS15VCVARSALL%" x64 %VSDEVCMD_ARGS% || exit /b 1
-)
-
-@echo on
-
-set DISTUTILS_USE_SDK=1
-
-set args=%1
-shift
-:start
-if [%1] == [] goto done
-set args=%args% %1
-shift
-goto start
-
-:done
-if "%args%" == "" (
-    echo Usage: vc_env_helper.bat [command] [args]
-    echo e.g. vc_env_helper.bat cl /c test.cpp
-)
-
-%args% || exit /b 1
diff --git a/.clang-format b/.clang-format
index 6d0ab740db4bd2ce6debe0008785a7d7c7468461..95d60445f4a51826e8a26e4b47c8233222261dda 100644
--- a/.clang-format
+++ b/.clang-format
@@ -60,9 +60,6 @@ MacroBlockBegin: ''
 MacroBlockEnd:   ''
 MaxEmptyLinesToKeep: 1
 NamespaceIndentation: None
-ObjCBlockIndentWidth: 2
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: false
 PenaltyBreakBeforeFirstCallParameter: 1
 PenaltyBreakComment: 300
 PenaltyBreakFirstLessLess: 120
@@ -85,4 +82,11 @@ SpacesInSquareBrackets: false
 Standard:        Cpp11
 TabWidth:        8
 UseTab:          Never
+---
+Language: ObjC
+ColumnLimit: 120
+AlignAfterOpenBracket: Align
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
 ...
diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs
index eec9385478805eb123b2f88e30e428342632a1ae..5e88f5b9bb7b678fe14530eaec363d982f94686c 100644
--- a/.git-blame-ignore-revs
+++ b/.git-blame-ignore-revs
@@ -7,3 +7,7 @@
 d367a01a18a3ae6bee13d8be3b63fd6a581ea46f
 # Upgrade usort to 1.0.2 and black to 22.3.0 (#5106) 
 6ca9c76adb6daf2695d603ad623a9cf1c4f4806f
+# Fix unnecessary exploded black formatting (#7709)
+a335d916db0694770e8152f41e19195de3134523
+# Renaming: `BoundingBox` -> `BoundingBoxes` (#7778)
+332bff937c6711666191880fab57fa2f23ae772e
diff --git a/.gitattributes b/.gitattributes
index f9d672d7fb5b2db73cfff9cea21f7afb344f663c..22d0452f8d7e02ba33fa717d8a1792a76b050182 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -6,6 +6,3 @@
 
 # To ignore it use below
 *.ipynb linguist-documentation
-
-# To exclude autogenerated files from code reviews
-.circleci/config.yml linguist-generated=true
diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
index a073146ebedf6ae14ba5bf62fadb65ade8e0318d..ba811554c439216ec72175977938a6f2196bc0d8 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -48,7 +48,7 @@ body:
     description: |
       Please run the following and paste the output below.
       ```sh
-      wget https://raw.githubusercontent.com/pytorch/pytorch/master/torch/utils/collect_env.py
+      wget https://raw.githubusercontent.com/pytorch/pytorch/main/torch/utils/collect_env.py
       # For security purposes, please check the contents of collect_env.py before running it.
       python collect_env.py
       ```
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 27d0f2a1f0b239bd5108a9ce77a81f69bb11edfe..1a3402466f4e03fa36c69260c1cf17cca893646d 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -1 +1,10 @@
 tracking_issue: 2447
+
+# List of workflows that will be re-run in case of failures
+# https://github.com/pytorch/test-infra/blob/main/torchci/lib/bot/retryBot.ts
+retryable_workflows:
+- Build Linux
+- Build Macos
+- Build M1
+- Build Windows
+- Tests
diff --git a/.github/scripts/cmake.sh b/.github/scripts/cmake.sh
new file mode 100755
index 0000000000000000000000000000000000000000..ef3e5f61dad934d6060b41d49979e1478a0cc38c
--- /dev/null
+++ b/.github/scripts/cmake.sh
@@ -0,0 +1,124 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+./.github/scripts/setup-env.sh
+
+# Activate conda environment
+set +x && eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci && set -x
+
+# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below.
+case $(uname) in
+  Linux)
+    OS_TYPE=linux
+    ;;
+  Darwin)
+    OS_TYPE=macos
+    ;;
+  MSYS*)
+    OS_TYPE=windows
+    ;;
+  *)
+    echo "Unknown OS type:" $(uname)
+    exit 1
+    ;;
+esac
+
+if [[ $OS_TYPE == macos ]]; then
+  JOBS=$(sysctl -n hw.logicalcpu)
+else
+  JOBS=$(nproc)
+fi
+
+TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))")
+if [[ $OS_TYPE == windows ]]; then
+  PACKAGING_DIR="${PWD}/packaging"
+  export PATH="${TORCH_PATH}/lib:${PATH}"
+fi
+
+Torch_DIR="${TORCH_PATH}/share/cmake/Torch"
+if [[ "${GPU_ARCH_TYPE}" == "cuda" ]]; then
+  WITH_CUDA=1
+else
+  WITH_CUDA=0
+fi
+
+echo '::group::Prepare CMake builds'
+mkdir -p cpp_build
+
+pushd test/tracing/frcnn
+python trace_model.py
+mkdir -p build
+mv fasterrcnn_resnet50_fpn.pt build
+popd
+
+pushd examples/cpp/hello_world
+python trace_model.py
+mkdir -p build
+mv resnet18.pt build
+popd
+
+# This was only needed for the tracing above
+pip uninstall -y torchvision
+echo '::endgroup::'
+
+echo '::group::Build and install libtorchvision'
+pushd cpp_build
+
+# On macOS, CMake is looking for the library (*.dylib) and the header (*.h) separately. By default, it prefers to load
+# the header from other packages that install the library. This easily leads to a mismatch if the library installed
+# from conda doesn't have the exact same version. Thus, we need to explicitly set CMAKE_FIND_FRAMEWORK=NEVER to force
+# it to not load anything from other installed frameworks. Resources:
+# https://stackoverflow.com/questions/36523911/osx-homebrew-cmake-libpng-version-mismatch-issue
+# https://cmake.org/cmake/help/latest/variable/CMAKE_FIND_FRAMEWORK.html
+cmake .. -DTorch_DIR="${Torch_DIR}" -DWITH_CUDA="${WITH_CUDA}" \
+  -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \
+  -DCMAKE_FIND_FRAMEWORK=NEVER \
+  -DCMAKE_INSTALL_PREFIX="${CONDA_PREFIX}"
+if [[ $OS_TYPE == windows ]]; then
+  "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cmake.bat" $JOBS
+else
+  make -j$JOBS
+  make install
+fi
+
+popd
+echo '::endgroup::'
+
+echo '::group::Build and run project that uses Faster-RCNN'
+pushd test/tracing/frcnn/build
+
+cmake .. -DTorch_DIR="${Torch_DIR}" -DWITH_CUDA="${WITH_CUDA}" \
+  -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \
+  -DCMAKE_FIND_FRAMEWORK=NEVER
+if [[ $OS_TYPE == windows ]]; then
+  "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_frcnn.bat" $JOBS
+  cd Release
+  cp ../fasterrcnn_resnet50_fpn.pt .
+else
+  make -j$JOBS
+fi
+
+./test_frcnn_tracing
+
+popd
+echo '::endgroup::'
+
+echo '::group::Build and run C++ example'
+pushd examples/cpp/hello_world/build
+
+cmake .. -DTorch_DIR="${Torch_DIR}" \
+  -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \
+  -DCMAKE_FIND_FRAMEWORK=NEVER
+if [[ $OS_TYPE == windows ]]; then
+  "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cpp_example.bat" $JOBS
+  cd Release
+  cp ../resnet18.pt .
+else
+  make -j$JOBS
+fi
+
+./hello-world
+
+popd
+echo '::endgroup::'
diff --git a/.circleci/unittest/linux/scripts/run-clang-format.py b/.github/scripts/run-clang-format.py
similarity index 99%
rename from .circleci/unittest/linux/scripts/run-clang-format.py
rename to .github/scripts/run-clang-format.py
index 5c61b2519e04617b3f7aedc8600c350579e27d39..670fd97833a7c7395c71771ca3c2060b9930cc9e 100755
--- a/.circleci/unittest/linux/scripts/run-clang-format.py
+++ b/.github/scripts/run-clang-format.py
@@ -48,7 +48,7 @@ except ImportError:
     DEVNULL = open(os.devnull, "wb")
 
 
-DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu"
+DEFAULT_EXTENSIONS = "c,h,C,H,cpp,hpp,cc,hh,c++,h++,cxx,hxx,cu,mm"
 
 
 class ExitStatus:
diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a4f113c367fa801bd0e95bded875eac4e9f8f15c
--- /dev/null
+++ b/.github/scripts/setup-env.sh
@@ -0,0 +1,118 @@
+#!/usr/bin/env bash
+
+set -euxo pipefail
+
+# Prepare conda
+set +x && eval "$($(which conda) shell.bash hook)" && set -x
+
+# Setup the OS_TYPE environment variable that should be used for conditions involving the OS below.
+case $(uname) in
+  Linux)
+    OS_TYPE=linux
+    ;;
+  Darwin)
+    OS_TYPE=macos
+    ;;
+  MSYS*)
+    OS_TYPE=windows
+    ;;
+  *)
+    echo "Unknown OS type:" $(uname)
+    exit 1
+    ;;
+esac
+
+if [[ "${OS_TYPE}" == "macos" && $(uname -m) == x86_64 ]]; then
+  echo '::group::Uninstall system JPEG libraries on macOS'
+  # The x86 macOS runners, e.g. the GitHub Actions native "macos-12" runner, has some JPEG and PNG libraries
+  # installed by default that interfere with our build. We uninstall them here and use the one from conda below.
+  IMAGE_LIBS=$(brew list | grep -E "jpeg|png")
+  for lib in $IMAGE_LIBS; do
+    brew uninstall --ignore-dependencies --force "${lib}"
+  done
+  echo '::endgroup::'
+fi
+
+echo '::group::Create build environment'
+# See https://github.com/pytorch/vision/issues/7296 for ffmpeg
+conda create \
+  --name ci \
+  --quiet --yes \
+  python="${PYTHON_VERSION}" pip \
+  ninja cmake \
+  libpng \
+  'ffmpeg<4.3'
+conda activate ci
+conda install --quiet --yes libjpeg-turbo -c pytorch
+pip install --progress-bar=off --upgrade setuptools
+
+# See https://github.com/pytorch/vision/issues/6790
+if [[ "${PYTHON_VERSION}" != "3.11" ]]; then
+  pip install --progress-bar=off av!=10.0.0
+fi
+
+echo '::endgroup::'
+
+if [[ "${OS_TYPE}" == windows && "${GPU_ARCH_TYPE}" == cuda ]]; then
+  echo '::group::Install VisualStudio CUDA extensions on Windows'
+  if [[ "${VC_YEAR:-}" == "2022" ]]; then
+    TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/MSBuild/Microsoft/VC/v170/BuildCustomizations"
+  else
+    TARGET_DIR="/c/Program Files (x86)/Microsoft Visual Studio/2019/BuildTools/MSBuild/Microsoft/VC/v160/BuildCustomizations"
+  fi
+  mkdir -p "${TARGET_DIR}"
+  cp -r "${CUDA_HOME}/MSBuildExtensions/"* "${TARGET_DIR}"
+  echo '::endgroup::'
+fi
+
+echo '::group::Install PyTorch'
+# TODO: Can we maybe have this as environment variable in the job template? For example, `IS_RELEASE`.
+if [[ (${GITHUB_EVENT_NAME} = 'pull_request' && (${GITHUB_BASE_REF} = 'release'*)) || (${GITHUB_REF} = 'refs/heads/release'*) ]]; then
+  CHANNEL=test
+else
+  CHANNEL=nightly
+fi
+
+case $GPU_ARCH_TYPE in
+  cpu)
+    GPU_ARCH_ID="cpu"
+    ;;
+  cuda)
+    VERSION_WITHOUT_DOT=$(echo "${GPU_ARCH_VERSION}" | sed 's/\.//')
+    GPU_ARCH_ID="cu${VERSION_WITHOUT_DOT}"
+    ;;
+  *)
+    echo "Unknown GPU_ARCH_TYPE=${GPU_ARCH_TYPE}"
+    exit 1
+    ;;
+esac
+PYTORCH_WHEEL_INDEX="https://download.pytorch.org/whl/${CHANNEL}/${GPU_ARCH_ID}"
+pip install --progress-bar=off --pre torch --index-url="${PYTORCH_WHEEL_INDEX}"
+
+if [[ $GPU_ARCH_TYPE == 'cuda' ]]; then
+  python -c "import torch; exit(not torch.cuda.is_available())"
+fi
+echo '::endgroup::'
+
+echo '::group::Install third party dependencies prior to TorchVision install'
+# Installing with `easy_install`, e.g. `python setup.py install` or `python setup.py develop`, has some quirks when
+# when pulling in third-party dependencies. For example:
+# - On Windows, we often hit an SSL error although `pip` can install just fine.
+# - It happily pulls in pre-releases, which can lead to more problems down the line.
+#   `pip` does not unless explicitly told to do so.
+# Thus, we use `easy_install` to extract the third-party dependencies here and install them upfront with `pip`.
+python setup.py egg_info
+# The requires.txt cannot be used with `pip install -r` directly. The requirements are listed at the top and the
+# optional dependencies come in non-standard syntax after a blank line. Thus, we just extract the header.
+sed -e '/^$/,$d' *.egg-info/requires.txt | tee requirements.txt
+pip install --progress-bar=off -r requirements.txt
+echo '::endgroup::'
+
+echo '::group::Install TorchVision'
+python setup.py develop
+echo '::endgroup::'
+
+echo '::group::Collect environment information'
+conda list
+python -m torch.utils.collect_env
+echo '::endgroup::'
diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bb2ad73715abf228a365922b11c20a6549d5479c
--- /dev/null
+++ b/.github/scripts/unittest.sh
@@ -0,0 +1,15 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+./.github/scripts/setup-env.sh
+
+# Activate conda environment
+eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate ci
+
+echo '::group::Install testing utilities'
+pip install --progress-bar=off pytest pytest-mock pytest-cov
+echo '::endgroup::'
+
+python test/smoke_test.py
+pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25
diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml
new file mode 100644
index 0000000000000000000000000000000000000000..23f2b4b06ec832f85c5f9689ccf4b5624839570b
--- /dev/null
+++ b/.github/workflows/build-cmake.yml
@@ -0,0 +1,83 @@
+name: CMake
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  linux:
+    strategy:
+      matrix:
+        include:
+          - runner: linux.12xlarge
+            gpu-arch-type: cpu
+          - runner: linux.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/cmake.sh
+
+  macos:
+    strategy:
+      matrix:
+        include:
+          - runner: macos-12
+          - runner: macos-m1-12
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/cmake.sh
+
+  windows:
+    strategy:
+      matrix:
+        include:
+          - runner: windows.4xlarge
+            gpu-arch-type: cpu
+          - runner: windows.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export VC_YEAR=2022
+        export VSDEVCMD_ARGS=""
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/cmake.sh
diff --git a/.github/workflows/build-conda-linux.yml b/.github/workflows/build-conda-linux.yml
new file mode 100644
index 0000000000000000000000000000000000000000..8da9d488f7e9332caa2c314a9e76123429f7fc56
--- /dev/null
+++ b/.github/workflows/build-conda-linux.yml
@@ -0,0 +1,52 @@
+name: Build Linux Conda
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
+    with:
+      package-type: conda
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            conda-package-directory: packaging/torchvision
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_conda_linux.yml@release/2.1
+    with:
+      conda-package-directory: ${{ matrix.conda-package-directory }}
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
diff --git a/.github/workflows/build-conda-m1.yml b/.github/workflows/build-conda-m1.yml
new file mode 100644
index 0000000000000000000000000000000000000000..4a347e1baf5a074717b5fab92371a8408fcaff94
--- /dev/null
+++ b/.github/workflows/build-conda-m1.yml
@@ -0,0 +1,53 @@
+name: Build M1 Conda
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
+    with:
+      package-type: conda
+      os: macos-arm64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            conda-package-directory: packaging/torchvision
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@release/2.1
+    with:
+      conda-package-directory: ${{ matrix.conda-package-directory }}
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      runner-type: macos-m1-12
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
diff --git a/.github/workflows/build-conda-macos.yml b/.github/workflows/build-conda-macos.yml
new file mode 100644
index 0000000000000000000000000000000000000000..aca1b12754cdfeb8a3b2817a0876875a01d35828
--- /dev/null
+++ b/.github/workflows/build-conda-macos.yml
@@ -0,0 +1,53 @@
+name: Build Macos Conda
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
+    with:
+      package-type: conda
+      os: macos
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: ""
+            post-script: ""
+            conda-package-directory: packaging/torchvision
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_conda_macos.yml@release/2.1
+    with:
+      conda-package-directory: ${{ matrix.conda-package-directory }}
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      runner-type: macos-12
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
diff --git a/.github/workflows/build-conda-windows.yml b/.github/workflows/build-conda-windows.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f03e4c57fc1c54e5627ec271da4614bdacad778e
--- /dev/null
+++ b/.github/workflows/build-conda-windows.yml
@@ -0,0 +1,52 @@
+name: Build Windows Conda
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
+    with:
+      package-type: conda
+      os: windows
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            env-script: packaging/windows/internal/vc_env_helper.bat
+            post-script: ""
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_conda_windows.yml@release/2.1
+    with:
+      conda-package-directory: ${{ matrix.conda-package-directory }}
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
+      CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }}
diff --git a/.github/workflows/build-m1-binaries.yml b/.github/workflows/build-m1-binaries.yml
deleted file mode 100644
index 81b15172a7e279bd4dadecf04ffb94e89e006e98..0000000000000000000000000000000000000000
--- a/.github/workflows/build-m1-binaries.yml
+++ /dev/null
@@ -1,160 +0,0 @@
-name: Build on M1
-on:
-  pull_request:
-    paths:
-      - .github/workflows/build-m1-binaries.yml
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-    tags:
-        # NOTE: Binary build pipelines should only get triggered on release candidate builds
-        # Release candidate tags look like: v1.11.0-rc1
-        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
-  workflow_dispatch:
-env:
-  CHANNEL: "nightly"
-jobs:
-  build_wheels:
-    name: "Build TorchVision M1 wheels"
-    runs-on: macos-m1-12
-    strategy:
-      matrix:
-        py_vers: [ "3.8", "3.9", "3.10" ]
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Set Release CHANNEL (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
-      - name: Build TorchVision M1 wheel
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')`
-          set -ex
-          . packaging/pkg_helpers.bash
-          # if we are uploading to test channell, our version consist only of the base: 0.x.x - no date string or suffix added
-          if [[ $CHANNEL == "test" ]]; then
-            setup_base_build_version
-          else
-            setup_build_version
-          fi
-
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg wheel pkg-config
-          conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
-          conda run -p ${ENV_NAME} python3 -mpip install delocate
-          conda run -p ${ENV_NAME} python3 setup.py bdist_wheel
-          export PYTORCH_VERSION="$(conda run -p ${ENV_NAME} python3 -mpip show torch | grep ^Version: | sed 's/Version:  *//')"
-          conda run -p ${ENV_NAME} DYLD_FALLBACK_LIBRARY_PATH="${ENV_NAME}/lib" delocate-wheel -v --ignore-missing-dependencies dist/*.whl
-          conda env remove -p ${ENV_NAME}
-      - name: Test wheel
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-test-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          set -ex
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy
-          conda run -p ${ENV_NAME} python3 -mpip install torch --pre --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
-          conda run -p ${ENV_NAME} python3 -mpip install dist/*.whl
-          # Test torch is importable, by changing cwd and running import commands
-          conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print('torchvision version is ', torchvision.__version__)"
-          conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torch;import torchvision;print('Is torchvision useable?', all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]))"
-          conda run --cwd /tmp -p ${ENV_NAME} python3 -c "import torchvision;print(torchvision.io.read_image('${PWD}/gallery/assets/dog1.jpg').shape)"
-          conda env remove -p ${ENV_NAME}
-      - name: Upload wheel to GitHub
-        uses: actions/upload-artifact@v3
-        with:
-          name: torchvision-py${{ matrix.py_vers }}-macos11-m1
-          path: dist/
-      - name: Upload wheel to S3
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }}
-        shell: arch -arch arm64 bash {0}
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
-        run: |
-          for pkg in dist/*; do
-            aws s3 cp "$pkg" "s3://pytorch/whl/${CHANNEL}/cpu/" --acl public-read
-          done
-  build_conda:
-    name: "Build TorchVision M1 conda packages"
-    runs-on: macos-m1-12
-    strategy:
-      matrix:
-        py_vers: [ "3.8", "3.9", "3.10" ]
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set CHANNEL (only for tagged pushes)
-        if: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags/') }}
-        run: |
-          # reference ends with an RC suffix
-          if [[ ${GITHUB_REF_NAME} = *-rc[0-9]* ]]; then
-            echo "CHANNEL=test" >> "$GITHUB_ENV"
-          fi
-      - name: Set CHANNEL Release (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Setup miniconda
-        uses: pytorch/test-infra/.github/actions/setup-miniconda@main
-      - name: Install conda-build and purge previous artifacts
-        shell: arch -arch arm64 bash {0}
-        run: |
-          conda install -yq conda-build
-          conda build purge-all
-      - name: Build TorchVision M1 conda package
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PYTHON_VERSION: ${{ matrix.py_vers }}
-          CU_VERSION: cpu
-        run: |
-          set -ex
-          . packaging/pkg_helpers.bash
-
-          if [[ $CHANNEL == "test" ]]; then
-            setup_base_build_version
-          else
-            setup_build_version
-          fi
-
-          setup_conda_pytorch_constraint
-          export SOURCE_ROOT_DIR=$(pwd)
-          conda build \
-            -c defaults \
-            $CONDA_CHANNEL_FLAGS \
-            --no-anaconda-upload \
-            --python "$PYTHON_VERSION" \
-            --output-folder=dist/ \
-            packaging/torchvision
-      - name: Upload package to GitHub
-        uses: actions/upload-artifact@v3
-        with:
-          name: torchvision-py${{ matrix.py_vers }}-macos11-m1-conda
-          path: dist/
-      - name: Upload package to conda
-        if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/')) }}
-        shell: arch -arch arm64 bash {0}
-        env:
-          CONDA_PYTORCHBOT_TOKEN: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }}
-        run: |
-          conda install -yq anaconda-client
-          set -x
-          export ANACONDA_PATH=$(conda info --base)/bin
-          $ANACONDA_PATH/anaconda  -t "${CONDA_PYTORCHBOT_TOKEN}" upload dist/osx-arm64/*.tar.bz2 -u "pytorch-${CHANNEL}" --label main --no-progress --force
diff --git a/.github/workflows/build-wheels-aarch64-linux.yml b/.github/workflows/build-wheels-aarch64-linux.yml
new file mode 100644
index 0000000000000000000000000000000000000000..30bcd3955012607624f1cb5e838104c992325acf
--- /dev/null
+++ b/.github/workflows/build-wheels-aarch64-linux.yml
@@ -0,0 +1,53 @@
+name: Build Aarch64 Linux Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
+    with:
+      package-type: wheel
+      os: linux-aarch64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      with-cuda: disable
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@release/2.1
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+      architecture: aarch64
+      setup-miniconda: false
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml
new file mode 100644
index 0000000000000000000000000000000000000000..e04c7383eaa1ca7a226905bb147a41bcc61c01aa
--- /dev/null
+++ b/.github/workflows/build-wheels-linux.yml
@@ -0,0 +1,50 @@
+name: Build Linux Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
+    with:
+      package-type: wheel
+      os: linux
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_linux.yml@release/2.1
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/build-wheels-m1.yml b/.github/workflows/build-wheels-m1.yml
new file mode 100644
index 0000000000000000000000000000000000000000..b4c4becc7106e9cf66e6774ab2e046541343277b
--- /dev/null
+++ b/.github/workflows/build-wheels-m1.yml
@@ -0,0 +1,51 @@
+name: Build M1 Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
+    with:
+      package-type: wheel
+      os: macos-arm64
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@release/2.1
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      runner-type: macos-m1-12
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/build-wheels-macos.yml b/.github/workflows/build-wheels-macos.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f79ace74583cbbe514c8640e6f2f6f514bfc57f2
--- /dev/null
+++ b/.github/workflows/build-wheels-macos.yml
@@ -0,0 +1,51 @@
+name: Build Macos Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
+    with:
+      package-type: wheel
+      os: macos
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            post-script: packaging/post_build_script.sh
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_macos.yml@release/2.1
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      runner-type: macos-12
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/build-wheels-windows.yml b/.github/workflows/build-wheels-windows.yml
new file mode 100644
index 0000000000000000000000000000000000000000..c9b3ebd72e75d516922ab517dca8b3d7e216c492
--- /dev/null
+++ b/.github/workflows/build-wheels-windows.yml
@@ -0,0 +1,52 @@
+name: Build Windows Wheels
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+        # NOTE: Binary build pipelines should only get triggered on release candidate builds
+        # Release candidate tags look like: v1.11.0-rc1
+        - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  generate-matrix:
+    uses: pytorch/test-infra/.github/workflows/generate_binary_build_matrix.yml@release/2.1
+    with:
+      package-type: wheel
+      os: windows
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+  build:
+    needs: generate-matrix
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - repository: pytorch/vision
+            pre-script: packaging/pre_build_script.sh
+            env-script: packaging/windows/internal/vc_env_helper.bat
+            post-script: "python packaging/wheel/relocate.py"
+            smoke-test-script: test/smoke_test.py
+            package-name: torchvision
+    name: ${{ matrix.repository }}
+    uses: pytorch/test-infra/.github/workflows/build_wheels_windows.yml@release/2.1
+    with:
+      repository: ${{ matrix.repository }}
+      ref: ""
+      test-infra-repository: pytorch/test-infra
+      test-infra-ref: release/2.1
+      build-matrix: ${{ needs.generate-matrix.outputs.matrix }}
+      pre-script: ${{ matrix.pre-script }}
+      env-script: ${{ matrix.env-script }}
+      post-script: ${{ matrix.post-script }}
+      package-name: ${{ matrix.package-name }}
+      smoke-test-script: ${{ matrix.smoke-test-script }}
+      trigger-event: ${{ github.event_name }}
+    secrets:
+      AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }}
+      AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }}
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
new file mode 100644
index 0000000000000000000000000000000000000000..724ee09a472a7e7011be412f0b395804986ccade
--- /dev/null
+++ b/.github/workflows/docs.yml
@@ -0,0 +1,127 @@
+name: Docs
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+    tags:
+      - v[0-9]+.[0-9]+.[0-9]
+      - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+
+  workflow_dispatch:
+
+jobs:
+  build:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      upload-artifact: docs
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+        ./.github/scripts/setup-env.sh
+
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+        # FIXME: not sure why we need this. `ldd torchvision/video_reader.so` shows that it
+        #  already links against the one pulled from conda. However, at runtime it pulls from
+        #  /lib64
+        # Should we maybe always do this in `./.github/scripts/setup-env.sh` so that we don't
+        # have to pay attention in all other workflows?
+        export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+
+        cd docs
+
+        echo '::group::Install doc requirements'
+        pip install --progress-bar=off -r requirements.txt
+        echo '::endgroup::'
+
+        if [[ ${{ github.event_name }} == push && (${{ github.ref_type }} == tag || (${{ github.ref_type }} == branch && ${{ github.ref_name }} == release/*)) ]]; then
+          echo '::group::Enable version string sanitization'
+          # This environment variable just has to exist and must not be empty. The actual value is arbitrary.
+          # See docs/source/conf.py for details
+          export TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS=1
+          echo '::endgroup::'
+        fi
+
+        # The runner does not have sufficient memory to run with as many processes as there are
+        # cores (`-j auto`). Thus, we limit to a single process (`-j 1`) here.
+        sed -i -e 's/-j auto/-j 1/' Makefile
+        make html
+
+        # Below is an imperfect way for us to add "try on collab" links to all of our gallery examples.
+        # sphinx-gallery will convert all gallery examples to .ipynb notebooks and stores them in
+        # build/html/_downloads/<some_hash>/<example_name>.ipynb
+        # We copy all those ipynb files in a more convenient folder so that we can more easily link to them.
+        mkdir build/html/_generated_ipynb_notebooks
+        for file in `find build/html/_downloads`; do
+          if [[ $file == *.ipynb ]]; then
+            cp $file build/html/_generated_ipynb_notebooks/
+          fi
+        done
+
+        cp -r build/html "${RUNNER_ARTIFACT_DIR}"
+
+        # On PRs we also want to upload the docs into our S3 bucket for preview.
+        if [[ ${{ github.event_name == 'pull_request' }} ]]; then
+          cp -r build/html/* "${RUNNER_DOCS_DIR}"
+        fi
+
+  upload:
+    needs: build
+    if: github.repository == 'pytorch/vision' && github.event_name == 'push' && 
+        ((github.ref_type == 'branch' && github.ref_name == 'main') || github.ref_type == 'tag')
+    permissions:
+      contents: write
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      download-artifact: docs
+      ref: gh-pages
+      script: |
+        set -euo pipefail
+
+        REF_TYPE=${{ github.ref_type }}
+        REF_NAME=${{ github.ref_name }}
+
+        if [[ "${REF_TYPE}" == branch ]]; then
+          TARGET_FOLDER="${REF_NAME}"
+        elif [[ "${REF_TYPE}" == tag ]]; then
+          case "${REF_NAME}" in
+            *-rc*)
+              echo "Aborting upload since this is an RC tag: ${REF_NAME}"
+              exit 0
+              ;;
+            *)
+              # Strip the leading "v" as well as the trailing patch version. For example:
+              # 'v0.15.2' -> '0.15'
+              TARGET_FOLDER=$(echo "${REF_NAME}" | sed 's/v\([0-9]\+\)\.\([0-9]\+\)\.[0-9]\+/\1.\2/')
+              ;;
+          esac
+        fi
+        echo "Target Folder: ${TARGET_FOLDER}"
+
+        mkdir -p "${TARGET_FOLDER}"
+        rm -rf "${TARGET_FOLDER}"/*
+        mv "${RUNNER_ARTIFACT_DIR}"/html/* "${TARGET_FOLDER}"
+        git add "${TARGET_FOLDER}" || true
+
+        if [[ "${TARGET_FOLDER}" == main ]]; then
+          mkdir -p _static
+          rm -rf _static/*
+          cp -r "${TARGET_FOLDER}"/_static/* _static
+          git add _static || true
+        fi
+
+        git config user.name 'pytorchbot'
+        git config user.email 'soumith+bot@pytorch.org'
+        git config http.postBuffer 524288000
+        git commit -m "auto-generating sphinx docs" || true
+        git push
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
new file mode 100644
index 0000000000000000000000000000000000000000..917bc54c8327136a91ae6312e8f938d6d5e918fe
--- /dev/null
+++ b/.github/workflows/lint.yml
@@ -0,0 +1,106 @@
+name: Lint
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  python-source-and-configs:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+
+        echo '::group::Setup environment'
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda create --name ci --quiet --yes python=3.8 pip
+        conda activate ci
+        echo '::endgroup::'
+
+        echo '::group::Install lint tools'
+        pip install --progress-bar=off pre-commit
+        echo '::endgroup::'
+
+        set +e
+        pre-commit run --all-files
+
+        if [ $? -ne 0 ]; then
+          git --no-pager diff
+          exit 1
+        fi
+
+  c-source:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+
+        echo '::group::Setup environment'
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        # clang-format needs some shared libraries that conflict with the system ones. Thus, we install them from conda
+        # and prepend the libraries to linker path to prioritize them. `ncurses=5` is only available on the conda-forge
+        # channel. Since we are not building or testing here, this is fine.
+        conda create --name ci --quiet --yes -c conda-forge python=3.8 ncurses=5 libgcc
+        conda activate ci
+        export LD_LIBRARY_PATH="${CONDA_PREFIX}/lib:${LD_LIBRARY_PATH}"
+        echo '::endgroup::'
+
+        echo '::group::Install lint tools'
+        curl https://oss-clang-format.s3.us-east-2.amazonaws.com/linux64/clang-format-linux64 -o ./clang-format
+        chmod +x ./clang-format
+        echo '::endgroup::'
+
+        echo '::group::Lint C source'
+        set +e
+        ./.github/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format
+
+        if [ $? -ne 0 ]; then
+          git --no-pager diff
+          exit 1
+        fi
+        echo '::endgroup::'
+
+  python-types:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/setup-env.sh
+
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+
+        echo '::group::Install lint tools'
+        pip install --progress-bar=off mypy
+        echo '::endgroup::'
+
+        echo '::group::Lint Python types'
+        mypy --install-types --non-interactive --config-file mypy.ini
+        echo '::endgroup::'
+
+  bc:
+    if: github.event.pull_request
+    runs-on: ubuntu-latest
+    steps:
+      - name: Run BC Lint Action
+        uses: pytorch/test-infra/.github/actions/bc-lint@release/2.1
+        with:
+          repo: ${{ github.event.pull_request.head.repo.full_name }}
+          base_sha: ${{ github.event.pull_request.base.sha }}
+          head_sha: ${{ github.event.pull_request.head.sha }}
diff --git a/.github/workflows/test-m1.yml b/.github/workflows/test-m1.yml
deleted file mode 100644
index 1e5f79f82fd764eb8fe725b927144fe51a961a0d..0000000000000000000000000000000000000000
--- a/.github/workflows/test-m1.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-name: Unit-tests on M1
-on:
-  pull_request:
-  push:
-    branches:
-      - nightly
-      - main
-      - release/*
-  workflow_dispatch:
-env:
-  CHANNEL: "nightly"
-jobs:
-  tests:
-    name: "Unit-tests on M1"
-    runs-on: macos-m1-12
-    strategy:
-      matrix:
-        py_vers: [ "3.8"]
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@v2
-      - name: Set Release CHANNEL (for release)
-        if: ${{ (github.event_name == 'pull_request' && startsWith(github.base_ref, 'release')) || startsWith(github.ref, 'refs/heads/release') }}
-        run: |
-          echo "CHANNEL=test" >> "$GITHUB_ENV"
-      - name: Install TorchVision
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          . ~/miniconda3/etc/profile.d/conda.sh
-          # Needed for JPEG library detection as setup.py detects conda presence by running `shutil.which('conda')`
-          export PATH=~/miniconda3/bin:$PATH
-          set -ex
-          conda create -yp ${ENV_NAME} python=${PY_VERS} numpy libpng jpeg scipy
-          conda run -p ${ENV_NAME} python3 -mpip install --pre torch --extra-index-url=https://download.pytorch.org/whl/${CHANNEL}
-          conda run -p ${ENV_NAME} python3 setup.py develop
-          conda run -p ${ENV_NAME} python3 -mpip install pytest pytest-mock av
-      - name: Run tests
-        shell: arch -arch arm64 bash {0}
-        env:
-          ENV_NAME: conda-env-${{ github.run_id }}
-          PY_VERS: ${{ matrix.py_vers }}
-        run: |
-          . ~/miniconda3/etc/profile.d/conda.sh
-          set -ex
-          conda run -p ${ENV_NAME} --no-capture-output python3 -u -mpytest -v --tb=long --durations 20
-          conda env remove -p ${ENV_NAME}
diff --git a/.github/workflows/tests-schedule.yml b/.github/workflows/tests-schedule.yml
index ecc283cac27e968ff70c0c2f405775acf15d2fd4..5426fdc997a58ad5d5afb6d48969595ca879bf4d 100644
--- a/.github/workflows/tests-schedule.yml
+++ b/.github/workflows/tests-schedule.yml
@@ -18,11 +18,14 @@ jobs:
       - name: Set up python
         uses: actions/setup-python@v2
         with:
-          python-version: 3.7
+          python-version: 3.8
 
       - name: Upgrade system packages
         run: python -m pip install --upgrade pip setuptools wheel
 
+      - name: SSL
+        run: python -c 'import ssl; print(ssl.OPENSSL_VERSION)'
+
       - name: Checkout repository
         uses: actions/checkout@v2
 
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
new file mode 100644
index 0000000000000000000000000000000000000000..7c1a334e1085c003a25e178083d1fc377f4d3741
--- /dev/null
+++ b/.github/workflows/tests.yml
@@ -0,0 +1,168 @@
+name: Tests
+
+on:
+  pull_request:
+  push:
+    branches:
+      - nightly
+      - main
+      - release/*
+  workflow_dispatch:
+
+jobs:
+  unittests-linux:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+        runner: ["linux.12xlarge"]
+        gpu-arch-type: ["cpu"]
+        include:
+          - python-version: 3.8
+            runner: linux.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      timeout: 120
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/unittest.sh
+
+  unittests-macos:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+        runner: ["macos-12"]
+        include:
+          - python-version: "3.8"
+            runner: macos-m1-12
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/macos_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      # We need an increased timeout here, since the macos-12 runner is the free one from GH
+      # and needs roughly 2 hours to just run the test suite
+      timeout: 240
+      runner: ${{ matrix.runner }}
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/unittest.sh
+
+  unittests-windows:
+    strategy:
+      matrix:
+        python-version:
+          - "3.8"
+          - "3.9"
+          - "3.10"
+          - "3.11"
+        runner: ["windows.4xlarge"]
+        gpu-arch-type: ["cpu"]
+        include:
+          - python-version: "3.8"
+            runner: windows.g5.4xlarge.nvidia.gpu
+            gpu-arch-type: cuda
+            gpu-arch-version: "11.8"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/windows_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      runner: ${{ matrix.runner }}
+      gpu-arch-type: ${{ matrix.gpu-arch-type }}
+      gpu-arch-version: ${{ matrix.gpu-arch-version }}
+      timeout: 120
+      script: |
+        set -euxo pipefail
+
+        export PYTHON_VERSION=${{ matrix.python-version }}
+        export VC_YEAR=2019
+        export VSDEVCMD_ARGS=""
+        export GPU_ARCH_TYPE=${{ matrix.gpu-arch-type }}
+        export GPU_ARCH_VERSION=${{ matrix.gpu-arch-version }}
+
+        ./.github/scripts/unittest.sh
+
+  onnx:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/setup-env.sh
+
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+
+        echo '::group::Install ONNX'
+        pip install --progress-bar=off onnx onnxruntime
+        echo '::endgroup::'
+
+        echo '::group::Install testing utilities'
+        pip install --progress-bar=off pytest
+        echo '::endgroup::'
+
+        echo '::group::Run ONNX tests'
+        pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_onnx.py
+        echo '::endgroup::'
+
+  unittests-extended:
+    uses: pytorch/test-infra/.github/workflows/linux_job.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      script: |
+        set -euo pipefail
+
+        export PYTHON_VERSION=3.8
+        export GPU_ARCH_TYPE=cpu
+        export GPU_ARCH_VERSION=''
+
+        ./.github/scripts/setup-env.sh
+
+        # Prepare conda
+        CONDA_PATH=$(which conda)
+        eval "$(${CONDA_PATH} shell.bash hook)"
+        conda activate ci
+
+        echo '::group::Pre-download model weights'
+        pip install --progress-bar=off aiohttp aiofiles tqdm
+        python scripts/download_model_urls.py
+        echo '::endgroup::'
+
+        echo '::group::Install testing utilities'
+        pip install --progress-bar=off pytest
+        echo '::endgroup::'
+
+        echo '::group::Run extended unittests'
+        export PYTORCH_TEST_WITH_EXTENDED=1
+        pytest --junit-xml="${RUNNER_TEST_RESULTS_DIR}/test-results.xml" -v --durations=25 test/test_extended_*.py
+        echo '::endgroup::'
diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml
new file mode 100644
index 0000000000000000000000000000000000000000..ac81f3ff155c35d152b64f8ec55e0af5cb23370e
--- /dev/null
+++ b/.github/workflows/update-viablestrict.yml
@@ -0,0 +1,23 @@
+name: Update viable/strict
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/update-viablestrict.yml
+  schedule:
+    - cron: 10,40 * * * *
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}
+  cancel-in-progress: false
+
+jobs:
+  do_update_viablestrict:
+    uses: pytorch/test-infra/.github/workflows/update-viablestrict.yml@release/2.1
+    with:
+      repository: pytorch/vision
+      required_checks: "Build Linux,Build M1,Build Macos,Build Windows,Tests,CMake,Lint,Docs"
+    secrets:
+      ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }}
+      GITHUB_DEPLOY_KEY : ${{ secrets.VISION_GITHUB_DEPLOY_KEY }}
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4c51a531267f44279aa2ce2226569b213732642d..762ebf6fce0b59e20d113a8b77ae684edddfbaff 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,19 +6,21 @@ repos:
       - id: check-toml
       - id: check-yaml
         exclude: packaging/.*
+        args:
+          - --allow-multiple-documents
       - id: mixed-line-ending
         args: [--fix=lf]
       - id: end-of-file-fixer
 
   - repo: https://github.com/omnilib/ufmt
-    rev: v1.3.2
+    rev: v1.3.3
     hooks:
       - id: ufmt
         additional_dependencies:
           - black == 22.3.0
           - usort == 1.0.2
 
-  - repo: https://gitlab.com/pycqa/flake8
+  - repo: https://github.com/PyCQA/flake8
     rev: 5.0.4
     hooks:
       - id: flake8
diff --git a/CITATION.cff b/CITATION.cff
new file mode 100644
index 0000000000000000000000000000000000000000..37db28b2badfdc4fd42ceaeb8aa301780d3b16f9
--- /dev/null
+++ b/CITATION.cff
@@ -0,0 +1,14 @@
+cff-version: 1.2.0
+title: "TorchVision: PyTorch's Computer Vision library"
+message: >-
+  If you find TorchVision useful in your work, please
+  consider citing the following BibTeX entry.
+type: software
+authors:
+  - given-names: TorchVision maintainers and contributors
+url: "https://github.com/pytorch/vision"
+license: "BSD-3-Clause"
+date-released: "2016-11-06"
+journal: "GitHub repository"
+publisher: "GitHub"
+key: "torchvision2016"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 85b878307cf4e89425d258ce6c69675b66cdefcb..0cd485d7a24860e058e6f6024b6e47531759279b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,9 +1,10 @@
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.18)
 project(torchvision)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 file(STRINGS version.txt TORCHVISION_VERSION)
 
 option(WITH_CUDA "Enable CUDA support" OFF)
+option(WITH_MPS "Enable MPS support" OFF)
 option(WITH_PNG "Enable features requiring LibPNG." ON)
 option(WITH_JPEG "Enable features requiring LibJPEG." ON)
 option(USE_PYTHON "Link to Python when building" OFF)
@@ -13,11 +14,11 @@ if(WITH_CUDA)
   add_definitions(-D__CUDA_NO_HALF_OPERATORS__)
   add_definitions(-DWITH_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
-  # CUDA-11.x can not be compiled using C++14 standard on Windows
-  string(REGEX MATCH "^[0-9]+" CUDA_MAJOR ${CMAKE_CUDA_COMPILER_VERSION})
-  if(${CUDA_MAJOR} GREATER 10 AND MSVC)
-    set(CMAKE_CXX_STANDARD 17)
-  endif()
+endif()
+
+if(WITH_MPS)
+  enable_language(OBJC OBJCXX)
+  add_definitions(-DWITH_MPS)
 endif()
 
 find_package(Torch REQUIRED)
@@ -84,6 +85,9 @@ list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCP
 if(WITH_CUDA)
     list(APPEND ALLOW_LISTED ${TVCPP}/ops/cuda ${TVCPP}/ops/autocast)
 endif()
+if(WITH_MPS)
+    list(APPEND ALLOW_LISTED ${TVCPP}/ops/mps)
+endif()
 
 FOREACH(DIR ${ALLOW_LISTED})
     file(GLOB ALL_SOURCES ${ALL_SOURCES} ${DIR}/*.*)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 3eedb6261a481109f06d567dc86ccc03b7dc133d..b41c0fe8939a81a19c50b5514073d85557753a23 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -4,22 +4,22 @@ We want to make contributing to this project as easy and transparent as possible
 
 ## TL;DR
 
-We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out. 
+We appreciate all contributions. If you are interested in contributing to Torchvision, there are many ways to help out.
 Your contributions may fall into the following categories:
 
-- It helps the project if you could 
+- It helps the project if you could
     - Report issues you're facing
-    - Give a :+1: on issues that others reported and that are relevant to you 
+    - Give a :+1: on issues that others reported and that are relevant to you
 
 - Answering queries on the issue tracker, investigating bugs are very valuable contributions to the project.
 
-- You would like to improve the documentation. This is no less important than improving the library itself! 
+- You would like to improve the documentation. This is no less important than improving the library itself!
 If you find a typo in the documentation, do not hesitate to submit a GitHub pull request.
 
 - If you would like to fix a bug
     - please pick one from the [list of open issues labelled as "help wanted"](https://github.com/pytorch/vision/issues?q=is%3Aopen+is%3Aissue+label%3A%22help+wanted%22)
     - comment on the issue that you want to work on this issue
-    - send a PR with your fix, see below. 
+    - send a PR with your fix, see below.
 
 - If you plan to contribute new features, utility functions or extensions, please first open an issue and discuss the feature with us.
 
@@ -30,30 +30,49 @@ clear and has sufficient instructions to be able to reproduce the issue.
 
 ## Development installation
 
-### Install PyTorch Nightly 
+
+### Dependencies
+
+Start by installing the **nightly** build of PyTorch following the [official
+instructions](https://pytorch.org/get-started/locally/).
+
+**Optionally**, install `libpng` and `libjpeg-turbo` if you want to enable
+support for
+native encoding / decoding of PNG and JPEG formats in
+[torchvision.io](https://pytorch.org/vision/stable/io.html#image):
 
 ```bash
-conda install pytorch -c pytorch-nightly
-# or with pip (see https://pytorch.org/get-started/locally/)
-# pip install numpy
-# pip install --pre torch -f https://download.pytorch.org/whl/nightly/cu102/torch_nightly.html
+conda install libpng libjpeg-turbo -c pytorch
 ```
 
-### Install Torchvision
+Note: you can use the `TORCHVISION_INCLUDE` and `TORCHVISION_LIBRARY`
+environment variables to tell the build system where to find those libraries if
+they are in specific locations. Take a look at
+[setup.py](https://github.com/pytorch/vision/blob/main/setup.py) for more
+details.
+
+### Clone and install torchvision
 
 ```bash
 git clone https://github.com/pytorch/vision.git
 cd vision
-python setup.py develop
+python setup.py develop  # use install instead of develop if you don't care about development.
 # or, for OSX
 # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py develop
-# for C++ debugging, please use DEBUG=1
+# for C++ debugging, use DEBUG=1
 # DEBUG=1 python setup.py develop
-pip install flake8 typing mypy pytest pytest-mock scipy
 ```
-You may also have to install `libpng-dev` and `libjpeg-turbo8-dev` libraries:
-```bash
-conda install libpng jpeg
+
+By default, GPU support is built if CUDA is found and `torch.cuda.is_available()` is true. It's possible to force
+building GPU support by setting `FORCE_CUDA=1` environment variable, which is useful when building a docker image.
+
+We don't officially support building from source using `pip`, but _if_ you do, you'll need to use the
+`--no-build-isolation` flag.
+
+Other development dependencies include:
+
+```
+pip install flake8 typing mypy pytest pytest-mock scipy
 ```
 
 ## Development Process
@@ -66,12 +85,12 @@ If you plan to modify the code or documentation, please follow the steps below:
 4. Ensure the test suite passes.
 5. Make sure your code passes the formatting checks (see below).
 
-For more details about pull requests, 
-please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request). 
+For more details about pull requests,
+please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request).
 
-If you would like to contribute a new model, please see [here](#New-model).
+If you would like to contribute a new model, please see [here](#New-architecture-or-improved-model-weights).
 
-If you would like to contribute a new dataset, please see [here](#New-dataset). 
+If you would like to contribute a new dataset, please see [here](#New-dataset).
 
 ### Code formatting and typing
 
@@ -83,7 +102,7 @@ Instead of relying directly on `black` however, we rely on
 [ufmt](https://github.com/omnilib/ufmt), for compatibility reasons with Facebook
 internal infrastructure.
 
-To format your code, install `ufmt` with `pip install ufmt==1.3.2 black==22.3.0 usort==1.0.2` and use e.g.:
+To format your code, install `ufmt` with `pip install ufmt==1.3.3 black==22.3.0 usort==1.0.2` and use e.g.:
 
 ```bash
 ufmt format torchvision
@@ -126,8 +145,8 @@ mypy --config-file mypy.ini
 
 ### Unit tests
 
-If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific 
-test: 
+If you have modified the code by adding a new feature or a bug-fix, please add unit tests for that. To run a specific
+test:
 ```bash
 pytest test/<test-module.py> -vvv -k <test_myfunc>
 # e.g. pytest test/test_transforms.py -vvv -k test_center_crop
@@ -136,7 +155,7 @@ pytest test/<test-module.py> -vvv -k <test_myfunc>
 If you would like to run all tests:
 ```bash
 pytest test -vvv
-``` 
+```
 
 Tests that require internet access should be in
 `test/test_internet.py`.
@@ -189,18 +208,18 @@ with "transforms" in their name.
 ### New architecture or improved model weights
 
 Please refer to the guidelines in [Contributing to Torchvision - Models](https://github.com/pytorch/vision/blob/main/CONTRIBUTING_MODELS.md).
- 
+
 ### New dataset
 
-More details on how to add a new dataset will be provided later. Please, do not send any PR with a new dataset without discussing 
+Please, do not send any PR with a new dataset without discussing
 it in an issue as, most likely, it will not be accepted.
 
 ### Pull Request
 
-If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on 
-different operation systems, python versions and hardwares.
+If all previous checks (flake8, mypy, unit tests) are passing, please send a PR. Submitted PR will pass other tests on
+different operating systems, python versions and hardware.
 
-For more details about pull requests workflow, 
+For more details about pull requests workflow,
 please read [GitHub's guides](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/creating-a-pull-request).
 
 ## License
diff --git a/CONTRIBUTING_MODELS.md b/CONTRIBUTING_MODELS.md
index 82845e6579aaddc4aca10f36bf95a3862811be64..390a25a0f8985767e8a9e39c43f6ad372befd1ca 100644
--- a/CONTRIBUTING_MODELS.md
+++ b/CONTRIBUTING_MODELS.md
@@ -20,13 +20,13 @@ So, before starting any work and submitting a PR there are a few critical things
 
 ### 1. Preparation work
 
-- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether or not this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it.
+- Start by looking into this [issue](https://github.com/pytorch/vision/issues/2707) in order to have an idea of the models that are being considered, express your willingness to add a new model and discuss with the community whether this model should be included in TorchVision. It is very important at this stage to make sure that there is an agreement on the value of having this model in TorchVision and there is no one else already working on it.
 
 - If the decision is to include the new model, then please create a new ticket which will be used for all design and implementation discussions prior to the PR. One of the TorchVision maintainers will reach out at this stage and this will be your POC from this point onwards in order to provide support, guidance and regular feedback.
 
 ### 2.  Implement the model
 
-Please take a look at existing models in TorchVision to get familiar with the idioms. Also please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1.  Example of things to take into account:
+Please take a look at existing models in TorchVision to get familiar with the idioms. Also, please look at recent contributions for new models. If in doubt about any design decisions you can ask for feedback on the issue created in step 1.  Example of things to take into account:
 
 - The implementation should be as close as possible to the canonical implementation/paper
 - The PR must include the code implementation, documentation and tests
@@ -34,7 +34,7 @@ Please take a look at existing models in TorchVision to get familiar with the id
 - The weights need to reproduce closely the results of the paper in terms of accuracy, even though the final weights to be deployed will be those trained by the TorchVision maintainers
 - The PR description should include commands/configuration used to train the model, so that the TorchVision maintainers can easily run them to verify the implementation and generate the final model to be released
 - Make sure we re-use existing components as much as possible (inheritance)
-- New primitives (transforms, losses, etc) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer
+- New primitives (transforms, losses, etc.) can be added if necessary, but the final location will be determined after discussion with the dedicated maintainer
 - Please take a look at the detailed [implementation and documentation guidelines](https://github.com/pytorch/vision/issues/5319) for a fine grain list of things not to be missed
 
 ### 3. Train the model with reference scripts
diff --git a/MANIFEST.in b/MANIFEST.in
index 75f238c0a2c97812ebe5fdf3e2b43667c7c7f6af..9e45188df355dac6e7e8e3657cd48959f8a2d968 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,4 @@
-include README.rst
+include README.md
 include LICENSE
 
 recursive-exclude * __pycache__
diff --git a/README.md b/README.md
index 981a96322b901cf9ab213a120d9e456924af18a2..b7df756a3578435295f6f3a039da7ba2403d090d 100644
--- a/README.md
+++ b/README.md
@@ -47,7 +47,7 @@ pip3 install torchvision*
 ##### 编译安装
 
 ```bash
-git clone -b dtk-23.10-v0.14.1 http://developer.hpccube.com/codes/aicomponent/vision.git
+git clone -b dtk-23.10-v0.16.0 http://developer.hpccube.com/codes/aicomponent/vision.git
 cd vision
 PYTORCH_ROCM_ARCH="gfx906;gfx926" python3 setup.py bdist_wheel
 pip3 install dist/vision*
diff --git a/README_ORIGIN.md b/README_ORIGIN.md
new file mode 100644
index 0000000000000000000000000000000000000000..373b6b79548f524171985fcdd0cdea906dc1f78a
--- /dev/null
+++ b/README_ORIGIN.md
@@ -0,0 +1,150 @@
+# torchvision
+
+[![total torchvision downloads](https://pepy.tech/badge/torchvision)](https://pepy.tech/project/torchvision)
+[![documentation](https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v)](https://pytorch.org/vision/stable/index.html)
+
+The torchvision package consists of popular datasets, model architectures, and common image transformations for computer
+vision.
+
+## Installation
+
+Please refer to the [official
+instructions](https://pytorch.org/get-started/locally/) to install the stable
+versions of `torch` and `torchvision` on your system.
+
+To build source, refer to our [contributing
+page](https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation).
+
+The following is the corresponding `torchvision` versions and supported Python
+versions.
+
+| `torch`            | `torchvision`      | Python              |
+| ------------------ | ------------------ | ------------------- |
+| `main` / `nightly` | `main` / `nightly` | `>=3.8`, `<=3.11`   |
+| `2.1`              | `0.16`             | `>=3.8`, `<=3.11`   |
+| `2.0`              | `0.15`             | `>=3.8`, `<=3.11`   |
+| `1.13`             | `0.14`             | `>=3.7.2`, `<=3.10` |
+
+<details>
+    <summary>older versions</summary>
+
+| `torch` | `torchvision`     | Python                    |
+|---------|-------------------|---------------------------|
+| `1.12`  | `0.13`            | `>=3.7`, `<=3.10`         |
+| `1.11`  | `0.12`            | `>=3.7`, `<=3.10`         |
+| `1.10`  | `0.11`            | `>=3.6`, `<=3.9`          |
+| `1.9`   | `0.10`            | `>=3.6`, `<=3.9`          |
+| `1.8`   | `0.9`             | `>=3.6`, `<=3.9`          |
+| `1.7`   | `0.8`             | `>=3.6`, `<=3.9`          |
+| `1.6`   | `0.7`             | `>=3.6`, `<=3.8`          |
+| `1.5`   | `0.6`             | `>=3.5`, `<=3.8`          |
+| `1.4`   | `0.5`             | `==2.7`, `>=3.5`, `<=3.8` |
+| `1.3`   | `0.4.2` / `0.4.3` | `==2.7`, `>=3.5`, `<=3.7` |
+| `1.2`   | `0.4.1`           | `==2.7`, `>=3.5`, `<=3.7` |
+| `1.1`   | `0.3`             | `==2.7`, `>=3.5`, `<=3.7` |
+| `<=1.0` | `0.2`             | `==2.7`, `>=3.5`, `<=3.7` |
+
+</details>
+
+## Image Backends
+
+Torchvision currently supports the following image backends:
+
+- torch tensors
+- PIL images:
+    - [Pillow](https://python-pillow.org/)
+    - [Pillow-SIMD](https://github.com/uploadcare/pillow-simd) - a **much faster** drop-in replacement for Pillow with SIMD.
+
+Read more in in our [docs](https://pytorch.org/vision/stable/transforms.html).
+
+## [UNSTABLE] Video Backend
+
+Torchvision currently supports the following video backends:
+
+- [pyav](https://github.com/PyAV-Org/PyAV) (default) - Pythonic binding for ffmpeg libraries.
+- video_reader - This needs ffmpeg to be installed and torchvision to be built from source. There shouldn't be any
+  conflicting version of ffmpeg installed. Currently, this is only supported on Linux.
+
+```
+conda install -c conda-forge ffmpeg
+python setup.py install
+```
+
+# Using the models on C++
+
+TorchVision provides an example project for how to use the models on C++ using JIT Script.
+
+Installation From source:
+
+```
+mkdir build
+cd build
+# Add -DWITH_CUDA=on support for the CUDA if needed
+cmake ..
+make
+make install
+```
+
+Once installed, the library can be accessed in cmake (after properly configuring `CMAKE_PREFIX_PATH`) via the
+`TorchVision::TorchVision` target:
+
+```
+find_package(TorchVision REQUIRED)
+target_link_libraries(my-target PUBLIC TorchVision::TorchVision)
+```
+
+The `TorchVision` package will also automatically look for the `Torch` package and add it as a dependency to
+`my-target`, so make sure that it is also available to cmake via the `CMAKE_PREFIX_PATH`.
+
+For an example setup, take a look at `examples/cpp/hello_world`.
+
+Python linking is disabled by default when compiling TorchVision with CMake, this allows you to run models without any
+Python dependency. In some special cases where TorchVision's operators are used from Python code, you may need to link
+to Python. This can be done by passing `-DUSE_PYTHON=on` to CMake.
+
+### TorchVision Operators
+
+In order to get the torchvision operators registered with torch (eg. for the JIT), all you need to do is to ensure that
+you `#include <torchvision/vision.h>` in your project.
+
+## Documentation
+
+You can find the API documentation on the pytorch website: <https://pytorch.org/vision/stable/index.html>
+
+## Contributing
+
+See the [CONTRIBUTING](CONTRIBUTING.md) file for how to help out.
+
+## Disclaimer on Datasets
+
+This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets,
+vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to
+determine whether you have permission to use the dataset under the dataset's license.
+
+If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset
+to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML
+community!
+
+## Pre-trained Model License
+
+The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the
+dataset used for training. It is your responsibility to determine whether you have permission to use the models for your
+use case.
+
+More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See
+[SWAG LICENSE](https://github.com/facebookresearch/SWAG/blob/main/LICENSE) for additional details.
+
+## Citing TorchVision
+
+If you find TorchVision useful in your work, please consider citing the following BibTeX entry:
+
+```bibtex
+@software{torchvision2016,
+    title        = {TorchVision: PyTorch's Computer Vision library},
+    author       = {TorchVision maintainers and contributors},
+    year         = 2016,
+    journal      = {GitHub repository},
+    publisher    = {GitHub},
+    howpublished = {\url{https://github.com/pytorch/vision}}
+}
+```
diff --git a/README_ORIGIN.rst b/README_ORIGIN.rst
deleted file mode 100644
index c3605cc3c9507373466ed73081061e2276320dd0..0000000000000000000000000000000000000000
--- a/README_ORIGIN.rst
+++ /dev/null
@@ -1,198 +0,0 @@
-torchvision
-===========
-
-.. image:: https://pepy.tech/badge/torchvision
-    :target: https://pepy.tech/project/torchvision
-
-.. image:: https://img.shields.io/badge/dynamic/json.svg?label=docs&url=https%3A%2F%2Fpypi.org%2Fpypi%2Ftorchvision%2Fjson&query=%24.info.version&colorB=brightgreen&prefix=v
-    :target: https://pytorch.org/vision/stable/index.html
-
-
-The torchvision package consists of popular datasets, model architectures, and common image transformations for computer vision.
-
-
-Installation
-============
-
-We recommend Anaconda as Python package management system. Please refer to `pytorch.org <https://pytorch.org/>`_
-for the detail of PyTorch (``torch``) installation. The following is the corresponding ``torchvision`` versions and
-supported Python versions.
-
-+--------------------------+--------------------------+---------------------------------+
-| ``torch``                | ``torchvision``          | ``python``                      |
-+==========================+==========================+=================================+
-| ``main`` / ``nightly``   | ``main`` / ``nightly``   | ``>=3.7``, ``<=3.10``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.12.0``               | ``0.13.0``               | ``>=3.7``, ``<=3.10``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.11.0``               | ``0.12.0``               | ``>=3.7``, ``<=3.10``           |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.10.2``               | ``0.11.3``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.10.1``               | ``0.11.2``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.10.0``               | ``0.11.1``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.9.1``                | ``0.10.1``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.9.0``                | ``0.10.0``               | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.8.2``                | ``0.9.2``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.8.1``                | ``0.9.1``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.8.0``                | ``0.9.0``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.7.1``                | ``0.8.2``                | ``>=3.6``, ``<=3.9``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.7.0``                | ``0.8.1``                | ``>=3.6``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.7.0``                | ``0.8.0``                | ``>=3.6``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.6.0``                | ``0.7.0``                | ``>=3.6``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.5.1``                | ``0.6.1``                | ``>=3.5``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.5.0``                | ``0.6.0``                | ``>=3.5``, ``<=3.8``            |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.4.0``                | ``0.5.0``                | ``==2.7``, ``>=3.5``, ``<=3.8`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.3.1``                | ``0.4.2``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.3.0``                | ``0.4.1``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.2.0``                | ``0.4.0``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``1.1.0``                | ``0.3.0``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-| ``<=1.0.1``              | ``0.2.2``                | ``==2.7``, ``>=3.5``, ``<=3.7`` |
-+--------------------------+--------------------------+---------------------------------+
-
-Anaconda:
-
-.. code:: bash
-
-    conda install torchvision -c pytorch
-
-pip:
-
-.. code:: bash
-
-    pip install torchvision
-
-From source:
-
-.. code:: bash
-
-    python setup.py install
-    # or, for OSX
-    # MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++ python setup.py install
-
-
-We don't officially support building from source using ``pip``, but *if* you do,
-you'll need to use the ``--no-build-isolation`` flag.
-In case building TorchVision from source fails, install the nightly version of PyTorch following
-the linked guide on the  `contributing page <https://github.com/pytorch/vision/blob/main/CONTRIBUTING.md#development-installation>`_ and retry the install.
-
-By default, GPU support is built if CUDA is found and ``torch.cuda.is_available()`` is true.
-It's possible to force building GPU support by setting ``FORCE_CUDA=1`` environment variable,
-which is useful when building a docker image.
-
-Image Backend
-=============
-Torchvision currently supports the following image backends:
-
-* `Pillow`_ (default)
-
-* `Pillow-SIMD`_ - a **much faster** drop-in replacement for Pillow with SIMD. If installed will be used as the default.
-
-* `accimage`_ - if installed can be activated by calling :code:`torchvision.set_image_backend('accimage')`
-
-* `libpng`_ - can be installed via conda :code:`conda install libpng` or any of the package managers for debian-based and RHEL-based Linux distributions.
-
-* `libjpeg`_ - can be installed via conda :code:`conda install jpeg` or any of the package managers for debian-based and RHEL-based Linux distributions. `libjpeg-turbo`_ can be used as well.
-
-**Notes:** ``libpng`` and ``libjpeg`` must be available at compilation time in order to be available. Make sure that it is available on the standard library locations,
-otherwise, add the include and library paths in the environment variables ``TORCHVISION_INCLUDE`` and ``TORCHVISION_LIBRARY``, respectively.
-
-.. _libpng : http://www.libpng.org/pub/png/libpng.html
-.. _Pillow : https://python-pillow.org/
-.. _Pillow-SIMD : https://github.com/uploadcare/pillow-simd
-.. _accimage: https://github.com/pytorch/accimage
-.. _libjpeg: http://ijg.org/
-.. _libjpeg-turbo: https://libjpeg-turbo.org/
-
-Video Backend
-=============
-Torchvision currently supports the following video backends:
-
-* `pyav`_ (default) - Pythonic binding for ffmpeg libraries.
-
-.. _pyav : https://github.com/PyAV-Org/PyAV
-
-* video_reader - This needs ffmpeg to be installed and torchvision to be built from source. There shouldn't be any conflicting version of ffmpeg installed. Currently, this is only supported on Linux.
-
-.. code:: bash
-
-     conda install -c conda-forge ffmpeg
-     python setup.py install
-
-
-Using the models on C++
-=======================
-TorchVision provides an example project for how to use the models on C++ using JIT Script.
-
-Installation From source:
-
-.. code:: bash
-
-    mkdir build
-    cd build
-    # Add -DWITH_CUDA=on support for the CUDA if needed
-    cmake ..
-    make
-    make install
-
-Once installed, the library can be accessed in cmake (after properly configuring ``CMAKE_PREFIX_PATH``) via the :code:`TorchVision::TorchVision` target:
-
-.. code:: rest
-
-	find_package(TorchVision REQUIRED)
-	target_link_libraries(my-target PUBLIC TorchVision::TorchVision)
-
-The ``TorchVision`` package will also automatically look for the ``Torch`` package and add it as a dependency to ``my-target``,
-so make sure that it is also available to cmake via the ``CMAKE_PREFIX_PATH``.
-
-For an example setup, take a look at ``examples/cpp/hello_world``.
-
-Python linking is disabled by default when compiling TorchVision with CMake, this allows you to run models without any Python 
-dependency. In some special cases where TorchVision's operators are used from Python code, you may need to link to Python. This 
-can be done by passing ``-DUSE_PYTHON=on`` to CMake.
-
-TorchVision Operators
----------------------
-In order to get the torchvision operators registered with torch (eg. for the JIT), all you need to do is to ensure that you
-:code:`#include <torchvision/vision.h>` in your project.
-
-Documentation
-=============
-You can find the API documentation on the pytorch website: https://pytorch.org/vision/stable/index.html
-
-Contributing
-============
-
-See the `CONTRIBUTING <CONTRIBUTING.md>`_ file for how to help out.
-
-Disclaimer on Datasets
-======================
-
-This is a utility library that downloads and prepares public datasets. We do not host or distribute these datasets, vouch for their quality or fairness, or claim that you have license to use the dataset. It is your responsibility to determine whether you have permission to use the dataset under the dataset's license.
-
-If you're a dataset owner and wish to update any part of it (description, citation, etc.), or do not want your dataset to be included in this library, please get in touch through a GitHub issue. Thanks for your contribution to the ML community!
-
-Pre-trained Model License
-=========================
-
-The pre-trained models provided in this library may have their own licenses or terms and conditions derived from the dataset used for training. It is your responsibility to determine whether you have permission to use the models for your use case.
-
-More specifically, SWAG models are released under the CC-BY-NC 4.0 license. See `SWAG LICENSE <https://github.com/facebookresearch/SWAG/blob/main/LICENSE>`_ for additional details.
diff --git a/android/README.md b/android/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..788c83f26de72593717e97af749ccadb77daab5f
--- /dev/null
+++ b/android/README.md
@@ -0,0 +1,3 @@
+## Status
+
+The Android demo of TorchVision is currently unmaintained, untested and likely out-of-date.
diff --git a/android/build.gradle b/android/build.gradle
index f28ba9112ff5709472fdbe511a0c53ecd1cd6c2d..f7995a07f5b619fd777ee5c52ce757115f2bf069 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -14,7 +14,7 @@ allprojects {
 
             androidSupportAppCompatV7Version = "28.0.0"
             fbjniJavaOnlyVersion = "0.0.3"
-            soLoaderNativeLoaderVersion = "0.10.4"
+            soLoaderNativeLoaderVersion = "0.10.5"
             pytorchAndroidVersion = "1.12"
         }
 
diff --git a/android/gradle.properties b/android/gradle.properties
index 1b6b275f63f64e360ea4dd1c688340e35e90345c..8204b73b05197d56e927a3bbdd7051e70db10fda 100644
--- a/android/gradle.properties
+++ b/android/gradle.properties
@@ -1,6 +1,6 @@
 ABI_FILTERS=armeabi-v7a,arm64-v8a,x86,x86_64
 
-VERSION_NAME=0.14.0-SNAPSHOT
+VERSION_NAME=0.15.0-SNAPSHOT
 GROUP=org.pytorch
 MAVEN_GROUP=org.pytorch
 SONATYPE_STAGING_PROFILE=orgpytorch
diff --git a/android/ops/CMakeLists.txt b/android/ops/CMakeLists.txt
index ad42adbfa71e63fc73d2de0e6dcae4d333cbca37..fb8d4348e8ea77948a8e8acc54ac5ede0ba53760 100644
--- a/android/ops/CMakeLists.txt
+++ b/android/ops/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.4.1)
 set(TARGET torchvision_ops)
 project(${TARGET} CXX)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 
 string(APPEND CMAKE_CXX_FLAGS " -DMOBILE")
 
diff --git a/cmake/iOS.cmake b/cmake/iOS.cmake
index d42ea4c9232c171312fdff20d42733d9ef379de1..935c57f11b9268504f2769d56eeffdba02a44b5f 100644
--- a/cmake/iOS.cmake
+++ b/cmake/iOS.cmake
@@ -10,11 +10,11 @@
 #   SIMULATOR - used to build for the Simulator platforms, which have an x86 arch.
 #
 # CMAKE_IOS_DEVELOPER_ROOT = automatic(default) or /path/to/platform/Developer folder
-#   By default this location is automatcially chosen based on the IOS_PLATFORM value above.
+#   By default this location is automatically chosen based on the IOS_PLATFORM value above.
 #   If set manually, it will override the default location and force the user of a particular Developer Platform
 #
 # CMAKE_IOS_SDK_ROOT = automatic(default) or /path/to/platform/Developer/SDKs/SDK folder
-#   By default this location is automatcially chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
+#   By default this location is automatically chosen based on the CMAKE_IOS_DEVELOPER_ROOT value.
 #   In this case it will always be the most up-to-date SDK found in the CMAKE_IOS_DEVELOPER_ROOT path.
 #   If set manually, this will force the use of a specific SDK version
 
@@ -100,7 +100,7 @@ if(IOS_DEPLOYMENT_TARGET)
   set(XCODE_IOS_PLATFORM_VERSION_FLAGS "-m${XCODE_IOS_PLATFORM}-version-min=${IOS_DEPLOYMENT_TARGET}")
 endif()
 
-# Hidden visibilty is required for cxx on iOS
+# Hidden visibility is required for cxx on iOS
 set(CMAKE_C_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS}")
 set(CMAKE_CXX_FLAGS_INIT "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -fvisibility-inlines-hidden")
 
diff --git a/docs/Makefile b/docs/Makefile
index 389a07a604e29769030bbd2e3df0d9252686487f..f462ff223032e8b44ff3f6a1429f164777596dd5 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -33,6 +33,7 @@ clean:
 	rm -rf $(SOURCEDIR)/auto_examples/  # sphinx-gallery
 	rm -rf $(SOURCEDIR)/gen_modules/  # sphinx-gallery
 	rm -rf $(SOURCEDIR)/generated/  # autosummary
+	rm -rf $(SOURCEDIR)/models/generated  # autosummary
 
 .PHONY: help Makefile docset
 
diff --git a/docs/requirements.txt b/docs/requirements.txt
index 09a11359ae758854b303403738188b8e3e934336..2a50d9b8f45c672a59ebd81a430d8674682eb498 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -5,3 +5,4 @@ sphinx-gallery>=0.11.1
 sphinx==5.0.0
 tabulate
 -e git+https://github.com/pytorch/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+pycocotools
diff --git a/docs/source/beta_status.py b/docs/source/beta_status.py
index 925894df5c5787ef453e86e2eb7ce326735a3b8e..cc79ca8972f3019f615b47560a5c02d25ea0e160 100644
--- a/docs/source/beta_status.py
+++ b/docs/source/beta_status.py
@@ -4,15 +4,26 @@ from docutils.parsers.rst import Directive
 
 class BetaStatus(Directive):
     has_content = True
+    text = "The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
+    node = nodes.warning
 
     def run(self):
-        api_name = " ".join(self.content)
-        text = f"The {api_name} is in Beta stage, and backward compatibility is not guaranteed."
-        return [nodes.warning("", nodes.paragraph("", "", nodes.Text(text)))]
+        text = self.text.format(api_name=" ".join(self.content))
+        return [self.node("", nodes.paragraph("", "", nodes.Text(text)))]
+
+
+class V2BetaStatus(BetaStatus):
+    text = (
+        "The {api_name} is in Beta stage, and while we do not expect disruptive breaking changes, "
+        "some APIs may slightly change according to user feedback. Please submit any feedback you may have "
+        "in this issue: https://github.com/pytorch/vision/issues/6753."
+    )
+    node = nodes.note
 
 
 def setup(app):
     app.add_directive("betastatus", BetaStatus)
+    app.add_directive("v2betastatus", V2BetaStatus)
     return {
         "version": "0.1",
         "parallel_read_safe": True,
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 231d3cad416dcd8189121ee94fc931cb9f135816..cd3a28658cbd721d3135aa3971dd4654fd278891 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -29,6 +29,7 @@ from pathlib import Path
 import pytorch_sphinx_theme
 import torchvision
 import torchvision.models as M
+from sphinx_gallery.sorting import ExplicitOrder
 from tabulate import tabulate
 
 sys.path.append(os.path.abspath("."))
@@ -55,11 +56,65 @@ extensions = [
     "beta_status",
 ]
 
+# We override sphinx-gallery's example header to prevent sphinx-gallery from
+# creating a note at the top of the renderred notebook.
+# https://github.com/sphinx-gallery/sphinx-gallery/blob/451ccba1007cc523f39cbcc960ebc21ca39f7b75/sphinx_gallery/gen_rst.py#L1267-L1271
+# This is because we also want to add a link to google collab, so we write our own note in each example.
+from sphinx_gallery import gen_rst
+
+gen_rst.EXAMPLE_HEADER = """
+.. DO NOT EDIT.
+.. THIS FILE WAS AUTOMATICALLY GENERATED BY SPHINX-GALLERY.
+.. TO MAKE CHANGES, EDIT THE SOURCE PYTHON FILE:
+.. "{0}"
+.. LINE NUMBERS ARE GIVEN BELOW.
+
+.. rst-class:: sphx-glr-example-title
+
+.. _sphx_glr_{1}:
+
+"""
+
+
+class CustomGalleryExampleSortKey:
+    # See https://sphinx-gallery.github.io/stable/configuration.html#sorting-gallery-examples
+    # and https://github.com/sphinx-gallery/sphinx-gallery/blob/master/sphinx_gallery/sorting.py
+    def __init__(self, src_dir):
+        self.src_dir = src_dir
+
+    transforms_subsection_order = [
+        "plot_transforms_getting_started.py",
+        "plot_transforms_illustrations.py",
+        "plot_transforms_e2e.py",
+        "plot_cutmix_mixup.py",
+        "plot_custom_transforms.py",
+        "plot_tv_tensors.py",
+        "plot_custom_tv_tensors.py",
+    ]
+
+    def __call__(self, filename):
+        if "gallery/transforms" in self.src_dir:
+            try:
+                return self.transforms_subsection_order.index(filename)
+            except ValueError as e:
+                raise ValueError(
+                    "Looks like you added an example in gallery/transforms? "
+                    "You need to specify its order in docs/source/conf.py. Look for CustomGalleryExampleSortKey."
+                ) from e
+        else:
+            # For other subsections we just sort alphabetically by filename
+            return filename
+
+
 sphinx_gallery_conf = {
     "examples_dirs": "../../gallery/",  # path to your example scripts
     "gallery_dirs": "auto_examples",  # path to where to save gallery generated output
+    "subsection_order": ExplicitOrder(["../../gallery/transforms", "../../gallery/others"]),
     "backreferences_dir": "gen_modules/backreferences",
     "doc_module": ("torchvision",),
+    "remove_config_comments": True,
+    "ignore_pattern": "helpers.py",
+    "within_subsection_order": CustomGalleryExampleSortKey,
 }
 
 napoleon_use_ivar = True
@@ -88,17 +143,15 @@ author = "Torch Contributors"
 # The version info for the project you're documenting, acts as replacement for
 # |version| and |release|, also used in various other places throughout the
 # built documents.
-#
-# The short X.Y version.
-version = "main (" + torchvision.__version__ + " )"
-# The full version, including alpha/beta/rc tags.
-release = "main"
-VERSION = os.environ.get("VERSION", None)
-if VERSION:
+# version: The short X.Y version.
+# release: The full version, including alpha/beta/rc tags.
+if os.environ.get("TORCHVISION_SANITIZE_VERSION_STR_IN_DOCS", None):
     # Turn 1.11.0aHASH into 1.11 (major.minor only)
-    version = ".".join(version.split(".")[:2])
+    version = release = ".".join(torchvision.__version__.split(".")[:2])
     html_title = " ".join((project, version, "documentation"))
-    release = version
+else:
+    version = f"main ({torchvision.__version__})"
+    release = "main"
 
 
 # The language for content autogenerated by Sphinx. Refer to documentation
@@ -138,7 +191,7 @@ html_theme_options = {
     "logo_only": True,
     "pytorch_project": "docs",
     "navigation_with_keys": True,
-    "analytics_id": "UA-117752657-2",
+    "analytics_id": "GTM-T8XT4PS",
 }
 
 html_logo = "_static/img/pytorch-logo-dark.svg"
@@ -318,7 +371,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
       used within the autoclass directive.
     """
 
-    if obj.__name__.endswith(("_Weights", "_QuantizedWeights")):
+    if getattr(obj, ".__name__", "").endswith(("_Weights", "_QuantizedWeights")):
 
         if len(obj) == 0:
             lines[:] = ["There are no available pre-trained weights."]
@@ -331,7 +384,7 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
         ]
 
         if obj.__doc__ != "An enumeration.":
-            # We only show the custom enum doc if it was overriden. The default one from Python is "An enumeration"
+            # We only show the custom enum doc if it was overridden. The default one from Python is "An enumeration"
             lines.append("")
             lines.append(obj.__doc__)
 
@@ -362,6 +415,13 @@ def inject_weight_metadata(app, what, name, obj, options, lines):
                     max_visible = 3
                     v_sample = ", ".join(v[:max_visible])
                     v = f"{v_sample}, ... ({len(v)-max_visible} omitted)" if len(v) > max_visible else v_sample
+                elif k == "_ops":
+                    v = f"{v:.2f}"
+                    k = "GIPS" if obj.__name__.endswith("_QuantizedWeights") else "GFLOPS"
+                elif k == "_file_size":
+                    k = "File size"
+                    v = f"{v:.1f} MB"
+
                 table.append((str(k), str(v)))
             table = tabulate(table, tablefmt="rst")
             lines += [".. rst-class:: table-weights"]  # Custom CSS class, see custom_torchvision.css
@@ -385,19 +445,27 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern
     if exclude_patterns is not None:
         weights = [w for w in weights if all(p not in str(w) for p in exclude_patterns)]
 
+    ops_name = "GIPS" if "QuantizedWeights" in weights_endswith else "GFLOPS"
+
     metrics_keys, metrics_names = zip(*metrics)
-    column_names = ["Weight"] + list(metrics_names) + ["Params", "Recipe"]
+    column_names = ["Weight"] + list(metrics_names) + ["Params"] + [ops_name, "Recipe"]  # Final column order
     column_names = [f"**{name}**" for name in column_names]  # Add bold
 
-    content = [
-        (
+    content = []
+    for w in weights:
+        row = [
             f":class:`{w} <{type(w).__name__}>`",
             *(w.meta["_metrics"][dataset][metric] for metric in metrics_keys),
             f"{w.meta['num_params']/1e6:.1f}M",
+            f"{w.meta['_ops']:.2f}",
             f"`link <{w.meta['recipe']}>`__",
-        )
-        for w in weights
-    ]
+        ]
+
+        content.append(row)
+
+    column_widths = ["110"] + ["18"] * len(metrics_names) + ["18"] * 2 + ["10"]
+    widths_table = " ".join(column_widths)
+
     table = tabulate(content, headers=column_names, tablefmt="rst")
 
     generated_dir = Path("generated")
@@ -405,7 +473,7 @@ def generate_weights_table(module, table_name, metrics, dataset, include_pattern
     with open(generated_dir / f"{table_name}_table.rst", "w+") as table_file:
         table_file.write(".. rst-class:: table-weights\n")  # Custom CSS class, see custom_torchvision.css
         table_file.write(".. table::\n")
-        table_file.write(f"    :widths: 100 {'20 ' * len(metrics_names)} 20 10\n\n")
+        table_file.write(f"    :widths: {widths_table} \n\n")
         table_file.write(f"{textwrap.indent(table, ' ' * 4)}\n\n")
 
 
diff --git a/docs/source/datasets.rst b/docs/source/datasets.rst
index f3f0b466d622be175cb9efc1688ca621bfc01d1b..588c1f781edbe8c5ef63d75bd20178e27266b9c3 100644
--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -1,3 +1,5 @@
+.. _datasets:
+
 Datasets
 ========
 
@@ -80,7 +82,6 @@ Image detection or segmentation
     CocoDetection
     CelebA
     Cityscapes
-    GTSRB
     Kitti
     OxfordIIITPet
     SBDataset
@@ -149,6 +150,14 @@ Video classification
     Kinetics
     UCF101
 
+Video prediction
+~~~~~~~~~~~~~~~~~~~~
+
+.. autosummary::
+    :toctree: generated/
+    :template: class_dataset.rst
+
+    MovingMNIST
 
 .. _base_classes_datasets:
 
@@ -162,3 +171,12 @@ Base classes for custom datasets
     DatasetFolder
     ImageFolder
     VisionDataset
+
+Transforms v2
+-------------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    wrap_dataset_for_transforms_v2
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 79dbebdd047d626eb8ec10347b6a226cbf564a08..dc5fdefaefb032ea7db7eb0e478d23bb7a7e37d8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -32,6 +32,7 @@ architectures, and common image transformations for computer vision.
    :caption: Package Reference
 
    transforms
+   tv_tensors
    models
    datasets
    utils
diff --git a/docs/source/io.rst b/docs/source/io.rst
index 258a1ee16dcaccd582543bd1fe2ae70aeed55fbf..1da9bb6882a9fbc4b91b9ce787de42164c825b7d 100644
--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -1,11 +1,37 @@
-Reading/Writing images and videos
-=================================
+Decoding / Encoding images and videos
+=====================================
 
 .. currentmodule:: torchvision.io
 
 The :mod:`torchvision.io` package provides functions for performing IO
-operations. They are currently specific to reading and writing video and
-images.
+operations. They are currently specific to reading and writing images and
+videos.
+
+Images
+------
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    read_image
+    decode_image
+    encode_jpeg
+    decode_jpeg
+    write_jpeg
+    encode_png
+    decode_png
+    write_png
+    read_file
+    write_file
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    ImageReadMode
+
+
 
 Video
 -----
@@ -20,7 +46,7 @@ Video
 
 
 Fine-grained video API
-----------------------
+^^^^^^^^^^^^^^^^^^^^^^
 
 In addition to the :mod:`read_video` function, we provide a high-performance 
 lower-level API for more fine-grained control compared to the :mod:`read_video` function.
@@ -61,28 +87,3 @@ Example of inspecting a video:
     # the constructor we select a default video stream, but
     # in practice, we can set whichever stream we would like 
     video.set_current_stream("video:0")
-
-
-Image
------
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    ImageReadMode
-
-.. autosummary::
-    :toctree: generated/
-    :template: function.rst
-
-    read_image
-    decode_image
-    encode_jpeg
-    decode_jpeg
-    write_jpeg
-    encode_png
-    decode_png
-    write_png
-    read_file
-    write_file
diff --git a/docs/source/models.rst b/docs/source/models.rst
index 10618434f9bf190255a89a3902a55c80272b9f00..155407786025401414e87c1f58e096e470c6807d 100644
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -120,13 +120,12 @@ behavior, such as batch normalization. To switch between these modes, use
     # Set model to eval mode
     model.eval()
 
-Model Registration Mechanism
-----------------------------
-
-.. betastatus:: registration mechanism
+Listing and retrieving available models
+---------------------------------------
 
-As of v0.14, TorchVision offers a new model registration mechanism which allows retreaving models
-and weights by their names. Here are a few examples on how to use them:
+As of v0.14, TorchVision offers a new mechanism which allows listing and
+retrieving models and weights by their names. Here are a few examples on how to
+use them:
 
 .. code:: python
 
@@ -148,7 +147,7 @@ and weights by their names. Here are a few examples on how to use them:
     weights_enum2 = get_model_weights(torchvision.models.quantization.mobilenet_v3_large)
     assert weights_enum == weights_enum2
 
-Here are the available public methods of the model registration mechanism:
+Here are the available public functions to retrieve models and their corresponding weights:
 
 .. currentmodule:: torchvision.models
 .. autosummary::
@@ -518,6 +517,7 @@ pre-trained weights:
    models/video_mvit
    models/video_resnet
    models/video_s3d
+   models/video_swin_transformer
 
 |
 
diff --git a/docs/source/models/alexnet.rst b/docs/source/models/alexnet.rst
index 080c241983bedf78f6b64fc19a8f5b7cecaf7e06..8e94b4eeed905983648cdefe50b29b95b4a4c41b 100644
--- a/docs/source/models/alexnet.rst
+++ b/docs/source/models/alexnet.rst
@@ -14,7 +14,7 @@ and is based on `One weird trick for parallelizing convolutional neural networks
 Model builders
 --------------
 
-The following model builders can be used to instanciate an AlexNet model, with or
+The following model builders can be used to instantiate an AlexNet model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.alexnet.AlexNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/efficientnet.rst b/docs/source/models/efficientnet.rst
index 4df547b3cbd3bac54e61a0270f5e60b010d227f5..cbc9718959af40e414a1a00a3cb5454305a3e16d 100644
--- a/docs/source/models/efficientnet.rst
+++ b/docs/source/models/efficientnet.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an EfficientNet model, with or
+The following model builders can be used to instantiate an EfficientNet model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/efficientnetv2.rst b/docs/source/models/efficientnetv2.rst
index 05c953b13277ac3bc9a82bd98310c2b042eeff63..3066c28ebd482a128f12656c966c246bfb8f0de9 100644
--- a/docs/source/models/efficientnetv2.rst
+++ b/docs/source/models/efficientnetv2.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an EfficientNetV2 model, with or
+The following model builders can be used to instantiate an EfficientNetV2 model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.efficientnet.EfficientNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/fcos.rst b/docs/source/models/fcos.rst
index 1bcc42676784fd72c775ac1485f117602bd213c4..085f26549b8dd40899fe2d08d55064406f676c13 100644
--- a/docs/source/models/fcos.rst
+++ b/docs/source/models/fcos.rst
@@ -3,7 +3,7 @@ FCOS
 
 .. currentmodule:: torchvision.models.detection
 
-The RetinaNet model is based on the `FCOS: Fully Convolutional One-Stage Object Detection
+The FCOS model is based on the `FCOS: Fully Convolutional One-Stage Object Detection
 <https://arxiv.org/abs/1904.01355>`__ paper.
 
 .. betastatus:: detection module
@@ -12,7 +12,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate a FCOS model, with or
-without pre-trained weights. All the model buidlers internally rely on the
+without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.fcos.FCOS`` base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/fcos.py>`_ for
 more details about this class.
diff --git a/docs/source/models/googlenet.rst b/docs/source/models/googlenet.rst
index ed4f1345e232ff36f4e960838c8c7ed7baa765d6..91ea03ddf3d48e1342f6a4be77e3344d8f635f0c 100644
--- a/docs/source/models/googlenet.rst
+++ b/docs/source/models/googlenet.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate a GoogLeNet model, with or
+The following model builders can be used to instantiate a GoogLeNet model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.googlenet.GoogLeNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/googlenet_quant.rst b/docs/source/models/googlenet_quant.rst
index acb2737b52b8d3752e692fc768899f4ece7a5c38..4358389b3e50c2c7b025a3c097fecd80af5f6306 100644
--- a/docs/source/models/googlenet_quant.rst
+++ b/docs/source/models/googlenet_quant.rst
@@ -10,7 +10,7 @@ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate a quantized GoogLeNet
+The following model builders can be used to instantiate a quantized GoogLeNet
 model, with or without pre-trained weights. All the model builders internally
 rely on the ``torchvision.models.quantization.googlenet.QuantizableGoogLeNet``
 base class. Please refer to the `source code
diff --git a/docs/source/models/inception.rst b/docs/source/models/inception.rst
index 72aa9724d4199686bca029329a7aab86cb91b7ad..e162eef5d30531bb357717186ca84a8b3cf8402b 100644
--- a/docs/source/models/inception.rst
+++ b/docs/source/models/inception.rst
@@ -10,7 +10,7 @@ Computer Vision <https://arxiv.org/abs/1512.00567>`__ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an InceptionV3 model, with or
+The following model builders can be used to instantiate an InceptionV3 model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.inception.Inception3`` base class. Please refer to the `source
 code <https://github.com/pytorch/vision/blob/main/torchvision/models/inception.py>`_ for
diff --git a/docs/source/models/inception_quant.rst b/docs/source/models/inception_quant.rst
index 397fd10df3c173c6a33bc0232433316fe8d46d6a..d26f1ab09da533b9c3496a5f835a540cf16f29df 100644
--- a/docs/source/models/inception_quant.rst
+++ b/docs/source/models/inception_quant.rst
@@ -10,7 +10,7 @@ Computer Vision <https://arxiv.org/abs/1512.00567>`__ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate a quantized Inception
+The following model builders can be used to instantiate a quantized Inception
 model, with or without pre-trained weights. All the model builders internally
 rely on the ``torchvision.models.quantization.inception.QuantizableInception3``
 base class. Please refer to the `source code
diff --git a/docs/source/models/mnasnet.rst b/docs/source/models/mnasnet.rst
index e31b4aca1b695073b8762e8d5a276cc3f70b7427..fd9ea5115857b0c85a2b7b949a24a99015f9374a 100644
--- a/docs/source/models/mnasnet.rst
+++ b/docs/source/models/mnasnet.rst
@@ -11,7 +11,7 @@ Search for Mobile <https://arxiv.org/pdf/1807.11626.pdf>`__ paper.
 Model builders
 --------------
 
-The following model builders can be used to instanciate an MNASNet model.
+The following model builders can be used to instantiate an MNASNet model.
 All the model builders internally rely on the
 ``torchvision.models.mnasnet.MNASNet`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/retinanet.rst b/docs/source/models/retinanet.rst
index 8613ae9aaab61fd931ce910bc95f7cedf7797887..910692ef3a5a91df23a4389af527c110f703bc88 100644
--- a/docs/source/models/retinanet.rst
+++ b/docs/source/models/retinanet.rst
@@ -12,7 +12,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate a RetinaNet model, with or
-without pre-trained weights. All the model buidlers internally rely on the
+without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.retinanet.RetinaNet`` base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/detection/retinanet.py>`_ for
 more details about this class.
diff --git a/docs/source/models/ssd.rst b/docs/source/models/ssd.rst
index 7d73b234a283088f74df46ceffb540f4c5b8a169..68b0bb224df3a22466bd1cd42bbcd06183769950 100644
--- a/docs/source/models/ssd.rst
+++ b/docs/source/models/ssd.rst
@@ -12,7 +12,7 @@ The SSD model is based on the `SSD: Single Shot MultiBox Detector
 Model builders
 --------------
 
-The following model builders can be used to instanciate a SSD model, with or
+The following model builders can be used to instantiate a SSD model, with or
 without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.detection.SSD`` base class. Please refer to the `source
 code
diff --git a/docs/source/models/swin_transformer.rst b/docs/source/models/swin_transformer.rst
index 35b529879541ede53bb16776e0de957038b109a6..b302f5bd79d390e658d7614de8d15471f8d1bb6e 100644
--- a/docs/source/models/swin_transformer.rst
+++ b/docs/source/models/swin_transformer.rst
@@ -15,7 +15,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate an SwinTransformer model (original and V2) with and without pre-trained weights.
-All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer`` 
+All the model builders internally rely on the ``torchvision.models.swin_transformer.SwinTransformer``
 base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py>`_ for
 more details about this class.
diff --git a/docs/source/models/vgg.rst b/docs/source/models/vgg.rst
index a9fa9aabfb10bc131ba680c639dcab89b6420524..77b5686927c99c39075fab6d0a9f9c24de491134 100644
--- a/docs/source/models/vgg.rst
+++ b/docs/source/models/vgg.rst
@@ -11,7 +11,7 @@ Model builders
 --------------
 
 The following model builders can be used to instantiate a VGG model, with or
-without pre-trained weights. All the model buidlers internally rely on the
+without pre-trained weights. All the model builders internally rely on the
 ``torchvision.models.vgg.VGG`` base class. Please refer to the `source code
 <https://github.com/pytorch/vision/blob/main/torchvision/models/vgg.py>`_ for
 more details about this class.
diff --git a/docs/source/models/video_swin_transformer.rst b/docs/source/models/video_swin_transformer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e31e69759b45681e5619ab1befe9a5bc4bc2cdf6
--- /dev/null
+++ b/docs/source/models/video_swin_transformer.rst
@@ -0,0 +1,27 @@
+Video SwinTransformer
+=====================
+
+.. currentmodule:: torchvision.models.video
+
+The Video SwinTransformer model is based on the `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`__ paper.
+
+.. betastatus:: video module
+
+
+Model builders
+--------------
+
+The following model builders can be used to instantiate a VideoResNet model, with or
+without pre-trained weights. All the model builders internally rely on the
+``torchvision.models.video.swin_transformer.SwinTransformer3d`` base class. Please refer to the `source
+code
+<https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_ for
+more details about this class.
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    swin3d_t
+    swin3d_s
+    swin3d_b
diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst
index 5909b68966ba8c7fec477c4d84c3ced4f9c8d08a..2aa1fc5ba1ebdc8e8c199098686c1d3661d95669 100644
--- a/docs/source/transforms.rst
+++ b/docs/source/transforms.rst
@@ -5,123 +5,549 @@ Transforming and augmenting images
 
 .. currentmodule:: torchvision.transforms
 
-Transforms are common image transformations available in the
-``torchvision.transforms`` module. They can be chained together using
-:class:`Compose`.
-Most transform classes have a function equivalent: :ref:`functional
-transforms <functional_transforms>` give fine-grained control over the
-transformations.
-This is useful if you have to build a more complex transformation pipeline
-(e.g. in the case of segmentation tasks).
-
-Most transformations accept both `PIL <https://pillow.readthedocs.io>`_
-images and tensor images, although some transformations are :ref:`PIL-only
-<transforms_pil_only>` and some are :ref:`tensor-only
-<transforms_tensor_only>`. The :ref:`conversion_transforms` may be used to
-convert to and from PIL images.
-
-The transformations that accept tensor images also accept batches of tensor
-images. A Tensor Image is a tensor with ``(C, H, W)`` shape, where ``C`` is a
-number of channels, ``H`` and ``W`` are image height and width. A batch of
-Tensor Images is a tensor of ``(B, C, H, W)`` shape, where ``B`` is a number
-of images in the batch.
+Torchvision supports common computer vision transformations in the
+``torchvision.transforms`` and ``torchvision.transforms.v2`` modules. Transforms
+can be used to transform or augment data for training or inference of different
+tasks (image classification, detection, segmentation, video classification).
+
+.. code:: python
+
+    # Image Classification
+    import torch
+    from torchvision.transforms import v2
+
+    H, W = 32, 32
+    img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)
+
+    transforms = v2.Compose([
+        v2.RandomResizedCrop(size=(224, 224), antialias=True),
+        v2.RandomHorizontalFlip(p=0.5),
+        v2.ToDtype(torch.float32, scale=True),
+        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+    img = transforms(img)
+
+.. code:: python
+
+    # Detection (re-using imports and transforms from above)
+    from torchvision import tv_tensors
+
+    img = torch.randint(0, 256, size=(3, H, W), dtype=torch.uint8)
+    boxes = torch.randint(0, H // 2, size=(3, 4))
+    boxes[:, 2:] += boxes[:, :2]
+    boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W))
+
+    # The same transforms can be used!
+    img, boxes = transforms(img, boxes)
+    # And you can pass arbitrary input structures
+    output_dict = transforms({"image": img, "boxes": boxes})
+
+Transforms are typically passed as the ``transform`` or ``transforms`` argument
+to the :ref:`Datasets <datasets>`.
+
+Start here
+----------
+
+Whether you're new to Torchvision transforms, or you're already experienced with
+them, we encourage you to start with
+:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` in
+order to learn more about what can be done with the new v2 transforms.
+
+Then, browse the sections in below this page for general information and
+performance tips. The available transforms and functionals are listed in the
+:ref:`API reference <v2_api_ref>`.
+
+More information and tutorials can also be found in our :ref:`example gallery
+<gallery>`, e.g. :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`
+or :ref:`sphx_glr_auto_examples_transforms_plot_custom_transforms.py`.
+
+.. _conventions:
+
+Supported input types and conventions
+-------------------------------------
+
+Most transformations accept both `PIL <https://pillow.readthedocs.io>`_ images
+and tensor inputs. Both CPU and CUDA tensors are supported.
+The result of both backends (PIL or Tensors) should be very
+close. In general, we recommend relying on the tensor backend :ref:`for
+performance <transforms_perf>`.  The :ref:`conversion transforms
+<conversion_transforms>` may be used to convert to and from PIL images, or for
+converting dtypes and ranges.
+
+Tensor image are expected to be of shape ``(C, H, W)``, where ``C`` is the
+number of channels, and ``H`` and ``W`` refer to height and width. Most
+transforms support batched tensor input. A batch of Tensor images is a tensor of
+shape ``(N, C, H, W)``, where ``N`` is a number of images in the batch. The
+:ref:`v2 <v1_or_v2>` transforms generally accept an arbitrary number of leading
+dimensions ``(..., C, H, W)`` and can handle batched images or batched videos.
+
+.. _range_and_dtype:
+
+Dtype and expected value range
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The expected range of the values of a tensor image is implicitly defined by
 the tensor dtype. Tensor images with a float dtype are expected to have
-values in ``[0, 1)``. Tensor images with an integer dtype are expected to
+values in ``[0, 1]``. Tensor images with an integer dtype are expected to
 have values in ``[0, MAX_DTYPE]`` where ``MAX_DTYPE`` is the largest value
-that can be represented in that dtype.
+that can be represented in that dtype. Typically, images of dtype
+``torch.uint8`` are expected to have values in ``[0, 255]``.
 
-Randomized transformations will apply the same transformation to all the
-images of a given batch, but they will produce different transformations
-across calls. For reproducible transformations across calls, you may use
-:ref:`functional transforms <functional_transforms>`.
+Use :class:`~torchvision.transforms.v2.ToDtype` to convert both the dtype and
+range of the inputs.
 
-The following examples illustrate the use of the available transforms:
+.. _v1_or_v2:
 
-    * :ref:`sphx_glr_auto_examples_plot_transforms.py`
+V1 or V2? Which one should I use?
+---------------------------------
 
-        .. figure:: ../source/auto_examples/images/sphx_glr_plot_transforms_001.png
-            :align: center
-            :scale: 65%
+**TL;DR** We recommending using the ``torchvision.transforms.v2`` transforms
+instead of those in ``torchvision.transforms``. They're faster and they can do
+more things. Just change the import and you should be good to go.
+
+In Torchvision 0.15 (March 2023), we released a new set of transforms available
+in the ``torchvision.transforms.v2`` namespace. These transforms have a lot of
+advantages compared to the v1 ones (in ``torchvision.transforms``):
+
+- They can transform images **but also** bounding boxes, masks, or videos. This
+  provides support for tasks beyond image classification: detection, segmentation,
+  video classification, etc. See
+  :ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py`
+  and :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`.
+- They support more transforms like :class:`~torchvision.transforms.v2.CutMix`
+  and :class:`~torchvision.transforms.v2.MixUp`. See
+  :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py`.
+- They're :ref:`faster <transforms_perf>`.
+- They support arbitrary input structures (dicts, lists, tuples, etc.).
+- Future improvements and features will be added to the v2 transforms only.
+
+These transforms are **fully backward compatible** with the v1 ones, so if
+you're already using tranforms from ``torchvision.transforms``, all you need to
+do to is to update the import to ``torchvision.transforms.v2``. In terms of
+output, there might be negligible differences due to implementation differences.
+
+.. note::
+
+    The v2 transforms are still BETA, but at this point we do not expect
+    disruptive changes to be made to their public APIs. We're planning to make
+    them fully stable in version 0.17. Please submit any feedback you may have
+    `here <https://github.com/pytorch/vision/issues/6753>`_.
+
+.. _transforms_perf:
+
+Performance considerations
+--------------------------
 
-    * :ref:`sphx_glr_auto_examples_plot_scripted_tensor_transforms.py`
+We recommend the following guidelines to get the best performance out of the
+transforms:
 
-        .. figure:: ../source/auto_examples/images/sphx_glr_plot_scripted_tensor_transforms_001.png
-            :align: center
-            :scale: 30%
+- Rely on the v2 transforms from ``torchvision.transforms.v2``
+- Use tensors instead of PIL images
+- Use ``torch.uint8`` dtype, especially for resizing
+- Resize with bilinear or bicubic mode
 
-.. warning::
+This is what a typical transform pipeline could look like:
 
-    Since v0.8.0 all random transformations are using torch default random generator to sample random parameters.
-    It is a backward compatibility breaking change and user should set the random state as following:
+.. code:: python
+
+    from torchvision.transforms import v2
+    transforms = v2.Compose([
+        v2.ToImage(),  # Convert to tensor, only needed if you had a PIL image
+        v2.ToDtype(torch.uint8, scale=True),  # optional, most input are already uint8 at this point
+        # ...
+        v2.RandomResizedCrop(size=(224, 224), antialias=True),  # Or Resize(antialias=True)
+        # ...
+        v2.ToDtype(torch.float32, scale=True),  # Normalize expects float input
+        v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+    ])
+
+The above should give you the best performance in a typical training environment
+that relies on the :class:`torch.utils.data.DataLoader` with ``num_workers >
+0``.
+
+Transforms tend to be sensitive to the input strides / memory format. Some
+transforms will be faster with channels-first images while others prefer
+channels-last. Like ``torch`` operators, most transforms will preserve the
+memory format of the input, but this may not always be respected due to
+implementation details. You may want to experiment a bit if you're chasing the
+very best performance.  Using :func:`torch.compile` on individual transforms may
+also help factoring out the memory format variable (e.g. on
+:class:`~torchvision.transforms.v2.Normalize`). Note that we're talking about
+**memory format**, not :ref:`tensor shape <conventions>`.
+
+Note that resize transforms like :class:`~torchvision.transforms.v2.Resize`
+and :class:`~torchvision.transforms.v2.RandomResizedCrop` typically prefer
+channels-last input and tend **not** to benefit from :func:`torch.compile` at
+this time.
 
-    .. code:: python
+.. _functional_transforms:
 
-        # Previous versions
-        # import random
-        # random.seed(12)
+Transform classes, functionals, and kernels
+-------------------------------------------
 
-        # Now
-        import torch
-        torch.manual_seed(17)
+Transforms are available as classes like
+:class:`~torchvision.transforms.v2.Resize`, but also as functionals like
+:func:`~torchvision.transforms.v2.functional.resize` in the
+``torchvision.transforms.v2.functional`` namespace.
+This is very much like the :mod:`torch.nn` package which defines both classes
+and functional equivalents in :mod:`torch.nn.functional`.
 
-    Please, keep in mind that the same seed for torch random generator and Python random generator will not
-    produce the same results.
+The functionals support PIL images, pure tensors, or :ref:`TVTensors
+<tv_tensors>`, e.g. both ``resize(image_tensor)`` and ``resize(boxes)`` are
+valid.
 
+.. note::
 
-Scriptable transforms
----------------------
+    Random transforms like :class:`~torchvision.transforms.v2.RandomCrop` will
+    randomly sample some parameter each time they're called. Their functional
+    counterpart (:func:`~torchvision.transforms.v2.functional.crop`) does not do
+    any kind of random sampling and thus have a slighlty different
+    parametrization. The ``get_params()`` class method of the transforms class
+    can be used to perform parameter sampling when using the functional APIs.
 
-In order to script the transformations, please use ``torch.nn.Sequential`` instead of :class:`Compose`.
+
+The ``torchvision.transforms.v2.functional`` namespace also contains what we
+call the "kernels". These are the low-level functions that implement the
+core functionalities for specific types, e.g. ``resize_bounding_boxes`` or
+```resized_crop_mask``. They are public, although not documented. Check the
+`code
+<https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/functional/__init__.py>`_
+to see which ones are available (note that those starting with a leading
+underscore are **not** public!). Kernels are only really useful if you want
+:ref:`torchscript support <transforms_torchscript>` for types like bounding
+boxes or masks.
+
+.. _transforms_torchscript:
+
+Torchscript support
+-------------------
+
+Most transform classes and functionals support torchscript. For composing
+transforms, use :class:`torch.nn.Sequential` instead of
+:class:`~torchvision.transforms.v2.Compose`:
 
 .. code:: python
 
     transforms = torch.nn.Sequential(
-        transforms.CenterCrop(10),
-        transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        CenterCrop(10),
+        Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
     )
     scripted_transforms = torch.jit.script(transforms)
 
-Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor`` and does not require
-`lambda` functions or ``PIL.Image``.
+.. warning::
 
-For any custom transformations to be used with ``torch.jit.script``, they should be derived from ``torch.nn.Module``.
+    v2 transforms support torchscript, but if you call ``torch.jit.script()`` on
+    a v2 **class** transform, you'll actually end up with its (scripted) v1
+    equivalent.  This may lead to slightly different results between the
+    scripted and eager executions due to implementation differences between v1
+    and v2.
 
+    If you really need torchscript support for the v2 transforms, we recommend
+    scripting the **functionals** from the
+    ``torchvision.transforms.v2.functional`` namespace to avoid surprises.
 
-Compositions of transforms
---------------------------
+
+Also note that the functionals only support torchscript for pure tensors, which
+are always treated as images. If you need torchscript support for other types
+like bounding boxes or masks, you can rely on the :ref:`low-level kernels
+<functional_transforms>`.
+
+For any custom transformations to be used with ``torch.jit.script``, they should
+be derived from ``torch.nn.Module``.
+
+See also: :ref:`sphx_glr_auto_examples_others_plot_scripted_tensor_transforms.py`.
+
+.. _v2_api_ref:
+
+V2 API reference - Recommended
+------------------------------
+
+Geometry
+^^^^^^^^
+
+Resizing
+""""""""
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
-    Compose
+    v2.Resize
+    v2.ScaleJitter
+    v2.RandomShortestSize
+    v2.RandomResize
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.resize
+
+Cropping
+""""""""
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.RandomCrop
+    v2.RandomResizedCrop
+    v2.RandomIoUCrop
+    v2.CenterCrop
+    v2.FiveCrop
+    v2.TenCrop
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.crop
+    v2.functional.resized_crop
+    v2.functional.ten_crop
+    v2.functional.center_crop
+    v2.functional.five_crop
+
+Others
+""""""
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.RandomHorizontalFlip
+    v2.RandomVerticalFlip
+    v2.Pad
+    v2.RandomZoomOut
+    v2.RandomRotation
+    v2.RandomAffine
+    v2.RandomPerspective
+    v2.ElasticTransform
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.horizontal_flip
+    v2.functional.vertical_flip
+    v2.functional.pad
+    v2.functional.rotate
+    v2.functional.affine
+    v2.functional.perspective
+    v2.functional.elastic
+
+Color
+^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.ColorJitter
+    v2.RandomChannelPermutation
+    v2.RandomPhotometricDistort
+    v2.Grayscale
+    v2.RandomGrayscale
+    v2.GaussianBlur
+    v2.RandomInvert
+    v2.RandomPosterize
+    v2.RandomSolarize
+    v2.RandomAdjustSharpness
+    v2.RandomAutocontrast
+    v2.RandomEqualize
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.permute_channels
+    v2.functional.rgb_to_grayscale
+    v2.functional.to_grayscale
+    v2.functional.gaussian_blur
+    v2.functional.invert
+    v2.functional.posterize
+    v2.functional.solarize
+    v2.functional.adjust_sharpness
+    v2.functional.autocontrast
+    v2.functional.adjust_contrast
+    v2.functional.equalize
+    v2.functional.adjust_brightness
+    v2.functional.adjust_saturation
+    v2.functional.adjust_hue
+    v2.functional.adjust_gamma
+
+
+Composition
+^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.Compose
+    v2.RandomApply
+    v2.RandomChoice
+    v2.RandomOrder
+
+Miscellaneous
+^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.LinearTransformation
+    v2.Normalize
+    v2.RandomErasing
+    v2.Lambda
+    v2.SanitizeBoundingBoxes
+    v2.ClampBoundingBoxes
+    v2.UniformTemporalSubsample
+
+Functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.normalize
+    v2.functional.erase
+    v2.functional.clamp_bounding_boxes
+    v2.functional.uniform_temporal_subsample
+
+.. _conversion_transforms:
+
+Conversion
+^^^^^^^^^^
+
+.. note::
+    Beware, some of these conversion transforms below will scale the values
+    while performing the conversion, while some may not do any scaling. By
+    scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0,
+    255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`.
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.ToImage
+    v2.ToPureTensor
+    v2.PILToTensor
+    v2.ToPILImage
+    v2.ToDtype
+    v2.ConvertBoundingBoxFormat
+
+functionals
+
+.. autosummary::
+    :toctree: generated/
+    :template: functional.rst
+
+    v2.functional.to_image
+    v2.functional.pil_to_tensor
+    v2.functional.to_pil_image
+    v2.functional.to_dtype
+    v2.functional.convert_bounding_box_format
+
+
+Deprecated
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.ToTensor
+    v2.functional.to_tensor
+    v2.ConvertImageDtype
+    v2.functional.convert_image_dtype
+
+Auto-Augmentation
+^^^^^^^^^^^^^^^^^
+
+`AutoAugment <https://arxiv.org/pdf/1805.09501.pdf>`_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models.
+Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that
+ImageNet policies provide significant improvements when applied to other datasets.
+In TorchVision we implemented 3 policies learned on the following datasets: ImageNet, CIFAR10 and SVHN.
+The new transform can be used standalone or mixed-and-matched with existing transforms:
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    v2.AutoAugment
+    v2.RandAugment
+    v2.TrivialAugmentWide
+    v2.AugMix
+
 
+CutMix - MixUp
+^^^^^^^^^^^^^^
 
-Transforms on PIL Image and torch.\*Tensor
-------------------------------------------
+CutMix and MixUp are special transforms that
+are meant to be used on batches rather than on individual images, because they
+are combining pairs of images together. These can be used after the dataloader
+(once the samples are batched), or part of a collation function. See
+:ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage examples.
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
+    v2.CutMix
+    v2.MixUp
+
+Developer tools
+^^^^^^^^^^^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: function.rst
+
+    v2.functional.register_kernel
+
+
+V1 API Reference
+----------------
+
+Geometry
+^^^^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    Resize
+    RandomCrop
+    RandomResizedCrop
     CenterCrop
-    ColorJitter
     FiveCrop
-    Grayscale
+    TenCrop
     Pad
+    RandomRotation
     RandomAffine
-    RandomApply
-    RandomCrop
-    RandomGrayscale
-    RandomHorizontalFlip
     RandomPerspective
-    RandomResizedCrop
-    RandomRotation
+    ElasticTransform
+    RandomHorizontalFlip
     RandomVerticalFlip
-    Resize
-    TenCrop
+
+
+Color
+^^^^^
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    ColorJitter
+    Grayscale
+    RandomGrayscale
     GaussianBlur
     RandomInvert
     RandomPosterize
@@ -130,23 +556,20 @@ Transforms on PIL Image and torch.\*Tensor
     RandomAutocontrast
     RandomEqualize
 
-
-.. _transforms_pil_only:
-
-Transforms on PIL Image only
-----------------------------
+Composition
+^^^^^^^^^^^
 
 .. autosummary::
     :toctree: generated/
     :template: class.rst
 
+    Compose
+    RandomApply
     RandomChoice
     RandomOrder
 
-.. _transforms_tensor_only:
-
-Transforms on torch.\*Tensor only
----------------------------------
+Miscellaneous
+^^^^^^^^^^^^^
 
 .. autosummary::
     :toctree: generated/
@@ -155,13 +578,17 @@ Transforms on torch.\*Tensor only
     LinearTransformation
     Normalize
     RandomErasing
-    ConvertImageDtype
-
-.. _conversion_transforms:
+    Lambda
 
-Conversion Transforms
----------------------
+Conversion
+^^^^^^^^^^
 
+.. note::
+    Beware, some of these conversion transforms below will scale the values
+    while performing the conversion, while some may not do any scaling. By
+    scaling, we mean e.g. that a ``uint8`` -> ``float32`` would map the [0,
+    255] range into [0, 1] (and vice-versa). See :ref:`range_and_dtype`.
+    
 .. autosummary::
     :toctree: generated/
     :template: class.rst
@@ -169,20 +596,10 @@ Conversion Transforms
     ToPILImage
     ToTensor
     PILToTensor
+    ConvertImageDtype
 
-
-Generic Transforms
-------------------
-
-.. autosummary::
-    :toctree: generated/
-    :template: class.rst
-
-    Lambda
-
-
-Automatic Augmentation Transforms
----------------------------------
+Auto-Augmentation
+^^^^^^^^^^^^^^^^^
 
 `AutoAugment <https://arxiv.org/pdf/1805.09501.pdf>`_ is a common Data Augmentation technique that can improve the accuracy of Image Classification models.
 Though the data augmentation policies are directly linked to their trained dataset, empirical studies show that
@@ -200,57 +617,13 @@ The new transform can be used standalone or mixed-and-matched with existing tran
     TrivialAugmentWide
     AugMix
 
-.. _functional_transforms:
+
 
 Functional Transforms
----------------------
+^^^^^^^^^^^^^^^^^^^^^
 
 .. currentmodule:: torchvision.transforms.functional
 
-Functional transforms give you fine-grained control of the transformation pipeline.
-As opposed to the transformations above, functional transforms don't contain a random number
-generator for their parameters.
-That means you have to specify/generate all parameters, but the functional transform will give you
-reproducible results across calls.
-
-Example:
-you can apply a functional transform with the same parameters to multiple images like this:
-
-.. code:: python
-
-    import torchvision.transforms.functional as TF
-    import random
-
-    def my_segmentation_transforms(image, segmentation):
-        if random.random() > 0.5:
-            angle = random.randint(-30, 30)
-            image = TF.rotate(image, angle)
-            segmentation = TF.rotate(segmentation, angle)
-        # more transforms ...
-        return image, segmentation
-
-
-Example:
-you can use a functional transform to build transform classes with custom behavior:
-
-.. code:: python
-
-    import torchvision.transforms.functional as TF
-    import random
-
-    class MyRotationTransform:
-        """Rotate by one of the given angles."""
-
-        def __init__(self, angles):
-            self.angles = angles
-
-        def __call__(self, x):
-            angle = random.choice(self.angles)
-            return TF.rotate(x, angle)
-
-    rotation_transform = MyRotationTransform(angles=[-30, -15, 0, 15, 30])
-
-
 .. autosummary::
     :toctree: generated/
     :template: function.rst
diff --git a/docs/source/tv_tensors.rst b/docs/source/tv_tensors.rst
new file mode 100644
index 0000000000000000000000000000000000000000..cb8a3c45fa9ca2c53754a110570a9bd0dab4d7ca
--- /dev/null
+++ b/docs/source/tv_tensors.rst
@@ -0,0 +1,29 @@
+.. _tv_tensors:
+
+TVTensors
+==========
+
+.. currentmodule:: torchvision.tv_tensors
+
+TVTensors are :class:`torch.Tensor` subclasses which the v2 :ref:`transforms
+<transforms>` use under the hood to dispatch their inputs to the appropriate
+lower-level kernels. Most users do not need to manipulate TVTensors directly.
+
+Refer to
+:ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` for
+an introduction to TVTensors, or
+:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py` for more advanced
+info.
+
+.. autosummary::
+    :toctree: generated/
+    :template: class.rst
+
+    Image
+    Video
+    BoundingBoxFormat
+    BoundingBoxes
+    Mask
+    TVTensor
+    set_return_type
+    wrap
diff --git a/docs/source/utils.rst b/docs/source/utils.rst
index 276f730c2940fa778d2064c666d24f38e01e0699..cda04de900ad8f43a9ac855631d3c67f0a149c69 100644
--- a/docs/source/utils.rst
+++ b/docs/source/utils.rst
@@ -4,7 +4,7 @@ Utils
 =====
 
 The ``torchvision.utils`` module contains various utilities, mostly :ref:`for
-vizualization <sphx_glr_auto_examples_plot_visualization_utils.py>`. 
+visualization <sphx_glr_auto_examples_others_plot_visualization_utils.py>`.
 
 .. currentmodule:: torchvision.utils
 
diff --git a/examples/cpp/hello_world/CMakeLists.txt b/examples/cpp/hello_world/CMakeLists.txt
index 3ca59e4c199e4e557ae3c3dfc414686507ded01d..7d49178b8b339ed02739adec1fbb3c8ec64882b2 100644
--- a/examples/cpp/hello_world/CMakeLists.txt
+++ b/examples/cpp/hello_world/CMakeLists.txt
@@ -17,4 +17,4 @@ add_executable(hello-world main.cpp)
 # which also adds all the necessary torch dependencies.
 target_compile_features(hello-world PUBLIC cxx_range_for)
 target_link_libraries(hello-world TorchVision::TorchVision)
-set_property(TARGET hello-world PROPERTY CXX_STANDARD 14)
+set_property(TARGET hello-world PROPERTY CXX_STANDARD 17)
diff --git a/gallery/README.rst b/gallery/README.rst
index 868afe743518523fe73b029e6250291518304376..8dfea35527640aea39b4659d3bd2b3873d1ad708 100644
--- a/gallery/README.rst
+++ b/gallery/README.rst
@@ -1,4 +1,4 @@
-Example gallery
-===============
+.. _gallery:
 
-Below is a gallery of examples
+Examples and tutorials
+======================
diff --git a/gallery/assets/coco/images/000000000001.jpg b/gallery/assets/coco/images/000000000001.jpg
new file mode 120000
index 0000000000000000000000000000000000000000..9be80c7c27300ce0b8fe589a9e41b13fef33c2b8
--- /dev/null
+++ b/gallery/assets/coco/images/000000000001.jpg
@@ -0,0 +1 @@
+../../astronaut.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/images/000000000002.jpg b/gallery/assets/coco/images/000000000002.jpg
new file mode 120000
index 0000000000000000000000000000000000000000..9f8efef9928aec7e07a66a3581f1d09c2184393e
--- /dev/null
+++ b/gallery/assets/coco/images/000000000002.jpg
@@ -0,0 +1 @@
+../../dog2.jpg
\ No newline at end of file
diff --git a/gallery/assets/coco/instances.json b/gallery/assets/coco/instances.json
new file mode 100644
index 0000000000000000000000000000000000000000..fe0e09270bfba4390db27fd796fbc943c2c76362
--- /dev/null
+++ b/gallery/assets/coco/instances.json
@@ -0,0 +1 @@
+{"images": [{"file_name": "000000000001.jpg", "height": 512, "width": 512, "id": 1}, {"file_name": "000000000002.jpg", "height": 500, "width": 500, "id": 2}], "annotations": [{"segmentation": [[40.0, 511.0, 26.0, 487.0, 28.0, 438.0, 17.0, 397.0, 24.0, 346.0, 38.0, 306.0, 61.0, 250.0, 111.0, 206.0, 111.0, 187.0, 120.0, 183.0, 136.0, 159.0, 159.0, 150.0, 181.0, 148.0, 182.0, 132.0, 175.0, 132.0, 168.0, 120.0, 154.0, 102.0, 153.0, 62.0, 188.0, 35.0, 191.0, 29.0, 208.0, 20.0, 210.0, 22.0, 227.0, 16.0, 240.0, 16.0, 276.0, 31.0, 285.0, 39.0, 301.0, 88.0, 297.0, 108.0, 281.0, 128.0, 273.0, 138.0, 266.0, 138.0, 264.0, 153.0, 257.0, 162.0, 256.0, 174.0, 284.0, 197.0, 300.0, 221.0, 303.0, 236.0, 337.0, 258.0, 357.0, 306.0, 361.0, 351.0, 358.0, 511.0]], "iscrowd": 0, "image_id": 1, "bbox": [17.0, 16.0, 344.0, 495.0], "category_id": 1, "id": 1}, {"segmentation": [[0.0, 411.0, 43.0, 401.0, 99.0, 395.0, 105.0, 351.0, 124.0, 326.0, 181.0, 294.0, 227.0, 280.0, 245.0, 262.0, 259.0, 234.0, 262.0, 207.0, 271.0, 140.0, 283.0, 139.0, 301.0, 162.0, 309.0, 181.0, 341.0, 175.0, 362.0, 139.0, 369.0, 139.0, 377.0, 163.0, 378.0, 203.0, 381.0, 212.0, 380.0, 220.0, 382.0, 242.0, 404.0, 264.0, 392.0, 293.0, 384.0, 295.0, 385.0, 316.0, 399.0, 343.0, 391.0, 448.0, 452.0, 475.0, 457.0, 494.0, 436.0, 498.0, 402.0, 491.0, 369.0, 488.0, 366.0, 496.0, 319.0, 496.0, 302.0, 485.0, 226.0, 469.0, 128.0, 456.0, 74.0, 458.0, 29.0, 439.0, 0.0, 445.0]], "iscrowd": 0, "image_id": 2, "bbox": [0.0, 139.0, 457.0, 359.0], "category_id": 18, "id": 2}]}
diff --git a/gallery/others/README.rst b/gallery/others/README.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fafb007d98522d5888a26868d0ecc420df434ca7
--- /dev/null
+++ b/gallery/others/README.rst
@@ -0,0 +1,2 @@
+Others
+------
diff --git a/gallery/plot_optical_flow.py b/gallery/others/plot_optical_flow.py
similarity index 90%
rename from gallery/plot_optical_flow.py
rename to gallery/others/plot_optical_flow.py
index b0a93209877e9afda6a52914a29c727b3f5ed4c3..3ab1449341729cced1c571c9eefefcadc586c08d 100644
--- a/gallery/plot_optical_flow.py
+++ b/gallery/others/plot_optical_flow.py
@@ -3,6 +3,10 @@
 Optical Flow: Predicting movement with the RAFT model
 =====================================================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_optical_flow.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_optical_flow.py>` to download the full example code.
+
 Optical flow is the task of predicting movement between two images, usually two
 consecutive frames of a video. Optical flow models take two images as input, and
 predict a flow: the flow indicates the displacement of every single pixel in the
@@ -42,7 +46,7 @@ def plot(imgs, **imshow_kwargs):
 
     plt.tight_layout()
 
-###################################
+# %%
 # Reading Videos Using Torchvision
 # --------------------------------
 # We will first read a video using :func:`~torchvision.io.read_video`.
@@ -62,7 +66,7 @@ video_url = "https://download.pytorch.org/tutorial/pexelscom_pavel_danilyuk_bask
 video_path = Path(tempfile.mkdtemp()) / "basketball.mp4"
 _ = urlretrieve(video_url, video_path)
 
-#########################
+# %%
 # :func:`~torchvision.io.read_video` returns the video frames, audio frames and
 # the metadata associated with the video. In our case, we only need the video
 # frames.
@@ -79,11 +83,12 @@ img2_batch = torch.stack([frames[101], frames[151]])
 
 plot(img1_batch)
 
-#########################
+# %%
 # The RAFT model accepts RGB images. We first get the frames from
-# :func:`~torchvision.io.read_video` and resize them to ensure their
-# dimensions are divisible by 8. Then we use the transforms bundled into the
-# weights in order to preprocess the input and rescale its values to the
+# :func:`~torchvision.io.read_video` and resize them to ensure their dimensions
+# are divisible by 8. Note that we explicitly use ``antialias=False``, because
+# this is how those models were trained. Then we use the transforms bundled into
+# the weights in order to preprocess the input and rescale its values to the
 # required ``[-1, 1]`` interval.
 
 from torchvision.models.optical_flow import Raft_Large_Weights
@@ -93,8 +98,8 @@ transforms = weights.transforms()
 
 
 def preprocess(img1_batch, img2_batch):
-    img1_batch = F.resize(img1_batch, size=[520, 960])
-    img2_batch = F.resize(img2_batch, size=[520, 960])
+    img1_batch = F.resize(img1_batch, size=[520, 960], antialias=False)
+    img2_batch = F.resize(img2_batch, size=[520, 960], antialias=False)
     return transforms(img1_batch, img2_batch)
 
 
@@ -103,7 +108,7 @@ img1_batch, img2_batch = preprocess(img1_batch, img2_batch)
 print(f"shape = {img1_batch.shape}, dtype = {img1_batch.dtype}")
 
 
-####################################
+# %%
 # Estimating Optical flow using RAFT
 # ----------------------------------
 # We will use our RAFT implementation from
@@ -124,12 +129,12 @@ list_of_flows = model(img1_batch.to(device), img2_batch.to(device))
 print(f"type = {type(list_of_flows)}")
 print(f"length = {len(list_of_flows)} = number of iterations of the model")
 
-####################################
+# %%
 # The RAFT model outputs lists of predicted flows where each entry is a
 # (N, 2, H, W) batch of predicted flows that corresponds to a given "iteration"
 # in the model. For more details on the iterative nature of the model, please
 # refer to the `original paper <https://arxiv.org/abs/2003.12039>`_. Here, we
-# are only interested in the final predicted flows (they are the most acccurate
+# are only interested in the final predicted flows (they are the most accurate
 # ones), so we will just retrieve the last item in the list.
 #
 # As described above, a flow is a tensor with dimensions (2, H, W) (or (N, 2, H,
@@ -143,10 +148,10 @@ print(f"shape = {predicted_flows.shape} = (N, 2, H, W)")
 print(f"min = {predicted_flows.min()}, max = {predicted_flows.max()}")
 
 
-####################################
+# %%
 # Visualizing predicted flows
 # ---------------------------
-# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utlity to
+# Torchvision provides the :func:`~torchvision.utils.flow_to_image` utility to
 # convert a flow into an RGB image. It also supports batches of flows.
 # each "direction" in the flow will be mapped to a given RGB color. In the
 # images below, pixels with similar colors are assumed by the model to be moving
@@ -165,7 +170,7 @@ img1_batch = [(img1 + 1) / 2 for img1 in img1_batch]
 grid = [[img1, flow_img] for (img1, flow_img) in zip(img1_batch, flow_imgs)]
 plot(grid)
 
-####################################
+# %%
 # Bonus: Creating GIFs of predicted flows
 # ---------------------------------------
 # In the example above we have only shown the predicted flows of 2 pairs of
@@ -186,7 +191,7 @@ plot(grid)
 #     output_folder = "/tmp/"  # Update this to the folder of your choice
 #     write_jpeg(flow_img, output_folder + f"predicted_flow_{i}.jpg")
 
-####################################
+# %%
 # Once the .jpg flow images are saved, you can convert them into a video or a
 # GIF using ffmpeg with e.g.:
 #
diff --git a/gallery/plot_repurposing_annotations.py b/gallery/others/plot_repurposing_annotations.py
similarity index 94%
rename from gallery/plot_repurposing_annotations.py
rename to gallery/others/plot_repurposing_annotations.py
index 7bb68617a17b0b99160d371b0f77204ea14035eb..b1617cacd99170497ebda357b4bc975b580354ff 100644
--- a/gallery/plot_repurposing_annotations.py
+++ b/gallery/others/plot_repurposing_annotations.py
@@ -3,6 +3,10 @@
 Repurposing masks into bounding boxes
 =====================================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_repurposing_annotations.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_repurposing_annotations.py>` to download the full example code.
+
 The following example illustrates the operations available
 the :ref:`torchvision.ops <ops>` module for repurposing
 segmentation masks into object localization annotations for different tasks
@@ -20,7 +24,7 @@ import matplotlib.pyplot as plt
 import torchvision.transforms.functional as F
 
 
-ASSETS_DIRECTORY = "assets"
+ASSETS_DIRECTORY = "../assets"
 
 plt.rcParams["savefig.bbox"] = "tight"
 
@@ -36,7 +40,7 @@ def show(imgs):
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
 
-####################################
+# %%
 # Masks
 # -----
 # In tasks like instance and panoptic segmentation, masks are commonly defined, and are defined by this package,
@@ -53,7 +57,7 @@ def show(imgs):
 # A nice property of masks is that they can be easily repurposed to be used in methods to solve a variety of object
 # localization tasks.
 
-####################################
+# %%
 # Converting Masks to Bounding Boxes
 # -----------------------------------------------
 # For example, the :func:`~torchvision.ops.masks_to_boxes` operation can be used to
@@ -70,7 +74,7 @@ img = read_image(img_path)
 mask = read_image(mask_path)
 
 
-#########################
+# %%
 # Here the masks are represented as a PNG Image, with floating point values.
 # Each pixel is encoded as different colors, with 0 being background.
 # Notice that the spatial dimensions of image and mask match.
@@ -79,7 +83,7 @@ print(mask.size())
 print(img.size())
 print(mask)
 
-############################
+# %%
 
 # We get the unique colors, as these would be the object ids.
 obj_ids = torch.unique(mask)
@@ -91,7 +95,7 @@ obj_ids = obj_ids[1:]
 # Note that this snippet would work as well if the masks were float values instead of ints.
 masks = mask == obj_ids[:, None, None]
 
-########################
+# %%
 # Now the masks are a boolean tensor.
 # The first dimension in this case 3 and denotes the number of instances: there are 3 people in the image.
 # The other two dimensions are height and width, which are equal to the dimensions of the image.
@@ -101,7 +105,7 @@ masks = mask == obj_ids[:, None, None]
 print(masks.size())
 print(masks)
 
-####################################
+# %%
 # Let us visualize an image and plot its corresponding segmentation masks.
 # We will use the :func:`~torchvision.utils.draw_segmentation_masks` to draw the segmentation masks.
 
@@ -113,7 +117,7 @@ for mask in masks:
 
 show(drawn_masks)
 
-####################################
+# %%
 # To convert the boolean masks into bounding boxes.
 # We will use the :func:`~torchvision.ops.masks_to_boxes` from the torchvision.ops module
 # It returns the boxes in ``(xmin, ymin, xmax, ymax)`` format.
@@ -124,7 +128,7 @@ boxes = masks_to_boxes(masks)
 print(boxes.size())
 print(boxes)
 
-####################################
+# %%
 # As the shape denotes, there are 3 boxes and in ``(xmin, ymin, xmax, ymax)`` format.
 # These can be visualized very easily with :func:`~torchvision.utils.draw_bounding_boxes` utility
 # provided in :ref:`torchvision.utils <utils>`.
@@ -134,7 +138,7 @@ from torchvision.utils import draw_bounding_boxes
 drawn_boxes = draw_bounding_boxes(img, boxes, colors="red")
 show(drawn_boxes)
 
-###################################
+# %%
 # These boxes can now directly be used by detection models in torchvision.
 # Here is demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
@@ -153,7 +157,7 @@ target["labels"] = labels = torch.ones((masks.size(0),), dtype=torch.int64)
 detection_outputs = model(img.unsqueeze(0), [target])
 
 
-####################################
+# %%
 # Converting Segmentation Dataset to Detection Dataset
 # ----------------------------------------------------
 #
diff --git a/gallery/others/plot_scripted_tensor_transforms.py b/gallery/others/plot_scripted_tensor_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..128ce7778f3c19811fd124566ade6e99049159db
--- /dev/null
+++ b/gallery/others/plot_scripted_tensor_transforms.py
@@ -0,0 +1,136 @@
+"""
+===================
+Torchscript support
+===================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_scripted_tensor_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_scripted_tensor_transforms.py>` to download the full example code.
+
+This example illustrates `torchscript
+<https://pytorch.org/docs/stable/jit.html>`_ support of the torchvision
+:ref:`transforms <transforms>` on Tensor images.
+"""
+
+# %%
+from pathlib import Path
+
+import matplotlib.pyplot as plt
+
+import torch
+import torch.nn as nn
+
+import torchvision.transforms as v1
+from torchvision.io import read_image
+
+plt.rcParams["savefig.bbox"] = 'tight'
+torch.manual_seed(1)
+
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+import sys
+sys.path += ["../transforms"]
+from helpers import plot
+ASSETS_PATH = Path('../assets')
+
+
+# %%
+# Most transforms support torchscript. For composing transforms, we use
+# :class:`torch.nn.Sequential` instead of
+# :class:`~torchvision.transforms.v2.Compose`:
+
+dog1 = read_image(str(ASSETS_PATH / 'dog1.jpg'))
+dog2 = read_image(str(ASSETS_PATH / 'dog2.jpg'))
+
+transforms = torch.nn.Sequential(
+    v1.RandomCrop(224),
+    v1.RandomHorizontalFlip(p=0.3),
+)
+
+scripted_transforms = torch.jit.script(transforms)
+
+plot([dog1, scripted_transforms(dog1), dog2, scripted_transforms(dog2)])
+
+
+# %%
+# .. warning::
+#
+#     Above we have used transforms from the ``torchvision.transforms``
+#     namespace, i.e. the "v1" transforms. The v2 transforms from the
+#     ``torchvision.transforms.v2`` namespace are the :ref:`recommended
+#     <v1_or_v2>` way to use transforms in your code.
+#
+#     The v2 transforms also support torchscript, but if you call
+#     ``torch.jit.script()`` on a v2 **class** transform, you'll actually end up
+#     with its (scripted) v1 equivalent.  This may lead to slightly different
+#     results between the scripted and eager executions due to implementation
+#     differences between v1 and v2.
+#
+#     If you really need torchscript support for the v2 transforms, **we
+#     recommend scripting the functionals** from the
+#     ``torchvision.transforms.v2.functional`` namespace to avoid surprises.
+#
+# Below we now show how to combine image transformations and a model forward
+# pass, while using ``torch.jit.script`` to obtain a single scripted module.
+#
+# Let's define a ``Predictor`` module that transforms the input tensor and then
+# applies an ImageNet model on it.
+
+from torchvision.models import resnet18, ResNet18_Weights
+
+
+class Predictor(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        weights = ResNet18_Weights.DEFAULT
+        self.resnet18 = resnet18(weights=weights, progress=False).eval()
+        self.transforms = weights.transforms(antialias=True)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        with torch.no_grad():
+            x = self.transforms(x)
+            y_pred = self.resnet18(x)
+            return y_pred.argmax(dim=1)
+
+
+# %%
+# Now, let's define scripted and non-scripted instances of ``Predictor`` and
+# apply it on multiple tensor images of the same size
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+predictor = Predictor().to(device)
+scripted_predictor = torch.jit.script(predictor).to(device)
+
+batch = torch.stack([dog1, dog2]).to(device)
+
+res = predictor(batch)
+res_scripted = scripted_predictor(batch)
+
+# %%
+# We can verify that the prediction of the scripted and non-scripted models are
+# the same:
+
+import json
+
+with open(Path('../assets') / 'imagenet_class_index.json') as labels_file:
+    labels = json.load(labels_file)
+
+for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
+    assert pred == pred_scripted
+    print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")
+
+# %%
+# Since the model is scripted, it can be easily dumped on disk and re-used
+
+import tempfile
+
+with tempfile.NamedTemporaryFile() as f:
+    scripted_predictor.save(f.name)
+
+    dumped_scripted_predictor = torch.jit.load(f.name)
+    res_scripted_dumped = dumped_scripted_predictor(batch)
+assert (res_scripted_dumped == res_scripted).all()
+
+# %%
diff --git a/gallery/plot_video_api.py b/gallery/others/plot_video_api.py
similarity index 94%
rename from gallery/plot_video_api.py
rename to gallery/others/plot_video_api.py
index d83a508eabe09e892d6f9a3408cb6954e86d8356..ac9eb0ba27d8948719e6beeac9545af6f725f362 100644
--- a/gallery/plot_video_api.py
+++ b/gallery/others/plot_video_api.py
@@ -1,20 +1,24 @@
 """
-=======================
+=========
 Video API
-=======================
+=========
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_video_api.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_video_api.py>` to download the full example code.
 
 This example illustrates some of the APIs that torchvision offers for
 videos, together with the examples on how to build datasets and more.
 """
 
-####################################
+# %%
 # 1. Introduction: building a new video object and examining the properties
 # -------------------------------------------------------------------------
 # First we select a video to test the object out. For the sake of argument
 # we're using one from kinetics400 dataset.
 # To create it, we need to define the path and the stream we want to use.
 
-######################################
+# %%
 # Chosen video statistics:
 #
 # - WUzgd7C1pWA.mp4
@@ -32,6 +36,7 @@ videos, together with the examples on how to build datasets and more.
 import torch
 import torchvision
 from torchvision.datasets.utils import download_url
+torchvision.set_video_backend("video_reader")
 
 # Download the sample video
 download_url(
@@ -41,7 +46,7 @@ download_url(
 )
 video_path = "./WUzgd7C1pWA.mp4"
 
-######################################
+# %%
 # Streams are defined in a similar fashion as torch devices. We encode them as strings in a form
 # of ``stream_type:stream_id`` where ``stream_type`` is a string and ``stream_id`` a long int.
 # The constructor accepts passing a ``stream_type`` only, in which case the stream is auto-discovered.
@@ -51,7 +56,7 @@ stream = "video"
 video = torchvision.io.VideoReader(video_path, stream)
 video.get_metadata()
 
-######################################
+# %%
 # Here we can see that video has two streams - a video and an audio stream.
 # Currently available stream types include ['video', 'audio'].
 # Each descriptor consists of two parts: stream type (e.g. 'video') and a unique stream id
@@ -60,7 +65,7 @@ video.get_metadata()
 # users can access the one they want.
 # If only stream type is passed, the decoder auto-detects first stream of that type and returns it.
 
-######################################
+# %%
 # Let's read all the frames from the video stream. By default, the return value of
 # ``next(video_reader)`` is a dict containing the following fields.
 #
@@ -84,7 +89,7 @@ approx_nf = metadata['audio']['duration'][0] * metadata['audio']['framerate'][0]
 print("Approx total number of datapoints we can expect: ", approx_nf)
 print("Read data size: ", frames[0].size(0) * len(frames))
 
-######################################
+# %%
 # But what if we only want to read certain time segment of the video?
 # That can be done easily using the combination of our ``seek`` function, and the fact that each call
 # to next returns the presentation timestamp of the returned frame in seconds.
@@ -106,7 +111,7 @@ for frame, pts in itertools.islice(video.seek(2), 10):
 
 print("Total number of frames: ", len(frames))
 
-######################################
+# %%
 # Or if we wanted to read from 2nd to 5th second,
 # We seek into a second second of the video,
 # then we utilize the itertools takewhile to get the
@@ -124,7 +129,7 @@ approx_nf = (5 - 2) * video.get_metadata()['video']['fps'][0]
 print("We can expect approx: ", approx_nf)
 print("Tensor size: ", frames[0].size())
 
-####################################
+# %%
 # 2. Building a sample read_video function
 # ----------------------------------------------------------------------------------------
 # We can utilize the methods above to build the read video function that follows
@@ -169,21 +174,21 @@ def example_read_video(video_object, start=0, end=None, read_video=True, read_au
 vf, af, info, meta = example_read_video(video)
 print(vf.size(), af.size())
 
-####################################
+# %%
 # 3. Building an example randomly sampled dataset (can be applied to training dataset of kinetics400)
 # -------------------------------------------------------------------------------------------------------
 # Cool, so now we can use the same principle to make the sample dataset.
 # We suggest trying out iterable dataset for this purpose.
 # Here, we are going to build an example dataset that reads randomly selected 10 frames of video.
 
-####################################
+# %%
 # Make sample dataset
 import os
 os.makedirs("./dataset", exist_ok=True)
 os.makedirs("./dataset/1", exist_ok=True)
 os.makedirs("./dataset/2", exist_ok=True)
 
-####################################
+# %%
 # Download the videos
 from torchvision.datasets.utils import download_url
 download_url(
@@ -211,7 +216,7 @@ download_url(
     "v_SoccerJuggling_g24_c01.avi"
 )
 
-####################################
+# %%
 # Housekeeping and utilities
 import os
 import random
@@ -231,7 +236,7 @@ def get_samples(root, extensions=(".mp4", ".avi")):
     _, class_to_idx = _find_classes(root)
     return make_dataset(root, class_to_idx, extensions=extensions)
 
-####################################
+# %%
 # We are going to define the dataset and some basic arguments.
 # We assume the structure of the FolderDataset, and add the following parameters:
 #
@@ -286,7 +291,7 @@ class RandomDataset(torch.utils.data.IterableDataset):
                 'end': current_pts}
             yield output
 
-####################################
+# %%
 # Given a path of videos in a folder structure, i.e:
 #
 # - dataset
@@ -308,7 +313,7 @@ frame_transform = t.Compose(transforms)
 
 dataset = RandomDataset("./dataset", epoch_size=None, frame_transform=frame_transform)
 
-####################################
+# %%
 from torch.utils.data import DataLoader
 loader = DataLoader(dataset, batch_size=12)
 data = {"video": [], 'start': [], 'end': [], 'tensorsize': []}
@@ -320,7 +325,7 @@ for batch in loader:
         data['tensorsize'].append(batch['video'][i].size())
 print(data)
 
-####################################
+# %%
 # 4. Data Visualization
 # ----------------------------------
 # Example of visualized video
@@ -333,7 +338,7 @@ for i in range(16):
     plt.imshow(batch["video"][0, i, ...].permute(1, 2, 0))
     plt.axis("off")
 
-####################################
+# %%
 # Cleanup the video and dataset:
 import os
 import shutil
diff --git a/gallery/plot_visualization_utils.py b/gallery/others/plot_visualization_utils.py
similarity index 93%
rename from gallery/plot_visualization_utils.py
rename to gallery/others/plot_visualization_utils.py
index b04e0b6cffa3e5756694c0375fddea9cd58e14c7..98089c54dbb4bb6ed8b46eb54669f1b762d8f131 100644
--- a/gallery/plot_visualization_utils.py
+++ b/gallery/others/plot_visualization_utils.py
@@ -3,6 +3,10 @@
 Visualization utilities
 =======================
 
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_visualization_utils.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_others_plot_visualization_utils.py>` to download the full example code.
+
 This example illustrates some of the utilities that torchvision offers for
 visualizing images, bounding boxes, segmentation masks and keypoints.
 """
@@ -30,7 +34,7 @@ def show(imgs):
         axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
 
 
-####################################
+# %%
 # Visualizing a grid of images
 # ----------------------------
 # The :func:`~torchvision.utils.make_grid` function can be used to create a
@@ -41,14 +45,14 @@ from torchvision.utils import make_grid
 from torchvision.io import read_image
 from pathlib import Path
 
-dog1_int = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2_int = read_image(str(Path('assets') / 'dog2.jpg'))
+dog1_int = read_image(str(Path('../assets') / 'dog1.jpg'))
+dog2_int = read_image(str(Path('../assets') / 'dog2.jpg'))
 dog_list = [dog1_int, dog2_int]
 
 grid = make_grid(dog_list)
 show(grid)
 
-####################################
+# %%
 # Visualizing bounding boxes
 # --------------------------
 # We can use :func:`~torchvision.utils.draw_bounding_boxes` to draw boxes on an
@@ -64,7 +68,7 @@ result = draw_bounding_boxes(dog1_int, boxes, colors=colors, width=5)
 show(result)
 
 
-#####################################
+# %%
 # Naturally, we can also plot bounding boxes produced by torchvision detection
 # models.  Here is a demo with a Faster R-CNN model loaded from
 # :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn`
@@ -85,7 +89,7 @@ model = model.eval()
 outputs = model(images)
 print(outputs)
 
-#####################################
+# %%
 # Let's plot the boxes detected by our model. We will only plot the boxes with a
 # score greater than a given threshold.
 
@@ -96,7 +100,7 @@ dogs_with_boxes = [
 ]
 show(dogs_with_boxes)
 
-#####################################
+# %%
 # Visualizing segmentation masks
 # ------------------------------
 # The :func:`~torchvision.utils.draw_segmentation_masks` function can be used to
@@ -125,7 +129,7 @@ batch = torch.stack([transforms(d) for d in dog_list])
 output = model(batch)['out']
 print(output.shape, output.min().item(), output.max().item())
 
-#####################################
+# %%
 # As we can see above, the output of the segmentation model is a tensor of shape
 # ``(batch_size, num_classes, H, W)``. Each value is a non-normalized score, and
 # we can normalize them into ``[0, 1]`` by using a softmax. After the softmax,
@@ -147,7 +151,7 @@ dog_and_boat_masks = [
 
 show(dog_and_boat_masks)
 
-#####################################
+# %%
 # As expected, the model is confident about the dog class, but not so much for
 # the boat class.
 #
@@ -162,7 +166,7 @@ print(f"shape = {boolean_dog_masks.shape}, dtype = {boolean_dog_masks.dtype}")
 show([m.float() for m in boolean_dog_masks])
 
 
-#####################################
+# %%
 # The line above where we define ``boolean_dog_masks`` is a bit cryptic, but you
 # can read it as the following query: "For which pixels is 'dog' the most likely
 # class?"
@@ -184,11 +188,11 @@ dogs_with_masks = [
 ]
 show(dogs_with_masks)
 
-#####################################
+# %%
 # We can plot more than one mask per image! Remember that the model returned as
 # many masks as there are classes. Let's ask the same query as above, but this
 # time for *all* classes, not just the dog class: "For each pixel and each class
-# C, is class C the most most likely class?"
+# C, is class C the most likely class?"
 #
 # This one is a bit more involved, so we'll first show how to do it with a
 # single image, and then we'll generalize to the batch
@@ -204,7 +208,7 @@ print(f"dog1_all_classes_masks = {dog1_all_classes_masks.shape}, dtype = {dog1_a
 dog_with_all_masks = draw_segmentation_masks(dog1_int, masks=dog1_all_classes_masks, alpha=.6)
 show(dog_with_all_masks)
 
-#####################################
+# %%
 # We can see in the image above that only 2 masks were drawn: the mask for the
 # background and the mask for the dog. This is because the model thinks that
 # only these 2 classes are the most likely ones across all the pixels. If the
@@ -231,7 +235,7 @@ dogs_with_masks = [
 show(dogs_with_masks)
 
 
-#####################################
+# %%
 # .. _instance_seg_output:
 #
 # Instance segmentation models
@@ -265,7 +269,7 @@ model = model.eval()
 output = model(images)
 print(output)
 
-#####################################
+# %%
 # Let's break this down. For each image in the batch, the model outputs some
 # detections (or instances). The number of detections varies for each input
 # image. Each instance is described by its bounding box, its label, its score
@@ -288,7 +292,7 @@ dog1_masks = dog1_output['masks']
 print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
       f"min = {dog1_masks.min()}, max = {dog1_masks.max()}")
 
-#####################################
+# %%
 # Here the masks correspond to probabilities indicating, for each pixel, how
 # likely it is to belong to the predicted label of that instance. Those
 # predicted labels correspond to the 'labels' element in the same output dict.
@@ -297,7 +301,7 @@ print(f"shape = {dog1_masks.shape}, dtype = {dog1_masks.dtype}, "
 print("For the first dog, the following instances were detected:")
 print([weights.meta["categories"][label] for label in dog1_output['labels']])
 
-#####################################
+# %%
 # Interestingly, the model detects two persons in the image. Let's go ahead and
 # plot those masks. Since :func:`~torchvision.utils.draw_segmentation_masks`
 # expects boolean masks, we need to convert those probabilities into boolean
@@ -315,14 +319,14 @@ dog1_bool_masks = dog1_bool_masks.squeeze(1)
 
 show(draw_segmentation_masks(dog1_int, dog1_bool_masks, alpha=0.9))
 
-#####################################
+# %%
 # The model seems to have properly detected the dog, but it also confused trees
-# with people. Looking more closely at the scores will help us plotting more
+# with people. Looking more closely at the scores will help us plot more
 # relevant masks:
 
 print(dog1_output['scores'])
 
-#####################################
+# %%
 # Clearly the model is more confident about the dog detection than it is about
 # the people detections. That's good news. When plotting the masks, we can ask
 # for only those that have a good score. Let's use a score threshold of .75
@@ -341,12 +345,12 @@ dogs_with_masks = [
 ]
 show(dogs_with_masks)
 
-#####################################
+# %%
 # The two 'people' masks in the first image where not selected because they have
-# a lower score than the score threshold. Similarly in the second image, the
+# a lower score than the score threshold. Similarly, in the second image, the
 # instance with class 15 (which corresponds to 'bench') was not selected.
 
-#####################################
+# %%
 # .. _keypoint_output:
 #
 # Visualizing keypoints
@@ -360,7 +364,7 @@ show(dogs_with_masks)
 from torchvision.models.detection import keypointrcnn_resnet50_fpn, KeypointRCNN_ResNet50_FPN_Weights
 from torchvision.io import read_image
 
-person_int = read_image(str(Path("assets") / "person1.jpg"))
+person_int = read_image(str(Path("../assets") / "person1.jpg"))
 
 weights = KeypointRCNN_ResNet50_FPN_Weights.DEFAULT
 transforms = weights.transforms()
@@ -373,7 +377,7 @@ model = model.eval()
 outputs = model([person_float])
 print(outputs)
 
-#####################################
+# %%
 # As we see the output contains a list of dictionaries.
 # The output list is of length batch_size.
 # We currently have just a single image so length of list is 1.
@@ -388,7 +392,7 @@ scores = outputs[0]['scores']
 print(kpts)
 print(scores)
 
-#####################################
+# %%
 # The KeypointRCNN model detects there are two instances in the image.
 # If you plot the boxes by using :func:`~draw_bounding_boxes`
 # you would recognize they are the person and the surfboard.
@@ -402,7 +406,7 @@ keypoints = kpts[idx]
 
 print(keypoints)
 
-#####################################
+# %%
 # Great, now we have the keypoints corresponding to the person.
 # Each keypoint is represented by x, y coordinates and the visibility.
 # We can now use the :func:`~torchvision.utils.draw_keypoints` function to draw keypoints.
@@ -413,7 +417,7 @@ from torchvision.utils import draw_keypoints
 res = draw_keypoints(person_int, keypoints, colors="blue", radius=3)
 show(res)
 
-#####################################
+# %%
 # As we see the keypoints appear as colored circles over the image.
 # The coco keypoints for a person are ordered and represent the following list.\
 
@@ -424,7 +428,7 @@ coco_keypoints = [
     "left_knee", "right_knee", "left_ankle", "right_ankle",
 ]
 
-#####################################
+# %%
 # What if we are interested in joining the keypoints?
 # This is especially useful in creating pose detection or action recognition.
 # We can join the keypoints easily using the `connectivity` parameter.
@@ -450,7 +454,7 @@ connect_skeleton = [
     (7, 9), (8, 10), (5, 11), (6, 12), (11, 13), (12, 14), (13, 15), (14, 16)
 ]
 
-#####################################
+# %%
 # We pass the above list to the connectivity parameter to connect the keypoints.
 #
 
diff --git a/gallery/plot_scripted_tensor_transforms.py b/gallery/plot_scripted_tensor_transforms.py
deleted file mode 100644
index 995383d460321af7928d69f249d23a92df981ea8..0000000000000000000000000000000000000000
--- a/gallery/plot_scripted_tensor_transforms.py
+++ /dev/null
@@ -1,141 +0,0 @@
-"""
-=========================
-Tensor transforms and JIT
-=========================
-
-This example illustrates various features that are now supported by the
-:ref:`image transformations <transforms>` on Tensor images. In particular, we
-show how image transforms can be performed on GPU, and how one can also script
-them using JIT compilation.
-
-Prior to v0.8.0, transforms in torchvision have traditionally been PIL-centric
-and presented multiple limitations due to that. Now, since v0.8.0, transforms
-implementations are Tensor and PIL compatible and we can achieve the following
-new features:
-
-- transform multi-band torch tensor images (with more than 3-4 channels)
-- torchscript transforms together with your model for deployment
-- support for GPU acceleration
-- batched transformation such as for videos
-- read and decode data directly as torch tensor with torchscript support (for PNG and JPEG image formats)
-
-.. note::
-    These features are only possible with **Tensor** images.
-"""
-
-from pathlib import Path
-
-import matplotlib.pyplot as plt
-import numpy as np
-
-import torch
-import torchvision.transforms as T
-from torchvision.io import read_image
-
-
-plt.rcParams["savefig.bbox"] = 'tight'
-torch.manual_seed(1)
-
-
-def show(imgs):
-    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
-    for i, img in enumerate(imgs):
-        img = T.ToPILImage()(img.to('cpu'))
-        axs[0, i].imshow(np.asarray(img))
-        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-
-
-####################################
-# The :func:`~torchvision.io.read_image` function allows to read an image and
-# directly load it as a tensor
-
-dog1 = read_image(str(Path('assets') / 'dog1.jpg'))
-dog2 = read_image(str(Path('assets') / 'dog2.jpg'))
-show([dog1, dog2])
-
-####################################
-# Transforming images on GPU
-# --------------------------
-# Most transforms natively support tensors on top of PIL images (to visualize
-# the effect of the transforms, you may refer to see
-# :ref:`sphx_glr_auto_examples_plot_transforms.py`).
-# Using tensor images, we can run the transforms on GPUs if cuda is available!
-
-import torch.nn as nn
-
-transforms = torch.nn.Sequential(
-    T.RandomCrop(224),
-    T.RandomHorizontalFlip(p=0.3),
-)
-
-device = 'cuda' if torch.cuda.is_available() else 'cpu'
-dog1 = dog1.to(device)
-dog2 = dog2.to(device)
-
-transformed_dog1 = transforms(dog1)
-transformed_dog2 = transforms(dog2)
-show([transformed_dog1, transformed_dog2])
-
-####################################
-# Scriptable transforms for easier deployment via torchscript
-# -----------------------------------------------------------
-# We now show how to combine image transformations and a model forward pass,
-# while using ``torch.jit.script`` to obtain a single scripted module.
-#
-# Let's define a ``Predictor`` module that transforms the input tensor and then
-# applies an ImageNet model on it.
-
-from torchvision.models import resnet18, ResNet18_Weights
-
-
-class Predictor(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        weights = ResNet18_Weights.DEFAULT
-        self.resnet18 = resnet18(weights=weights, progress=False).eval()
-        self.transforms = weights.transforms()
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        with torch.no_grad():
-            x = self.transforms(x)
-            y_pred = self.resnet18(x)
-            return y_pred.argmax(dim=1)
-
-
-####################################
-# Now, let's define scripted and non-scripted instances of ``Predictor`` and
-# apply it on multiple tensor images of the same size
-
-predictor = Predictor().to(device)
-scripted_predictor = torch.jit.script(predictor).to(device)
-
-batch = torch.stack([dog1, dog2]).to(device)
-
-res = predictor(batch)
-res_scripted = scripted_predictor(batch)
-
-####################################
-# We can verify that the prediction of the scripted and non-scripted models are
-# the same:
-
-import json
-
-with open(Path('assets') / 'imagenet_class_index.json') as labels_file:
-    labels = json.load(labels_file)
-
-for i, (pred, pred_scripted) in enumerate(zip(res, res_scripted)):
-    assert pred == pred_scripted
-    print(f"Prediction for Dog {i + 1}: {labels[str(pred.item())]}")
-
-####################################
-# Since the model is scripted, it can be easily dumped on disk and re-used
-
-import tempfile
-
-with tempfile.NamedTemporaryFile() as f:
-    scripted_predictor.save(f.name)
-
-    dumped_scripted_predictor = torch.jit.load(f.name)
-    res_scripted_dumped = dumped_scripted_predictor(batch)
-assert (res_scripted_dumped == res_scripted).all()
diff --git a/gallery/transforms/README.rst b/gallery/transforms/README.rst
new file mode 100644
index 0000000000000000000000000000000000000000..1b8b1b08155ae339948c20d13f2f55d5a580a6bc
--- /dev/null
+++ b/gallery/transforms/README.rst
@@ -0,0 +1,4 @@
+.. _transforms_gallery:
+
+Transforms
+----------
diff --git a/gallery/transforms/helpers.py b/gallery/transforms/helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..e94d717eb7df9a7585cf1704262368208bf0e786
--- /dev/null
+++ b/gallery/transforms/helpers.py
@@ -0,0 +1,50 @@
+import matplotlib.pyplot as plt
+import torch
+from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+
+def plot(imgs, row_title=None, **imshow_kwargs):
+    if not isinstance(imgs[0], list):
+        # Make a 2d grid even if there's just 1 row
+        imgs = [imgs]
+
+    num_rows = len(imgs)
+    num_cols = len(imgs[0])
+    _, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
+    for row_idx, row in enumerate(imgs):
+        for col_idx, img in enumerate(row):
+            boxes = None
+            masks = None
+            if isinstance(img, tuple):
+                img, target = img
+                if isinstance(target, dict):
+                    boxes = target.get("boxes")
+                    masks = target.get("masks")
+                elif isinstance(target, tv_tensors.BoundingBoxes):
+                    boxes = target
+                else:
+                    raise ValueError(f"Unexpected target type: {type(target)}")
+            img = F.to_image(img)
+            if img.dtype.is_floating_point and img.min() < 0:
+                # Poor man's re-normalization for the colors to be OK-ish. This
+                # is useful for images coming out of Normalize()
+                img -= img.min()
+                img /= img.max()
+
+            img = F.to_dtype(img, torch.uint8, scale=True)
+            if boxes is not None:
+                img = draw_bounding_boxes(img, boxes, colors="yellow", width=3)
+            if masks is not None:
+                img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65)
+
+            ax = axs[row_idx, col_idx]
+            ax.imshow(img.permute(1, 2, 0).numpy(), **imshow_kwargs)
+            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
+
+    if row_title is not None:
+        for row_idx in range(num_rows):
+            axs[row_idx, 0].set(ylabel=row_title[row_idx])
+
+    plt.tight_layout()
diff --git a/gallery/transforms/plot_custom_transforms.py b/gallery/transforms/plot_custom_transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..898c2cd0beaedbe21c5b112aed556a689518bb49
--- /dev/null
+++ b/gallery/transforms/plot_custom_transforms.py
@@ -0,0 +1,121 @@
+"""
+===================================
+How to write your own v2 transforms
+===================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_transforms.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_transforms.py>` to download the full example code.
+
+This guide explains how to write transforms that are compatible with the
+torchvision transforms V2 API.
+"""
+
+# %%
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+
+# %%
+# Just create a ``nn.Module`` and override the ``forward`` method
+# ===============================================================
+#
+# In most cases, this is all you're going to need, as long as you already know
+# the structure of the input that your transform will expect. For example if
+# you're just doing image classification, your transform will typically accept a
+# single image as input, or a ``(img, label)`` input. So you can just hard-code
+# your ``forward`` method to accept just that, e.g.
+#
+# .. code:: python
+#
+#     class MyCustomTransform(torch.nn.Module):
+#         def forward(self, img, label):
+#             # Do some transformations
+#             return new_img, new_label
+#
+# .. note::
+#
+#     This means that if you have a custom transform that is already compatible
+#     with the V1 transforms (those in ``torchvision.transforms``), it will
+#     still work with the V2 transforms without any change!
+#
+# We will illustrate this more completely below with a typical detection case,
+# where our samples are just images, bounding boxes and labels:
+
+class MyCustomTransform(torch.nn.Module):
+    def forward(self, img, bboxes, label):  # we assume inputs are always structured like this
+        print(
+            f"I'm transforming an image of shape {img.shape} "
+            f"with bboxes = {bboxes}\n{label = }"
+        )
+        # Do some transformations. Here, we're just passing though the input
+        return img, bboxes, label
+
+
+transforms = v2.Compose([
+    MyCustomTransform(),
+    v2.RandomResizedCrop((224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=1),
+    v2.Normalize(mean=[0, 0, 0], std=[1, 1, 1])
+])
+
+H, W = 256, 256
+img = torch.rand(3, H, W)
+bboxes = tv_tensors.BoundingBoxes(
+    torch.tensor([[0, 10, 10, 20], [50, 50, 70, 70]]),
+    format="XYXY",
+    canvas_size=(H, W)
+)
+label = 3
+
+out_img, out_bboxes, out_label = transforms(img, bboxes, label)
+# %%
+print(f"Output image shape: {out_img.shape}\nout_bboxes = {out_bboxes}\n{out_label = }")
+# %%
+# .. note::
+#     While working with TVTensor classes in your code, make sure to
+#     familiarize yourself with this section:
+#     :ref:`tv_tensor_unwrapping_behaviour`
+#
+# Supporting arbitrary input structures
+# =====================================
+#
+# In the section above, we have assumed that you already know the structure of
+# your inputs and that you're OK with hard-coding this expected structure in
+# your code. If you want your custom transforms to be as flexible as possible,
+# this can be a bit limiting.
+#
+# A key feature of the builtin Torchvision V2 transforms is that they can accept
+# arbitrary input structure and return the same structure as output (with
+# transformed entries). For example, transforms can accept a single image, or a
+# tuple of ``(img, label)``, or an arbitrary nested dictionary as input:
+
+structured_input = {
+    "img": img,
+    "annotations": (bboxes, label),
+    "something_that_will_be_ignored": (1, "hello")
+}
+structured_output = v2.RandomHorizontalFlip(p=1)(structured_input)
+
+assert isinstance(structured_output, dict)
+assert structured_output["something_that_will_be_ignored"] == (1, "hello")
+print(f"The transformed bboxes are:\n{structured_output['annotations'][0]}")
+
+# %%
+# If you want to reproduce this behavior in your own transform, we invite you to
+# look at our `code
+# <https://github.com/pytorch/vision/blob/main/torchvision/transforms/v2/_transform.py>`_
+# and adapt it to your needs.
+#
+# In brief, the core logic is to unpack the input into a flat list using `pytree
+# <https://github.com/pytorch/pytorch/blob/main/torch/utils/_pytree.py>`_, and
+# then transform only the entries that can be transformed (the decision is made
+# based on the **class** of the entries, as all TVTensors are
+# tensor-subclasses) plus some custom logic that is out of score here - check the
+# code for details. The (potentially transformed) entries are then repacked and
+# returned, in the same structure as the input.
+#
+# We do not provide public dev-facing tools to achieve that at this time, but if
+# this is something that would be valuable to you, please let us know by opening
+# an issue on our `GitHub repo <https://github.com/pytorch/vision/issues>`_.
diff --git a/gallery/transforms/plot_custom_tv_tensors.py b/gallery/transforms/plot_custom_tv_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf5ee198837ca34c40ec279293184b2db19efb9d
--- /dev/null
+++ b/gallery/transforms/plot_custom_tv_tensors.py
@@ -0,0 +1,119 @@
+"""
+====================================
+How to write your own TVTensor class
+====================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_custom_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_custom_tv_tensors.py>` to download the full example code.
+
+This guide is intended for advanced users and downstream library maintainers. We explain how to
+write your own TVTensor class, and how to make it compatible with the built-in
+Torchvision v2 transforms. Before continuing, make sure you have read
+:ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
+"""
+
+# %%
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+# %%
+# We will create a very simple class that just inherits from the base
+# :class:`~torchvision.tv_tensors.TVTensor` class. It will be enough to cover
+# what you need to know to implement your more elaborate uses-cases. If you need
+# to create a class that carries meta-data, take a look at how the
+# :class:`~torchvision.tv_tensors.BoundingBoxes` class is `implemented
+# <https://github.com/pytorch/vision/blob/main/torchvision/tv_tensors/_bounding_box.py>`_.
+
+
+class MyTVTensor(tv_tensors.TVTensor):
+    pass
+
+
+my_dp = MyTVTensor([1, 2, 3])
+my_dp
+
+# %%
+# Now that we have defined our custom TVTensor class, we want it to be
+# compatible with the built-in torchvision transforms, and the functional API.
+# For that, we need to implement a kernel which performs the core of the
+# transformation, and then "hook" it to the functional that we want to support
+# via :func:`~torchvision.transforms.v2.functional.register_kernel`.
+#
+# We illustrate this process below: we create a kernel for the "horizontal flip"
+# operation of our MyTVTensor class, and register it to the functional API.
+
+from torchvision.transforms.v2 import functional as F
+
+
+@F.register_kernel(functional="hflip", tv_tensor_cls=MyTVTensor)
+def hflip_my_tv_tensor(my_dp, *args, **kwargs):
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return tv_tensors.wrap(out, like=my_dp)
+
+
+# %%
+# To understand why :func:`~torchvision.tv_tensors.wrap` is used, see
+# :ref:`tv_tensor_unwrapping_behaviour`. Ignore the ``*args, **kwargs`` for now,
+# we will explain it below in :ref:`param_forwarding`.
+#
+# .. note::
+#
+#     In our call to ``register_kernel`` above we used a string
+#     ``functional="hflip"`` to refer to the functional we want to hook into. We
+#     could also have used the  functional *itself*, i.e.
+#     ``@register_kernel(functional=F.hflip, ...)``.
+#
+# Now that we have registered our kernel, we can call the functional API on a
+# ``MyTVTensor`` instance:
+
+my_dp = MyTVTensor(torch.rand(3, 256, 256))
+_ = F.hflip(my_dp)
+
+# %%
+# And we can also use the
+# :class:`~torchvision.transforms.v2.RandomHorizontalFlip` transform, since it relies on :func:`~torchvision.transforms.v2.functional.hflip` internally:
+t = v2.RandomHorizontalFlip(p=1)
+_ = t(my_dp)
+
+# %%
+# .. note::
+#
+#     We cannot register a kernel for a transform class, we can only register a
+#     kernel for a **functional**. The reason we can't register a transform
+#     class is because one transform may internally rely on more than one
+#     functional, so in general we can't register a single kernel for a given
+#     class.
+#
+# .. _param_forwarding:
+#
+# Parameter forwarding, and ensuring future compatibility of your kernels
+# -----------------------------------------------------------------------
+#
+# The functional API that you're hooking into is public and therefore
+# **backward** compatible: we guarantee that the parameters of these functionals
+# won't be removed or renamed without a proper deprecation cycle. However, we
+# don't guarantee **forward** compatibility, and we may add new parameters in
+# the future.
+#
+# Imagine that in a future version, Torchvision adds a new ``inplace`` parameter
+# to its :func:`~torchvision.transforms.v2.functional.hflip` functional. If you
+# already defined and registered your own kernel as
+
+def hflip_my_tv_tensor(my_dp):  # noqa
+    print("Flipping!")
+    out = my_dp.flip(-1)
+    return tv_tensors.wrap(out, like=my_dp)
+
+
+# %%
+# then calling ``F.hflip(my_dp)`` will **fail**, because ``hflip`` will try to
+# pass the new ``inplace`` parameter to your kernel, but your kernel doesn't
+# accept it.
+#
+# For this reason, we recommend to always define your kernels with
+# ``*args, **kwargs`` in their signature, as done above. This way, your kernel
+# will be able to accept any new parameter that we may add in the future.
+# (Technically, adding `**kwargs` only should be enough).
diff --git a/gallery/transforms/plot_cutmix_mixup.py b/gallery/transforms/plot_cutmix_mixup.py
new file mode 100644
index 0000000000000000000000000000000000000000..d26b027b121ad849b6638f3387460aa8d6ec9ed5
--- /dev/null
+++ b/gallery/transforms/plot_cutmix_mixup.py
@@ -0,0 +1,150 @@
+
+"""
+===========================
+How to use CutMix and MixUp
+===========================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_cutmix_mixup.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_cutmix_mixup.py>` to download the full example code.
+
+:class:`~torchvision.transforms.v2.CutMix` and
+:class:`~torchvision.transforms.v2.MixUp` are popular augmentation strategies
+that can improve classification accuracy.
+
+These transforms are slightly different from the rest of the Torchvision
+transforms, because they expect
+**batches** of samples as input, not individual images. In this example we'll
+explain how to use them: after the ``DataLoader``, or as part of a collation
+function.
+"""
+
+# %%
+import torch
+from torchvision.datasets import FakeData
+from torchvision.transforms import v2
+
+
+NUM_CLASSES = 100
+
+# %%
+# Pre-processing pipeline
+# -----------------------
+#
+# We'll use a simple but typical image classification pipeline:
+
+preproc = v2.Compose([
+    v2.PILToTensor(),
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),  # to float32 in [0, 1]
+    v2.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),  # typically from ImageNet
+])
+
+dataset = FakeData(size=1000, num_classes=NUM_CLASSES, transform=preproc)
+
+img, label = dataset[0]
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }, {label = }")
+
+# %%
+#
+# One important thing to note is that neither CutMix nor MixUp are part of this
+# pre-processing pipeline. We'll add them a bit later once we define the
+# DataLoader. Just as a refresher, this is what the DataLoader and training loop
+# would look like if we weren't using CutMix or MixUp:
+
+from torch.utils.data import DataLoader
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    print(labels.dtype)
+    # <rest of the training loop here>
+    break
+# %%
+
+# %%
+# Where to use MixUp and CutMix
+# -----------------------------
+#
+# After the DataLoader
+# ^^^^^^^^^^^^^^^^^^^^
+#
+# Now let's add CutMix and MixUp. The simplest way to do this right after the
+# DataLoader: the Dataloader has already batched the images and labels for us,
+# and this is exactly what these transforms expect as input:
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True)
+
+cutmix = v2.CutMix(num_classes=NUM_CLASSES)
+mixup = v2.MixUp(num_classes=NUM_CLASSES)
+cutmix_or_mixup = v2.RandomChoice([cutmix, mixup])
+
+for images, labels in dataloader:
+    print(f"Before CutMix/MixUp: {images.shape = }, {labels.shape = }")
+    images, labels = cutmix_or_mixup(images, labels)
+    print(f"After CutMix/MixUp: {images.shape = }, {labels.shape = }")
+
+    # <rest of the training loop here>
+    break
+# %%
+#
+# Note how the labels were also transformed: we went from a batched label of
+# shape (batch_size,) to a tensor of shape (batch_size, num_classes). The
+# transformed labels can still be passed as-is to a loss function like
+# :func:`torch.nn.functional.cross_entropy`.
+#
+# As part of the collation function
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# Passing the transforms after the DataLoader is the simplest way to use CutMix
+# and MixUp, but one disadvantage is that it does not take advantage of the
+# DataLoader multi-processing. For that, we can pass those transforms as part of
+# the collation function (refer to the `PyTorch docs
+# <https://pytorch.org/docs/stable/data.html#dataloader-collate-fn>`_ to learn
+# more about collation).
+
+from torch.utils.data import default_collate
+
+
+def collate_fn(batch):
+    return cutmix_or_mixup(*default_collate(batch))
+
+
+dataloader = DataLoader(dataset, batch_size=4, shuffle=True, num_workers=2, collate_fn=collate_fn)
+
+for images, labels in dataloader:
+    print(f"{images.shape = }, {labels.shape = }")
+    # No need to call cutmix_or_mixup, it's already been called as part of the DataLoader!
+    # <rest of the training loop here>
+    break
+
+# %%
+# Non-standard input format
+# -------------------------
+#
+# So far we've used a typical sample structure where we pass ``(images,
+# labels)`` as inputs. MixUp and CutMix will magically work by default with most
+# common sample structures: tuples where the second parameter is a tensor label,
+# or dict with a "label[s]" key. Look at the documentation of the
+# ``labels_getter`` parameter for more details.
+#
+# If your samples have a different structure, you can still use CutMix and MixUp
+# by passing a callable to the ``labels_getter`` parameter. For example:
+
+batch = {
+    "imgs": torch.rand(4, 3, 224, 224),
+    "target": {
+        "classes": torch.randint(0, NUM_CLASSES, size=(4,)),
+        "some_other_key": "this is going to be passed-through"
+    }
+}
+
+
+def labels_getter(batch):
+    return batch["target"]["classes"]
+
+
+out = v2.CutMix(num_classes=NUM_CLASSES, labels_getter=labels_getter)(batch)
+print(f"{out['imgs'].shape = }, {out['target']['classes'].shape = }")
diff --git a/gallery/transforms/plot_transforms_e2e.py b/gallery/transforms/plot_transforms_e2e.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c58b4a5a9ad6ca4161a984dec675a5dcfa2ce55
--- /dev/null
+++ b/gallery/transforms/plot_transforms_e2e.py
@@ -0,0 +1,181 @@
+"""
+===============================================================
+Transforms v2: End-to-end object detection/segmentation example
+===============================================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_e2e.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_e2e.py>` to download the full example code.
+
+Object detection and segmentation tasks are natively supported:
+``torchvision.transforms.v2`` enables jointly transforming images, videos,
+bounding boxes, and masks.
+
+This example showcases an end-to-end instance segmentation training case using
+Torchvision utils from ``torchvision.datasets``, ``torchvision.models`` and
+``torchvision.transforms.v2``. Everything covered here can be applied similarly
+to object detection or semantic segmentation tasks.
+"""
+
+# %%
+import pathlib
+
+import torch
+import torch.utils.data
+
+from torchvision import models, datasets, tv_tensors
+from torchvision.transforms import v2
+
+torch.manual_seed(0)
+
+# This loads fake data for illustration purposes of this example. In practice, you'll have
+# to replace this with the proper data.
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+ROOT = pathlib.Path("../assets") / "coco"
+IMAGES_PATH = str(ROOT / "images")
+ANNOTATIONS_PATH = str(ROOT / "instances.json")
+from helpers import plot
+
+
+# %%
+# Dataset preparation
+# -------------------
+#
+# We start off by loading the :class:`~torchvision.datasets.CocoDetection` dataset to have a look at what it currently
+# returns.
+
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH)
+
+sample = dataset[0]
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{type(target[0]) = }\n{target[0].keys() = }")
+
+
+# %%
+# Torchvision datasets preserve the data structure and types as it was intended
+# by the datasets authors. So by default, the output structure may not always be
+# compatible with the models or the transforms.
+#
+# To overcome that, we can use the
+# :func:`~torchvision.datasets.wrap_dataset_for_transforms_v2` function. For
+# :class:`~torchvision.datasets.CocoDetection`, this changes the target
+# structure to a single dictionary of lists:
+
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=("boxes", "labels", "masks"))
+
+sample = dataset[0]
+img, target = sample
+print(f"{type(img) = }\n{type(target) = }\n{target.keys() = }")
+print(f"{type(target['boxes']) = }\n{type(target['labels']) = }\n{type(target['masks']) = }")
+
+# %%
+# We used the ``target_keys`` parameter to specify the kind of output we're
+# interested in. Our dataset now returns a target which is dict where the values
+# are :ref:`TVTensors <what_are_tv_tensors>` (all are :class:`torch.Tensor`
+# subclasses). We're dropped all unncessary keys from the previous output, but
+# if you need any of the original keys e.g. "image_id", you can still ask for
+# it.
+#
+# .. note::
+#
+#     If you just want to do detection, you don't need and shouldn't pass
+#     "masks" in ``target_keys``: if masks are present in the sample, they will
+#     be transformed, slowing down your transformations unnecessarily.
+#
+# As baseline, let's have a look at a sample without transformations:
+
+plot([dataset[0], dataset[1]])
+
+
+# %%
+# Transforms
+# ----------
+#
+# Let's now define our pre-processing transforms. All the transforms know how
+# to handle images, bouding boxes and masks when relevant.
+#
+# Transforms are typically passed as the ``transforms`` parameter of the
+# dataset so that they can leverage multi-processing from the
+# :class:`torch.utils.data.DataLoader`.
+
+transforms = v2.Compose(
+    [
+        v2.ToImage(),
+        v2.RandomPhotometricDistort(p=1),
+        v2.RandomZoomOut(fill={tv_tensors.Image: (123, 117, 104), "others": 0}),
+        v2.RandomIoUCrop(),
+        v2.RandomHorizontalFlip(p=1),
+        v2.SanitizeBoundingBoxes(),
+        v2.ToDtype(torch.float32, scale=True),
+    ]
+)
+
+dataset = datasets.CocoDetection(IMAGES_PATH, ANNOTATIONS_PATH, transforms=transforms)
+dataset = datasets.wrap_dataset_for_transforms_v2(dataset, target_keys=["boxes", "labels", "masks"])
+
+# %%
+# A few things are worth noting here:
+#
+# - We're converting the PIL image into a
+#   :class:`~torchvision.transforms.v2.Image` object. This isn't strictly
+#   necessary, but relying on Tensors (here: a Tensor subclass) will
+#   :ref:`generally be faster <transforms_perf>`.
+# - We are calling :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` to
+#   make sure we remove degenerate bounding boxes, as well as their
+#   corresponding labels and masks.
+#   :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes` should be placed
+#   at least once at the end of a detection pipeline; it is particularly
+#   critical if :class:`~torchvision.transforms.v2.RandomIoUCrop` was used.
+#
+# Let's look how the sample looks like with our augmentation pipeline in place:
+
+# sphinx_gallery_thumbnail_number = 2
+plot([dataset[0], dataset[1]])
+
+
+# %%
+# We can see that the color of the images were distorted, zoomed in or out, and flipped.
+# The bounding boxes and the masks were transformed accordingly. And without any further ado, we can start training.
+#
+# Data loading and training loop
+# ------------------------------
+#
+# Below we're using Mask-RCNN which is an instance segmentation model, but
+# everything we've covered in this tutorial also applies to object detection and
+# semantic segmentation tasks.
+
+data_loader = torch.utils.data.DataLoader(
+    dataset,
+    batch_size=2,
+    # We need a custom collation function here, since the object detection
+    # models expect a sequence of images and target dictionaries. The default
+    # collation function tries to torch.stack() the individual elements,
+    # which fails in general for object detection, because the number of bouding
+    # boxes varies between the images of a same batch.
+    collate_fn=lambda batch: tuple(zip(*batch)),
+)
+
+model = models.get_model("maskrcnn_resnet50_fpn_v2", weights=None, weights_backbone=None).train()
+
+for imgs, targets in data_loader:
+    loss_dict = model(imgs, targets)
+    # Put your training logic here
+
+    print(f"{[img.shape for img in imgs] = }")
+    print(f"{[type(target) for target in targets] = }")
+    for name, loss_val in loss_dict.items():
+        print(f"{name:<20}{loss_val:.3f}")
+
+# %%
+# Training References
+# -------------------
+#
+# From there, you can check out the `torchvision references
+# <https://github.com/pytorch/vision/tree/main/references>`_ where you'll find
+# the actual training scripts we use to train our models.
+#
+# **Disclaimer** The code in our references is more complex than what you'll
+# need for your own use-cases: this is because we're supporting different
+# backends (PIL, tensors, TVTensors) and different transforms namespaces (v1 and
+# v2). So don't be afraid to simplify and only keep what you need.
diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py
new file mode 100644
index 0000000000000000000000000000000000000000..c61d1cc1be0681a708d6bc08a8ac3369c4c54ddc
--- /dev/null
+++ b/gallery/transforms/plot_transforms_getting_started.py
@@ -0,0 +1,266 @@
+"""
+==================================
+Getting started with transforms v2
+==================================
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_getting_started.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_getting_started.py>` to download the full example code.
+
+This example illustrates all of what you need to know to get started with the
+new :mod:`torchvision.transforms.v2` API. We'll cover simple tasks like
+image classification, and more advanced ones like object detection /
+segmentation.
+"""
+
+# %%
+# First, a bit of setup
+from pathlib import Path
+import torch
+import matplotlib.pyplot as plt
+plt.rcParams["savefig.bbox"] = 'tight'
+
+from torchvision.transforms import v2
+from torchvision.io import read_image
+
+torch.manual_seed(1)
+
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+img = read_image(str(Path('../assets') / 'astronaut.jpg'))
+print(f"{type(img) = }, {img.dtype = }, {img.shape = }")
+
+# %%
+# The basics
+# ----------
+#
+# The Torchvision transforms behave like a regular :class:`torch.nn.Module` (in
+# fact, most of them are): instantiate a transform, pass an input, get a
+# transformed output:
+
+transform = v2.RandomCrop(size=(224, 224))
+out = transform(img)
+
+plot([img, out])
+
+# %%
+# I just want to do image classification
+# --------------------------------------
+#
+# If you just care about image classification, things are very simple. A basic
+# classification pipeline may look like this:
+
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomHorizontalFlip(p=0.5),
+    v2.ToDtype(torch.float32, scale=True),
+    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+])
+out = transforms(img)
+
+plot([img, out])
+
+# %%
+# Such transformation pipeline is typically passed as the ``transform`` argument
+# to the :ref:`Datasets <datasets>`, e.g. ``ImageNet(...,
+# transform=transforms)``.
+#
+# That's pretty much all there is. From there, read through our :ref:`main docs
+# <transforms>` to learn more about recommended practices and conventions, or
+# explore more :ref:`examples <transforms_gallery>` e.g. how to use augmentation
+# transforms like :ref:`CutMix and MixUp
+# <sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py>`.
+#
+# .. note::
+#
+#     If you're already relying on the ``torchvision.transforms`` v1 API,
+#     we recommend to :ref:`switch to the new v2 transforms<v1_or_v2>`. It's
+#     very easy: the v2 transforms are fully compatible with the v1 API, so you
+#     only need to change the import!
+#
+# Detection, Segmentation, Videos
+# -------------------------------
+#
+# The new Torchvision transforms in the ``torchvision.transforms.v2`` namespace
+# support tasks beyond image classification: they can also transform bounding
+# boxes, segmentation / detection masks, or videos.
+#
+# Let's briefly look at a detection example with bounding boxes.
+
+from torchvision import tv_tensors  # we'll describe this a bit later, bare with us
+
+boxes = tv_tensors.BoundingBoxes(
+    [
+        [15, 10, 370, 510],
+        [275, 340, 510, 510],
+        [130, 345, 210, 425]
+    ],
+    format="XYXY", canvas_size=img.shape[-2:])
+
+transforms = v2.Compose([
+    v2.RandomResizedCrop(size=(224, 224), antialias=True),
+    v2.RandomPhotometricDistort(p=1),
+    v2.RandomHorizontalFlip(p=1),
+])
+out_img, out_boxes = transforms(img, boxes)
+print(type(boxes), type(out_boxes))
+
+plot([(img, boxes), (out_img, out_boxes)])
+
+# %%
+#
+# The example above focuses on object detection. But if we had masks
+# (:class:`torchvision.tv_tensors.Mask`) for object segmentation or semantic
+# segmentation, or videos (:class:`torchvision.tv_tensors.Video`), we could have
+# passed them to the transforms in exactly the same way.
+#
+# By now you likely have a few questions: what are these TVTensors, how do we
+# use them, and what is the expected input/output of those transforms? We'll
+# answer these in the next sections.
+
+# %%
+#
+# .. _what_are_tv_tensors:
+#
+# What are TVTensors?
+# --------------------
+#
+# TVTensors are :class:`torch.Tensor` subclasses. The available TVTensors are
+# :class:`~torchvision.tv_tensors.Image`,
+# :class:`~torchvision.tv_tensors.BoundingBoxes`,
+# :class:`~torchvision.tv_tensors.Mask`, and
+# :class:`~torchvision.tv_tensors.Video`.
+#
+# TVTensors look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()``
+# or any ``torch.*`` operator will also work on a TVTensor:
+
+img_dp = tv_tensors.Image(torch.randint(0, 256, (3, 256, 256), dtype=torch.uint8))
+
+print(f"{isinstance(img_dp, torch.Tensor) = }")
+print(f"{img_dp.dtype = }, {img_dp.shape = }, {img_dp.sum() = }")
+
+# %%
+# These TVTensor classes are at the core of the transforms: in order to
+# transform a given input, the transforms first look at the **class** of the
+# object, and dispatch to the appropriate implementation accordingly.
+#
+# You don't need to know much more about TVTensors at this point, but advanced
+# users who want to learn more can refer to
+# :ref:`sphx_glr_auto_examples_transforms_plot_tv_tensors.py`.
+#
+# What do I pass as input?
+# ------------------------
+#
+# Above, we've seen two examples: one where we passed a single image as input
+# i.e. ``out = transforms(img)``, and one where we passed both an image and
+# bounding boxes, i.e. ``out_img, out_boxes = transforms(img, boxes)``.
+#
+# In fact, transforms support **arbitrary input structures**. The input can be a
+# single image, a tuple, an arbitrarily nested dictionary... pretty much
+# anything. The same structure will be returned as output. Below, we use the
+# same detection transforms, but pass a tuple (image, target_dict) as input and
+# we're getting the same structure as output:
+
+target = {
+    "boxes": boxes,
+    "labels": torch.arange(boxes.shape[0]),
+    "this_is_ignored": ("arbitrary", {"structure": "!"})
+}
+
+# Re-using the transforms and definitions from above.
+out_img, out_target = transforms(img, target)
+
+# sphinx_gallery_thumbnail_number = 4
+plot([(img, target["boxes"]), (out_img, out_target["boxes"])])
+print(f"{out_target['this_is_ignored']}")
+
+# %%
+# We passed a tuple so we get a tuple back, and the second element is the
+# tranformed target dict. Transforms don't really care about the structure of
+# the input; as mentioned above, they only care about the **type** of the
+# objects and transforms them accordingly.
+#
+# *Foreign* objects like strings or ints are simply passed-through. This can be
+# useful e.g. if you want to associate a path with every single sample when
+# debugging!
+#
+# .. _passthrough_heuristic:
+#
+# .. note::
+#
+#     **Disclaimer** This note is slightly advanced and can be safely skipped on
+#     a first read.
+#
+#     Pure :class:`torch.Tensor` objects are, in general, treated as images (or
+#     as videos for video-specific transforms). Indeed, you may have noticed
+#     that in the code above we haven't used the
+#     :class:`~torchvision.tv_tensors.Image` class at all, and yet our images
+#     got transformed properly. Transforms follow the following logic to
+#     determine whether a pure Tensor should be treated as an image (or video),
+#     or just ignored:
+#
+#     * If there is an :class:`~torchvision.tv_tensors.Image`,
+#       :class:`~torchvision.tv_tensors.Video`,
+#       or :class:`PIL.Image.Image` instance in the input, all other pure
+#       tensors are passed-through.
+#     * If there is no :class:`~torchvision.tv_tensors.Image` or
+#       :class:`~torchvision.tv_tensors.Video` instance, only the first pure
+#       :class:`torch.Tensor` will be transformed as image or video, while all
+#       others will be passed-through. Here "first" means "first in a depth-wise
+#       traversal".
+#
+#     This is what happened in the detection example above: the first pure
+#     tensor was the image so it got transformed properly, and all other pure
+#     tensor instances like the ``labels`` were passed-through (although labels
+#     can still be transformed by some transforms like
+#     :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`!).
+#
+# .. _transforms_datasets_intercompatibility:
+#
+# Transforms and Datasets intercompatibility
+# ------------------------------------------
+#
+# Roughly speaking, the output of the datasets must correspond to the input of
+# the transforms. How to do that depends on whether you're using the torchvision
+# :ref:`built-in datatsets <datasets>`, or your own custom datasets.
+#
+# Using built-in datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you're just doing image classification, you don't need to do anything. Just
+# use ``transform`` argument of the dataset e.g. ``ImageNet(...,
+# transform=transforms)`` and you're good to go.
+#
+# Torchvision also supports datasets for object detection or segmentation like
+# :class:`torchvision.datasets.CocoDetection`. Those datasets predate
+# the existence of the :mod:`torchvision.transforms.v2` module and of the
+# TVTensors, so they don't return TVTensors out of the box.
+#
+# An easy way to force those datasets to return TVTensors and to make them
+# compatible with v2 transforms is to use the
+# :func:`torchvision.datasets.wrap_dataset_for_transforms_v2` function:
+#
+# .. code-block:: python
+#
+#    from torchvision.datasets import CocoDetection, wrap_dataset_for_transforms_v2
+#
+#    dataset = CocoDetection(..., transforms=my_transforms)
+#    dataset = wrap_dataset_for_transforms_v2(dataset)
+#    # Now the dataset returns TVTensors!
+#
+# Using your own datasets
+# ^^^^^^^^^^^^^^^^^^^^^^^
+#
+# If you have a custom dataset, then you'll need to convert your objects into
+# the appropriate TVTensor classes. Creating TVTensor instances is very easy,
+# refer to :ref:`tv_tensor_creation` for more details.
+#
+# There are two main places where you can implement that conversion logic:
+#
+# - At the end of the datasets's ``__getitem__`` method, before returning the
+#   sample (or by sub-classing the dataset).
+# - As the very first step of your transforms pipeline
+#
+# Either way, the logic will depend on your specific dataset.
diff --git a/gallery/plot_transforms.py b/gallery/transforms/plot_transforms_illustrations.py
similarity index 56%
rename from gallery/plot_transforms.py
rename to gallery/transforms/plot_transforms_illustrations.py
index c6e44a14e229915903551b995753f6032c704e40..95ab455d0fd80d1752f9964b18c866172ae866bc 100644
--- a/gallery/plot_transforms.py
+++ b/gallery/transforms/plot_transforms_illustrations.py
@@ -3,317 +3,318 @@
 Illustration of transforms
 ==========================
 
-This example illustrates the various transforms available in :ref:`the
-torchvision.transforms module <transforms>`.
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_transforms_illustrations.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_transforms_illustrations.py>` to download the full example code.
+
+This example illustrates some of the various transforms available in :ref:`the
+torchvision.transforms.v2 module <transforms>`.
 """
+# %%
 
 # sphinx_gallery_thumbnail_path = "../../gallery/assets/transforms_thumbnail.png"
 
 from PIL import Image
 from pathlib import Path
 import matplotlib.pyplot as plt
-import numpy as np
 
 import torch
-import torchvision.transforms as T
-
+from torchvision.transforms import v2
 
 plt.rcParams["savefig.bbox"] = 'tight'
-orig_img = Image.open(Path('assets') / 'astronaut.jpg')
+
 # if you change the seed, make sure that the randomly-applied transforms
 # properly show that the image can be both transformed and *not* transformed!
 torch.manual_seed(0)
 
-
-def plot(imgs, with_orig=True, row_title=None, **imshow_kwargs):
-    if not isinstance(imgs[0], list):
-        # Make a 2d grid even if there's just 1 row
-        imgs = [imgs]
-
-    num_rows = len(imgs)
-    num_cols = len(imgs[0]) + with_orig
-    fig, axs = plt.subplots(nrows=num_rows, ncols=num_cols, squeeze=False)
-    for row_idx, row in enumerate(imgs):
-        row = [orig_img] + row if with_orig else row
-        for col_idx, img in enumerate(row):
-            ax = axs[row_idx, col_idx]
-            ax.imshow(np.asarray(img), **imshow_kwargs)
-            ax.set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])
-
-    if with_orig:
-        axs[0, 0].set(title='Original image')
-        axs[0, 0].title.set_size(8)
-    if row_title is not None:
-        for row_idx in range(num_rows):
-            axs[row_idx, 0].set(ylabel=row_title[row_idx])
-
-    plt.tight_layout()
-
-
-####################################
+# If you're trying to run that on collab, you can download the assets and the
+# helpers from https://github.com/pytorch/vision/tree/main/gallery/
+from helpers import plot
+orig_img = Image.open(Path('../assets') / 'astronaut.jpg')
+
+# %%
+# Geometric Transforms
+# --------------------
+# Geometric image transformation refers to the process of altering the geometric properties of an image,
+# such as its shape, size, orientation, or position.
+# It involves applying mathematical operations to the image pixels or coordinates to achieve the desired transformation.
+#
 # Pad
-# ---
+# ~~~
 # The :class:`~torchvision.transforms.Pad` transform
 # (see also :func:`~torchvision.transforms.functional.pad`)
-# fills image borders with some pixel values.
-padded_imgs = [T.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
-plot(padded_imgs)
+# pads all image borders with some pixel values.
+padded_imgs = [v2.Pad(padding=padding)(orig_img) for padding in (3, 10, 30, 50)]
+plot([orig_img] + padded_imgs)
 
-####################################
+# %%
 # Resize
-# ------
+# ~~~~~~
 # The :class:`~torchvision.transforms.Resize` transform
 # (see also :func:`~torchvision.transforms.functional.resize`)
 # resizes an image.
-resized_imgs = [T.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(resized_imgs)
+resized_imgs = [v2.Resize(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + resized_imgs)
 
-####################################
+# %%
 # CenterCrop
-# ----------
+# ~~~~~~~~~~
 # The :class:`~torchvision.transforms.CenterCrop` transform
 # (see also :func:`~torchvision.transforms.functional.center_crop`)
 # crops the given image at the center.
-center_crops = [T.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
-plot(center_crops)
+center_crops = [v2.CenterCrop(size=size)(orig_img) for size in (30, 50, 100, orig_img.size)]
+plot([orig_img] + center_crops)
 
-####################################
+# %%
 # FiveCrop
-# --------
+# ~~~~~~~~
 # The :class:`~torchvision.transforms.FiveCrop` transform
 # (see also :func:`~torchvision.transforms.functional.five_crop`)
 # crops the given image into four corners and the central crop.
-(top_left, top_right, bottom_left, bottom_right, center) = T.FiveCrop(size=(100, 100))(orig_img)
-plot([top_left, top_right, bottom_left, bottom_right, center])
+(top_left, top_right, bottom_left, bottom_right, center) = v2.FiveCrop(size=(100, 100))(orig_img)
+plot([orig_img] + [top_left, top_right, bottom_left, bottom_right, center])
 
-####################################
-# Grayscale
-# ---------
-# The :class:`~torchvision.transforms.Grayscale` transform
-# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
-# converts an image to grayscale
-gray_img = T.Grayscale()(orig_img)
-plot([gray_img], cmap='gray')
-
-####################################
-# Random transforms
-# -----------------
-# The following transforms are random, which means that the same transfomer
-# instance will produce different result each time it transforms a given image.
-#
-# ColorJitter
-# ~~~~~~~~~~~
-# The :class:`~torchvision.transforms.ColorJitter` transform
-# randomly changes the brightness, saturation, and other properties of an image.
-jitter = T.ColorJitter(brightness=.5, hue=.3)
-jitted_imgs = [jitter(orig_img) for _ in range(4)]
-plot(jitted_imgs)
-
-####################################
-# GaussianBlur
-# ~~~~~~~~~~~~
-# The :class:`~torchvision.transforms.GaussianBlur` transform
-# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
-# performs gaussian blur transform on an image.
-blurrer = T.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5))
-blurred_imgs = [blurrer(orig_img) for _ in range(4)]
-plot(blurred_imgs)
-
-####################################
+# %%
 # RandomPerspective
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPerspective` transform
 # (see also :func:`~torchvision.transforms.functional.perspective`)
 # performs random perspective transform on an image.
-perspective_transformer = T.RandomPerspective(distortion_scale=0.6, p=1.0)
+perspective_transformer = v2.RandomPerspective(distortion_scale=0.6, p=1.0)
 perspective_imgs = [perspective_transformer(orig_img) for _ in range(4)]
-plot(perspective_imgs)
+plot([orig_img] + perspective_imgs)
 
-####################################
+# %%
 # RandomRotation
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomRotation` transform
 # (see also :func:`~torchvision.transforms.functional.rotate`)
 # rotates an image with random angle.
-rotater = T.RandomRotation(degrees=(0, 180))
+rotater = v2.RandomRotation(degrees=(0, 180))
 rotated_imgs = [rotater(orig_img) for _ in range(4)]
-plot(rotated_imgs)
+plot([orig_img] + rotated_imgs)
 
-####################################
+# %%
 # RandomAffine
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAffine` transform
 # (see also :func:`~torchvision.transforms.functional.affine`)
 # performs random affine transform on an image.
-affine_transfomer = T.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
+affine_transfomer = v2.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
 affine_imgs = [affine_transfomer(orig_img) for _ in range(4)]
-plot(affine_imgs)
+plot([orig_img] + affine_imgs)
 
-####################################
+# %%
 # ElasticTransform
 # ~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.ElasticTransform` transform
 # (see also :func:`~torchvision.transforms.functional.elastic_transform`)
 # Randomly transforms the morphology of objects in images and produces a
 # see-through-water-like effect.
-elastic_transformer = T.ElasticTransform(alpha=250.0)
+elastic_transformer = v2.ElasticTransform(alpha=250.0)
 transformed_imgs = [elastic_transformer(orig_img) for _ in range(2)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
 
-####################################
+# %%
 # RandomCrop
 # ~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomCrop` transform
 # (see also :func:`~torchvision.transforms.functional.crop`)
 # crops an image at a random location.
-cropper = T.RandomCrop(size=(128, 128))
+cropper = v2.RandomCrop(size=(128, 128))
 crops = [cropper(orig_img) for _ in range(4)]
-plot(crops)
+plot([orig_img] + crops)
 
-####################################
+# %%
 # RandomResizedCrop
 # ~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomResizedCrop` transform
 # (see also :func:`~torchvision.transforms.functional.resized_crop`)
 # crops an image at a random location, and then resizes the crop to a given
 # size.
-resize_cropper = T.RandomResizedCrop(size=(32, 32))
+resize_cropper = v2.RandomResizedCrop(size=(32, 32))
 resized_crops = [resize_cropper(orig_img) for _ in range(4)]
-plot(resized_crops)
+plot([orig_img] + resized_crops)
+
+# %%
+# Photometric Transforms
+# ----------------------
+# Photometric image transformation refers to the process of modifying the photometric properties of an image,
+# such as its brightness, contrast, color, or tone.
+# These transformations are applied to change the visual appearance of an image
+# while preserving its geometric structure.
+#
+# Except :class:`~torchvision.transforms.Grayscale`, the following transforms are random,
+# which means that the same transform
+# instance will produce different result each time it transforms a given image.
+#
+# Grayscale
+# ~~~~~~~~~
+# The :class:`~torchvision.transforms.Grayscale` transform
+# (see also :func:`~torchvision.transforms.functional.to_grayscale`)
+# converts an image to grayscale
+gray_img = v2.Grayscale()(orig_img)
+plot([orig_img, gray_img], cmap='gray')
 
-####################################
+# %%
+# ColorJitter
+# ~~~~~~~~~~~
+# The :class:`~torchvision.transforms.ColorJitter` transform
+# randomly changes the brightness, contrast, saturation, hue, and other properties of an image.
+jitter = v2.ColorJitter(brightness=.5, hue=.3)
+jittered_imgs = [jitter(orig_img) for _ in range(4)]
+plot([orig_img] + jittered_imgs)
+
+# %%
+# GaussianBlur
+# ~~~~~~~~~~~~
+# The :class:`~torchvision.transforms.GaussianBlur` transform
+# (see also :func:`~torchvision.transforms.functional.gaussian_blur`)
+# performs gaussian blur transform on an image.
+blurrer = v2.GaussianBlur(kernel_size=(5, 9), sigma=(0.1, 5.))
+blurred_imgs = [blurrer(orig_img) for _ in range(4)]
+plot([orig_img] + blurred_imgs)
+
+# %%
 # RandomInvert
 # ~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomInvert` transform
 # (see also :func:`~torchvision.transforms.functional.invert`)
 # randomly inverts the colors of the given image.
-inverter = T.RandomInvert()
+inverter = v2.RandomInvert()
 invertered_imgs = [inverter(orig_img) for _ in range(4)]
-plot(invertered_imgs)
+plot([orig_img] + invertered_imgs)
 
-####################################
+# %%
 # RandomPosterize
 # ~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomPosterize` transform
 # (see also :func:`~torchvision.transforms.functional.posterize`)
 # randomly posterizes the image by reducing the number of bits
 # of each color channel.
-posterizer = T.RandomPosterize(bits=2)
+posterizer = v2.RandomPosterize(bits=2)
 posterized_imgs = [posterizer(orig_img) for _ in range(4)]
-plot(posterized_imgs)
+plot([orig_img] + posterized_imgs)
 
-####################################
+# %%
 # RandomSolarize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomSolarize` transform
 # (see also :func:`~torchvision.transforms.functional.solarize`)
 # randomly solarizes the image by inverting all pixel values above
 # the threshold.
-solarizer = T.RandomSolarize(threshold=192.0)
+solarizer = v2.RandomSolarize(threshold=192.0)
 solarized_imgs = [solarizer(orig_img) for _ in range(4)]
-plot(solarized_imgs)
+plot([orig_img] + solarized_imgs)
 
-####################################
+# %%
 # RandomAdjustSharpness
 # ~~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAdjustSharpness` transform
 # (see also :func:`~torchvision.transforms.functional.adjust_sharpness`)
 # randomly adjusts the sharpness of the given image.
-sharpness_adjuster = T.RandomAdjustSharpness(sharpness_factor=2)
+sharpness_adjuster = v2.RandomAdjustSharpness(sharpness_factor=2)
 sharpened_imgs = [sharpness_adjuster(orig_img) for _ in range(4)]
-plot(sharpened_imgs)
+plot([orig_img] + sharpened_imgs)
 
-####################################
+# %%
 # RandomAutocontrast
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomAutocontrast` transform
 # (see also :func:`~torchvision.transforms.functional.autocontrast`)
 # randomly applies autocontrast to the given image.
-autocontraster = T.RandomAutocontrast()
+autocontraster = v2.RandomAutocontrast()
 autocontrasted_imgs = [autocontraster(orig_img) for _ in range(4)]
-plot(autocontrasted_imgs)
+plot([orig_img] + autocontrasted_imgs)
 
-####################################
+# %%
 # RandomEqualize
 # ~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomEqualize` transform
 # (see also :func:`~torchvision.transforms.functional.equalize`)
 # randomly equalizes the histogram of the given image.
-equalizer = T.RandomEqualize()
+equalizer = v2.RandomEqualize()
 equalized_imgs = [equalizer(orig_img) for _ in range(4)]
-plot(equalized_imgs)
+plot([orig_img] + equalized_imgs)
 
-####################################
+# %%
+# Augmentation Transforms
+# -----------------------
+# The following transforms are combinations of multiple transforms,
+# either geometric or photometric, or both.
+#
 # AutoAugment
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.AutoAugment` transform
 # automatically augments data based on a given auto-augmentation policy.
 # See :class:`~torchvision.transforms.AutoAugmentPolicy` for the available policies.
-policies = [T.AutoAugmentPolicy.CIFAR10, T.AutoAugmentPolicy.IMAGENET, T.AutoAugmentPolicy.SVHN]
-augmenters = [T.AutoAugment(policy) for policy in policies]
+policies = [v2.AutoAugmentPolicy.CIFAR10, v2.AutoAugmentPolicy.IMAGENET, v2.AutoAugmentPolicy.SVHN]
+augmenters = [v2.AutoAugment(policy) for policy in policies]
 imgs = [
     [augmenter(orig_img) for _ in range(4)]
     for augmenter in augmenters
 ]
 row_title = [str(policy).split('.')[-1] for policy in policies]
-plot(imgs, row_title=row_title)
+plot([[orig_img] + row for row in imgs], row_title=row_title)
 
-####################################
+# %%
 # RandAugment
 # ~~~~~~~~~~~
-# The :class:`~torchvision.transforms.RandAugment` transform automatically augments the data.
-augmenter = T.RandAugment()
+# The :class:`~torchvision.transforms.RandAugment` is an alternate version of AutoAugment.
+augmenter = v2.RandAugment()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)
 
-####################################
+# %%
 # TrivialAugmentWide
 # ~~~~~~~~~~~~~~~~~~
-# The :class:`~torchvision.transforms.TrivialAugmentWide` transform automatically augments the data.
-augmenter = T.TrivialAugmentWide()
+# The :class:`~torchvision.transforms.TrivialAugmentWide` is an alternate implementation of AutoAugment.
+# However, instead of transforming an image multiple times, it transforms an image only once
+# using a random transform from a given list with a random strength number.
+augmenter = v2.TrivialAugmentWide()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)
 
-####################################
+# %%
 # AugMix
 # ~~~~~~
-# The :class:`~torchvision.transforms.AugMix` transform automatically augments the data.
-augmenter = T.AugMix()
+# The :class:`~torchvision.transforms.AugMix` transform interpolates between augmented versions of an image.
+augmenter = v2.AugMix()
 imgs = [augmenter(orig_img) for _ in range(4)]
-plot(imgs)
+plot([orig_img] + imgs)
 
-####################################
-# Randomly-applied transforms
+# %%
+# Randomly-applied Transforms
 # ---------------------------
 #
-# Some transforms are randomly-applied given a probability ``p``.  That is, the
-# transformed image may actually be the same as the original one, even when
-# called with the same transformer instance!
+# The following transforms are randomly-applied given a probability ``p``.  That is, given ``p = 0.5``,
+# there is a 50% chance to return the original image, and a 50% chance to return the transformed image,
+# even when called with the same transform instance!
 #
 # RandomHorizontalFlip
 # ~~~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomHorizontalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.hflip`)
 # performs horizontal flip of an image, with a given probability.
-hflipper = T.RandomHorizontalFlip(p=0.5)
+hflipper = v2.RandomHorizontalFlip(p=0.5)
 transformed_imgs = [hflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
 
-####################################
+# %%
 # RandomVerticalFlip
 # ~~~~~~~~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomVerticalFlip` transform
 # (see also :func:`~torchvision.transforms.functional.vflip`)
 # performs vertical flip of an image, with a given probability.
-vflipper = T.RandomVerticalFlip(p=0.5)
+vflipper = v2.RandomVerticalFlip(p=0.5)
 transformed_imgs = [vflipper(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
 
-####################################
+# %%
 # RandomApply
 # ~~~~~~~~~~~
 # The :class:`~torchvision.transforms.RandomApply` transform
 # randomly applies a list of transforms, with a given probability.
-applier = T.RandomApply(transforms=[T.RandomCrop(size=(64, 64))], p=0.5)
+applier = v2.RandomApply(transforms=[v2.RandomCrop(size=(64, 64))], p=0.5)
 transformed_imgs = [applier(orig_img) for _ in range(4)]
-plot(transformed_imgs)
+plot([orig_img] + transformed_imgs)
diff --git a/gallery/transforms/plot_tv_tensors.py b/gallery/transforms/plot_tv_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..0cdbe9d083142ead856afe5387da0d4b1dd1ef0a
--- /dev/null
+++ b/gallery/transforms/plot_tv_tensors.py
@@ -0,0 +1,224 @@
+"""
+=============
+TVTensors FAQ
+=============
+
+.. note::
+    Try on `collab <https://colab.research.google.com/github/pytorch/vision/blob/gh-pages/main/_generated_ipynb_notebooks/plot_tv_tensors.ipynb>`_
+    or :ref:`go to the end <sphx_glr_download_auto_examples_transforms_plot_tv_tensors.py>` to download the full example code.
+
+
+TVTensors are Tensor subclasses introduced together with
+``torchvision.transforms.v2``. This example showcases what these TVTensors are
+and how they behave.
+
+.. warning::
+
+    **Intended Audience** Unless you're writing your own transforms or your own TVTensors, you
+    probably do not need to read this guide. This is a fairly low-level topic
+    that most users will not need to worry about: you do not need to understand
+    the internals of TVTensors to efficiently rely on
+    ``torchvision.transforms.v2``. It may however be useful for advanced users
+    trying to implement their own datasets, transforms, or work directly with
+    the TVTensors.
+"""
+
+# %%
+import PIL.Image
+
+import torch
+from torchvision import tv_tensors
+
+
+# %%
+# What are TVTensors?
+# -------------------
+#
+# TVTensors are zero-copy tensor subclasses:
+
+tensor = torch.rand(3, 256, 256)
+image = tv_tensors.Image(tensor)
+
+assert isinstance(image, torch.Tensor)
+assert image.data_ptr() == tensor.data_ptr()
+
+# %%
+# Under the hood, they are needed in :mod:`torchvision.transforms.v2` to correctly dispatch to the appropriate function
+# for the input data.
+#
+# :mod:`torchvision.tv_tensors` supports four types of TVTensors:
+#
+# * :class:`~torchvision.tv_tensors.Image`
+# * :class:`~torchvision.tv_tensors.Video`
+# * :class:`~torchvision.tv_tensors.BoundingBoxes`
+# * :class:`~torchvision.tv_tensors.Mask`
+#
+# What can I do with a TVTensor?
+# ------------------------------
+#
+# TVTensors look and feel just like regular tensors - they **are** tensors.
+# Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` or
+# any ``torch.*`` operator will also work on TVTensors. See
+# :ref:`tv_tensor_unwrapping_behaviour` for a few gotchas.
+
+# %%
+# .. _tv_tensor_creation:
+#
+# How do I construct a TVTensor?
+# ------------------------------
+#
+# Using the constructor
+# ^^^^^^^^^^^^^^^^^^^^^
+#
+# Each TVTensor class takes any tensor-like data that can be turned into a :class:`~torch.Tensor`
+
+image = tv_tensors.Image([[[[0, 1], [1, 0]]]])
+print(image)
+
+
+# %%
+# Similar to other PyTorch creations ops, the constructor also takes the ``dtype``, ``device``, and ``requires_grad``
+# parameters.
+
+float_image = tv_tensors.Image([[[0, 1], [1, 0]]], dtype=torch.float32, requires_grad=True)
+print(float_image)
+
+
+# %%
+# In addition, :class:`~torchvision.tv_tensors.Image` and :class:`~torchvision.tv_tensors.Mask` can also take a
+# :class:`PIL.Image.Image` directly:
+
+image = tv_tensors.Image(PIL.Image.open("../assets/astronaut.jpg"))
+print(image.shape, image.dtype)
+
+# %%
+# Some TVTensors require additional metadata to be passed in ordered to be constructed. For example,
+# :class:`~torchvision.tv_tensors.BoundingBoxes` requires the coordinate format as well as the size of the
+# corresponding image (``canvas_size``) alongside the actual values. These
+# metadata are required to properly transform the bounding boxes.
+
+bboxes = tv_tensors.BoundingBoxes(
+    [[17, 16, 344, 495], [0, 10, 0, 10]],
+    format=tv_tensors.BoundingBoxFormat.XYXY,
+    canvas_size=image.shape[-2:]
+)
+print(bboxes)
+
+# %%
+# Using ``tv_tensors.wrap()``
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can also use the :func:`~torchvision.tv_tensors.wrap` function to wrap a tensor object
+# into a TVTensor. This is useful when you already have an object of the
+# desired type, which typically happens when writing transforms: you just want
+# to wrap the output like the input.
+
+new_bboxes = torch.tensor([0, 20, 30, 40])
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+assert new_bboxes.canvas_size == bboxes.canvas_size
+
+# %%
+# The metadata of ``new_bboxes`` is the same as ``bboxes``, but you could pass
+# it as a parameter to override it.
+#
+# .. _tv_tensor_unwrapping_behaviour:
+#
+# I had a TVTensor but now I have a Tensor. Help!
+# -----------------------------------------------
+#
+# By default, operations on :class:`~torchvision.tv_tensors.TVTensor` objects
+# will return a pure Tensor:
+
+
+assert isinstance(bboxes, tv_tensors.BoundingBoxes)
+
+# Shift bboxes by 3 pixels in both H and W
+new_bboxes = bboxes + 3
+
+assert isinstance(new_bboxes, torch.Tensor)
+assert not isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# .. note::
+#
+#    This behavior only affects native ``torch`` operations. If you are using
+#    the built-in ``torchvision`` transforms or functionals, you will always get
+#    as output the same type that you passed as input (pure ``Tensor`` or
+#    ``TVTensor``).
+
+# %%
+# But I want a TVTensor back!
+# ^^^^^^^^^^^^^^^^^^^^^^^^^^^
+#
+# You can re-wrap a pure tensor into a TVTensor by just calling the TVTensor
+# constructor, or by using the :func:`~torchvision.tv_tensors.wrap` function
+# (see more details above in :ref:`tv_tensor_creation`):
+
+new_bboxes = bboxes + 3
+new_bboxes = tv_tensors.wrap(new_bboxes, like=bboxes)
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# Alternatively, you can use the :func:`~torchvision.tv_tensors.set_return_type`
+# as a global config setting for the whole program, or as a context manager
+# (read its docs to learn more about caveats):
+
+with tv_tensors.set_return_type("TVTensor"):
+    new_bboxes = bboxes + 3
+assert isinstance(new_bboxes, tv_tensors.BoundingBoxes)
+
+# %%
+# Why is this happening?
+# ^^^^^^^^^^^^^^^^^^^^^^
+#
+# **For performance reasons**. :class:`~torchvision.tv_tensors.TVTensor`
+# classes are Tensor subclasses, so any operation involving a
+# :class:`~torchvision.tv_tensors.TVTensor` object will go through the
+# `__torch_function__
+# <https://pytorch.org/docs/stable/notes/extending.html#extending-torch>`_
+# protocol. This induces a small overhead, which we want to avoid when possible.
+# This doesn't matter for built-in ``torchvision`` transforms because we can
+# avoid the overhead there, but it could be a problem in your model's
+# ``forward``.
+#
+# **The alternative isn't much better anyway.** For every operation where
+# preserving the :class:`~torchvision.tv_tensors.TVTensor` type makes
+# sense, there are just as many operations where returning a pure Tensor is
+# preferable: for example, is ``img.sum()`` still an :class:`~torchvision.tv_tensors.Image`?
+# If we were to preserve :class:`~torchvision.tv_tensors.TVTensor` types all
+# the way, even model's logits or the output of the loss function would end up
+# being of type :class:`~torchvision.tv_tensors.Image`, and surely that's not
+# desirable.
+#
+# .. note::
+#
+#    This behaviour is something we're actively seeking feedback on. If you find this surprising or if you
+#    have any suggestions on how to better support your use-cases, please reach out to us via this issue:
+#    https://github.com/pytorch/vision/issues/7319
+#
+# Exceptions
+# ^^^^^^^^^^
+#
+# There are a few exceptions to this "unwrapping" rule:
+# :meth:`~torch.Tensor.clone`, :meth:`~torch.Tensor.to`,
+# :meth:`torch.Tensor.detach`, and :meth:`~torch.Tensor.requires_grad_` retain
+# the TVTensor type.
+#
+# Inplace operations on TVTensors like ``obj.add_()`` will preserve the type of
+# ``obj``. However, the **returned** value of inplace operations will be a pure
+# tensor:
+
+image = tv_tensors.Image([[[0, 1], [1, 0]]])
+
+new_image = image.add_(1).mul_(2)
+
+# image got transformed in-place and is still a TVTensor Image, but new_image
+# is a Tensor. They share the same underlying data and they're equal, just
+# different classes.
+assert isinstance(image, tv_tensors.Image)
+print(image)
+
+assert isinstance(new_image, torch.Tensor) and not isinstance(new_image, tv_tensors.Image)
+assert (new_image == image).all()
+assert new_image.data_ptr() == image.data_ptr()
diff --git a/hubconf.py b/hubconf.py
index 57ce7a0d12a3ebd200493a565bd2ab439ab16c00..637827127cab488eb0cf7d08ff9eb120a1989155 100644
--- a/hubconf.py
+++ b/hubconf.py
@@ -20,6 +20,7 @@ from torchvision.models.efficientnet import (
 )
 from torchvision.models.googlenet import googlenet
 from torchvision.models.inception import inception_v3
+from torchvision.models.maxvit import maxvit_t
 from torchvision.models.mnasnet import mnasnet0_5, mnasnet0_75, mnasnet1_0, mnasnet1_3
 from torchvision.models.mobilenetv2 import mobilenet_v2
 from torchvision.models.mobilenetv3 import mobilenet_v3_large, mobilenet_v3_small
@@ -68,6 +69,17 @@ from torchvision.models.shufflenetv2 import (
     shufflenet_v2_x2_0,
 )
 from torchvision.models.squeezenet import squeezenet1_0, squeezenet1_1
-from torchvision.models.swin_transformer import swin_b, swin_s, swin_t
+from torchvision.models.swin_transformer import swin_b, swin_s, swin_t, swin_v2_b, swin_v2_s, swin_v2_t
 from torchvision.models.vgg import vgg11, vgg11_bn, vgg13, vgg13_bn, vgg16, vgg16_bn, vgg19, vgg19_bn
+from torchvision.models.video import (
+    mc3_18,
+    mvit_v1_b,
+    mvit_v2_s,
+    r2plus1d_18,
+    r3d_18,
+    s3d,
+    swin3d_b,
+    swin3d_s,
+    swin3d_t,
+)
 from torchvision.models.vision_transformer import vit_b_16, vit_b_32, vit_h_14, vit_l_16, vit_l_32
diff --git a/ios/CMakeLists.txt b/ios/CMakeLists.txt
index 6b9fd3925b2972faa7e0691187fe1b8cfd2d810f..4201240a42725dc52e05e67859347c65459e7e8e 100644
--- a/ios/CMakeLists.txt
+++ b/ios/CMakeLists.txt
@@ -1,7 +1,7 @@
 cmake_minimum_required(VERSION 3.4.1)
 set(TARGET torchvision_ops)
 project(${TARGET} CXX)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
 set(LIBTORCH_HEADER_ROOT ${LIBTORCH_HEADER_ROOT})
 set(LIBRARY_OUTPUT_PATH ../lib)
 
diff --git a/ios/LibTorchvision.podspec b/ios/LibTorchvision.podspec
index ba87820e142f7804d5824e5975a9f091cacfe63a..b88fb70ac40fe786a323e2eb40065ac0ad537dcc 100644
--- a/ios/LibTorchvision.podspec
+++ b/ios/LibTorchvision.podspec
@@ -1,8 +1,8 @@
-pytorch_version = '1.12.0'
+pytorch_version = '2.0.0'
 
 Pod::Spec.new do |s|
     s.name             = 'LibTorchvision'
-    s.version          = '0.13.0'
+    s.version          = '0.15.1'
     s.authors          = 'PyTorch Team'
     s.license          = { :type => 'BSD' }
     s.homepage         = 'https://github.com/pytorch/vision'
diff --git a/ios/README.md b/ios/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..0b50245f1ee819ede8dbdc177c10e06db48e8408
--- /dev/null
+++ b/ios/README.md
@@ -0,0 +1,3 @@
+## Status
+
+The iOS demo of TorchVision is currently unmaintained, untested and likely out-of-date.
diff --git a/maintainer_guide.md b/maintainer_guide.md
new file mode 100644
index 0000000000000000000000000000000000000000..24ac0943821d86a82ff2021add09799d730ca4b4
--- /dev/null
+++ b/maintainer_guide.md
@@ -0,0 +1,76 @@
+## Torchvision maintainers guide
+
+This document aims at documenting user-facing policies / principles used when
+developing and maintaining torchvision. Other maintainer info (e.g. release
+process) can be found in the meta-internal wiki.
+
+### What is public and what is private?
+
+For the Python API, torchvision largely follows the [PyTorch
+policy](https://github.com/pytorch/pytorch/wiki/Public-API-definition-and-documentation)
+which is consistent with other major packages
+([numpy](https://numpy.org/neps/nep-0023-backwards-compatibility.html),
+[scikit-learn](https://scikit-learn.org/dev/glossary.html#term-API) etc.).
+We recognize that his policy is somewhat imperfect for some edge cases, and that
+it's difficult to come up with an accurate technical definition. In broad terms,
+which are usually well understood by users, the policy is that:
+
+- modules that can be accessed without leading underscore are public
+- objects in a public file that don't have a leading underscore are public
+- class attributes are public iff they have no leading underscore
+- the rest of the modules / objects / class attributes are considered private
+
+The public API has backward-compatible (BC) guarantees defined in our
+deprecation policy (see below). The private API has not BC guarantees.
+
+For C++, code is private. For Meta employees: if a C++ change breaks fbcode, fix
+fbcode or revert the change. We should be careful about models running in
+production and relying on torchvision ops.
+
+The `test` folder is not importable and is **private.** Even meta-internal
+projects should *not* rely on it (it has happened in the past and is now
+programmatically impossible).
+
+The training references do not have BC guarantees. Breaking changes are
+possible, but we should make sure that the tutorials are still running properly,
+and that their intended narrative is preserved (by e.g. checking outputs,
+etc.).
+
+The rest of the folders (build, android, ios, etc.) are private and have no BC
+guarantees.
+
+### Deprecation policy.
+
+Because they're disruptive, **deprecations should only be used sparingly**.
+
+We largely follow the [PyTorch
+policy](https://github.com/pytorch/pytorch/wiki/PyTorch's-Python-Frontend-Backward-and-Forward-Compatibility-Policy):
+breaking changes require a deprecation period of at least 2 versions.
+
+Deprecations should clearly indicate their deadline in the docs and warning
+messages. Avoid not committing to a deadline, or keeping deprecated APIs for too
+long: it gives no incentive for users to update their code, sends conflicting
+messages ("why was this API removed while this other one is still around?"), and
+accumulates debt in the project.
+
+### Should this attribute be public? Should this function be private?
+
+When designing an API it’s not always obvious what should be exposed as public,
+and what should be kept as a private implementation detail. The following
+guidelines can be useful:
+
+* Functional consistency throughout the library is a top priority, for users and
+  developers’ sake. In doubt and unless it’s clearly wrong, expose what other
+  similar classes expose.
+* Think really hard about the users and their use-cases, and try to expose what
+  they would need to address those use-cases. Aggressively keep everything else
+  private. Remember that the “private -> public” direction is way smoother than
+  the “public -> private” one: in doubt, keep it private.
+* When thinking about use-cases, the general API motto applies: make what’s
+  simple and common easy, and make what’s complex possible (80% / 20% rule).
+  There might be a ~1% left that’s not addressed: that’s OK. Also, **make what’s
+  wrong very hard**, if not impossible.
+
+As a good practice, always create new files and even classes with a leading
+underscore in their name. This way, everything is private by default and the
+only public surface is explicitly present in an `__init__.py` file.
diff --git a/mypy.ini b/mypy.ini
index aaeea57a6915aff7903a9f9c3b56975b354323dd..653f7c14ec2b6d63edb4512fc7379b18f1b22d84 100644
--- a/mypy.ini
+++ b/mypy.ini
@@ -7,6 +7,38 @@ allow_redefinition = True
 no_implicit_optional = True
 warn_redundant_casts = True
 
+[mypy-torchvision.prototype.datapoints.*]
+
+; untyped definitions and calls
+disallow_untyped_defs = True
+
+; None and Optional handling
+no_implicit_optional = True
+
+; warnings
+warn_unused_ignores = True
+
+; miscellaneous strictness flags
+allow_redefinition = True
+
+[mypy-torchvision.prototype.transforms.*]
+
+; untyped definitions and calls
+disallow_untyped_defs = True
+
+; None and Optional handling
+no_implicit_optional = True
+
+; warnings
+warn_unused_ignores = True
+
+; miscellaneous strictness flags
+allow_redefinition = True
+
+[mypy-torchvision.prototype.datasets.*]
+
+ignore_errors = True
+
 [mypy-torchvision.io.image.*]
 
 ignore_errors = True
diff --git a/packaging/README.md b/packaging/README.md
deleted file mode 100644
index 3ceac53030e7b89d1df93e03be86d18c667f49e6..0000000000000000000000000000000000000000
--- a/packaging/README.md
+++ /dev/null
@@ -1,6 +0,0 @@
-# Building torchvision packages for release
-
-TorchVision release packages are built by using `build_wheel.sh` and `build_conda.sh` for all permutations of
-supported operating systems, compute platforms and python versions.
-
-OS/Python/Compute matrix is defined in https://github.com/pytorch/vision/blob/main/.circleci/regenerate.py
diff --git a/packaging/build_cmake.sh b/packaging/build_cmake.sh
deleted file mode 100755
index 99d98c67f1ad08355476f484b81fb812ab336502..0000000000000000000000000000000000000000
--- a/packaging/build_cmake.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/bin/bash
-set -ex
-
-PARALLELISM=8
-if [ -n "$MAX_JOBS" ]; then
-    PARALLELISM=$MAX_JOBS
-fi
-
-if [[ "$(uname)" != Darwin && "$OSTYPE" != "msys" ]]; then
-    eval "$(./conda/bin/conda shell.bash hook)"
-    conda activate ./env
-fi
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=conda
-setup_env
-export SOURCE_ROOT_DIR="$PWD"
-setup_conda_pytorch_constraint
-setup_conda_cudatoolkit_plain_constraint
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    conda install -yq conda-build cmake future
-    pip install dataclasses
-fi
-
-setup_visual_studio_constraint
-setup_junit_results_folder
-
-if [[ "$(uname)" == Darwin ]]; then
-  # TODO: this can be removed as soon as mkl's CMake support works with clang
-  #  see https://github.com/pytorch/vision/pull/4203 for details
-  MKL_CONSTRAINT='mkl==2021.2.0'
-else
-  MKL_CONSTRAINT=''
-fi
-
-if [[ $CONDA_BUILD_VARIANT == "cpu" ]]; then
-  PYTORCH_MUTEX_CONSTRAINT='pytorch-mutex=1.0=cpu'
-else
-  PYTORCH_MUTEX_CONSTRAINT=''
-fi
-
-conda install -yq \pytorch=$PYTORCH_VERSION $CONDA_CUDATOOLKIT_CONSTRAINT $PYTORCH_MUTEX_CONSTRAINT $MKL_CONSTRAINT numpy -c nvidia -c "pytorch-${UPLOAD_CHANNEL}"
-TORCH_PATH=$(dirname $(python -c "import torch; print(torch.__file__)"))
-
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    conda install -yq libpng jpeg
-else
-    yum install -y libpng-devel libjpeg-turbo-devel
-fi
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    source .circleci/unittest/windows/scripts/set_cuda_envs.sh
-fi
-
-mkdir cpp_build
-pushd cpp_build
-
-# Generate libtorchvision files
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA
-
-# Compile and install libtorchvision
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cmake.bat" $PARALLELISM
-    CONDA_PATH=$(dirname $(which python))
-    cp -r "C:/Program Files (x86)/torchvision/include/torchvision" $CONDA_PATH/include
-else
-    make -j$PARALLELISM
-    make install
-
-    if [[ "$(uname)" == Darwin ]]; then
-        CONDA_PATH=$(dirname $(dirname $(which python)))
-        cp -r /usr/local/include/torchvision $CONDA_PATH/include/
-        export C_INCLUDE_PATH=/usr/local/include
-        export CPLUS_INCLUDE_PATH=/usr/local/include
-    fi
-fi
-
-popd
-
-# Install torchvision locally
-python setup.py develop
-
-# Trace, compile and run project that uses Faster-RCNN
-pushd test/tracing/frcnn
-mkdir build
-
-# Trace model
-python trace_model.py
-cp fasterrcnn_resnet50_fpn.pt build
-
-cd build
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch -DWITH_CUDA=$CMAKE_USE_CUDA
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_frcnn.bat" $PARALLELISM
-    mv fasterrcnn_resnet50_fpn.pt Release
-    cd Release
-    export PATH=$(cygpath -w "C:/Program Files/NVIDIA Corporation/NvToolsExt/bin/x64"):$(cygpath -w "C:/Program Files (x86)/torchvision/bin"):$(cygpath -w $TORCH_PATH)/lib:$PATH
-else
-    make -j$PARALLELISM
-fi
-
-# Run traced program
-./test_frcnn_tracing
-
-# Compile and run the CPP example
-popd
-cd examples/cpp/hello_world
-mkdir build
-
-# Trace model
-python trace_model.py
-cp resnet18.pt build
-
-cd build
-cmake .. -DTorch_DIR=$TORCH_PATH/share/cmake/Torch
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    "$script_dir/windows/internal/vc_env_helper.bat" "$script_dir/windows/internal/build_cpp_example.bat" $PARALLELISM
-    mv resnet18.pt Release
-    cd Release
-else
-    make -j$PARALLELISM
-fi
-
-# Run CPP example
-./hello-world
diff --git a/packaging/build_conda.sh b/packaging/build_conda.sh
deleted file mode 100755
index e80c7dfbe64c4da74fb535b09c78d94871aeb54b..0000000000000000000000000000000000000000
--- a/packaging/build_conda.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-set -ex
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=conda
-setup_env
-export SOURCE_ROOT_DIR="$PWD"
-setup_conda_pytorch_constraint
-setup_conda_cudatoolkit_constraint
-setup_visual_studio_constraint
-setup_junit_results_folder
-export CUDATOOLKIT_CHANNEL="nvidia"
-
-conda build -c $CUDATOOLKIT_CHANNEL $CONDA_CHANNEL_FLAGS --no-anaconda-upload --python "$PYTHON_VERSION" packaging/torchvision
diff --git a/packaging/build_wheel.sh b/packaging/build_wheel.sh
deleted file mode 100755
index 3299d16ec92ab1c50b53ea586b25779fdd1f62ea..0000000000000000000000000000000000000000
--- a/packaging/build_wheel.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/bin/bash
-set -ex
-
-script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-. "$script_dir/pkg_helpers.bash"
-
-export BUILD_TYPE=wheel
-setup_env
-setup_wheel_python
-pip_install numpy pyyaml future ninja
-pip_install --upgrade setuptools
-setup_pip_pytorch_version
-python setup.py clean
-
-# Copy binaries to be included in the wheel distribution
-if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    python_exec="$(which python)"
-    bin_path=$(dirname $python_exec)
-    env_path=$(dirname $bin_path)
-    if [[ "$(uname)" == Darwin ]]; then
-        # Install delocate to relocate the required binaries
-        pip_install "delocate>=0.9"
-    else
-        cp "$bin_path/Library/bin/libpng16.dll" torchvision
-        cp "$bin_path/Library/bin/libjpeg.dll" torchvision
-    fi
-else
-    # Install auditwheel to get some inspection utilities
-    pip_install auditwheel
-
-    # Point to custom libraries
-    export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH
-    export TORCHVISION_INCLUDE=$(pwd)/ext_libraries/include
-    export TORCHVISION_LIBRARY=$(pwd)/ext_libraries/lib
-fi
-
-download_copy_ffmpeg
-
-if [[ "$OSTYPE" == "msys" ]]; then
-    IS_WHEEL=1 "$script_dir/windows/internal/vc_env_helper.bat" python setup.py bdist_wheel
-else
-    IS_WHEEL=1 python setup.py bdist_wheel
-fi
-
-
-if [[ "$(uname)" == Darwin ]]; then
-    pushd dist/
-    python_exec="$(which python)"
-    bin_path=$(dirname $python_exec)
-    env_path=$(dirname $bin_path)
-    for whl in *.whl; do
-        DYLD_FALLBACK_LIBRARY_PATH="$env_path/lib/:$DYLD_FALLBACK_LIBRARY_PATH" delocate-wheel -v --ignore-missing-dependencies $whl
-    done
-else
-    if [[ "$OSTYPE" == "msys" ]]; then
-        "$script_dir/windows/internal/vc_env_helper.bat" python $script_dir/wheel/relocate.py
-    else
-        LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python $script_dir/wheel/relocate.py
-    fi
-fi
diff --git a/packaging/pkg_helpers.bash b/packaging/pkg_helpers.bash
deleted file mode 100644
index 195bc3a4561025fb5a7ccdf37f2e03d9f99a3fcb..0000000000000000000000000000000000000000
--- a/packaging/pkg_helpers.bash
+++ /dev/null
@@ -1,339 +0,0 @@
-# A set of useful bash functions for common functionality we need to do in
-# many build scripts
-
-
-# Setup CUDA environment variables, based on CU_VERSION
-#
-# Inputs:
-#   CU_VERSION (cpu, cu92, cu100)
-#   NO_CUDA_PACKAGE (bool)
-#   BUILD_TYPE (conda, wheel)
-#
-# Outputs:
-#   VERSION_SUFFIX (e.g., "")
-#   PYTORCH_VERSION_SUFFIX (e.g., +cpu)
-#   WHEEL_DIR (e.g., cu100/)
-#   CUDA_HOME (e.g., /usr/local/cuda-9.2, respected by torch.utils.cpp_extension)
-#   FORCE_CUDA (respected by torchvision setup.py)
-#   NVCC_FLAGS (respected by torchvision setup.py)
-#
-# Precondition: CUDA versions are installed in their conventional locations in
-# /usr/local/cuda-*
-#
-# NOTE: Why VERSION_SUFFIX versus PYTORCH_VERSION_SUFFIX?  If you're building
-# a package with CUDA on a platform we support CUDA on, VERSION_SUFFIX ==
-# PYTORCH_VERSION_SUFFIX and everyone is happy.  However, if you are building a
-# package with only CPU bits (e.g., torchaudio), then VERSION_SUFFIX is always
-# empty, but PYTORCH_VERSION_SUFFIX is +cpu (because that's how you get a CPU
-# version of a Python package.  But that doesn't apply if you're on OS X,
-# since the default CU_VERSION on OS X is cpu.
-setup_cuda() {
-
-  # First, compute version suffixes.  By default, assume no version suffixes
-  export VERSION_SUFFIX=""
-  export PYTORCH_VERSION_SUFFIX=""
-  export WHEEL_DIR=""
-  # Wheel builds need suffixes (but not if they're on OS X, which never has suffix)
-  if [[ "$BUILD_TYPE" == "wheel" ]] && [[ "$(uname)" != Darwin ]]; then
-    export PYTORCH_VERSION_SUFFIX="+$CU_VERSION"
-    # Match the suffix scheme of pytorch, unless this package does not have
-    # CUDA builds (in which case, use default)
-    if [[ -z "$NO_CUDA_PACKAGE" ]]; then
-      export VERSION_SUFFIX="$PYTORCH_VERSION_SUFFIX"
-      export WHEEL_DIR="$CU_VERSION/"
-    fi
-  fi
-
-  # Now work out the CUDA settings
-  case "$CU_VERSION" in
-    cu117)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.7/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cu116)
-      if [[ "$OSTYPE" == "msys" ]]; then
-        export CUDA_HOME="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.6"
-      else
-        export CUDA_HOME=/usr/local/cuda-11.6/
-      fi
-      export TORCH_CUDA_ARCH_LIST="3.5;5.0+PTX;6.0;7.0;7.5;8.0;8.6"
-      ;;
-    cpu)
-      ;;
-    rocm*)
-      export FORCE_CUDA=1
-      ;;
-    *)
-      echo "Unrecognized CU_VERSION=$CU_VERSION"
-      exit 1
-      ;;
-  esac
-  if [[ -n "$CUDA_HOME" ]]; then
-    # Adds nvcc binary to the search path so that CMake's `find_package(CUDA)` will pick the right one
-    export PATH="$CUDA_HOME/bin:$PATH"
-    export FORCE_CUDA=1
-  fi
-}
-
-# Populate build version if necessary, and add version suffix
-#
-# Inputs:
-#   BUILD_VERSION (e.g., 0.2.0 or empty)
-#   VERSION_SUFFIX (e.g., +cpu)
-#
-# Outputs:
-#   BUILD_VERSION (e.g., 0.2.0.dev20190807+cpu)
-#
-# Fill BUILD_VERSION if it doesn't exist already with a nightly string
-# Usage: setup_build_version 0.2.0
-setup_build_version() {
-  if [[ -z "$BUILD_VERSION" ]]; then
-    if [[ -z "$1" ]]; then
-      setup_base_build_version
-    else
-      BUILD_VERSION="$1"
-    fi
-    BUILD_VERSION="$BUILD_VERSION.dev$(date "+%Y%m%d")$VERSION_SUFFIX"
-  else
-    BUILD_VERSION="$BUILD_VERSION$VERSION_SUFFIX"
-  fi
-
-  # Set build version based on tag if on tag
-  if [[ -n "${CIRCLE_TAG}" ]]; then
-    # Strip tag
-    BUILD_VERSION="$(echo "${CIRCLE_TAG}" | sed -e 's/^v//' -e 's/-.*$//')${VERSION_SUFFIX}"
-  fi
-
-  export BUILD_VERSION
-}
-
-setup_base_build_version() {
-  SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
-  # version.txt for some reason has `a` character after major.minor.rev
-  # command below yields 0.10.0 from version.txt containing 0.10.0a0
-  BUILD_VERSION=$( cut -f 1 -d a "$SCRIPT_DIR/../version.txt" )
-  export BUILD_VERSION
-}
-
-# Set some useful variables for OS X, if applicable
-setup_macos() {
-  if [[ "$(uname)" == Darwin ]]; then
-    export MACOSX_DEPLOYMENT_TARGET=10.9 CC=clang CXX=clang++
-  fi
-}
-
-
-# Top-level entry point for things every package will need to do
-#
-# Usage: setup_env 0.2.0
-setup_env() {
-  setup_cuda
-  setup_build_version "$1"
-  setup_macos
-}
-
-# Function to retry functions that sometimes timeout or have flaky failures
-retry () {
-    $*  || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)
-}
-
-# Inputs:
-#   PYTHON_VERSION (3.7, 3.8, 3.9)
-#   UNICODE_ABI (bool)
-#
-# Outputs:
-#   PATH modified to put correct Python version in PATH
-#
-# Precondition: If Linux, you are in a soumith/manylinux-cuda* Docker image
-setup_wheel_python() {
-  if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
-    eval "$(conda shell.bash hook)"
-    conda env remove -n "env$PYTHON_VERSION" || true
-    conda create ${CONDA_CHANNEL_FLAGS} -yn "env$PYTHON_VERSION" python="$PYTHON_VERSION"
-    conda activate "env$PYTHON_VERSION"
-    # Install libpng from Anaconda (defaults)
-    conda install ${CONDA_CHANNEL_FLAGS} libpng "jpeg<=9b" -y
-  else
-    # Install native CentOS libJPEG, freetype and GnuTLS
-    yum install -y libjpeg-turbo-devel freetype gnutls
-    case "$PYTHON_VERSION" in
-      3.7) python_abi=cp37-cp37m ;;
-      3.8) python_abi=cp38-cp38 ;;
-      3.9) python_abi=cp39-cp39 ;;
-      3.10) python_abi=cp310-cp310 ;;
-      *)
-        echo "Unrecognized PYTHON_VERSION=$PYTHON_VERSION"
-        exit 1
-        ;;
-    esac
-    # Download all the dependencies required to compile image and video_reader
-    # extensions
-
-    mkdir -p ext_libraries
-    pushd ext_libraries
-    popd
-    export PATH="/opt/python/$python_abi/bin:$(pwd)/ext_libraries/bin:$PATH"
-  fi
-}
-
-# Install with pip a bit more robustly than the default
-pip_install() {
-  retry pip install --progress-bar off "$@"
-}
-
-# Install torch with pip, respecting PYTORCH_VERSION, and record the installed
-# version into PYTORCH_VERSION, if applicable
-setup_pip_pytorch_version() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    # Install latest prerelease version of torch, per our nightlies, consistent
-    # with the requested cuda version
-    pip_install --pre torch -f "https://download.pytorch.org/whl/test/${WHEEL_DIR}torch_test.html"
-    if [[ "$CUDA_VERSION" == "cpu" ]]; then
-      # CUDA and CPU are ABI compatible on the CPU-only parts, so strip
-      # in this case
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//' | sed 's/+.\+//')"
-    else
-      export PYTORCH_VERSION="$(pip show torch | grep ^Version: | sed 's/Version:  *//')"
-    fi
-  else
-    pip_install "torch==$PYTORCH_VERSION$PYTORCH_VERSION_SUFFIX" \
-      -f "https://download.pytorch.org/whl/${CU_VERSION}/torch_stable.html" \
-      -f "https://download.pytorch.org/whl/${UPLOAD_CHANNEL}/${CU_VERSION}/torch_${UPLOAD_CHANNEL}.html"
-  fi
-}
-
-# Fill PYTORCH_VERSION with the latest conda nightly version, and
-# CONDA_CHANNEL_FLAGS with appropriate flags to retrieve these versions
-#
-# You MUST have populated PYTORCH_VERSION_SUFFIX before hand.
-setup_conda_pytorch_constraint() {
-  if [[ -z "$PYTORCH_VERSION" ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch-test -c pytorch"
-    PYTHON="python"
-    # Check if we have python 3 instead and prefer that
-    if python3 --version >/dev/null 2>/dev/null; then
-      PYTHON="python3"
-    fi
-    export PYTORCH_VERSION="$(conda search --json 'pytorch[channel=pytorch-test]' | \
-                              ${PYTHON} -c "import os, sys, json, re; cuver = os.environ.get('CU_VERSION'); \
-                               cuver_1 = cuver.replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               cuver_2 = (cuver[:-1] + '.' + cuver[-1]).replace('cu', 'cuda') if cuver != 'cpu' else cuver; \
-                               print(re.sub(r'\\+.*$', '', \
-                                [x['version'] for x in json.load(sys.stdin)['pytorch'] \
-                                  if (x['platform'] == 'darwin' or cuver_1 in x['fn'] or cuver_2 in x['fn']) \
-                                    and 'py' + os.environ['PYTHON_VERSION'] in x['fn']][-1]))")"
-    if [[ -z "$PYTORCH_VERSION" ]]; then
-      echo "PyTorch version auto detection failed"
-      echo "No package found for CU_VERSION=$CU_VERSION and PYTHON_VERSION=$PYTHON_VERSION"
-      exit 1
-    fi
-  else
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c pytorch -c pytorch-${UPLOAD_CHANNEL}"
-  fi
-  if [[ "$CU_VERSION" == cpu ]]; then
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==$PYTORCH_VERSION${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==$PYTORCH_VERSION"
-  else
-    export CONDA_PYTORCH_BUILD_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-    export CONDA_PYTORCH_CONSTRAINT="- pytorch==${PYTORCH_VERSION}${PYTORCH_VERSION_SUFFIX}"
-  fi
-  if [[ "$OSTYPE" == msys && "$CU_VERSION" == cu92 ]]; then
-    export CONDA_CHANNEL_FLAGS="${CONDA_CHANNEL_FLAGS} -c defaults -c numba/label/dev"
-  fi
-}
-
-# Translate CUDA_VERSION into CUDA_CUDATOOLKIT_CONSTRAINT
-setup_conda_cudatoolkit_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-  else
-    case "$CU_VERSION" in
-      cu117)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.7 # [not osx]"
-        ;;
-      cu116)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="- pytorch-cuda=11.6 # [not osx]"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-setup_conda_cudatoolkit_plain_constraint() {
-  export CONDA_BUILD_VARIANT="cuda"
-  export CMAKE_USE_CUDA=1
-  if [[ "$(uname)" == Darwin ]]; then
-    export CONDA_BUILD_VARIANT="cpu"
-    export CMAKE_USE_CUDA=0
-  else
-    case "$CU_VERSION" in
-      cu117)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.7"
-        ;;
-      cu116)
-        export CONDA_CUDATOOLKIT_CONSTRAINT="pytorch-cuda=11.6"
-        ;;
-      cpu)
-        export CONDA_CUDATOOLKIT_CONSTRAINT=""
-        export CONDA_BUILD_VARIANT="cpu"
-        export CMAKE_USE_CUDA=0
-        ;;
-      *)
-        echo "Unrecognized CU_VERSION=$CU_VERSION"
-        exit 1
-        ;;
-    esac
-  fi
-}
-
-# Build the proper compiler package before building the final package
-setup_visual_studio_constraint() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-      export VSTOOLCHAIN_PACKAGE=vs$VC_YEAR
-      conda build $CONDA_CHANNEL_FLAGS --no-anaconda-upload packaging/$VSTOOLCHAIN_PACKAGE
-      cp packaging/$VSTOOLCHAIN_PACKAGE/conda_build_config.yaml packaging/torchvision/conda_build_config.yaml
-  fi
-}
-
-setup_junit_results_folder() {
-  if [[ "$CI" == "true" ]]; then
-    export CONDA_PYTORCH_BUILD_RESULTS_DIRECTORY="${SOURCE_ROOT_DIR}/build_results/results.xml"
-  fi
-}
-
-
-download_copy_ffmpeg() {
-  if [[ "$OSTYPE" == "msys" ]]; then
-    # conda install -yq ffmpeg=4.2 -c pytorch
-    # curl -L -q https://anaconda.org/pytorch/ffmpeg/4.3/download/win-64/ffmpeg-4.3-ha925a31_0.tar.bz2 --output ffmpeg-4.3-ha925a31_0.tar.bz2
-    # bzip2 --decompress --stdout ffmpeg-4.3-ha925a31_0.tar.bz2 | tar -x --file=-
-    # cp Library/bin/*.dll ../torchvision
-    echo "FFmpeg is disabled currently on Windows"
-  else
-    if [[ "$(uname)" == Darwin ]]; then
-      conda install -yq ffmpeg=4.2 -c pytorch
-      conda install -yq wget
-    else
-      # pushd ext_libraries
-      # wget -q https://anaconda.org/pytorch/ffmpeg/4.2/download/linux-64/ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # tar -xjvf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # rm -rf ffmpeg-4.2-hf484d3e_0.tar.bz2
-      # ldconfig
-      # which ffmpeg
-      # popd
-      echo "FFmpeg is disabled currently on Linux"
-    fi
-  fi
-}
diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh
new file mode 100644
index 0000000000000000000000000000000000000000..ae7542f9f8a97a7e66d255dba0f7925b5c8584fe
--- /dev/null
+++ b/packaging/post_build_script.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+LD_LIBRARY_PATH="/usr/local/lib:$CUDA_HOME/lib64:$LD_LIBRARY_PATH" python packaging/wheel/relocate.py
diff --git a/packaging/pre_build_script.sh b/packaging/pre_build_script.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e93a7267e651507f69787f05e1bde62cb1aef4a8
--- /dev/null
+++ b/packaging/pre_build_script.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+if [[ "$(uname)" == Darwin ]]; then
+  # Uninstall Conflicting jpeg brew formulae
+  jpeg_packages=$(brew list | grep jpeg)
+  echo "Existing Jpeg-related Brew libraries"
+  echo $jpeg_packages
+  for pkg in $jpeg_packages; do
+    brew uninstall --ignore-dependencies --force $pkg || true
+  done
+
+  conda install -yq wget
+fi
+
+if [[ "$(uname)" == Darwin || "$OSTYPE" == "msys" ]]; then
+  # Install libpng from Anaconda (defaults)
+  conda install libpng -yq
+  conda install -yq ffmpeg=4.2 libjpeg-turbo -c pytorch
+
+  # Copy binaries to be included in the wheel distribution
+  if [[ "$OSTYPE" == "msys" ]]; then
+      python_exec="$(which python)"
+      bin_path=$(dirname $python_exec)
+      cp "$bin_path/Library/bin/libjpeg.dll" torchvision
+  fi
+else
+
+  if [[ "$ARCH" == "aarch64" ]]; then
+    conda install libpng -yq
+    conda install -yq ffmpeg=4.2 libjpeg-turbo -c pytorch-nightly
+  fi
+
+  # Install native CentOS libJPEG, freetype and GnuTLS
+  yum install -y libjpeg-turbo-devel freetype gnutls
+
+  # Download all the dependencies required to compile image and video_reader
+  # extensions
+  mkdir -p ext_libraries
+  pushd ext_libraries
+  popd
+  export PATH="$(pwd)/ext_libraries/bin:$PATH"
+  pip install auditwheel
+
+  # Point to custom libraries
+  export LD_LIBRARY_PATH=$(pwd)/ext_libraries/lib:$LD_LIBRARY_PATH
+  export TORCHVISION_INCLUDE=$(pwd)/ext_libraries/include
+  export TORCHVISION_LIBRARY=$(pwd)/ext_libraries/lib
+fi
+
+pip install numpy pyyaml future ninja
+pip install --upgrade setuptools
diff --git a/packaging/torchvision/conda_build_config.yaml b/packaging/torchvision/conda_build_config.yaml
index 52b95952ddf14f4812790e5ba316503cadc2b15b..a7c25c6d53475a8c3a32b0d30b9df35727c753f2 100644
--- a/packaging/torchvision/conda_build_config.yaml
+++ b/packaging/torchvision/conda_build_config.yaml
@@ -7,7 +7,7 @@ c_compiler:
 cxx_compiler:
   - vs2017                     # [win]
 python:
-  - 3.7
+  - 3.8
 # This differs from target_platform in that it determines what subdir the compiler
 #    will target, not what subdir the compiler package will be itself.
 #    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
diff --git a/packaging/torchvision/meta.yaml b/packaging/torchvision/meta.yaml
index 105e28c453e33b763333d4c3b3e5e19b56b20f96..9adc13b558bb9646dd27ee508fa5e27121997ded 100644
--- a/packaging/torchvision/meta.yaml
+++ b/packaging/torchvision/meta.yaml
@@ -10,7 +10,7 @@ requirements:
   build:
     - {{ compiler('c') }} # [win]
     - libpng
-    - jpeg
+    - libjpeg-turbo
     # NOTE: The only ffmpeg version that we build is actually 4.2
     - ffmpeg >=4.2  # [not win]
 
@@ -23,11 +23,12 @@ requirements:
 
   run:
     - python
-    - defaults::numpy >=1.11
+    - defaults::numpy >=1.11 # [py <= 310]
+    - numpy >=1.23.5 # [py >= 311]
     - requests
     - libpng
     - ffmpeg >=4.2  # [not win]
-    - jpeg
+    - libjpeg-turbo
     - pillow >=5.3.0, !=8.3.*
     - pytorch-mutex 1.0 {{ build_variant }}  # [not osx ]
     {{ environ.get('CONDA_PYTORCH_CONSTRAINT') }}
@@ -61,7 +62,7 @@ test:
   requires:
     - pytest
     - scipy
-    - jpeg
+    - libjpeg-turbo
     - ca-certificates
 
 
diff --git a/packaging/vs2017/activate.bat b/packaging/vs2017/activate.bat
deleted file mode 100644
index ccecfc25442f0563990588edfb0e9f949a4b8af4..0000000000000000000000000000000000000000
--- a/packaging/vs2017/activate.bat
+++ /dev/null
@@ -1,44 +0,0 @@
-:: Set env vars that tell distutils to use the compiler that we put on path
-SET DISTUTILS_USE_SDK=1
-SET MSSdk=1
-
-SET "VS_VERSION=15.0"
-SET "VS_MAJOR=15"
-SET "VS_YEAR=2017"
-
-set "MSYS2_ARG_CONV_EXCL=/AI;/AL;/OUT;/out"
-set "MSYS2_ENV_CONV_EXCL=CL"
-
-:: For Python 3.5+, ensure that we link with the dynamic runtime.  See
-:: http://stevedower.id.au/blog/building-for-python-3-5-part-two/ for more info
-set "PY_VCRUNTIME_REDIST=%PREFIX%\\bin\\vcruntime140.dll"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VSINSTALLDIR=%%i\"
-        goto :vswhere
-    )
-)
-
-:vswhere
-
-:: Shorten PATH to avoid the `input line too long` error.
-SET MyPath=%PATH%
-
-setlocal EnableDelayedExpansion
-
-SET TempPath="%MyPath:;=";"%"
-SET var=
-FOR %%a IN (%TempPath%) DO (
-    IF EXIST %%~sa (
-        SET "var=!var!;%%~sa"
-    )
-)
-
-set "TempPath=!var:~1!"
-endlocal & set "PATH=%TempPath%"
-
-:: Shorten current directory too
-FOR %%A IN (.) DO CD "%%~sA"
-
-:: other things added by install_activate.bat at package build time
diff --git a/packaging/vs2017/conda_build_config.yaml b/packaging/vs2017/conda_build_config.yaml
deleted file mode 100644
index 2479ceb3e762b561c6b5d7b4daa5bb4d2cfded59..0000000000000000000000000000000000000000
--- a/packaging/vs2017/conda_build_config.yaml
+++ /dev/null
@@ -1,23 +0,0 @@
-blas_impl:
-  - mkl                        # [x86_64]
-c_compiler:
-  - vs2017                     # [win]
-cxx_compiler:
-  - vs2017                     # [win]
-python:
-  - 3.7
-# This differs from target_platform in that it determines what subdir the compiler
-#    will target, not what subdir the compiler package will be itself.
-#    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
-#    code on win-64 miniconda.
-cross_compiler_target_platform:
-  - win-64                     # [win]
-target_platform:
-  - win-64                     # [win]
-vc:
-  - 14
-zip_keys:
-  -                             # [win]
-    - vc                        # [win]
-    - c_compiler                # [win]
-    - cxx_compiler              # [win]
diff --git a/packaging/vs2017/install_activate.bat b/packaging/vs2017/install_activate.bat
deleted file mode 100644
index 253d2f2c2c1d3431eeb1f7cb90f26260d9f71c9f..0000000000000000000000000000000000000000
--- a/packaging/vs2017/install_activate.bat
+++ /dev/null
@@ -1,29 +0,0 @@
-set YEAR=2017
-set VER=15
-
-mkdir "%PREFIX%\etc\conda\activate.d"
-COPY "%RECIPE_DIR%\activate.bat" "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-
-IF "%cross_compiler_target_platform%" == "win-64" (
-  set "target_platform=amd64"
-  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR% Win64" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  IF "%VSDEVCMD_ARGS%" == "" (
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  ) ELSE (
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-    echo CALL "VC\Auxiliary\Build\vcvarsall.bat" x86_amd64 %VSDEVCMD_ARGS% >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  )
-  echo popd >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  ) else (
-  set "target_platform=x86"
-  echo SET "CMAKE_GENERATOR=Visual Studio %VER% %YEAR%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo pushd "%%VSINSTALLDIR%%" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo CALL "VC\Auxiliary\Build\vcvars32.bat" >> "%PREFIX%\etc\conda\activate.d\vs%YEAR%_compiler_vars.bat"
-  echo popd
-  )
diff --git a/packaging/vs2017/install_runtime.bat b/packaging/vs2017/install_runtime.bat
deleted file mode 100644
index 5163c16cf24d49092b6a4aa5cfb1d18a19cc1549..0000000000000000000000000000000000000000
--- a/packaging/vs2017/install_runtime.bat
+++ /dev/null
@@ -1,49 +0,0 @@
-set VC_PATH=x86
-if "%ARCH%"=="64" (
-   set VC_PATH=x64
-)
-
-set MSC_VER=2017
-
-rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
-rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
-rem     set SP=%%A
-rem     )
-
-rem if not "%SP%" == "%PKG_VERSION%" (
-rem    echo "Version detected from registry: %SP%"
-rem    echo    "does not match version of package being built (%PKG_VERSION%)"
-rem    echo "Do you have current updates for VS 2015 installed?"
-rem    exit 1
-rem )
-
-
-REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
-if %ERRORLEVEL% GEQ 8 exit 1
-
-REM ========== This one comes from visual studio 2017
-set "VC_VER=141"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [15^,16^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto :eof
-    )
-)
-
-@setlocal
-call "%VS15VARSALL%" x64
-
-set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
-
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-@endlocal
diff --git a/packaging/vs2017/meta.yaml b/packaging/vs2017/meta.yaml
deleted file mode 100644
index 1f569525ee176da433857aa6ae5a565350320549..0000000000000000000000000000000000000000
--- a/packaging/vs2017/meta.yaml
+++ /dev/null
@@ -1,24 +0,0 @@
-{% set vcver="14.1" %}
-{% set vcfeature="14" %}
-{% set vsyear="2017" %}
-{% set fullver="15.4.27004.2010" %}
-
-package:
-  name: vs{{ vsyear }}
-  version: {{ fullver }}
-
-build:
-  skip: True  [not win]
-  script_env:
-    - VSDEVCMD_ARGS # [win]
-
-outputs:
-  - name: vs{{ vsyear }}_{{ cross_compiler_target_platform }}
-    script: install_activate.bat
-    track_features:
-      # VS 2017 is binary-compatible with VS 2015/vc14.  Tools are "v141".
-      strong:
-        - vc{{ vcfeature }}
-    about:
-      summary: Activation and version verification of MSVC {{ vcver }} (VS {{ vsyear }}) compiler
-      license: BSD 3-clause
diff --git a/packaging/vs2019/conda_build_config.yaml b/packaging/vs2019/conda_build_config.yaml
index 7bd8de2ea5bd629fedb307852d566a918a9fd623..b4dc99341d07a245acdfaf4383235230449943a1 100644
--- a/packaging/vs2019/conda_build_config.yaml
+++ b/packaging/vs2019/conda_build_config.yaml
@@ -5,7 +5,7 @@ c_compiler:
 cxx_compiler:
   - vs2019                     # [win]
 python:
-  - 3.7
+  - 3.8
 # This differs from target_platform in that it determines what subdir the compiler
 #    will target, not what subdir the compiler package will be itself.
 #    For example, we need a win-64 vs2008_win-32 package, so that we compile win-32
diff --git a/packaging/vs2019/install_runtime.bat b/packaging/vs2019/install_runtime.bat
deleted file mode 100644
index e09a5ccfb0f42cc6de2a2f960d31faf2511ae094..0000000000000000000000000000000000000000
--- a/packaging/vs2019/install_runtime.bat
+++ /dev/null
@@ -1,49 +0,0 @@
-set VC_PATH=x86
-if "%ARCH%"=="64" (
-   set VC_PATH=x64
-)
-
-set MSC_VER=2019
-
-rem :: This should always be present for VC installed with VS.  Not sure about VC installed with Visual C++ Build Tools 2015
-rem FOR /F "usebackq tokens=3*" %%A IN (`REG QUERY "HKEY_LOCAL_MACHINE\Software\Microsoft\DevDiv\VC\Servicing\14.0\IDE.x64" /v UpdateVersion`) DO (
-rem     set SP=%%A
-rem     )
-
-rem if not "%SP%" == "%PKG_VERSION%" (
-rem    echo "Version detected from registry: %SP%"
-rem    echo    "does not match version of package being built (%PKG_VERSION%)"
-rem    echo "Do you have current updates for VS 2015 installed?"
-rem    exit 1
-rem )
-
-
-REM ========== REQUIRES Win 10 SDK be installed, or files otherwise copied to location below!
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%LIBRARY_BIN%" *.dll /E
-robocopy "C:\Program Files (x86)\Windows Kits\10\Redist\ucrt\DLLs\%VC_PATH%"  "%PREFIX%" *.dll /E
-if %ERRORLEVEL% GEQ 8 exit 1
-
-REM ========== This one comes from visual studio 2019
-set "VC_VER=142"
-
-for /f "usebackq tokens=*" %%i in (`"%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe" -legacy -products * -version [16^,17^) -property installationPath`) do (
-    if exist "%%i" if exist "%%i\VC\Auxiliary\Build\vcvarsall.bat" (
-        set "VS15VCVARSALL=%%i\VC\Auxiliary\Build\vcvarsall.bat"
-        goto :eof
-    )
-)
-
-@setlocal
-call "%VS15VARSALL%" x64
-
-set "REDIST_ROOT=%VCToolsRedistDir%%VC_PATH%"
-
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.CRT" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%LIBRARY_BIN%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-robocopy "%REDIST_ROOT%\Microsoft.VC%VC_VER%.OpenMP" "%PREFIX%" *.dll /E
-if %ERRORLEVEL% LSS 8 exit 0
-@endlocal
diff --git a/packaging/wheel/relocate.py b/packaging/wheel/relocate.py
index e6a4ef9d458888bdbe3e63d757c54253c6a5c662..fb110abd873def571154fe219f1a589dccb9eb06 100644
--- a/packaging/wheel/relocate.py
+++ b/packaging/wheel/relocate.py
@@ -2,7 +2,6 @@
 
 import glob
 import hashlib
-import io
 
 # Standard library imports
 import os
@@ -65,21 +64,12 @@ PLATFORM_ARCH = platform.machine()
 PYTHON_VERSION = sys.version_info
 
 
-def read_chunks(file, size=io.DEFAULT_BUFFER_SIZE):
-    """Yield pieces of data from a file-like object until EOF."""
-    while True:
-        chunk = file.read(size)
-        if not chunk:
-            break
-        yield chunk
-
-
 def rehash(path, blocksize=1 << 20):
     """Return (hash, length) for path using hashlib.sha256()"""
     h = hashlib.sha256()
     length = 0
     with open(path, "rb") as f:
-        for block in read_chunks(f, size=blocksize):
+        while block := f.read(blocksize):
             length += len(block)
             h.update(block)
     digest = "sha256=" + urlsafe_b64encode(h.digest()).decode("latin1").rstrip("=")
@@ -191,7 +181,7 @@ def relocate_elf_library(patchelf, output_dir, output_library, binary):
 
     print("Copying dependencies to wheel directory")
     new_libraries_path = osp.join(output_dir, "torchvision.libs")
-    os.makedirs(new_libraries_path)
+    os.makedirs(new_libraries_path, exist_ok=True)
 
     new_names = {binary: binary_path}
 
diff --git a/packaging/windows/internal/cuda_install.bat b/packaging/windows/internal/cuda_install.bat
deleted file mode 100644
index 66e922289956382e0a6f2189d5eec0d8072d1b66..0000000000000000000000000000000000000000
--- a/packaging/windows/internal/cuda_install.bat
+++ /dev/null
@@ -1,143 +0,0 @@
-@echo on
-
-if "%CU_VERSION%" == "cpu" (
-    echo Skipping for CPU builds
-    exit /b 0
-)
-
-set SRC_DIR=%~dp0\..
-
-if not exist "%SRC_DIR%\temp_build" mkdir "%SRC_DIR%\temp_build"
-
-rem in unit test workflow, we get CUDA_VERSION, for example 11.1
-if defined CUDA_VERSION (
-    set CUDA_VER=%CUDA_VERSION:.=%
-) else (
-    set CUDA_VER=%CU_VERSION:cu=%
-)
-
-set /a CUDA_VER=%CU_VERSION:cu=%
-set CUDA_VER_MAJOR=%CUDA_VER:~0,-1%
-set CUDA_VER_MINOR=%CUDA_VER:~-1,1%
-set CUDA_VERSION_STR=%CUDA_VER_MAJOR%.%CUDA_VER_MINOR%
-set CUDNN_FOLDER="cuda"
-set CUDNN_LIB_FOLDER="lib\x64"
-
-if %CUDA_VER% EQU 116 goto cuda116
-if %CUDA_VER% EQU 117 goto cuda117
-
-echo CUDA %CUDA_VERSION_STR% is not supported
-exit /b 1
-
-:cuda116
-
-set CUDA_INSTALL_EXE=cuda_11.6.0_511.23_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.6 nvcc_11.6 cuobjdump_11.6 nvprune_11.6 nvprof_11.6 cupti_11.6 cublas_11.6 cublas_dev_11.6 cudart_11.6 cufft_11.6 cufft_dev_11.6 curand_11.6 curand_dev_11.6 cusolver_11.6 cusolver_dev_11.6 cusparse_11.6 cusparse_dev_11.6 npp_11.6 npp_dev_11.6 nvjpeg_11.6 nvjpeg_dev_11.6 nvrtc_11.6 nvrtc_dev_11.6 nvml_dev_11.6"
-)
-
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
-set CUDNN_LIB_FOLDER="lib"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-
-    rem Make sure windows path contains zlib dll
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-    7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-    xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-)
-
-goto cuda_common
-
-:cuda117
-
-set CUDA_INSTALL_EXE=cuda_11.7.0_516.01_windows.exe
-if not exist "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%" (
-    curl -k -L "https://ossci-windows.s3.amazonaws.com/%CUDA_INSTALL_EXE%" --output "%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    if errorlevel 1 exit /b 1
-    set "CUDA_SETUP_FILE=%SRC_DIR%\temp_build\%CUDA_INSTALL_EXE%"
-    set "ARGS=thrust_11.7 nvcc_11.7 cuobjdump_11.7 nvprune_11.7 nvprof_11.7 cupti_11.7 cublas_11.7 cublas_dev_11.7 cudart_11.7 cufft_11.7 cufft_dev_11.7 curand_11.7 curand_dev_11.7 cusolver_11.7 cusolver_dev_11.7 cusparse_11.7 cusparse_dev_11.7 npp_11.7 npp_dev_11.7 nvjpeg_11.7 nvjpeg_dev_11.7 nvrtc_11.7 nvrtc_dev_11.7 nvml_dev_11.7"
-)
-
-set CUDNN_INSTALL_ZIP=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive.zip
-set CUDNN_FOLDER=cudnn-windows-x86_64-8.3.2.44_cuda11.5-archive
-set CUDNN_LIB_FOLDER="lib"
-if not exist "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%" (
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/%CUDNN_INSTALL_ZIP%" --output "%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-    if errorlevel 1 exit /b 1
-    set "CUDNN_SETUP_FILE=%SRC_DIR%\temp_build\%CUDNN_INSTALL_ZIP%"
-
-    rem Make sure windows path contains zlib dll
-    curl -k -L "http://s3.amazonaws.com/ossci-windows/zlib123dllx64.zip" --output "%SRC_DIR%\temp_build\zlib123dllx64.zip"
-    7z x "%SRC_DIR%\temp_build\zlib123dllx64.zip" -o"%SRC_DIR%\temp_build\zlib"
-    xcopy /Y "%SRC_DIR%\temp_build\zlib\dll_x64\*.dll" "C:\Windows\System32"
-)
-
-goto cuda_common
-
-:cuda_common
-
-if not exist "%SRC_DIR%\temp_build\NvToolsExt.7z" (
-    curl -k -L https://www.dropbox.com/s/9mcolalfdj4n979/NvToolsExt.7z?dl=1 --output "%SRC_DIR%\temp_build\NvToolsExt.7z"
-    if errorlevel 1 exit /b 1
-)
-
-echo Installing CUDA toolkit...
-7z x %CUDA_SETUP_FILE% -o"%SRC_DIR%\temp_build\cuda"
-pushd "%SRC_DIR%\temp_build\cuda"
-sc config wuauserv start= disabled
-sc stop wuauserv
-sc query wuauserv
-
-start /wait setup.exe -s %ARGS% -loglevel:6 -log:"%cd%/cuda_install_logs"
-echo %errorlevel%
-
-popd
-
-echo Installing VS integration...
-rem It's for VS 2019
-if "%CUDA_VER_MAJOR%" == "10" (
-    xcopy /Y "%SRC_DIR%\temp_build\cuda\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations"
-)
-if "%CUDA_VER_MAJOR%" == "11" (
-    xcopy /Y "%SRC_DIR%\temp_build\cuda\visual_studio_integration\CUDAVisualStudioIntegration\extras\visual_studio_integration\MSBuildExtensions\*.*" "C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\MSBuild\Microsoft\VC\v160\BuildCustomizations"
-)
-
-echo Installing NvToolsExt...
-7z x %SRC_DIR%\temp_build\NvToolsExt.7z -o"%SRC_DIR%\temp_build\NvToolsExt"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
-mkdir "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\bin\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\include\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\include"
-xcopy /Y "%SRC_DIR%\temp_build\NvToolsExt\lib\x64\*.*" "%ProgramFiles%\NVIDIA Corporation\NvToolsExt\lib\x64"
-
-echo Setting up environment...
-set "PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin;%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\libnvvp;%PATH%"
-set "CUDA_PATH=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
-set "CUDA_PATH_V%CUDA_VER_MAJOR%_%CUDA_VER_MINOR%=%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%"
-set "NVTOOLSEXT_PATH=%ProgramFiles%\NVIDIA Corporation\NvToolsExt\bin\x64"
-
-if not exist "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin\nvcc.exe" (
-    echo CUDA %CUDA_VERSION_STR% installed failed.
-    echo --------- RunDll32.exe.log
-    type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.RunDll32.exe.log"
-    echo --------- setup.exe.log -------
-    type "%SRC_DIR%\temp_build\cuda\cuda_install_logs\LOG.setup.exe.log"
-    exit /b 1
-)
-
-echo Installing cuDNN...
-7z x %CUDNN_SETUP_FILE% -o"%SRC_DIR%\temp_build\cudnn"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\bin\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\bin"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\%CUDNN_LIB_FOLDER%\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\lib\x64"
-xcopy /Y "%SRC_DIR%\temp_build\cudnn\%CUDNN_FOLDER%\include\*.*" "%ProgramFiles%\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION_STR%\include"
-
-echo Cleaning temp files
-rd /s /q "%SRC_DIR%\temp_build" || ver > nul
diff --git a/packaging/windows/internal/driver_update.bat b/packaging/windows/internal/driver_update.bat
deleted file mode 100644
index 00b43affc01cc302a3d6c527be197f1adcc0ba2f..0000000000000000000000000000000000000000
--- a/packaging/windows/internal/driver_update.bat
+++ /dev/null
@@ -1,25 +0,0 @@
-set "DRIVER_DOWNLOAD_LINK=https://ossci-windows.s3.amazonaws.com/461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe"
-curl --retry 3 -kL %DRIVER_DOWNLOAD_LINK% --output 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe
-if errorlevel 1 exit /b 1
-
-start /wait 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe -s -noreboot
-if errorlevel 1 exit /b 1
-
-del 461.09-data-center-tesla-desktop-winserver-2019-2016-international.exe || ver > NUL
-
-setlocal EnableDelayedExpansion
-set NVIDIA_GPU_EXISTS=0
-for /F "delims=" %%i in ('wmic path win32_VideoController get name') do (
-    set GPUS=%%i
-    if not "x!GPUS:NVIDIA=!" == "x!GPUS!" (
-        SET NVIDIA_GPU_EXISTS=1
-        goto gpu_check_end
-    )
-)
-:gpu_check_end
-endlocal & set NVIDIA_GPU_EXISTS=%NVIDIA_GPU_EXISTS%
-
-if "%NVIDIA_GPU_EXISTS%" == "0" (
-    echo "CUDA Driver installation Failed"
-    exit /b 1
-)
diff --git a/packaging/windows/internal/vc_env_helper.bat b/packaging/windows/internal/vc_env_helper.bat
index e85a372f93d58c87107c7dc1e2d7aa2a5e423445..d3484a66e9f9021a06512a4a7888c7d9329c1029 100644
--- a/packaging/windows/internal/vc_env_helper.bat
+++ b/packaging/windows/internal/vc_env_helper.bat
@@ -1,7 +1,11 @@
 @echo on
 
-set VC_VERSION_LOWER=16
-set VC_VERSION_UPPER=17
+set VC_VERSION_LOWER=17
+set VC_VERSION_UPPER=18
+if "%VC_YEAR%" == "2019" (
+    set VC_VERSION_LOWER=16
+    set VC_VERSION_UPPER=17
+)
 if "%VC_YEAR%" == "2017" (
     set VC_VERSION_LOWER=15
     set VC_VERSION_UPPER=16
diff --git a/packaging/windows/internal/vc_install_helper.sh b/packaging/windows/internal/vc_install_helper.sh
deleted file mode 100644
index cdae18065b9f6e97e385fa2002131ef857562306..0000000000000000000000000000000000000000
--- a/packaging/windows/internal/vc_install_helper.sh
+++ /dev/null
@@ -1,16 +0,0 @@
-#!/bin/bash
-
-set -ex
-
-if [[ "$CU_VERSION" == "cu92" ]]; then
-  export VC_YEAR=2017
-  export VSDEVCMD_ARGS="-vcvars_ver=14.13"
-  powershell packaging/windows/internal/vs2017_install.ps1
-elif [[ "$CU_VERSION" == "cu100" ]]; then
-  export VC_YEAR=2017
-  export VSDEVCMD_ARGS=""
-  powershell packaging/windows/internal/vs2017_install.ps1
-else
-  export VC_YEAR=2019
-  export VSDEVCMD_ARGS=""
-fi
diff --git a/packaging/windows/internal/vs2017_install.ps1 b/packaging/windows/internal/vs2017_install.ps1
deleted file mode 100644
index 3e953de1ab7a0fa33238e10fbcd80564246c1a55..0000000000000000000000000000000000000000
--- a/packaging/windows/internal/vs2017_install.ps1
+++ /dev/null
@@ -1,25 +0,0 @@
-$VS_DOWNLOAD_LINK = "https://aka.ms/vs/15/release/vs_buildtools.exe"
-$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.14.13",
-                                                     "--add Microsoft.Component.MSBuild",
-                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
-                                                     "--add Microsoft.VisualStudio.Component.TextTemplating",
-                                                     "--add Microsoft.VisualStudio.Component.VC.CoreIde",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
-                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
-                                                     "--add Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Win81")
-
-curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
-if ($LASTEXITCODE -ne 0) {
-    echo "Download of the VS 2017 installer failed"
-    exit 1
-}
-
-$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
-Remove-Item -Path vs_installer.exe -Force
-$exitCode = $process.ExitCode
-if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-    echo "VS 2017 installer exited with code $exitCode, which should be one of [0, 3010]."
-    exit 1
-}
diff --git a/packaging/windows/internal/vs2019_install.ps1 b/packaging/windows/internal/vs2019_install.ps1
deleted file mode 100644
index e436051f0dbb2ce9361f3d1c33295249ba032bb2..0000000000000000000000000000000000000000
--- a/packaging/windows/internal/vs2019_install.ps1
+++ /dev/null
@@ -1,21 +0,0 @@
-$VS_DOWNLOAD_LINK = "https://aka.ms/vs/16/release/vs_buildtools.exe"
-$VS_INSTALL_ARGS = @("--nocache","--quiet","--wait", "--add Microsoft.VisualStudio.Workload.VCTools",
-                                                     "--add Microsoft.Component.MSBuild",
-                                                     "--add Microsoft.VisualStudio.Component.Roslyn.Compiler",
-                                                     "--add Microsoft.VisualStudio.Component.VC.CoreBuildTools",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
-                                                     "--add Microsoft.VisualStudio.Component.VC.Tools.x86.x64")
-
-curl.exe --retry 3 -kL $VS_DOWNLOAD_LINK --output vs_installer.exe
-if ($LASTEXITCODE -ne 0) {
-    echo "Download of the VS 2019 installer failed"
-    exit 1
-}
-
-$process = Start-Process "${PWD}\vs_installer.exe" -ArgumentList $VS_INSTALL_ARGS -NoNewWindow -Wait -PassThru
-Remove-Item -Path vs_installer.exe -Force
-$exitCode = $process.ExitCode
-if (($exitCode -ne 0) -and ($exitCode -ne 3010)) {
-    echo "VS 2019 installer exited with code $exitCode, which should be one of [0, 3010]."
-    exit 1
-}
diff --git a/pyproject.toml b/pyproject.toml
index 8f0be4245bd978c8d945bc14e3c276c3b017cf12..61e4a957fc563f9503eb1ef52bb93a701b1fbcb1 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -5,7 +5,7 @@ first_party_detection = false
 [tool.black]
 
 line-length = 120
-target-version = ["py37"]
+target-version = ["py38"]
 
 [tool.ufmt]
 
diff --git a/pytest.ini b/pytest.ini
index ca7539448595a47ec17404e7eb07be60dc142e35..594f14964a1868374b67506025df277f4b7cfbc7 100644
--- a/pytest.ini
+++ b/pytest.ini
@@ -1,9 +1,9 @@
 [pytest]
 addopts =
-    # show summary of all tests that did not pass
-    -ra
+    # show tests that (f)ailed, (E)rror, or (X)passed in the summary
+    -rfEX
     # Make tracebacks shorter
-    --tb=native
+    --tb=short
     # enable all warnings
     -Wd
     --ignore=test/test_datasets_download.py
diff --git a/references/classification/README.md b/references/classification/README.md
index da5cd98867de85b6c6e6d2f9dbc73750e8a63646..66ae871aedeec5af1f275690f64885911b015c8d 100644
--- a/references/classification/README.md
+++ b/references/classification/README.md
@@ -298,7 +298,7 @@ Here `$MODEL` is one of `googlenet`, `inception_v3`, `resnet18`, `resnet50`, `re
 
 ### Quantized ShuffleNet V2
 
-Here are commands that we use to quantized the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models.
+Here are commands that we use to quantize the `shufflenet_v2_x1_5` and `shufflenet_v2_x2_0` models.
 ```
 # For shufflenet_v2_x1_5
 python train_quantization.py --device='cpu' --post-training-quantize --backend='fbgemm' \
diff --git a/references/classification/presets.py b/references/classification/presets.py
index 5d1bf1cc71455fcb043538fecbad54050671d015..8653957a57646925f2f028041e3fc4b2e422ee94 100644
--- a/references/classification/presets.py
+++ b/references/classification/presets.py
@@ -1,9 +1,23 @@
 import torch
-from torchvision.transforms import autoaugment, transforms
 from torchvision.transforms.functional import InterpolationMode
 
 
+def get_module(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+
+        return torchvision.transforms.v2
+    else:
+        import torchvision.transforms
+
+        return torchvision.transforms
+
+
 class ClassificationPresetTrain:
+    # Note: this transform assumes that the input to forward() are always PIL
+    # images, regardless of the backend parameter. We may change that in the
+    # future though, if we change the output type from the dataset.
     def __init__(
         self,
         *,
@@ -16,31 +30,48 @@ class ClassificationPresetTrain:
         ra_magnitude=9,
         augmix_severity=3,
         random_erase_prob=0.0,
+        backend="pil",
+        use_v2=False,
     ):
-        trans = [transforms.RandomResizedCrop(crop_size, interpolation=interpolation)]
+        T = get_module(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
+
+        transforms.append(T.RandomResizedCrop(crop_size, interpolation=interpolation, antialias=True))
         if hflip_prob > 0:
-            trans.append(transforms.RandomHorizontalFlip(hflip_prob))
+            transforms.append(T.RandomHorizontalFlip(hflip_prob))
         if auto_augment_policy is not None:
             if auto_augment_policy == "ra":
-                trans.append(autoaugment.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
+                transforms.append(T.RandAugment(interpolation=interpolation, magnitude=ra_magnitude))
             elif auto_augment_policy == "ta_wide":
-                trans.append(autoaugment.TrivialAugmentWide(interpolation=interpolation))
+                transforms.append(T.TrivialAugmentWide(interpolation=interpolation))
             elif auto_augment_policy == "augmix":
-                trans.append(autoaugment.AugMix(interpolation=interpolation, severity=augmix_severity))
+                transforms.append(T.AugMix(interpolation=interpolation, severity=augmix_severity))
             else:
-                aa_policy = autoaugment.AutoAugmentPolicy(auto_augment_policy)
-                trans.append(autoaugment.AutoAugment(policy=aa_policy, interpolation=interpolation))
-        trans.extend(
+                aa_policy = T.AutoAugmentPolicy(auto_augment_policy)
+                transforms.append(T.AutoAugment(policy=aa_policy, interpolation=interpolation))
+
+        if backend == "pil":
+            transforms.append(T.PILToTensor())
+
+        transforms.extend(
             [
-                transforms.PILToTensor(),
-                transforms.ConvertImageDtype(torch.float),
-                transforms.Normalize(mean=mean, std=std),
+                T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float),
+                T.Normalize(mean=mean, std=std),
             ]
         )
         if random_erase_prob > 0:
-            trans.append(transforms.RandomErasing(p=random_erase_prob))
+            transforms.append(T.RandomErasing(p=random_erase_prob))
 
-        self.transforms = transforms.Compose(trans)
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
         return self.transforms(img)
@@ -55,17 +86,34 @@ class ClassificationPresetEval:
         mean=(0.485, 0.456, 0.406),
         std=(0.229, 0.224, 0.225),
         interpolation=InterpolationMode.BILINEAR,
+        backend="pil",
+        use_v2=False,
     ):
+        T = get_module(use_v2)
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tensor' or 'pil', but got {backend}")
 
-        self.transforms = transforms.Compose(
-            [
-                transforms.Resize(resize_size, interpolation=interpolation),
-                transforms.CenterCrop(crop_size),
-                transforms.PILToTensor(),
-                transforms.ConvertImageDtype(torch.float),
-                transforms.Normalize(mean=mean, std=std),
-            ]
-        )
+        transforms += [
+            T.Resize(resize_size, interpolation=interpolation, antialias=True),
+            T.CenterCrop(crop_size),
+        ]
+
+        if backend == "pil":
+            transforms.append(T.PILToTensor())
+
+        transforms += [
+            T.ToDtype(torch.float, scale=True) if use_v2 else T.ConvertImageDtype(torch.float),
+            T.Normalize(mean=mean, std=std),
+        ]
+
+        if use_v2:
+            transforms.append(T.ToPureTensor())
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img):
         return self.transforms(img)
diff --git a/references/classification/train.py b/references/classification/train.py
index 00af63018316588228ebfbe8629bd59799acca36..1bb0d86e9a592c84cae85839251997ad67db1130 100644
--- a/references/classification/train.py
+++ b/references/classification/train.py
@@ -7,12 +7,13 @@ import presets
 import torch
 import torch.utils.data
 import torchvision
-import transforms
+import torchvision.transforms
 import utils
 from sampler import RASampler
 from torch import nn
 from torch.utils.data.dataloader import default_collate
 from torchvision.transforms.functional import InterpolationMode
+from transforms import get_mixup_cutmix
 
 
 def train_one_epoch(model, criterion, optimizer, data_loader, device, epoch, args, model_ema=None, scaler=None):
@@ -128,10 +129,12 @@ def load_data(traindir, valdir, args):
         print(f"Loading dataset_train from {cache_path}")
         dataset, _ = torch.load(cache_path)
     else:
+        # We need a default value for the variables below because args may come
+        # from train_quantization.py which doesn't define them.
         auto_augment_policy = getattr(args, "auto_augment", None)
         random_erase_prob = getattr(args, "random_erase", 0.0)
-        ra_magnitude = args.ra_magnitude
-        augmix_severity = args.augmix_severity
+        ra_magnitude = getattr(args, "ra_magnitude", None)
+        augmix_severity = getattr(args, "augmix_severity", None)
         dataset = torchvision.datasets.ImageFolder(
             traindir,
             presets.ClassificationPresetTrain(
@@ -141,6 +144,8 @@ def load_data(traindir, valdir, args):
                 random_erase_prob=random_erase_prob,
                 ra_magnitude=ra_magnitude,
                 augmix_severity=augmix_severity,
+                backend=args.backend,
+                use_v2=args.use_v2,
             ),
         )
         if args.cache_dataset:
@@ -158,10 +163,17 @@ def load_data(traindir, valdir, args):
     else:
         if args.weights and args.test_only:
             weights = torchvision.models.get_weight(args.weights)
-            preprocessing = weights.transforms()
+            preprocessing = weights.transforms(antialias=True)
+            if args.backend == "tensor":
+                preprocessing = torchvision.transforms.Compose([torchvision.transforms.PILToTensor(), preprocessing])
+
         else:
             preprocessing = presets.ClassificationPresetEval(
-                crop_size=val_crop_size, resize_size=val_resize_size, interpolation=interpolation
+                crop_size=val_crop_size,
+                resize_size=val_resize_size,
+                interpolation=interpolation,
+                backend=args.backend,
+                use_v2=args.use_v2,
             )
 
         dataset_test = torchvision.datasets.ImageFolder(
@@ -206,18 +218,17 @@ def main(args):
     val_dir = os.path.join(args.data_path, "val")
     dataset, dataset_test, train_sampler, test_sampler = load_data(train_dir, val_dir, args)
 
-    collate_fn = None
     num_classes = len(dataset.classes)
-    mixup_transforms = []
-    if args.mixup_alpha > 0.0:
-        mixup_transforms.append(transforms.RandomMixup(num_classes, p=1.0, alpha=args.mixup_alpha))
-    if args.cutmix_alpha > 0.0:
-        mixup_transforms.append(transforms.RandomCutmix(num_classes, p=1.0, alpha=args.cutmix_alpha))
-    if mixup_transforms:
-        mixupcutmix = torchvision.transforms.RandomChoice(mixup_transforms)
+    mixup_cutmix = get_mixup_cutmix(
+        mixup_alpha=args.mixup_alpha, cutmix_alpha=args.cutmix_alpha, num_categories=num_classes, use_v2=args.use_v2
+    )
+    if mixup_cutmix is not None:
 
         def collate_fn(batch):
-            return mixupcutmix(*default_collate(batch))
+            return mixup_cutmix(*default_collate(batch))
+
+    else:
+        collate_fn = default_collate
 
     data_loader = torch.utils.data.DataLoader(
         dataset,
@@ -314,11 +325,11 @@ def main(args):
 
     model_ema = None
     if args.model_ema:
-        # Decay adjustment that aims to keep the decay independent from other hyper-parameters originally proposed at:
+        # Decay adjustment that aims to keep the decay independent of other hyper-parameters originally proposed at:
         # https://github.com/facebookresearch/pycls/blob/f8cd9627/pycls/core/net.py#L123
         #
         # total_ema_updates = (Dataset_size / n_GPUs) * epochs / (batch_size_per_gpu * EMA_steps)
-        # We consider constant = Dataset_size for a given dataset/setup and ommit it. Thus:
+        # We consider constant = Dataset_size for a given dataset/setup and omit it. Thus:
         # adjust = 1 / total_ema_updates ~= n_GPUs * batch_size_per_gpu * EMA_steps / epochs
         adjust = args.world_size * args.batch_size * args.model_ema_steps / args.epochs
         alpha = 1.0 - args.model_ema_decay
@@ -505,6 +516,8 @@ def get_args_parser(add_help=True):
         "--ra-reps", default=3, type=int, help="number of repetitions for Repeated Augmentation (default: 3)"
     )
     parser.add_argument("--weights", default=None, type=str, help="the weights enum name to load")
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
     return parser
 
 
diff --git a/references/classification/transforms.py b/references/classification/transforms.py
index 9a8ef7877d6d5525aead106695d495b7a05cb009..3d10388c36fc8af1374d235049f4e2c690fe436f 100644
--- a/references/classification/transforms.py
+++ b/references/classification/transforms.py
@@ -2,12 +2,35 @@ import math
 from typing import Tuple
 
 import torch
+from presets import get_module
 from torch import Tensor
 from torchvision.transforms import functional as F
 
 
-class RandomMixup(torch.nn.Module):
-    """Randomly apply Mixup to the provided batch and targets.
+def get_mixup_cutmix(*, mixup_alpha, cutmix_alpha, num_categories, use_v2):
+    transforms_module = get_module(use_v2)
+
+    mixup_cutmix = []
+    if mixup_alpha > 0:
+        mixup_cutmix.append(
+            transforms_module.MixUp(alpha=mixup_alpha, num_categories=num_categories)
+            if use_v2
+            else RandomMixUp(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
+        )
+    if cutmix_alpha > 0:
+        mixup_cutmix.append(
+            transforms_module.CutMix(alpha=mixup_alpha, num_categories=num_categories)
+            if use_v2
+            else RandomCutMix(num_classes=num_categories, p=1.0, alpha=mixup_alpha)
+        )
+    if not mixup_cutmix:
+        return None
+
+    return transforms_module.RandomChoice(mixup_cutmix)
+
+
+class RandomMixUp(torch.nn.Module):
+    """Randomly apply MixUp to the provided batch and targets.
     The class implements the data augmentations as described in the paper
     `"mixup: Beyond Empirical Risk Minimization" <https://arxiv.org/abs/1710.09412>`_.
 
@@ -89,8 +112,8 @@ class RandomMixup(torch.nn.Module):
         return s
 
 
-class RandomCutmix(torch.nn.Module):
-    """Randomly apply Cutmix to the provided batch and targets.
+class RandomCutMix(torch.nn.Module):
+    """Randomly apply CutMix to the provided batch and targets.
     The class implements the data augmentations as described in the paper
     `"CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features"
     <https://arxiv.org/abs/1905.04899>`_.
diff --git a/references/classification/utils.py b/references/classification/utils.py
index c31f3928e8641acabf725be129895f0533ecd29e..3e6c2e89e39341d0d15aac17948c1f284f5d8353 100644
--- a/references/classification/utils.py
+++ b/references/classification/utils.py
@@ -365,12 +365,12 @@ def store_model_weights(model, checkpoint_path, checkpoint_key="model", strict=T
     checkpoint_path = os.path.abspath(checkpoint_path)
     output_dir = os.path.dirname(checkpoint_path)
 
-    # Deep copy to avoid side-effects on the model object.
+    # Deep copy to avoid side effects on the model object.
     model = copy.deepcopy(model)
     checkpoint = torch.load(checkpoint_path, map_location="cpu")
 
     # Load the weights to the model to validate that everything works
-    # and remove unnecessary weights (such as auxiliaries, etc)
+    # and remove unnecessary weights (such as auxiliaries, etc.)
     if checkpoint_key == "model_ema":
         del checkpoint[checkpoint_key]["n_averaged"]
         torch.nn.modules.utils.consume_prefix_in_state_dict_if_present(checkpoint[checkpoint_key], "module.")
diff --git a/references/depth/stereo/README.md b/references/depth/stereo/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..22bcae27ab0e81cbd9899e5db185819f27c1f115
--- /dev/null
+++ b/references/depth/stereo/README.md
@@ -0,0 +1,180 @@
+# Stereo Matching reference training scripts
+
+This folder contains reference training scripts for Stereo Matching.
+They serve as a log of how to train specific models, so as to provide baseline
+training and evaluation scripts to quickly bootstrap research.
+
+
+### CREStereo
+
+The CREStereo model was trained on a dataset mixture between **CREStereo**, **ETH3D** and the additional split from **Middlebury2014**.
+A ratio of **88-6-6** was used in order to train a baseline weight set. We provide multi-set variant as well.
+Both used 8 A100 GPUs and a batch size of 2 (so effective batch size is 16). The
+rest of the hyper-parameters loosely follow the recipe from https://github.com/megvii-research/CREStereo.
+The original recipe trains for **300000** updates (or steps) on the dataset mixture. We modify the learning rate
+schedule to one that starts decaying the weight much sooner. Throughout the experiments we found that this reduces 
+overfitting during evaluation time and gradient clip help stabilize the loss during a pre-mature learning rate change.
+
+```
+torchrun --nproc_per_node 8 --nnodes 1 train.py \
+    --dataset-root $dataset_root \
+    --name $name_cre \
+    --model crestereo_base \
+    --train-datasets crestereo eth3d-train middlebury2014-other \
+    --dataset-steps 264000 18000 18000
+    --batch-size 2 \
+    --lr 0.0004 \
+    --min-lr 0.00002 \
+    --lr-decay-method cosine \
+    --warmup-steps 6000 \
+    --decay-after-steps 30000 \
+    --clip-grad-norm 1.0 \
+```
+
+We employ a multi-set fine-tuning stage where we uniformly sample from multiple datasets. Given hat some of these datasets have extremely large images (``2048x2048`` or more) we opt for a very aggressive scale-range ``[0.2 - 0.8]`` such that as much of the original frame composition is captured inside the ``384x512`` crop.
+
+```
+torchrun --nproc_per_node 8 --nnodes 1 train.py \
+    --dataset-root $dataset_root \
+    --name $name_things \
+    --model crestereo_base \
+    --train-datasets crestereo eth3d-train middlebury2014-other instereo2k fallingthings carla-highres sintel sceneflow-monkaa sceneflow-driving \
+    --dataset-steps 12000 12000 12000 12000 12000 12000 12000 12000 12000
+    --batch-size 2 \
+    --scale-range 0.2 0.8 \
+    --lr 0.0004 \
+    --lr-decay-method cosine \
+    --decay-after-steps 0 \
+    --warmup-steps 0 \
+    --min-lr 0.00002 \
+    --resume-path $checkpoint_dir/$name_cre.pth
+```
+
+
+### Evaluation
+
+Evaluating the base weights
+
+```
+torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_ETH_MBL_V1
+```
+
+This should give an **mae of about 1.416** on the train set of `Middlebury2014`. Results may vary slightly depending on the batch size and the number of GPUs. For the most accurate results use 1 GPU and `--batch-size 1`. The created log file should look like this, where the first key is the number of cascades and the nested key is the number of recursive iterations:
+
+```
+Dataset: middlebury2014-train @size: [384, 512]:
+{
+	1: {
+		2: {'mae': 2.363, 'rmse': 4.352, '1px': 0.611, '3px': 0.828, '5px': 0.891, 'relepe': 0.176, 'fl-all': 64.511}
+		5: {'mae': 1.618, 'rmse': 3.71, '1px': 0.761, '3px': 0.879, '5px': 0.918, 'relepe': 0.154, 'fl-all': 77.128}
+		10: {'mae': 1.416, 'rmse': 3.53, '1px': 0.777, '3px': 0.896, '5px': 0.933, 'relepe': 0.148, 'fl-all': 78.388}
+		20: {'mae': 1.448, 'rmse': 3.583, '1px': 0.771, '3px': 0.893, '5px': 0.931, 'relepe': 0.145, 'fl-all': 77.7}
+	},
+}
+{
+	2: {
+		2: {'mae': 1.972, 'rmse': 4.125, '1px': 0.73, '3px': 0.865, '5px': 0.908, 'relepe': 0.169, 'fl-all': 74.396}
+		5: {'mae': 1.403, 'rmse': 3.448, '1px': 0.793, '3px': 0.905, '5px': 0.937, 'relepe': 0.151, 'fl-all': 80.186}
+		10: {'mae': 1.312, 'rmse': 3.368, '1px': 0.799, '3px': 0.912, '5px': 0.943, 'relepe': 0.148, 'fl-all': 80.379}
+		20: {'mae': 1.376, 'rmse': 3.542, '1px': 0.796, '3px': 0.91, '5px': 0.942, 'relepe': 0.149, 'fl-all': 80.054}
+	},
+}
+```
+
+You can also evaluate the Finetuned weights:
+
+```
+torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.CRESTEREO_FINETUNE_MULTI_V1
+```
+
+```
+Dataset: middlebury2014-train @size: [384, 512]:
+{
+	1: {
+		2: {'mae': 1.85, 'rmse': 3.797, '1px': 0.673, '3px': 0.862, '5px': 0.917, 'relepe': 0.171, 'fl-all': 69.736}
+		5: {'mae': 1.111, 'rmse': 3.166, '1px': 0.838, '3px': 0.93, '5px': 0.957, 'relepe': 0.134, 'fl-all': 84.596}
+		10: {'mae': 1.02, 'rmse': 3.073, '1px': 0.854, '3px': 0.938, '5px': 0.96, 'relepe': 0.129, 'fl-all': 86.042}
+		20: {'mae': 0.993, 'rmse': 3.059, '1px': 0.855, '3px': 0.942, '5px': 0.967, 'relepe': 0.126, 'fl-all': 85.784}
+	},
+}
+{
+	2: {
+		2: {'mae': 1.667, 'rmse': 3.867, '1px': 0.78, '3px': 0.891, '5px': 0.922, 'relepe': 0.165, 'fl-all': 78.89}
+		5: {'mae': 1.158, 'rmse': 3.278, '1px': 0.843, '3px': 0.926, '5px': 0.955, 'relepe': 0.135, 'fl-all': 84.556}
+		10: {'mae': 1.046, 'rmse': 3.13, '1px': 0.85, '3px': 0.934, '5px': 0.96, 'relepe': 0.13, 'fl-all': 85.464}
+		20: {'mae': 1.021, 'rmse': 3.102, '1px': 0.85, '3px': 0.935, '5px': 0.963, 'relepe': 0.129, 'fl-all': 85.417}
+	},
+}
+```
+
+Evaluating the author provided weights:
+
+```
+torchrun --nproc_per_node 1 --nnodes 1 cascade_evaluation.py --dataset middlebury2014-train --batch-size 1 --dataset-root $dataset_root --model crestereo_base --weights CREStereo_Base_Weights.MEGVII_V1
+```
+
+```
+Dataset: middlebury2014-train @size: [384, 512]:
+{
+	1: {
+		2: {'mae': 1.704, 'rmse': 3.738, '1px': 0.738, '3px': 0.896, '5px': 0.933, 'relepe': 0.157, 'fl-all': 76.464}
+		5: {'mae': 0.956, 'rmse': 2.963, '1px': 0.88, '3px': 0.948, '5px': 0.965, 'relepe': 0.124, 'fl-all': 88.186}
+		10: {'mae': 0.792, 'rmse': 2.765, '1px': 0.905, '3px': 0.958, '5px': 0.97, 'relepe': 0.114, 'fl-all': 90.429}
+		20: {'mae': 0.749, 'rmse': 2.706, '1px': 0.907, '3px': 0.961, '5px': 0.972, 'relepe': 0.113, 'fl-all': 90.807}
+	},
+}
+{
+	2: {
+		2: {'mae': 1.702, 'rmse': 3.784, '1px': 0.784, '3px': 0.894, '5px': 0.924, 'relepe': 0.172, 'fl-all': 80.313}
+		5: {'mae': 0.932, 'rmse': 2.907, '1px': 0.877, '3px': 0.944, '5px': 0.963, 'relepe': 0.125, 'fl-all': 87.979}
+		10: {'mae': 0.773, 'rmse': 2.768, '1px': 0.901, '3px': 0.958, '5px': 0.972, 'relepe': 0.117, 'fl-all': 90.43}
+		20: {'mae': 0.854, 'rmse': 2.971, '1px': 0.9, '3px': 0.957, '5px': 0.97, 'relepe': 0.122, 'fl-all': 90.269}
+	},
+}
+```
+
+# Concerns when training
+
+We encourage users to be aware of the **aspect-ratio** and **disparity scale** they are targeting when doing any sort of training or fine-tuning. The model is highly sensitive to these two factors, as a consequence of naive multi-set fine-tuning one can achieve `0.2 mae` relatively fast. We recommend that users pay close attention to how they **balance dataset sizing** when training such networks.
+
+ Ideally, dataset scaling should be trated at an individual level and a thorough **EDA** of the disparity distribution in random crops at the desired training / inference size should be performed prior to any large compute investments.
+
+### Disparity scaling
+
+##### Sample A
+ The top row contains a sample from `Sintel` whereas the bottom row one from `Middlebury`.
+
+![Disparity1](assets/disparity-domain-drift.jpg)
+
+From left to right (`left_image`, `right_image`, `valid_mask`, `valid_mask & ground_truth`, `prediction`). **Darker is further away, lighter is closer**. In the case of `Sintel` which is more closely aligned to the original distribution of `CREStereo` we notice that the model accurately predicts the background scale whereas in the case of `Middlebury2014` it cannot correctly estimate the continuous disparity. Notice that the frame composition is similar for both examples. The blue skybox in the `Sintel` scene behaves similarly to the `Middlebury` black background. However, because the `Middlebury` samples comes from an extremely large scene the crop size of `384x512` does not correctly capture the general training distribution.
+
+
+
+
+##### Sample B
+
+The top row contains a scene from `Sceneflow` using the `Monkaa` split whilst the bottom row is a scene from `Middlebury`. This sample exhibits the same issues when it comes to **background estimation**. Given the exaggerated size of the `Middlebury` samples the model **colapses the smooth background** of the sample to what it considers to be a mean background disparity value.
+
+![Disparity2](assets/disparity-background-mode-collapse.jpg)
+
+
+For more detail on why this behaviour occurs based on the training distribution proportions you can read more about the network at: https://github.com/pytorch/vision/pull/6629#discussion_r978160493
+
+
+### Metric overfitting
+
+##### Learning is critical in the beginning
+
+We also advise users to make user of faster training schedules, as the performance gain over long periods time is marginal. Here we exhibit a difference between a faster decay schedule and later decay schedule.
+
+![Loss1](assets/Loss.jpg)
+
+In **grey** we set the lr decay to begin after `30000` steps whilst in **orange** we opt for a very late learning rate decay at around `180000` steps. Although exhibiting stronger variance, we can notice that unfreezing the learning rate earlier whilst employing `gradient-norm` out-performs the default configuration.
+
+##### Gradient norm saves time
+
+![Loss2](assets/gradient-norm-removal.jpg)
+
+In **grey** we keep ``gradient norm`` enabled whilst in **orange** we do not. We can notice that remvoing the gradient norm exacerbates the performance decrease in the early stages whilst also showcasing an almost complete collapse around the `60000` steps mark where we started decaying the lr for **orange**.
+
+Although both runs ahieve an improvement of about ``0.1`` mae after the lr decay start, the benefits of it are observable much faster when ``gradient norm`` is employed as the recovery period is no longer accounted for.
diff --git a/references/depth/stereo/cascade_evaluation.py b/references/depth/stereo/cascade_evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee506ce398551537ad38a9a1f5eb01eb52c4ec7e
--- /dev/null
+++ b/references/depth/stereo/cascade_evaluation.py
@@ -0,0 +1,299 @@
+import os
+import warnings
+
+import torch
+import torchvision
+import torchvision.prototype.models.depth.stereo
+import utils
+from torch.nn import functional as F
+from train import make_eval_loader
+
+from utils.metrics import AVAILABLE_METRICS
+from visualization import make_prediction_image_side_to_side
+
+
+def get_args_parser(add_help=True):
+    import argparse
+
+    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Evaluation", add_help=add_help)
+    parser.add_argument("--dataset", type=str, default="middlebury2014-train", help="dataset to use")
+    parser.add_argument("--dataset-root", type=str, default="", help="root of the dataset")
+
+    parser.add_argument("--checkpoint", type=str, default="", help="path to weights")
+    parser.add_argument("--weights", type=str, default=None, help="torchvision API weight")
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="crestereo_base",
+        help="which model to use if not speciffying a training checkpoint",
+    )
+    parser.add_argument("--img-folder", type=str, default="images")
+
+    parser.add_argument("--batch-size", type=int, default=1, help="batch size")
+    parser.add_argument("--workers", type=int, default=0, help="number of workers")
+
+    parser.add_argument("--eval-size", type=int, nargs="+", default=[384, 512], help="resize size")
+    parser.add_argument(
+        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
+    )
+    parser.add_argument(
+        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
+    )
+    parser.add_argument(
+        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
+    )
+    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
+    parser.add_argument(
+        "--interpolation-strategy",
+        type=str,
+        default="bilinear",
+        help="interpolation strategy",
+        choices=["bilinear", "bicubic", "mixed"],
+    )
+
+    parser.add_argument("--n_iterations", nargs="+", type=int, default=[10], help="number of recurent iterations")
+    parser.add_argument("--n_cascades", nargs="+", type=int, default=[1], help="number of cascades")
+    parser.add_argument(
+        "--metrics",
+        type=str,
+        nargs="+",
+        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
+        help="metrics to log",
+        choices=AVAILABLE_METRICS,
+    )
+    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
+
+    parser.add_argument("--world-size", type=int, default=1, help="number of distributed processes")
+    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
+    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
+
+    parser.add_argument("--save-images", action="store_true", help="save images of the predictions")
+    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
+
+    return parser
+
+
+def cascade_inference(model, image_left, image_right, iterations, cascades):
+    # check that image size is divisible by 16 * (2 ** (cascades - 1))
+    for image in [image_left, image_right]:
+        if image.shape[-2] % ((2 ** (cascades - 1))) != 0:
+            raise ValueError(
+                f"image height is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
+            )
+
+        if image.shape[-1] % ((2 ** (cascades - 1))) != 0:
+            raise ValueError(
+                f"image width is not divisible by {16 * (2 ** (cascades - 1))}. Image shape: {image.shape[-2]}"
+            )
+
+    left_image_pyramid = [image_left]
+    right_image_pyramid = [image_right]
+    for idx in range(0, cascades - 1):
+        ds_factor = int(2 ** (idx + 1))
+        ds_shape = (image_left.shape[-2] // ds_factor, image_left.shape[-1] // ds_factor)
+        left_image_pyramid += F.interpolate(image_left, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(0)
+        right_image_pyramid += F.interpolate(image_right, size=ds_shape, mode="bilinear", align_corners=True).unsqueeze(
+            0
+        )
+
+    flow_init = None
+    for left_image, right_image in zip(reversed(left_image_pyramid), reversed(right_image_pyramid)):
+        flow_pred = model(left_image, right_image, flow_init, num_iters=iterations)
+        # flow pred is a list
+        flow_init = flow_pred[-1]
+
+    return flow_init
+
+
+@torch.inference_mode()
+def _evaluate(
+    model,
+    args,
+    val_loader,
+    *,
+    padder_mode,
+    print_freq=10,
+    writer=None,
+    step=None,
+    iterations=10,
+    cascades=1,
+    batch_size=None,
+    header=None,
+    save_images=False,
+    save_path="",
+):
+    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset.
+    We process as many samples as possible with ddp.
+    """
+    model.eval()
+    header = header or "Test:"
+    device = torch.device(args.device)
+    metric_logger = utils.MetricLogger(delimiter="  ")
+
+    iterations = iterations or args.recurrent_updates
+
+    logger = utils.MetricLogger()
+    for meter_name in args.metrics:
+        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
+    if "fl-all" not in args.metrics:
+        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
+
+    num_processed_samples = 0
+    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+        batch_idx = 0
+        for blob in metric_logger.log_every(val_loader, print_freq, header):
+            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
+            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
+            image_left, image_right = padder.pad(image_left, image_right)
+
+            disp_pred = cascade_inference(model, image_left, image_right, iterations, cascades)
+            disp_pred = disp_pred[:, :1, :, :]
+            disp_pred = padder.unpad(disp_pred)
+
+            if save_images:
+                if args.distributed:
+                    rank_prefix = args.rank
+                else:
+                    rank_prefix = 0
+                make_prediction_image_side_to_side(
+                    disp_pred, disp_gt, valid_disp_mask, save_path, prefix=f"batch_{rank_prefix}_{batch_idx}"
+                )
+
+            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
+            num_processed_samples += image_left.shape[0]
+            for name in metrics:
+                logger.meters[name].update(metrics[name], n=1)
+
+            batch_idx += 1
+
+    num_processed_samples = utils.reduce_across_processes(num_processed_samples) / args.world_size
+
+    print("Num_processed_samples: ", num_processed_samples)
+    if (
+        hasattr(val_loader.dataset, "__len__")
+        and len(val_loader.dataset) != num_processed_samples
+        and torch.distributed.get_rank() == 0
+    ):
+        warnings.warn(
+            f"Number of processed samples {num_processed_samples} is different"
+            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
+            "the dataset is not divisible by the batch size. Try lowering the batch size for more accurate results."
+        )
+
+    if writer is not None and args.rank == 0:
+        for meter_name, meter_value in logger.meters.items():
+            scalar_name = f"{meter_name} {header}"
+            writer.add_scalar(scalar_name, meter_value.avg, step)
+
+    logger.synchronize_between_processes()
+    print(header, logger)
+
+    logger_metrics = {k: v.global_avg for k, v in logger.meters.items()}
+    return logger_metrics
+
+
+def evaluate(model, loader, args, writer=None, step=None):
+    os.makedirs(args.img_folder, exist_ok=True)
+    checkpoint_name = os.path.basename(args.checkpoint) or args.weights
+    image_checkpoint_folder = os.path.join(args.img_folder, checkpoint_name)
+
+    metrics = {}
+    base_image_folder = os.path.join(image_checkpoint_folder, args.dataset)
+    os.makedirs(base_image_folder, exist_ok=True)
+
+    for n_cascades in args.n_cascades:
+        for n_iters in args.n_iterations:
+
+            config = f"{n_cascades}c_{n_iters}i"
+            config_image_folder = os.path.join(base_image_folder, config)
+            os.makedirs(config_image_folder, exist_ok=True)
+
+            metrics[config] = _evaluate(
+                model,
+                args,
+                loader,
+                padder_mode=args.padder_type,
+                header=f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{n_cascades} n_iters:{n_iters}",
+                batch_size=args.batch_size,
+                writer=writer,
+                step=step,
+                iterations=n_iters,
+                cascades=n_cascades,
+                save_path=config_image_folder,
+                save_images=args.save_images,
+            )
+
+    metric_log = []
+    metric_log_dict = {}
+    # print the final results
+    for config in metrics:
+        config_tokens = config.split("_")
+        config_iters = config_tokens[1][:-1]
+        config_cascades = config_tokens[0][:-1]
+
+        metric_log_dict[config_cascades] = metric_log_dict.get(config_cascades, {})
+        metric_log_dict[config_cascades][config_iters] = metrics[config]
+
+        evaluation_str = f"{args.dataset} evaluation@ size:{args.eval_size} n_cascades:{config_cascades} recurrent_updates:{config_iters}"
+        metrics_str = f"Metrics: {metrics[config]}"
+        metric_log.extend([evaluation_str, metrics_str])
+
+        print(evaluation_str)
+        print(metrics_str)
+
+    eval_log_name = f"{checkpoint_name.replace('.pth', '')}_eval.log"
+    print("Saving eval log to: ", eval_log_name)
+    with open(eval_log_name, "w") as f:
+        f.write(f"Dataset: {args.dataset} @size: {args.eval_size}:\n")
+        # write the dict line by line for each key, and each value in the keys
+        for config_cascades in metric_log_dict:
+            f.write("{\n")
+            f.write(f"\t{config_cascades}: {{\n")
+            for config_iters in metric_log_dict[config_cascades]:
+                # convert every metric to 4 decimal places
+                metrics = metric_log_dict[config_cascades][config_iters]
+                metrics = {k: float(f"{v:.3f}") for k, v in metrics.items()}
+                f.write(f"\t\t{config_iters}: {metrics}\n")
+            f.write("\t},\n")
+            f.write("}\n")
+
+
+def load_checkpoint(args):
+    utils.setup_ddp(args)
+
+    if not args.weights:
+        checkpoint = torch.load(args.checkpoint, map_location=torch.device("cpu"))
+        if "model" in checkpoint:
+            experiment_args = checkpoint["args"]
+            model = torchvision.prototype.models.depth.stereo.__dict__[experiment_args.model](weights=None)
+            model.load_state_dict(checkpoint["model"])
+        else:
+            model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=None)
+            model.load_state_dict(checkpoint)
+
+        # set the appropriate devices
+        if args.distributed and args.device == "cpu":
+            raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
+        device = torch.device(args.device)
+    else:
+        model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
+
+    # convert to DDP if need be
+    if args.distributed:
+        model = model.to(args.device)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+    else:
+        model.to(device)
+
+    return model
+
+
+def main(args):
+    model = load_checkpoint(args)
+    loader = make_eval_loader(args.dataset, args)
+    evaluate(model, loader, args)
+
+
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)
diff --git a/references/depth/stereo/train.py b/references/depth/stereo/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..30c73628c619b9d0443640615e6c59a3cd678fad
--- /dev/null
+++ b/references/depth/stereo/train.py
@@ -0,0 +1,788 @@
+import argparse
+import os
+import warnings
+from pathlib import Path
+from typing import List, Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+import torchvision.models.optical_flow
+import torchvision.prototype.models.depth.stereo
+import utils
+import visualization
+
+from parsing import make_dataset, make_eval_transform, make_train_transform, VALID_DATASETS
+from torch import nn
+from torchvision.transforms.functional import get_dimensions, InterpolationMode, resize
+from utils.metrics import AVAILABLE_METRICS
+from utils.norm import freeze_batch_norm
+
+
+def make_stereo_flow(flow: Union[torch.Tensor, List[torch.Tensor]], model_out_channels: int) -> torch.Tensor:
+    """Helper function to make stereo flow from a given model output"""
+    if isinstance(flow, list):
+        return [make_stereo_flow(flow_i, model_out_channels) for flow_i in flow]
+
+    B, C, H, W = flow.shape
+    # we need to add zero flow if the model outputs 2 channels
+    if C == 1 and model_out_channels == 2:
+        zero_flow = torch.zeros_like(flow)
+        # by convention the flow is X-Y axis, so we need the Y flow last
+        flow = torch.cat([flow, zero_flow], dim=1)
+    return flow
+
+
+def make_lr_schedule(args: argparse.Namespace, optimizer: torch.optim.Optimizer) -> np.ndarray:
+    """Helper function to return a learning rate scheduler for CRE-stereo"""
+    if args.decay_after_steps < args.warmup_steps:
+        raise ValueError(f"decay_after_steps: {args.function} must be greater than warmup_steps: {args.warmup_steps}")
+
+    warmup_steps = args.warmup_steps if args.warmup_steps else 0
+    flat_lr_steps = args.decay_after_steps - warmup_steps if args.decay_after_steps else 0
+    decay_lr_steps = args.total_iterations - flat_lr_steps
+
+    max_lr = args.lr
+    min_lr = args.min_lr
+
+    schedulers = []
+    milestones = []
+
+    if warmup_steps > 0:
+        if args.lr_warmup_method == "linear":
+            warmup_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=args.lr_warmup_factor, total_iters=warmup_steps
+            )
+        elif args.lr_warmup_method == "constant":
+            warmup_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(
+                optimizer, factor=args.lr_warmup_factor, total_iters=warmup_steps
+            )
+        else:
+            raise ValueError(f"Unknown lr warmup method {args.lr_warmup_method}")
+        schedulers.append(warmup_lr_scheduler)
+        milestones.append(warmup_steps)
+
+    if flat_lr_steps > 0:
+        flat_lr_scheduler = torch.optim.lr_scheduler.ConstantLR(optimizer, factor=max_lr, total_iters=flat_lr_steps)
+        schedulers.append(flat_lr_scheduler)
+        milestones.append(flat_lr_steps + warmup_steps)
+
+    if decay_lr_steps > 0:
+        if args.lr_decay_method == "cosine":
+            decay_lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
+                optimizer, T_max=decay_lr_steps, eta_min=min_lr
+            )
+        elif args.lr_decay_method == "linear":
+            decay_lr_scheduler = torch.optim.lr_scheduler.LinearLR(
+                optimizer, start_factor=max_lr, end_factor=min_lr, total_iters=decay_lr_steps
+            )
+        elif args.lr_decay_method == "exponential":
+            decay_lr_scheduler = torch.optim.lr_scheduler.ExponentialLR(
+                optimizer, gamma=args.lr_decay_gamma, last_epoch=-1
+            )
+        else:
+            raise ValueError(f"Unknown lr decay method {args.lr_decay_method}")
+        schedulers.append(decay_lr_scheduler)
+
+    scheduler = torch.optim.lr_scheduler.SequentialLR(optimizer, schedulers, milestones=milestones)
+    return scheduler
+
+
+def shuffle_dataset(dataset):
+    """Shuffle the dataset"""
+    perm = torch.randperm(len(dataset))
+    return torch.utils.data.Subset(dataset, perm)
+
+
+def resize_dataset_to_n_steps(
+    dataset: torch.utils.data.Dataset, dataset_steps: int, samples_per_step: int, args: argparse.Namespace
+) -> torch.utils.data.Dataset:
+    original_size = len(dataset)
+    if args.steps_is_epochs:
+        samples_per_step = original_size
+    target_size = dataset_steps * samples_per_step
+
+    dataset_copies = []
+    n_expands, remainder = divmod(target_size, original_size)
+    for idx in range(n_expands):
+        dataset_copies.append(dataset)
+
+    if remainder > 0:
+        dataset_copies.append(torch.utils.data.Subset(dataset, list(range(remainder))))
+
+    if args.dataset_shuffle:
+        dataset_copies = [shuffle_dataset(dataset_copy) for dataset_copy in dataset_copies]
+
+    dataset = torch.utils.data.ConcatDataset(dataset_copies)
+    return dataset
+
+
+def get_train_dataset(dataset_root: str, args: argparse.Namespace) -> torch.utils.data.Dataset:
+    datasets = []
+    for dataset_name in args.train_datasets:
+        transform = make_train_transform(args)
+        dataset = make_dataset(dataset_name, dataset_root, transform)
+        datasets.append(dataset)
+
+    if len(datasets) == 0:
+        raise ValueError("No datasets specified for training")
+
+    samples_per_step = args.world_size * args.batch_size
+
+    for idx, (dataset, steps_per_dataset) in enumerate(zip(datasets, args.dataset_steps)):
+        datasets[idx] = resize_dataset_to_n_steps(dataset, steps_per_dataset, samples_per_step, args)
+
+    dataset = torch.utils.data.ConcatDataset(datasets)
+    if args.dataset_order_shuffle:
+        dataset = shuffle_dataset(dataset)
+
+    print(f"Training dataset: {len(dataset)} samples")
+    return dataset
+
+
+@torch.inference_mode()
+def _evaluate(
+    model,
+    args,
+    val_loader,
+    *,
+    padder_mode,
+    print_freq=10,
+    writer=None,
+    step=None,
+    iterations=None,
+    batch_size=None,
+    header=None,
+):
+    """Helper function to compute various metrics (epe, etc.) for a model on a given dataset."""
+    model.eval()
+    header = header or "Test:"
+    device = torch.device(args.device)
+    metric_logger = utils.MetricLogger(delimiter="  ")
+
+    iterations = iterations or args.recurrent_updates
+
+    logger = utils.MetricLogger()
+    for meter_name in args.metrics:
+        logger.add_meter(meter_name, fmt="{global_avg:.4f}")
+    if "fl-all" not in args.metrics:
+        logger.add_meter("fl-all", fmt="{global_avg:.4f}")
+
+    num_processed_samples = 0
+    with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+        for blob in metric_logger.log_every(val_loader, print_freq, header):
+            image_left, image_right, disp_gt, valid_disp_mask = (x.to(device) for x in blob)
+            padder = utils.InputPadder(image_left.shape, mode=padder_mode)
+            image_left, image_right = padder.pad(image_left, image_right)
+
+            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=iterations)
+            disp_pred = disp_predictions[-1][:, :1, :, :]
+            disp_pred = padder.unpad(disp_pred)
+
+            metrics, _ = utils.compute_metrics(disp_pred, disp_gt, valid_disp_mask, metrics=logger.meters.keys())
+            num_processed_samples += image_left.shape[0]
+            for name in metrics:
+                logger.meters[name].update(metrics[name], n=1)
+
+    num_processed_samples = utils.reduce_across_processes(num_processed_samples)
+
+    print("Num_processed_samples: ", num_processed_samples)
+    if (
+        hasattr(val_loader.dataset, "__len__")
+        and len(val_loader.dataset) != num_processed_samples
+        and torch.distributed.get_rank() == 0
+    ):
+        warnings.warn(
+            f"Number of processed samples {num_processed_samples} is different"
+            f"from the dataset size {len(val_loader.dataset)}. This may happen if"
+            "the dataset is not divisible by the batch size. Try lowering the batch size or GPU number for more accurate results."
+        )
+
+    if writer is not None and args.rank == 0:
+        for meter_name, meter_value in logger.meters.items():
+            scalar_name = f"{meter_name} {header}"
+            writer.add_scalar(scalar_name, meter_value.avg, step)
+
+    logger.synchronize_between_processes()
+    print(header, logger)
+
+
+def make_eval_loader(dataset_name: str, args: argparse.Namespace) -> torch.utils.data.DataLoader:
+    if args.weights:
+        weights = torchvision.models.get_weight(args.weights)
+        trans = weights.transforms()
+
+        def preprocessing(image_left, image_right, disp, valid_disp_mask):
+            C_o, H_o, W_o = get_dimensions(image_left)
+            image_left, image_right = trans(image_left, image_right)
+
+            C_t, H_t, W_t = get_dimensions(image_left)
+            scale_factor = W_t / W_o
+
+            if disp is not None and not isinstance(disp, torch.Tensor):
+                disp = torch.from_numpy(disp)
+                if W_t != W_o:
+                    disp = resize(disp, (H_t, W_t), mode=InterpolationMode.BILINEAR) * scale_factor
+            if valid_disp_mask is not None and not isinstance(valid_disp_mask, torch.Tensor):
+                valid_disp_mask = torch.from_numpy(valid_disp_mask)
+                if W_t != W_o:
+                    valid_disp_mask = resize(valid_disp_mask, (H_t, W_t), mode=InterpolationMode.NEAREST)
+            return image_left, image_right, disp, valid_disp_mask
+
+    else:
+        preprocessing = make_eval_transform(args)
+
+    val_dataset = make_dataset(dataset_name, args.dataset_root, transforms=preprocessing)
+    if args.distributed:
+        sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=False)
+    else:
+        sampler = torch.utils.data.SequentialSampler(val_dataset)
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset,
+        sampler=sampler,
+        batch_size=args.batch_size,
+        pin_memory=True,
+        num_workers=args.workers,
+    )
+
+    return val_loader
+
+
+def evaluate(model, loaders, args, writer=None, step=None):
+    for loader_name, loader in loaders.items():
+        _evaluate(
+            model,
+            args,
+            loader,
+            iterations=args.recurrent_updates,
+            padder_mode=args.padder_type,
+            header=f"{loader_name} evaluation",
+            batch_size=args.batch_size,
+            writer=writer,
+            step=step,
+        )
+
+
+def run(model, optimizer, scheduler, train_loader, val_loaders, logger, writer, scaler, args):
+    device = torch.device(args.device)
+    # wrap the loader in a logger
+    loader = iter(logger.log_every(train_loader))
+    # output channels
+    model_out_channels = model.module.output_channels if args.distributed else model.output_channels
+
+    torch.set_num_threads(args.threads)
+
+    sequence_criterion = utils.SequenceLoss(
+        gamma=args.gamma,
+        max_flow=args.max_disparity,
+        exclude_large_flows=args.flow_loss_exclude_large,
+    ).to(device)
+
+    if args.consistency_weight:
+        consistency_criterion = utils.FlowSequenceConsistencyLoss(
+            args.gamma,
+            resize_factor=0.25,
+            rescale_factor=0.25,
+            rescale_mode="bilinear",
+        ).to(device)
+    else:
+        consistency_criterion = None
+
+    if args.psnr_weight:
+        psnr_criterion = utils.PSNRLoss().to(device)
+    else:
+        psnr_criterion = None
+
+    if args.smoothness_weight:
+        smoothness_criterion = utils.SmoothnessLoss().to(device)
+    else:
+        smoothness_criterion = None
+
+    if args.photometric_weight:
+        photometric_criterion = utils.FlowPhotoMetricLoss(
+            ssim_weight=args.photometric_ssim_weight,
+            max_displacement_ratio=args.photometric_max_displacement_ratio,
+            ssim_use_padding=False,
+        ).to(device)
+    else:
+        photometric_criterion = None
+
+    for step in range(args.start_step + 1, args.total_iterations + 1):
+        data_blob = next(loader)
+        optimizer.zero_grad()
+
+        # unpack the data blob
+        image_left, image_right, disp_mask, valid_disp_mask = (x.to(device) for x in data_blob)
+        with torch.cuda.amp.autocast(enabled=args.mixed_precision, dtype=torch.float16):
+            disp_predictions = model(image_left, image_right, flow_init=None, num_iters=args.recurrent_updates)
+            # different models have different outputs, make sure we get the right ones for this task
+            disp_predictions = make_stereo_flow(disp_predictions, model_out_channels)
+            # should the architecture or training loop require it, we have to adjust the disparity mask
+            # target to possibly look like an optical flow mask
+            disp_mask = make_stereo_flow(disp_mask, model_out_channels)
+            # sequence loss on top of the model outputs
+
+        loss = sequence_criterion(disp_predictions, disp_mask, valid_disp_mask) * args.flow_loss_weight
+
+        if args.consistency_weight > 0:
+            loss_consistency = consistency_criterion(disp_predictions)
+            loss += loss_consistency * args.consistency_weight
+
+        if args.psnr_weight > 0:
+            loss_psnr = 0.0
+            for pred in disp_predictions:
+                # predictions might have 2 channels
+                loss_psnr += psnr_criterion(
+                    pred * valid_disp_mask.unsqueeze(1),
+                    disp_mask * valid_disp_mask.unsqueeze(1),
+                ).mean()  # mean the psnr loss over the batch
+            loss += loss_psnr / len(disp_predictions) * args.psnr_weight
+
+        if args.photometric_weight > 0:
+            loss_photometric = 0.0
+            for pred in disp_predictions:
+                # predictions might have 1 channel, therefore we need to inpute 0s for the second channel
+                if model_out_channels == 1:
+                    pred = torch.cat([pred, torch.zeros_like(pred)], dim=1)
+
+                loss_photometric += photometric_criterion(
+                    image_left, image_right, pred, valid_disp_mask
+                )  # photometric loss already comes out meaned over the batch
+            loss += loss_photometric / len(disp_predictions) * args.photometric_weight
+
+        if args.smoothness_weight > 0:
+            loss_smoothness = 0.0
+            for pred in disp_predictions:
+                # predictions might have 2 channels
+                loss_smoothness += smoothness_criterion(
+                    image_left, pred[:, :1, :, :]
+                ).mean()  # mean the smoothness loss over the batch
+            loss += loss_smoothness / len(disp_predictions) * args.smoothness_weight
+
+        with torch.no_grad():
+            metrics, _ = utils.compute_metrics(
+                disp_predictions[-1][:, :1, :, :],  # predictions might have 2 channels
+                disp_mask[:, :1, :, :],  # so does the ground truth
+                valid_disp_mask,
+                args.metrics,
+            )
+
+        metrics.pop("fl-all", None)
+        logger.update(loss=loss, **metrics)
+
+        if scaler is not None:
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            if args.clip_grad_norm:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
+            scaler.step(optimizer)
+            scaler.update()
+        else:
+            loss.backward()
+            if args.clip_grad_norm:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=args.clip_grad_norm)
+            optimizer.step()
+
+        scheduler.step()
+
+        if not dist.is_initialized() or dist.get_rank() == 0:
+            if writer is not None and step % args.tensorboard_log_frequency == 0:
+                # log the loss and metrics to tensorboard
+
+                writer.add_scalar("loss", loss, step)
+                for name, value in logger.meters.items():
+                    writer.add_scalar(name, value.avg, step)
+                # log the images to tensorboard
+                pred_grid = visualization.make_training_sample_grid(
+                    image_left, image_right, disp_mask, valid_disp_mask, disp_predictions
+                )
+                writer.add_image("predictions", pred_grid, step, dataformats="HWC")
+
+                # second thing we want to see is how relevant the iterative refinement is
+                pred_sequence_grid = visualization.make_disparity_sequence_grid(disp_predictions, disp_mask)
+                writer.add_image("sequence", pred_sequence_grid, step, dataformats="HWC")
+
+        if step % args.save_frequency == 0:
+            if not args.distributed or args.rank == 0:
+                model_without_ddp = (
+                    model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+                )
+                checkpoint = {
+                    "model": model_without_ddp.state_dict(),
+                    "optimizer": optimizer.state_dict(),
+                    "scheduler": scheduler.state_dict(),
+                    "step": step,
+                    "args": args,
+                }
+                os.makedirs(args.checkpoint_dir, exist_ok=True)
+                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
+                torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
+
+        if step % args.valid_frequency == 0:
+            evaluate(model, val_loaders, args, writer, step)
+            model.train()
+            if args.freeze_batch_norm:
+                if isinstance(model, nn.parallel.DistributedDataParallel):
+                    freeze_batch_norm(model.module)
+                else:
+                    freeze_batch_norm(model)
+
+    # one final save at the end
+    if not args.distributed or args.rank == 0:
+        model_without_ddp = model.module if isinstance(model, torch.nn.parallel.DistributedDataParallel) else model
+        checkpoint = {
+            "model": model_without_ddp.state_dict(),
+            "optimizer": optimizer.state_dict(),
+            "scheduler": scheduler.state_dict(),
+            "step": step,
+            "args": args,
+        }
+        os.makedirs(args.checkpoint_dir, exist_ok=True)
+        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}_{step}.pth")
+        torch.save(checkpoint, Path(args.checkpoint_dir) / f"{args.name}.pth")
+
+
+def main(args):
+    args.total_iterations = sum(args.dataset_steps)
+
+    # initialize DDP setting
+    utils.setup_ddp(args)
+    print(args)
+
+    args.test_only = args.train_datasets is None
+
+    # set the appropriate devices
+    if args.distributed and args.device == "cpu":
+        raise ValueError("The device must be cuda if we want to run in distributed mode using torchrun")
+    device = torch.device(args.device)
+
+    # select model architecture
+    model = torchvision.prototype.models.depth.stereo.__dict__[args.model](weights=args.weights)
+
+    # convert to DDP if need be
+    if args.distributed:
+        model = model.to(args.gpu)
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+    else:
+        model.to(device)
+        model_without_ddp = model
+
+    os.makedirs(args.checkpoint_dir, exist_ok=True)
+
+    val_loaders = {name: make_eval_loader(name, args) for name in args.test_datasets}
+
+    # EVAL ONLY configurations
+    if args.test_only:
+        evaluate(model, val_loaders, args)
+        return
+
+    # Sanity check for the parameter count
+    print(f"Parameter Count: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
+
+    # Compose the training dataset
+    train_dataset = get_train_dataset(args.dataset_root, args)
+
+    # initialize the optimizer
+    if args.optimizer == "adam":
+        optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay)
+    elif args.optimizer == "sgd":
+        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, weight_decay=args.weight_decay, momentum=0.9)
+    else:
+        raise ValueError(f"Unknown optimizer {args.optimizer}. Please choose between adam and sgd")
+
+    # initialize the learning rate schedule
+    scheduler = make_lr_schedule(args, optimizer)
+
+    # load them from checkpoint if needed
+    args.start_step = 0
+    if args.resume_path is not None:
+        checkpoint = torch.load(args.resume_path, map_location="cpu")
+        if "model" in checkpoint:
+            # this means the user requested to resume from a training checkpoint
+            model_without_ddp.load_state_dict(checkpoint["model"])
+            # this means the user wants to continue training from where it was left off
+            if args.resume_schedule:
+                optimizer.load_state_dict(checkpoint["optimizer"])
+                scheduler.load_state_dict(checkpoint["scheduler"])
+                args.start_step = checkpoint["step"] + 1
+                # modify starting point of the dat
+                sample_start_step = args.start_step * args.batch_size * args.world_size
+                train_dataset = train_dataset[sample_start_step:]
+
+        else:
+            # this means the user wants to finetune on top of a model state dict
+            # and that no other changes are required
+            model_without_ddp.load_state_dict(checkpoint)
+
+    torch.backends.cudnn.benchmark = True
+
+    # enable training mode
+    model.train()
+    if args.freeze_batch_norm:
+        freeze_batch_norm(model_without_ddp)
+
+    # put dataloader on top of the dataset
+    # make sure to disable shuffling since the dataset is already shuffled
+    # in order to guarantee quasi randomness whilst retaining a deterministic
+    # dataset consumption order
+    if args.distributed:
+        # the train dataset is preshuffled in order to respect the iteration order
+        sampler = torch.utils.data.distributed.DistributedSampler(train_dataset, shuffle=False, drop_last=True)
+    else:
+        # the train dataset is already shuffled, so we can use a simple SequentialSampler
+        sampler = torch.utils.data.SequentialSampler(train_dataset)
+
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        sampler=sampler,
+        batch_size=args.batch_size,
+        pin_memory=True,
+        num_workers=args.workers,
+    )
+
+    # initialize the logger
+    if args.tensorboard_summaries:
+        from torch.utils.tensorboard import SummaryWriter
+
+        tensorboard_path = Path(args.checkpoint_dir) / "tensorboard"
+        os.makedirs(tensorboard_path, exist_ok=True)
+
+        tensorboard_run = tensorboard_path / f"{args.name}"
+        writer = SummaryWriter(tensorboard_run)
+    else:
+        writer = None
+
+    logger = utils.MetricLogger(delimiter="  ")
+
+    scaler = torch.cuda.amp.GradScaler() if args.mixed_precision else None
+    # run the training loop
+    # this will perform optimization, respectively logging and saving checkpoints
+    # when need be
+    run(
+        model=model,
+        optimizer=optimizer,
+        scheduler=scheduler,
+        train_loader=train_loader,
+        val_loaders=val_loaders,
+        logger=logger,
+        writer=writer,
+        scaler=scaler,
+        args=args,
+    )
+
+
+def get_args_parser(add_help=True):
+    import argparse
+
+    parser = argparse.ArgumentParser(description="PyTorch Stereo Matching Training", add_help=add_help)
+    # checkpointing
+    parser.add_argument("--name", default="crestereo", help="name of the experiment")
+    parser.add_argument("--resume", type=str, default=None, help="from which checkpoint to resume")
+    parser.add_argument("--checkpoint-dir", type=str, default="checkpoints", help="path to the checkpoint directory")
+
+    # dataset
+    parser.add_argument("--dataset-root", type=str, default="", help="path to the dataset root directory")
+    parser.add_argument(
+        "--train-datasets",
+        type=str,
+        nargs="+",
+        default=["crestereo"],
+        help="dataset(s) to train on",
+        choices=list(VALID_DATASETS.keys()),
+    )
+    parser.add_argument(
+        "--dataset-steps", type=int, nargs="+", default=[300_000], help="number of steps for each dataset"
+    )
+    parser.add_argument(
+        "--steps-is-epochs", action="store_true", help="if set, dataset-steps are interpreted as epochs"
+    )
+    parser.add_argument(
+        "--test-datasets",
+        type=str,
+        nargs="+",
+        default=["middlebury2014-train"],
+        help="dataset(s) to test on",
+        choices=["middlebury2014-train"],
+    )
+    parser.add_argument("--dataset-shuffle", type=bool, help="shuffle the dataset", default=True)
+    parser.add_argument("--dataset-order-shuffle", type=bool, help="shuffle the dataset order", default=True)
+    parser.add_argument("--batch-size", type=int, default=2, help="batch size per GPU")
+    parser.add_argument("--workers", type=int, default=4, help="number of workers per GPU")
+    parser.add_argument(
+        "--threads",
+        type=int,
+        default=16,
+        help="number of CPU threads per GPU. This can be changed around to speed-up transforms if needed. This can lead to worker thread contention so use with care.",
+    )
+
+    # model architecture
+    parser.add_argument(
+        "--model",
+        type=str,
+        default="crestereo_base",
+        help="model architecture",
+        choices=["crestereo_base", "raft_stereo"],
+    )
+    parser.add_argument("--recurrent-updates", type=int, default=10, help="number of recurrent updates")
+    parser.add_argument("--freeze-batch-norm", action="store_true", help="freeze batch norm parameters")
+
+    # loss parameters
+    parser.add_argument("--gamma", type=float, default=0.8, help="gamma parameter for the flow sequence loss")
+    parser.add_argument("--flow-loss-weight", type=float, default=1.0, help="weight for the flow loss")
+    parser.add_argument(
+        "--flow-loss-exclude-large",
+        action="store_true",
+        help="exclude large flow values from the loss. A large value is defined as a value greater than the ground truth flow norm",
+        default=False,
+    )
+    parser.add_argument("--consistency-weight", type=float, default=0.0, help="consistency loss weight")
+    parser.add_argument(
+        "--consistency-resize-factor",
+        type=float,
+        default=0.25,
+        help="consistency loss resize factor to account for the fact that the flow is computed on a downsampled image",
+    )
+    parser.add_argument("--psnr-weight", type=float, default=0.0, help="psnr loss weight")
+    parser.add_argument("--smoothness-weight", type=float, default=0.0, help="smoothness loss weight")
+    parser.add_argument("--photometric-weight", type=float, default=0.0, help="photometric loss weight")
+    parser.add_argument(
+        "--photometric-max-displacement-ratio",
+        type=float,
+        default=0.15,
+        help="Only pixels with a displacement smaller than this ratio of the image width will be considered for the photometric loss",
+    )
+    parser.add_argument("--photometric-ssim-weight", type=float, default=0.85, help="photometric ssim loss weight")
+
+    # transforms parameters
+    parser.add_argument("--gpu-transforms", action="store_true", help="use GPU transforms")
+    parser.add_argument(
+        "--eval-size", type=int, nargs="+", default=[384, 512], help="size of the images for evaluation"
+    )
+    parser.add_argument("--resize-size", type=int, nargs=2, default=None, help="resize size")
+    parser.add_argument("--crop-size", type=int, nargs=2, default=[384, 512], help="crop size")
+    parser.add_argument("--scale-range", type=float, nargs=2, default=[0.6, 1.0], help="random scale range")
+    parser.add_argument("--rescale-prob", type=float, default=1.0, help="probability of resizing the image")
+    parser.add_argument(
+        "--scaling-type", type=str, default="linear", help="scaling type", choices=["exponential", "linear"]
+    )
+    parser.add_argument("--flip-prob", type=float, default=0.5, help="probability of flipping the image")
+    parser.add_argument(
+        "--norm-mean", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="mean for image normalization"
+    )
+    parser.add_argument(
+        "--norm-std", type=float, nargs="+", default=[0.5, 0.5, 0.5], help="std for image normalization"
+    )
+    parser.add_argument(
+        "--use-grayscale", action="store_true", help="use grayscale images instead of RGB", default=False
+    )
+    parser.add_argument("--max-disparity", type=float, default=None, help="maximum disparity")
+    parser.add_argument(
+        "--interpolation-strategy",
+        type=str,
+        default="bilinear",
+        help="interpolation strategy",
+        choices=["bilinear", "bicubic", "mixed"],
+    )
+    parser.add_argument("--spatial-shift-prob", type=float, default=1.0, help="probability of shifting the image")
+    parser.add_argument(
+        "--spatial-shift-max-angle", type=float, default=0.1, help="maximum angle for the spatial shift"
+    )
+    parser.add_argument(
+        "--spatial-shift-max-displacement", type=float, default=2.0, help="maximum displacement for the spatial shift"
+    )
+    parser.add_argument("--gamma-range", type=float, nargs="+", default=[0.8, 1.2], help="range for gamma correction")
+    parser.add_argument(
+        "--brightness-range", type=float, nargs="+", default=[0.8, 1.2], help="range for brightness correction"
+    )
+    parser.add_argument(
+        "--contrast-range", type=float, nargs="+", default=[0.8, 1.2], help="range for contrast correction"
+    )
+    parser.add_argument(
+        "--saturation-range", type=float, nargs="+", default=0.0, help="range for saturation correction"
+    )
+    parser.add_argument("--hue-range", type=float, nargs="+", default=0.0, help="range for hue correction")
+    parser.add_argument(
+        "--asymmetric-jitter-prob",
+        type=float,
+        default=1.0,
+        help="probability of using asymmetric jitter instead of symmetric jitter",
+    )
+    parser.add_argument("--occlusion-prob", type=float, default=0.5, help="probability of occluding the rightimage")
+    parser.add_argument(
+        "--occlusion-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of occluded pixels"
+    )
+    parser.add_argument("--erase-prob", type=float, default=0.0, help="probability of erasing in both images")
+    parser.add_argument(
+        "--erase-px-range", type=int, nargs="+", default=[50, 100], help="range for the number of erased pixels"
+    )
+    parser.add_argument(
+        "--erase-num-repeats", type=int, default=1, help="number of times to repeat the erase operation"
+    )
+
+    # optimizer parameters
+    parser.add_argument("--optimizer", type=str, default="adam", help="optimizer", choices=["adam", "sgd"])
+    parser.add_argument("--lr", type=float, default=4e-4, help="learning rate")
+    parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay")
+    parser.add_argument("--clip-grad-norm", type=float, default=0.0, help="clip grad norm")
+
+    # lr_scheduler parameters
+    parser.add_argument("--min-lr", type=float, default=2e-5, help="minimum learning rate")
+    parser.add_argument("--warmup-steps", type=int, default=6_000, help="number of warmup steps")
+    parser.add_argument(
+        "--decay-after-steps", type=int, default=180_000, help="number of steps after which to start decay the lr"
+    )
+    parser.add_argument(
+        "--lr-warmup-method", type=str, default="linear", help="warmup method", choices=["linear", "cosine"]
+    )
+    parser.add_argument("--lr-warmup-factor", type=float, default=0.02, help="warmup factor for the learning rate")
+    parser.add_argument(
+        "--lr-decay-method",
+        type=str,
+        default="linear",
+        help="decay method",
+        choices=["linear", "cosine", "exponential"],
+    )
+    parser.add_argument("--lr-decay-gamma", type=float, default=0.8, help="decay factor for the learning rate")
+
+    # deterministic behaviour
+    parser.add_argument("--seed", type=int, default=42, help="seed for random number generators")
+
+    # mixed precision training
+    parser.add_argument("--mixed-precision", action="store_true", help="use mixed precision training")
+
+    # logging
+    parser.add_argument("--tensorboard-summaries", action="store_true", help="log to tensorboard")
+    parser.add_argument("--tensorboard-log-frequency", type=int, default=100, help="log frequency")
+    parser.add_argument("--save-frequency", type=int, default=1_000, help="save frequency")
+    parser.add_argument("--valid-frequency", type=int, default=1_000, help="validation frequency")
+    parser.add_argument(
+        "--metrics",
+        type=str,
+        nargs="+",
+        default=["mae", "rmse", "1px", "3px", "5px", "relepe"],
+        help="metrics to log",
+        choices=AVAILABLE_METRICS,
+    )
+
+    # distributed parameters
+    parser.add_argument("--world-size", type=int, default=8, help="number of distributed processes")
+    parser.add_argument("--dist-url", type=str, default="env://", help="url used to set up distributed training")
+    parser.add_argument("--device", type=str, default="cuda", help="device to use for training")
+
+    # weights API
+    parser.add_argument("--weights", type=str, default=None, help="weights API url")
+    parser.add_argument(
+        "--resume-path", type=str, default=None, help="a path from which to resume or start fine-tuning"
+    )
+    parser.add_argument("--resume-schedule", action="store_true", help="resume optimizer state")
+
+    # padder parameters
+    parser.add_argument("--padder-type", type=str, default="kitti", help="padder type", choices=["kitti", "sintel"])
+    return parser
+
+
+if __name__ == "__main__":
+    args = get_args_parser().parse_args()
+    main(args)
diff --git a/references/depth/stereo/transforms.py b/references/depth/stereo/transforms.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c4a6bab6d3b56922ea5339d4424356ef9eca292
--- /dev/null
+++ b/references/depth/stereo/transforms.py
@@ -0,0 +1,650 @@
+import random
+from typing import Callable, List, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import PIL.Image
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+from torch import Tensor
+
+T_FLOW = Union[Tensor, np.ndarray, None]
+T_MASK = Union[Tensor, np.ndarray, None]
+T_STEREO_TENSOR = Tuple[Tensor, Tensor]
+T_COLOR_AUG_PARAM = Union[float, Tuple[float, float]]
+
+
+def rand_float_range(size: Sequence[int], low: float, high: float) -> Tensor:
+    return (low - high) * torch.rand(size) + high
+
+
+class InterpolationStrategy:
+
+    _valid_modes: List[str] = ["mixed", "bicubic", "bilinear"]
+
+    def __init__(self, mode: str = "mixed") -> None:
+        if mode not in self._valid_modes:
+            raise ValueError(f"Invalid interpolation mode: {mode}. Valid modes are: {self._valid_modes}")
+
+        if mode == "mixed":
+            self.strategies = [F.InterpolationMode.BILINEAR, F.InterpolationMode.BICUBIC]
+        elif mode == "bicubic":
+            self.strategies = [F.InterpolationMode.BICUBIC]
+        elif mode == "bilinear":
+            self.strategies = [F.InterpolationMode.BILINEAR]
+
+    def __call__(self) -> F.InterpolationMode:
+        return random.choice(self.strategies)
+
+    @classmethod
+    def is_valid(mode: str) -> bool:
+        return mode in InterpolationStrategy._valid_modes
+
+    @property
+    def valid_modes() -> List[str]:
+        return InterpolationStrategy._valid_modes
+
+
+class ValidateModelInput(torch.nn.Module):
+    # Pass-through transform that checks the shape and dtypes to make sure the model gets what it expects
+    def forward(self, images: T_STEREO_TENSOR, disparities: T_FLOW, masks: T_MASK):
+        if images[0].shape != images[1].shape:
+            raise ValueError("img1 and img2 should have the same shape.")
+        h, w = images[0].shape[-2:]
+        if disparities[0] is not None and disparities[0].shape != (1, h, w):
+            raise ValueError(f"disparities[0].shape should be (1, {h}, {w}) instead of {disparities[0].shape}")
+        if masks[0] is not None:
+            if masks[0].shape != (h, w):
+                raise ValueError(f"masks[0].shape should be ({h}, {w}) instead of {masks[0].shape}")
+            if masks[0].dtype != torch.bool:
+                raise TypeError(f"masks[0] should be of dtype torch.bool instead of {masks[0].dtype}")
+
+        return images, disparities, masks
+
+
+class ConvertToGrayscale(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        images: Tuple[PIL.Image.Image, PIL.Image.Image],
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        img_left = F.rgb_to_grayscale(images[0], num_output_channels=3)
+        img_right = F.rgb_to_grayscale(images[1], num_output_channels=3)
+
+        return (img_left, img_right), disparities, masks
+
+
+class MakeValidDisparityMask(torch.nn.Module):
+    def __init__(self, max_disparity: Optional[int] = 256) -> None:
+        super().__init__()
+        self.max_disparity = max_disparity
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        valid_masks = tuple(
+            torch.ones(images[idx].shape[-2:], dtype=torch.bool, device=images[idx].device) if mask is None else mask
+            for idx, mask in enumerate(masks)
+        )
+
+        valid_masks = tuple(
+            torch.logical_and(mask, disparity > 0).squeeze(0) if disparity is not None else mask
+            for mask, disparity in zip(valid_masks, disparities)
+        )
+
+        if self.max_disparity is not None:
+            valid_masks = tuple(
+                torch.logical_and(mask, disparity < self.max_disparity).squeeze(0) if disparity is not None else mask
+                for mask, disparity in zip(valid_masks, disparities)
+            )
+
+        return images, disparities, valid_masks
+
+
+class ToGPU(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        dev_images = tuple(image.cuda() for image in images)
+        dev_disparities = tuple(map(lambda x: x.cuda() if x is not None else None, disparities))
+        dev_masks = tuple(map(lambda x: x.cuda() if x is not None else None, masks))
+        return dev_images, dev_disparities, dev_masks
+
+
+class ConvertImageDtype(torch.nn.Module):
+    def __init__(self, dtype: torch.dtype):
+        super().__init__()
+        self.dtype = dtype
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        img_left = F.convert_image_dtype(images[0], dtype=self.dtype)
+        img_right = F.convert_image_dtype(images[1], dtype=self.dtype)
+
+        img_left = img_left.contiguous()
+        img_right = img_right.contiguous()
+
+        return (img_left, img_right), disparities, masks
+
+
+class Normalize(torch.nn.Module):
+    def __init__(self, mean: List[float], std: List[float]) -> None:
+        super().__init__()
+        self.mean = mean
+        self.std = std
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        img_left = F.normalize(images[0], mean=self.mean, std=self.std)
+        img_right = F.normalize(images[1], mean=self.mean, std=self.std)
+
+        img_left = img_left.contiguous()
+        img_right = img_right.contiguous()
+
+        return (img_left, img_right), disparities, masks
+
+
+class ToTensor(torch.nn.Module):
+    def forward(
+        self,
+        images: Tuple[PIL.Image.Image, PIL.Image.Image],
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        if images[0] is None:
+            raise ValueError("img_left is None")
+        if images[1] is None:
+            raise ValueError("img_right is None")
+
+        img_left = F.pil_to_tensor(images[0])
+        img_right = F.pil_to_tensor(images[1])
+        disparity_tensors = ()
+        mask_tensors = ()
+
+        for idx in range(2):
+            disparity_tensors += (torch.from_numpy(disparities[idx]),) if disparities[idx] is not None else (None,)
+            mask_tensors += (torch.from_numpy(masks[idx]),) if masks[idx] is not None else (None,)
+
+        return (img_left, img_right), disparity_tensors, mask_tensors
+
+
+class AsymmetricColorJitter(T.ColorJitter):
+    # p determines the probability of doing asymmetric vs symmetric color jittering
+    def __init__(
+        self,
+        brightness: T_COLOR_AUG_PARAM = 0,
+        contrast: T_COLOR_AUG_PARAM = 0,
+        saturation: T_COLOR_AUG_PARAM = 0,
+        hue: T_COLOR_AUG_PARAM = 0,
+        p: float = 0.2,
+    ):
+        super().__init__(brightness=brightness, contrast=contrast, saturation=saturation, hue=hue)
+        self.p = p
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        if torch.rand(1) < self.p:
+            # asymmetric: different transform for img1 and img2
+            img_left = super().forward(images[0])
+            img_right = super().forward(images[1])
+        else:
+            # symmetric: same transform for img1 and img2
+            batch = torch.stack(images)
+            batch = super().forward(batch)
+            img_left, img_right = batch[0], batch[1]
+
+        return (img_left, img_right), disparities, masks
+
+
+class AsymetricGammaAdjust(torch.nn.Module):
+    def __init__(self, p: float, gamma_range: Tuple[float, float], gain: float = 1) -> None:
+        super().__init__()
+        self.gamma_range = gamma_range
+        self.gain = gain
+        self.p = p
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        gamma = rand_float_range((1,), low=self.gamma_range[0], high=self.gamma_range[1]).item()
+
+        if torch.rand(1) < self.p:
+            # asymmetric: different transform for img1 and img2
+            img_left = F.adjust_gamma(images[0], gamma, gain=self.gain)
+            img_right = F.adjust_gamma(images[1], gamma, gain=self.gain)
+        else:
+            # symmetric: same transform for img1 and img2
+            batch = torch.stack(images)
+            batch = F.adjust_gamma(batch, gamma, gain=self.gain)
+            img_left, img_right = batch[0], batch[1]
+
+        return (img_left, img_right), disparities, masks
+
+
+class RandomErase(torch.nn.Module):
+    # Produces multiple symmetric random erasures
+    # these can be viewed as occlusions present in both camera views.
+    # Similarly to Optical Flow occlusion prediction tasks, we mask these pixels in the disparity map
+    def __init__(
+        self,
+        p: float = 0.5,
+        erase_px_range: Tuple[int, int] = (50, 100),
+        value: Union[Tensor, float] = 0,
+        inplace: bool = False,
+        max_erase: int = 2,
+    ):
+        super().__init__()
+        self.min_px_erase = erase_px_range[0]
+        self.max_px_erase = erase_px_range[1]
+        if self.max_px_erase < 0:
+            raise ValueError("erase_px_range[1] should be equal or greater than 0")
+        if self.min_px_erase < 0:
+            raise ValueError("erase_px_range[0] should be equal or greater than 0")
+        if self.min_px_erase > self.max_px_erase:
+            raise ValueError("erase_prx_range[0] should be equal or lower than erase_px_range[1]")
+
+        self.p = p
+        self.value = value
+        self.inplace = inplace
+        self.max_erase = max_erase
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: T_STEREO_TENSOR,
+        masks: T_STEREO_TENSOR,
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        if torch.rand(1) < self.p:
+            return images, disparities, masks
+
+        image_left, image_right = images
+        mask_left, mask_right = masks
+        for _ in range(torch.randint(self.max_erase, size=(1,)).item()):
+            y, x, h, w, v = self._get_params(image_left)
+            image_right = F.erase(image_right, y, x, h, w, v, self.inplace)
+            image_left = F.erase(image_left, y, x, h, w, v, self.inplace)
+            # similarly to optical flow occlusion prediction, we consider
+            # any erasure pixels that are in both images to be occluded therefore
+            # we mark them as invalid
+            if mask_left is not None:
+                mask_left = F.erase(mask_left, y, x, h, w, False, self.inplace)
+            if mask_right is not None:
+                mask_right = F.erase(mask_right, y, x, h, w, False, self.inplace)
+
+        return (image_left, image_right), disparities, (mask_left, mask_right)
+
+    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
+        img_h, img_w = img.shape[-2:]
+        crop_h, crop_w = (
+            random.randint(self.min_px_erase, self.max_px_erase),
+            random.randint(self.min_px_erase, self.max_px_erase),
+        )
+        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
+
+        return crop_y, crop_x, crop_h, crop_w, self.value
+
+
+class RandomOcclusion(torch.nn.Module):
+    # This adds an occlusion in the right image
+    # the occluded patch works as a patch erase where the erase value is the mean
+    # of the pixels from the selected zone
+    def __init__(self, p: float = 0.5, occlusion_px_range: Tuple[int, int] = (50, 100), inplace: bool = False):
+        super().__init__()
+
+        self.min_px_occlusion = occlusion_px_range[0]
+        self.max_px_occlusion = occlusion_px_range[1]
+
+        if self.max_px_occlusion < 0:
+            raise ValueError("occlusion_px_range[1] should be greater or equal than 0")
+        if self.min_px_occlusion < 0:
+            raise ValueError("occlusion_px_range[0] should be greater or equal than 0")
+        if self.min_px_occlusion > self.max_px_occlusion:
+            raise ValueError("occlusion_px_range[0] should be lower than occlusion_px_range[1]")
+
+        self.p = p
+        self.inplace = inplace
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: T_STEREO_TENSOR,
+        masks: T_STEREO_TENSOR,
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        left_image, right_image = images
+
+        if torch.rand(1) < self.p:
+            return images, disparities, masks
+
+        y, x, h, w, v = self._get_params(right_image)
+        right_image = F.erase(right_image, y, x, h, w, v, self.inplace)
+
+        return ((left_image, right_image), disparities, masks)
+
+    def _get_params(self, img: torch.Tensor) -> Tuple[int, int, int, int, float]:
+        img_h, img_w = img.shape[-2:]
+        crop_h, crop_w = (
+            random.randint(self.min_px_occlusion, self.max_px_occlusion),
+            random.randint(self.min_px_occlusion, self.max_px_occlusion),
+        )
+
+        crop_x, crop_y = (random.randint(0, img_w - crop_w), random.randint(0, img_h - crop_h))
+        occlusion_value = img[..., crop_y : crop_y + crop_h, crop_x : crop_x + crop_w].mean(dim=(-2, -1), keepdim=True)
+
+        return (crop_y, crop_x, crop_h, crop_w, occlusion_value)
+
+
+class RandomSpatialShift(torch.nn.Module):
+    # This transform applies a vertical shift and a slight angle rotation and the same time
+    def __init__(
+        self, p: float = 0.5, max_angle: float = 0.1, max_px_shift: int = 2, interpolation_type: str = "bilinear"
+    ) -> None:
+        super().__init__()
+        self.p = p
+        self.max_angle = max_angle
+        self.max_px_shift = max_px_shift
+        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: T_STEREO_TENSOR,
+        masks: T_STEREO_TENSOR,
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        # the transform is applied only on the right image
+        # in order to mimic slight calibration issues
+        img_left, img_right = images
+
+        INTERP_MODE = self._interpolation_mode_strategy()
+
+        if torch.rand(1) < self.p:
+            # [0, 1] -> [-a, a]
+            shift = rand_float_range((1,), low=-self.max_px_shift, high=self.max_px_shift).item()
+            angle = rand_float_range((1,), low=-self.max_angle, high=self.max_angle).item()
+            # sample center point for the rotation matrix
+            y = torch.randint(size=(1,), low=0, high=img_right.shape[-2]).item()
+            x = torch.randint(size=(1,), low=0, high=img_right.shape[-1]).item()
+            # apply affine transformations
+            img_right = F.affine(
+                img_right,
+                angle=angle,
+                translate=[0, shift],  # translation only on the y-axis
+                center=[x, y],
+                scale=1.0,
+                shear=0.0,
+                interpolation=INTERP_MODE,
+            )
+
+        return ((img_left, img_right), disparities, masks)
+
+
+class RandomHorizontalFlip(torch.nn.Module):
+    def __init__(self, p: float = 0.5) -> None:
+        super().__init__()
+        self.p = p
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        img_left, img_right = images
+        dsp_left, dsp_right = disparities
+        mask_left, mask_right = masks
+
+        if dsp_right is not None and torch.rand(1) < self.p:
+            img_left, img_right = F.hflip(img_left), F.hflip(img_right)
+            dsp_left, dsp_right = F.hflip(dsp_left), F.hflip(dsp_right)
+            if mask_left is not None and mask_right is not None:
+                mask_left, mask_right = F.hflip(mask_left), F.hflip(mask_right)
+            return ((img_right, img_left), (dsp_right, dsp_left), (mask_right, mask_left))
+
+        return images, disparities, masks
+
+
+class Resize(torch.nn.Module):
+    def __init__(self, resize_size: Tuple[int, ...], interpolation_type: str = "bilinear") -> None:
+        super().__init__()
+        self.resize_size = list(resize_size)  # doing this to keep mypy happy
+        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+        resized_images = ()
+        resized_disparities = ()
+        resized_masks = ()
+
+        INTERP_MODE = self._interpolation_mode_strategy()
+
+        for img in images:
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the stereo models with antialias=True?
+            resized_images += (F.resize(img, self.resize_size, interpolation=INTERP_MODE, antialias=False),)
+
+        for dsp in disparities:
+            if dsp is not None:
+                # rescale disparity to match the new image size
+                scale_x = self.resize_size[1] / dsp.shape[-1]
+                resized_disparities += (F.resize(dsp, self.resize_size, interpolation=INTERP_MODE) * scale_x,)
+            else:
+                resized_disparities += (None,)
+
+        for mask in masks:
+            if mask is not None:
+                resized_masks += (
+                    # we squeeze and unsqueeze because the API requires > 3D tensors
+                    F.resize(
+                        mask.unsqueeze(0),
+                        self.resize_size,
+                        interpolation=F.InterpolationMode.NEAREST,
+                    ).squeeze(0),
+                )
+            else:
+                resized_masks += (None,)
+
+        return resized_images, resized_disparities, resized_masks
+
+
+class RandomRescaleAndCrop(torch.nn.Module):
+    # This transform will resize the input with a given proba, and then crop it.
+    # These are the reversed operations of the built-in RandomResizedCrop,
+    # although the order of the operations doesn't matter too much: resizing a
+    # crop would give the same result as cropping a resized image, up to
+    # interpolation artifact at the borders of the output.
+    #
+    # The reason we don't rely on RandomResizedCrop is because of a significant
+    # difference in the parametrization of both transforms, in particular,
+    # because of the way the random parameters are sampled in both transforms,
+    # which leads to fairly different results (and different epe). For more details see
+    # https://github.com/pytorch/vision/pull/5026/files#r762932579
+    def __init__(
+        self,
+        crop_size: Tuple[int, int],
+        scale_range: Tuple[float, float] = (-0.2, 0.5),
+        rescale_prob: float = 0.8,
+        scaling_type: str = "exponential",
+        interpolation_type: str = "bilinear",
+    ) -> None:
+        super().__init__()
+        self.crop_size = crop_size
+        self.min_scale = scale_range[0]
+        self.max_scale = scale_range[1]
+        self.rescale_prob = rescale_prob
+        self.scaling_type = scaling_type
+        self._interpolation_mode_strategy = InterpolationStrategy(interpolation_type)
+
+        if self.scaling_type == "linear" and self.min_scale < 0:
+            raise ValueError("min_scale must be >= 0 for linear scaling")
+
+    def forward(
+        self,
+        images: T_STEREO_TENSOR,
+        disparities: Tuple[T_FLOW, T_FLOW],
+        masks: Tuple[T_MASK, T_MASK],
+    ) -> Tuple[T_STEREO_TENSOR, Tuple[T_FLOW, T_FLOW], Tuple[T_MASK, T_MASK]]:
+
+        img_left, img_right = images
+        dsp_left, dsp_right = disparities
+        mask_left, mask_right = masks
+        INTERP_MODE = self._interpolation_mode_strategy()
+
+        # randomly sample scale
+        h, w = img_left.shape[-2:]
+        # Note: in original code, they use + 1 instead of + 8 for sparse datasets (e.g. Kitti)
+        # It shouldn't matter much
+        min_scale = max((self.crop_size[0] + 8) / h, (self.crop_size[1] + 8) / w)
+
+        # exponential scaling will draw a random scale in (min_scale, max_scale) and then raise
+        # 2 to the power of that random value. This final scale distribution will have a different
+        # mean and variance than a uniform distribution. Note that a scale of 1 will result in
+        # a rescaling of 2X the original size, whereas a scale of -1 will result in a rescaling
+        # of 0.5X the original size.
+        if self.scaling_type == "exponential":
+            scale = 2 ** torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
+        # linear scaling will draw a random scale in (min_scale, max_scale)
+        elif self.scaling_type == "linear":
+            scale = torch.empty(1, dtype=torch.float32).uniform_(self.min_scale, self.max_scale).item()
+
+        scale = max(scale, min_scale)
+
+        new_h, new_w = round(h * scale), round(w * scale)
+
+        if torch.rand(1).item() < self.rescale_prob:
+            # rescale the images
+            img_left = F.resize(img_left, size=(new_h, new_w), interpolation=INTERP_MODE)
+            img_right = F.resize(img_right, size=(new_h, new_w), interpolation=INTERP_MODE)
+
+            resized_masks, resized_disparities = (), ()
+
+            for disparity, mask in zip(disparities, masks):
+                if disparity is not None:
+                    if mask is None:
+                        resized_disparity = F.resize(disparity, size=(new_h, new_w), interpolation=INTERP_MODE)
+                        # rescale the disparity
+                        resized_disparity = (
+                            resized_disparity * torch.tensor([scale], device=resized_disparity.device)[:, None, None]
+                        )
+                        resized_mask = None
+                    else:
+                        resized_disparity, resized_mask = _resize_sparse_flow(
+                            disparity, mask, scale_x=scale, scale_y=scale
+                        )
+                resized_masks += (resized_mask,)
+                resized_disparities += (resized_disparity,)
+
+        else:
+            resized_disparities = disparities
+            resized_masks = masks
+
+        disparities = resized_disparities
+        masks = resized_masks
+
+        # Note: For sparse datasets (Kitti), the original code uses a "margin"
+        # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
+        # We don't, not sure if it matters much
+        y0 = torch.randint(0, img_left.shape[1] - self.crop_size[0], size=(1,)).item()
+        x0 = torch.randint(0, img_right.shape[2] - self.crop_size[1], size=(1,)).item()
+
+        img_left = F.crop(img_left, y0, x0, self.crop_size[0], self.crop_size[1])
+        img_right = F.crop(img_right, y0, x0, self.crop_size[0], self.crop_size[1])
+        if dsp_left is not None:
+            dsp_left = F.crop(disparities[0], y0, x0, self.crop_size[0], self.crop_size[1])
+        if dsp_right is not None:
+            dsp_right = F.crop(disparities[1], y0, x0, self.crop_size[0], self.crop_size[1])
+
+        cropped_masks = ()
+        for mask in masks:
+            if mask is not None:
+                mask = F.crop(mask, y0, x0, self.crop_size[0], self.crop_size[1])
+            cropped_masks += (mask,)
+
+        return ((img_left, img_right), (dsp_left, dsp_right), cropped_masks)
+
+
+def _resize_sparse_flow(
+    flow: Tensor, valid_flow_mask: Tensor, scale_x: float = 1.0, scale_y: float = 0.0
+) -> Tuple[Tensor, Tensor]:
+    # This resizes both the flow and the valid_flow_mask mask (which is assumed to be reasonably sparse)
+    # There are as-many non-zero values in the original flow as in the resized flow (up to OOB)
+    # So for example if scale_x = scale_y = 2, the sparsity of the output flow is multiplied by 4
+
+    h, w = flow.shape[-2:]
+
+    h_new = int(round(h * scale_y))
+    w_new = int(round(w * scale_x))
+    flow_new = torch.zeros(size=[1, h_new, w_new], dtype=flow.dtype)
+    valid_new = torch.zeros(size=[h_new, w_new], dtype=valid_flow_mask.dtype)
+
+    jj, ii = torch.meshgrid(torch.arange(w), torch.arange(h), indexing="xy")
+
+    ii_valid, jj_valid = ii[valid_flow_mask], jj[valid_flow_mask]
+
+    ii_valid_new = torch.round(ii_valid.to(float) * scale_y).to(torch.long)
+    jj_valid_new = torch.round(jj_valid.to(float) * scale_x).to(torch.long)
+
+    within_bounds_mask = (0 <= ii_valid_new) & (ii_valid_new < h_new) & (0 <= jj_valid_new) & (jj_valid_new < w_new)
+
+    ii_valid = ii_valid[within_bounds_mask]
+    jj_valid = jj_valid[within_bounds_mask]
+    ii_valid_new = ii_valid_new[within_bounds_mask]
+    jj_valid_new = jj_valid_new[within_bounds_mask]
+
+    valid_flow_new = flow[:, ii_valid, jj_valid]
+    valid_flow_new *= scale_x
+
+    flow_new[:, ii_valid_new, jj_valid_new] = valid_flow_new
+    valid_new[ii_valid_new, jj_valid_new] = valid_flow_mask[ii_valid, jj_valid]
+
+    return flow_new, valid_new.bool()
+
+
+class Compose(torch.nn.Module):
+    def __init__(self, transforms: List[Callable]):
+        super().__init__()
+        self.transforms = transforms
+
+    @torch.inference_mode()
+    def forward(self, images, disparities, masks):
+        for t in self.transforms:
+            images, disparities, masks = t(images, disparities, masks)
+        return images, disparities, masks
diff --git a/references/depth/stereo/utils/losses.py b/references/depth/stereo/utils/losses.py
new file mode 100644
index 0000000000000000000000000000000000000000..c809cc74d0f49f1b87277dd0ecece413a0f079c2
--- /dev/null
+++ b/references/depth/stereo/utils/losses.py
@@ -0,0 +1,503 @@
+from typing import List, Optional
+
+import torch
+from torch import nn, Tensor
+from torch.nn import functional as F
+from torchvision.prototype.models.depth.stereo.raft_stereo import grid_sample, make_coords_grid
+
+
+def make_gaussian_kernel(kernel_size: int, sigma: float) -> torch.Tensor:
+    """Function to create a 2D Gaussian kernel."""
+
+    x = torch.arange(kernel_size, dtype=torch.float32)
+    y = torch.arange(kernel_size, dtype=torch.float32)
+    x = x - (kernel_size - 1) / 2
+    y = y - (kernel_size - 1) / 2
+    x, y = torch.meshgrid(x, y)
+    grid = (x**2 + y**2) / (2 * sigma**2)
+    kernel = torch.exp(-grid)
+    kernel = kernel / kernel.sum()
+    return kernel
+
+
+def _sequence_loss_fn(
+    flow_preds: List[Tensor],
+    flow_gt: Tensor,
+    valid_flow_mask: Optional[Tensor],
+    gamma: Tensor,
+    max_flow: int = 256,
+    exclude_large: bool = False,
+    weights: Optional[Tensor] = None,
+):
+    """Loss function defined over sequence of flow predictions"""
+    torch._assert(
+        gamma < 1,
+        "sequence_loss: `gamma` must be lower than 1, but got {}".format(gamma),
+    )
+
+    if exclude_large:
+        # exclude invalid pixels and extremely large diplacements
+        flow_norm = torch.sum(flow_gt**2, dim=1).sqrt()
+        if valid_flow_mask is not None:
+            valid_flow_mask = valid_flow_mask & (flow_norm < max_flow)
+        else:
+            valid_flow_mask = flow_norm < max_flow
+
+    if valid_flow_mask is not None:
+        valid_flow_mask = valid_flow_mask.unsqueeze(1)
+    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
+
+    abs_diff = (flow_preds - flow_gt).abs()
+    if valid_flow_mask is not None:
+        abs_diff = abs_diff * valid_flow_mask.unsqueeze(0)
+
+    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
+    num_predictions = flow_preds.shape[0]
+
+    # allocating on CPU and moving to device during run-time can force
+    # an unwanted GPU synchronization that produces a large overhead
+    if weights is None or len(weights) != num_predictions:
+        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
+
+    flow_loss = (abs_diff * weights).sum()
+    return flow_loss, weights
+
+
+class SequenceLoss(nn.Module):
+    def __init__(self, gamma: float = 0.8, max_flow: int = 256, exclude_large_flows: bool = False) -> None:
+        """
+        Args:
+            gamma: value for the exponential weighting of the loss across frames
+            max_flow: maximum flow value to exclude
+            exclude_large_flows: whether to exclude large flows
+        """
+
+        super().__init__()
+        self.max_flow = max_flow
+        self.excluding_large = exclude_large_flows
+        self.register_buffer("gamma", torch.tensor([gamma]))
+        # cache the scale factor for the loss
+        self._weights = None
+
+    def forward(self, flow_preds: List[Tensor], flow_gt: Tensor, valid_flow_mask: Optional[Tensor]) -> Tensor:
+        """
+        Args:
+            flow_preds: list of flow predictions of shape (batch_size, C, H, W)
+            flow_gt: ground truth flow of shape (batch_size, C, H, W)
+            valid_flow_mask: mask of valid flow pixels of shape (batch_size, H, W)
+        """
+        loss, weights = _sequence_loss_fn(
+            flow_preds, flow_gt, valid_flow_mask, self.gamma, self.max_flow, self.excluding_large, self._weights
+        )
+        self._weights = weights
+        return loss
+
+    def set_gamma(self, gamma: float) -> None:
+        self.gamma.fill_(gamma)
+        # reset the cached scale factor
+        self._weights = None
+
+
+def _ssim_loss_fn(
+    source: Tensor,
+    reference: Tensor,
+    kernel: Tensor,
+    eps: float = 1e-8,
+    c1: float = 0.01**2,
+    c2: float = 0.03**2,
+    use_padding: bool = False,
+) -> Tensor:
+    # ref: Algorithm section: https://en.wikipedia.org/wiki/Structural_similarity
+    # ref: Alternative implementation: https://kornia.readthedocs.io/en/latest/_modules/kornia/metrics/ssim.html#ssim
+
+    torch._assert(
+        source.ndim == reference.ndim == 4,
+        "SSIM: `source` and `reference` must be 4-dimensional tensors",
+    )
+
+    torch._assert(
+        source.shape == reference.shape,
+        "SSIM: `source` and `reference` must have the same shape, but got {} and {}".format(
+            source.shape, reference.shape
+        ),
+    )
+
+    B, C, H, W = source.shape
+    kernel = kernel.unsqueeze(0).unsqueeze(0).repeat(C, 1, 1, 1)
+    if use_padding:
+        pad_size = kernel.shape[2] // 2
+        source = F.pad(source, (pad_size, pad_size, pad_size, pad_size), "reflect")
+        reference = F.pad(reference, (pad_size, pad_size, pad_size, pad_size), "reflect")
+
+    mu1 = F.conv2d(source, kernel, groups=C)
+    mu2 = F.conv2d(reference, kernel, groups=C)
+
+    mu1_sq = mu1.pow(2)
+    mu2_sq = mu2.pow(2)
+
+    mu1_mu2 = mu1 * mu2
+    mu_img1_sq = F.conv2d(source.pow(2), kernel, groups=C)
+    mu_img2_sq = F.conv2d(reference.pow(2), kernel, groups=C)
+    mu_img1_mu2 = F.conv2d(source * reference, kernel, groups=C)
+
+    sigma1_sq = mu_img1_sq - mu1_sq
+    sigma2_sq = mu_img2_sq - mu2_sq
+    sigma12 = mu_img1_mu2 - mu1_mu2
+
+    numerator = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
+    denominator = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
+    ssim = numerator / (denominator + eps)
+
+    # doing 1 - ssim because we want to maximize the ssim
+    return 1 - ssim.mean(dim=(1, 2, 3))
+
+
+class SSIM(nn.Module):
+    def __init__(
+        self,
+        kernel_size: int = 11,
+        max_val: float = 1.0,
+        sigma: float = 1.5,
+        eps: float = 1e-12,
+        use_padding: bool = True,
+    ) -> None:
+        """SSIM loss function.
+
+        Args:
+            kernel_size: size of the Gaussian kernel
+            max_val: constant scaling factor
+            sigma: sigma of the Gaussian kernel
+            eps: constant for division by zero
+            use_padding: whether to pad the input tensor such that we have a score for each pixel
+        """
+        super().__init__()
+
+        self.kernel_size = kernel_size
+        self.max_val = max_val
+        self.sigma = sigma
+
+        gaussian_kernel = make_gaussian_kernel(kernel_size, sigma)
+        self.register_buffer("gaussian_kernel", gaussian_kernel)
+
+        self.c1 = (0.01 * self.max_val) ** 2
+        self.c2 = (0.03 * self.max_val) ** 2
+
+        self.use_padding = use_padding
+        self.eps = eps
+
+    def forward(self, source: torch.Tensor, reference: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            source: source image of shape (batch_size, C, H, W)
+            reference: reference image of shape (batch_size, C, H, W)
+
+        Returns:
+            SSIM loss of shape (batch_size,)
+        """
+        return _ssim_loss_fn(
+            source,
+            reference,
+            kernel=self.gaussian_kernel,
+            c1=self.c1,
+            c2=self.c2,
+            use_padding=self.use_padding,
+            eps=self.eps,
+        )
+
+
+def _smoothness_loss_fn(img_gx: Tensor, img_gy: Tensor, val_gx: Tensor, val_gy: Tensor):
+    # ref: https://github.com/nianticlabs/monodepth2/blob/b676244e5a1ca55564eb5d16ab521a48f823af31/layers.py#L202
+
+    torch._assert(
+        img_gx.ndim >= 3,
+        "smoothness_loss: `img_gx` must be at least 3-dimensional tensor of shape (..., C, H, W)",
+    )
+
+    torch._assert(
+        img_gx.ndim == val_gx.ndim,
+        "smoothness_loss: `img_gx` and `depth_gx` must have the same dimensionality, but got {} and {}".format(
+            img_gx.ndim, val_gx.ndim
+        ),
+    )
+
+    for idx in range(img_gx.ndim):
+        torch._assert(
+            (img_gx.shape[idx] == val_gx.shape[idx] or (img_gx.shape[idx] == 1 or val_gx.shape[idx] == 1)),
+            "smoothness_loss: `img_gx` and `depth_gx` must have either the same shape or broadcastable shape, but got {} and {}".format(
+                img_gx.shape, val_gx.shape
+            ),
+        )
+
+    # -3 is channel dimension
+    weights_x = torch.exp(-torch.mean(torch.abs(val_gx), axis=-3, keepdim=True))
+    weights_y = torch.exp(-torch.mean(torch.abs(val_gy), axis=-3, keepdim=True))
+
+    smoothness_x = img_gx * weights_x
+    smoothness_y = img_gy * weights_y
+
+    smoothness = (torch.abs(smoothness_x) + torch.abs(smoothness_y)).mean(axis=(-3, -2, -1))
+    return smoothness
+
+
+class SmoothnessLoss(nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+
+    def _x_gradient(self, img: Tensor) -> Tensor:
+        if img.ndim > 4:
+            original_shape = img.shape
+            is_reshaped = True
+            img = img.reshape(-1, *original_shape[-3:])
+        else:
+            is_reshaped = False
+
+        padded = F.pad(img, (0, 1, 0, 0), mode="replicate")
+        grad = padded[..., :, :-1] - padded[..., :, 1:]
+        if is_reshaped:
+            grad = grad.reshape(original_shape)
+        return grad
+
+    def _y_gradient(self, x: torch.Tensor) -> torch.Tensor:
+        if x.ndim > 4:
+            original_shape = x.shape
+            is_reshaped = True
+            x = x.reshape(-1, *original_shape[-3:])
+        else:
+            is_reshaped = False
+
+        padded = F.pad(x, (0, 0, 0, 1), mode="replicate")
+        grad = padded[..., :-1, :] - padded[..., 1:, :]
+        if is_reshaped:
+            grad = grad.reshape(original_shape)
+        return grad
+
+    def forward(self, images: Tensor, vals: Tensor) -> Tensor:
+        """
+        Args:
+            images: tensor of shape (D1, D2, ..., DN, C, H, W)
+            vals: tensor of shape (D1, D2, ..., DN, 1, H, W)
+
+        Returns:
+            smoothness loss of shape (D1, D2, ..., DN)
+        """
+        img_gx = self._x_gradient(images)
+        img_gy = self._y_gradient(images)
+
+        val_gx = self._x_gradient(vals)
+        val_gy = self._y_gradient(vals)
+
+        return _smoothness_loss_fn(img_gx, img_gy, val_gx, val_gy)
+
+
+def _flow_sequence_consistency_loss_fn(
+    flow_preds: List[Tensor],
+    gamma: float = 0.8,
+    resize_factor: float = 0.25,
+    rescale_factor: float = 0.25,
+    rescale_mode: str = "bilinear",
+    weights: Optional[Tensor] = None,
+):
+    """Loss function defined over sequence of flow predictions"""
+
+    # Simplified version of ref: https://arxiv.org/pdf/2006.11242.pdf
+    # In the original paper, an additional refinement network is used to refine a flow prediction.
+    # Each step performed by the recurrent module in Raft or CREStereo is a refinement step using a delta_flow update.
+    # which should be consistent with the previous step. In this implementation, we simplify the overall loss
+    # term and ignore left-right consistency loss or photometric loss which can be treated separately.
+
+    torch._assert(
+        rescale_factor <= 1.0,
+        "sequence_consistency_loss: `rescale_factor` must be less than or equal to 1, but got {}".format(
+            rescale_factor
+        ),
+    )
+
+    flow_preds = torch.stack(flow_preds)  # shape = (num_flow_updates, batch_size, 2, H, W)
+    N, B, C, H, W = flow_preds.shape
+
+    # rescale flow predictions to account for bilinear upsampling artifacts
+    if rescale_factor:
+        flow_preds = (
+            F.interpolate(
+                flow_preds.view(N * B, C, H, W), scale_factor=resize_factor, mode=rescale_mode, align_corners=True
+            )
+        ) * rescale_factor
+        flow_preds = torch.stack(torch.chunk(flow_preds, N, dim=0), dim=0)
+
+    # force the next prediction to be similar to the previous prediction
+    abs_diff = (flow_preds[1:] - flow_preds[:-1]).square()
+    abs_diff = abs_diff.mean(axis=(1, 2, 3, 4))
+
+    num_predictions = flow_preds.shape[0] - 1  # because we are comparing differences
+    if weights is None or len(weights) != num_predictions:
+        weights = gamma ** torch.arange(num_predictions - 1, -1, -1, device=flow_preds.device, dtype=flow_preds.dtype)
+
+    flow_loss = (abs_diff * weights).sum()
+    return flow_loss, weights
+
+
+class FlowSequenceConsistencyLoss(nn.Module):
+    def __init__(
+        self,
+        gamma: float = 0.8,
+        resize_factor: float = 0.25,
+        rescale_factor: float = 0.25,
+        rescale_mode: str = "bilinear",
+    ) -> None:
+        super().__init__()
+        self.gamma = gamma
+        self.resize_factor = resize_factor
+        self.rescale_factor = rescale_factor
+        self.rescale_mode = rescale_mode
+        self._weights = None
+
+    def forward(self, flow_preds: List[Tensor]) -> Tensor:
+        """
+        Args:
+            flow_preds: list of tensors of shape (batch_size, C, H, W)
+
+        Returns:
+            sequence consistency loss of shape (batch_size,)
+        """
+        loss, weights = _flow_sequence_consistency_loss_fn(
+            flow_preds,
+            gamma=self.gamma,
+            resize_factor=self.resize_factor,
+            rescale_factor=self.rescale_factor,
+            rescale_mode=self.rescale_mode,
+            weights=self._weights,
+        )
+        self._weights = weights
+        return loss
+
+    def set_gamma(self, gamma: float) -> None:
+        self.gamma.fill_(gamma)
+        # reset the cached scale factor
+        self._weights = None
+
+
+def _psnr_loss_fn(source: torch.Tensor, target: torch.Tensor, max_val: float) -> torch.Tensor:
+    torch._assert(
+        source.shape == target.shape,
+        "psnr_loss: source and target must have the same shape, but got {} and {}".format(source.shape, target.shape),
+    )
+
+    # ref https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
+    return 10 * torch.log10(max_val**2 / ((source - target).pow(2).mean(axis=(-3, -2, -1))))
+
+
+class PSNRLoss(nn.Module):
+    def __init__(self, max_val: float = 256) -> None:
+        """
+        Args:
+            max_val: maximum value of the input tensor. This refers to the maximum domain value of the input tensor.
+
+        """
+        super().__init__()
+        self.max_val = max_val
+
+    def forward(self, source: Tensor, target: Tensor) -> Tensor:
+        """
+        Args:
+            source: tensor of shape (D1, D2, ..., DN, C, H, W)
+            target: tensor of shape (D1, D2, ..., DN, C, H, W)
+
+        Returns:
+            psnr loss of shape (D1, D2, ..., DN)
+        """
+
+        # multiply by -1 as we want to maximize the psnr
+        return -1 * _psnr_loss_fn(source, target, self.max_val)
+
+
+class FlowPhotoMetricLoss(nn.Module):
+    def __init__(
+        self,
+        ssim_weight: float = 0.85,
+        ssim_window_size: int = 11,
+        ssim_max_val: float = 1.0,
+        ssim_sigma: float = 1.5,
+        ssim_eps: float = 1e-12,
+        ssim_use_padding: bool = True,
+        max_displacement_ratio: float = 0.15,
+    ) -> None:
+        super().__init__()
+
+        self._ssim_loss = SSIM(
+            kernel_size=ssim_window_size,
+            max_val=ssim_max_val,
+            sigma=ssim_sigma,
+            eps=ssim_eps,
+            use_padding=ssim_use_padding,
+        )
+
+        self._L1_weight = 1 - ssim_weight
+        self._SSIM_weight = ssim_weight
+        self._max_displacement_ratio = max_displacement_ratio
+
+    def forward(
+        self,
+        source: Tensor,
+        reference: Tensor,
+        flow_pred: Tensor,
+        valid_mask: Optional[Tensor] = None,
+    ):
+        """
+        Args:
+            source: tensor of shape (B, C, H, W)
+            reference: tensor of shape (B, C, H, W)
+            flow_pred: tensor of shape (B, 2, H, W)
+            valid_mask: tensor of shape (B, H, W) or None
+
+        Returns:
+            photometric loss of shape
+
+        """
+        torch._assert(
+            source.ndim == 4,
+            "FlowPhotoMetricLoss: source must have 4 dimensions, but got {}".format(source.ndim),
+        )
+        torch._assert(
+            reference.ndim == source.ndim,
+            "FlowPhotoMetricLoss: source and other must have the same number of dimensions, but got {} and {}".format(
+                source.ndim, reference.ndim
+            ),
+        )
+        torch._assert(
+            flow_pred.shape[1] == 2,
+            "FlowPhotoMetricLoss: flow_pred must have 2 channels, but got {}".format(flow_pred.shape[1]),
+        )
+        torch._assert(
+            flow_pred.ndim == 4,
+            "FlowPhotoMetricLoss: flow_pred must have 4 dimensions, but got {}".format(flow_pred.ndim),
+        )
+
+        B, C, H, W = source.shape
+        flow_channels = flow_pred.shape[1]
+
+        max_displacements = []
+        for dim in range(flow_channels):
+            shape_index = -1 - dim
+            max_displacements.append(int(self._max_displacement_ratio * source.shape[shape_index]))
+
+        # mask out all pixels that have larger flow than the max flow allowed
+        max_flow_mask = torch.logical_and(
+            *[flow_pred[:, dim, :, :] < max_displacements[dim] for dim in range(flow_channels)]
+        )
+
+        if valid_mask is not None:
+            valid_mask = torch.logical_and(valid_mask, max_flow_mask).unsqueeze(1)
+        else:
+            valid_mask = max_flow_mask.unsqueeze(1)
+
+        grid = make_coords_grid(B, H, W, device=str(source.device))
+        resampled_grids = grid - flow_pred
+        resampled_grids = resampled_grids.permute(0, 2, 3, 1)
+        resampled_source = grid_sample(reference, resampled_grids, mode="bilinear")
+
+        # compute SSIM loss
+        ssim_loss = self._ssim_loss(resampled_source * valid_mask, source * valid_mask)
+        l1_loss = (resampled_source * valid_mask - source * valid_mask).abs().mean(axis=(-3, -2, -1))
+        loss = self._L1_weight * l1_loss + self._SSIM_weight * ssim_loss
+
+        return loss.mean()
diff --git a/references/depth/stereo/utils/metrics.py b/references/depth/stereo/utils/metrics.py
new file mode 100644
index 0000000000000000000000000000000000000000..05b149fb048b70a95e32b485ac91de7de45c237a
--- /dev/null
+++ b/references/depth/stereo/utils/metrics.py
@@ -0,0 +1,49 @@
+from typing import Dict, List, Optional, Tuple
+
+from torch import Tensor
+
+AVAILABLE_METRICS = ["mae", "rmse", "epe", "bad1", "bad2", "epe", "1px", "3px", "5px", "fl-all", "relepe"]
+
+
+def compute_metrics(
+    flow_pred: Tensor, flow_gt: Tensor, valid_flow_mask: Optional[Tensor], metrics: List[str]
+) -> Tuple[Dict[str, float], int]:
+    for m in metrics:
+        if m not in AVAILABLE_METRICS:
+            raise ValueError(f"Invalid metric: {m}. Valid metrics are: {AVAILABLE_METRICS}")
+
+    metrics_dict = {}
+
+    pixels_diffs = (flow_pred - flow_gt).abs()
+    # there is no Y flow in Stereo Matching, therefore flow.abs() = flow.pow(2).sum(dim=1).sqrt()
+    flow_norm = flow_gt.abs()
+
+    if valid_flow_mask is not None:
+        valid_flow_mask = valid_flow_mask.unsqueeze(1)
+        pixels_diffs = pixels_diffs[valid_flow_mask]
+        flow_norm = flow_norm[valid_flow_mask]
+
+    num_pixels = pixels_diffs.numel()
+    if "bad1" in metrics:
+        metrics_dict["bad1"] = (pixels_diffs > 1).float().mean().item()
+    if "bad2" in metrics:
+        metrics_dict["bad2"] = (pixels_diffs > 2).float().mean().item()
+
+    if "mae" in metrics:
+        metrics_dict["mae"] = pixels_diffs.mean().item()
+    if "rmse" in metrics:
+        metrics_dict["rmse"] = pixels_diffs.pow(2).mean().sqrt().item()
+    if "epe" in metrics:
+        metrics_dict["epe"] = pixels_diffs.mean().item()
+    if "1px" in metrics:
+        metrics_dict["1px"] = (pixels_diffs < 1).float().mean().item()
+    if "3px" in metrics:
+        metrics_dict["3px"] = (pixels_diffs < 3).float().mean().item()
+    if "5px" in metrics:
+        metrics_dict["5px"] = (pixels_diffs < 5).float().mean().item()
+    if "fl-all" in metrics:
+        metrics_dict["fl-all"] = ((pixels_diffs < 3) & ((pixels_diffs / flow_norm) < 0.05)).float().mean().item() * 100
+    if "relepe" in metrics:
+        metrics_dict["relepe"] = (pixels_diffs / flow_norm).mean().item()
+
+    return metrics_dict, num_pixels
diff --git a/references/depth/stereo/visualization.py b/references/depth/stereo/visualization.py
new file mode 100644
index 0000000000000000000000000000000000000000..d043d274614969d206159e5bc4973e6844cf39f7
--- /dev/null
+++ b/references/depth/stereo/visualization.py
@@ -0,0 +1,126 @@
+import os
+from typing import List
+
+import numpy as np
+import torch
+from torch import Tensor
+from torchvision.utils import make_grid
+
+
+@torch.no_grad()
+def make_disparity_image(disparity: Tensor):
+    # normalize image to [0, 1]
+    disparity = disparity.detach().cpu()
+    disparity = (disparity - disparity.min()) / (disparity.max() - disparity.min())
+    return disparity
+
+
+@torch.no_grad()
+def make_disparity_image_pairs(disparity: Tensor, image: Tensor):
+    disparity = make_disparity_image(disparity)
+    # image is in [-1, 1], bring it to [0, 1]
+    image = image.detach().cpu()
+    image = image * 0.5 + 0.5
+    return disparity, image
+
+
+@torch.no_grad()
+def make_disparity_sequence(disparities: List[Tensor]):
+    # convert each disparity to [0, 1]
+    for idx, disparity_batch in enumerate(disparities):
+        disparities[idx] = torch.stack(list(map(make_disparity_image, disparity_batch)))
+    # make the list into a batch
+    disparity_sequences = torch.stack(disparities)
+    return disparity_sequences
+
+
+@torch.no_grad()
+def make_pair_grid(*inputs, orientation="horizontal"):
+    # make a grid of images with the outputs and references side by side
+    if orientation == "horizontal":
+        # interleave the outputs and references
+        canvas = torch.zeros_like(inputs[0])
+        canvas = torch.cat([canvas] * len(inputs), dim=0)
+        size = len(inputs)
+        for idx, inp in enumerate(inputs):
+            canvas[idx::size, ...] = inp
+        grid = make_grid(canvas, nrow=len(inputs), padding=16, normalize=True, scale_each=True)
+    elif orientation == "vertical":
+        # interleave the outputs and references
+        canvas = torch.cat(inputs, dim=0)
+        size = len(inputs)
+        for idx, inp in enumerate(inputs):
+            canvas[idx::size, ...] = inp
+        grid = make_grid(canvas, nrow=len(inputs[0]), padding=16, normalize=True, scale_each=True)
+    else:
+        raise ValueError("Unknown orientation: {}".format(orientation))
+    return grid
+
+
+@torch.no_grad()
+def make_training_sample_grid(
+    left_images: Tensor,
+    right_images: Tensor,
+    disparities: Tensor,
+    masks: Tensor,
+    predictions: List[Tensor],
+) -> np.ndarray:
+    # detach images and renormalize to [0, 1]
+    images_left = left_images.detach().cpu() * 0.5 + 0.5
+    images_right = right_images.detach().cpu() * 0.5 + 0.5
+    # detach the disparties and predictions
+    disparities = disparities.detach().cpu()
+    predictions = predictions[-1].detach().cpu()
+    # keep only the first channel of pixels, and repeat it 3 times
+    disparities = disparities[:, :1, ...].repeat(1, 3, 1, 1)
+    predictions = predictions[:, :1, ...].repeat(1, 3, 1, 1)
+    # unsqueeze and repeat the masks
+    masks = masks.detach().cpu().unsqueeze(1).repeat(1, 3, 1, 1)
+    # make a grid that will self normalize across the batch
+    pred_grid = make_pair_grid(images_left, images_right, masks, disparities, predictions, orientation="horizontal")
+    pred_grid = pred_grid.permute(1, 2, 0).numpy()
+    pred_grid = (pred_grid * 255).astype(np.uint8)
+    return pred_grid
+
+
+@torch.no_grad()
+def make_disparity_sequence_grid(predictions: List[Tensor], disparities: Tensor) -> np.ndarray:
+    # right most we will be adding the ground truth
+    seq_len = len(predictions) + 1
+    predictions = list(map(lambda x: x[:, :1, :, :].detach().cpu(), predictions + [disparities]))
+    sequence = make_disparity_sequence(predictions)
+    # swap axes to have the in the correct order for each batch sample
+    sequence = torch.swapaxes(sequence, 0, 1).contiguous().reshape(-1, 1, disparities.shape[-2], disparities.shape[-1])
+    sequence = make_grid(sequence, nrow=seq_len, padding=16, normalize=True, scale_each=True)
+    sequence = sequence.permute(1, 2, 0).numpy()
+    sequence = (sequence * 255).astype(np.uint8)
+    return sequence
+
+
+@torch.no_grad()
+def make_prediction_image_side_to_side(
+    predictions: Tensor, disparities: Tensor, valid_mask: Tensor, save_path: str, prefix: str
+) -> None:
+    import matplotlib.pyplot as plt
+
+    # normalize the predictions and disparities in [0, 1]
+    predictions = (predictions - predictions.min()) / (predictions.max() - predictions.min())
+    disparities = (disparities - disparities.min()) / (disparities.max() - disparities.min())
+    predictions = predictions * valid_mask
+    disparities = disparities * valid_mask
+
+    predictions = predictions.detach().cpu()
+    disparities = disparities.detach().cpu()
+
+    for idx, (pred, gt) in enumerate(zip(predictions, disparities)):
+        pred = pred.permute(1, 2, 0).numpy()
+        gt = gt.permute(1, 2, 0).numpy()
+        # plot pred and gt side by side
+        fig, ax = plt.subplots(1, 2, figsize=(10, 5))
+        ax[0].imshow(pred)
+        ax[0].set_title("Prediction")
+        ax[1].imshow(gt)
+        ax[1].set_title("Ground Truth")
+        save_name = os.path.join(save_path, "{}_{}.png".format(prefix, idx))
+        plt.savefig(save_name)
+        plt.close()
diff --git a/references/detection/coco_utils.py b/references/detection/coco_utils.py
index 396de63297ba2e38cc307d9aff1334704edb0298..f40dcdff783d7a2ce26d5e453c13dd23b52cc212 100644
--- a/references/detection/coco_utils.py
+++ b/references/detection/coco_utils.py
@@ -1,4 +1,3 @@
-import copy
 import os
 
 import torch
@@ -9,24 +8,6 @@ from pycocotools import mask as coco_mask
 from pycocotools.coco import COCO
 
 
-class FilterAndRemapCocoCategories:
-    def __init__(self, categories, remap=True):
-        self.categories = categories
-        self.remap = remap
-
-    def __call__(self, image, target):
-        anno = target["annotations"]
-        anno = [obj for obj in anno if obj["category_id"] in self.categories]
-        if not self.remap:
-            target["annotations"] = anno
-            return image, target
-        anno = copy.deepcopy(anno)
-        for obj in anno:
-            obj["category_id"] = self.categories.index(obj["category_id"])
-        target["annotations"] = anno
-        return image, target
-
-
 def convert_coco_poly_to_mask(segmentations, height, width):
     masks = []
     for polygons in segmentations:
@@ -49,7 +30,6 @@ class ConvertCocoPolysToMask:
         w, h = image.size
 
         image_id = target["image_id"]
-        image_id = torch.tensor([image_id])
 
         anno = target["annotations"]
 
@@ -116,7 +96,7 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
         # if all boxes have close to zero area, there is no annotation
         if _has_only_empty_bbox(anno):
             return False
-        # keypoints task have a slight different critera for considering
+        # keypoints task have a slight different criteria for considering
         # if an annotation is valid
         if "keypoints" not in anno[0]:
             return True
@@ -126,10 +106,6 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
             return True
         return False
 
-    if not isinstance(dataset, torchvision.datasets.CocoDetection):
-        raise TypeError(
-            f"This function expects dataset of type torchvision.datasets.CocoDetection, instead  got {type(dataset)}"
-        )
     ids = []
     for ds_idx, img_id in enumerate(dataset.ids):
         ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
@@ -153,7 +129,7 @@ def convert_to_coco_api(ds):
         # find better way to get target
         # targets = ds.get_annotations(img_idx)
         img, targets = ds[img_idx]
-        image_id = targets["image_id"].item()
+        image_id = targets["image_id"]
         img_dict = {}
         img_dict["id"] = image_id
         img_dict["height"] = img.shape[-2]
@@ -196,6 +172,7 @@ def convert_to_coco_api(ds):
 
 
 def get_coco_api_from_dataset(dataset):
+    # FIXME: This is... awful?
     for _ in range(10):
         if isinstance(dataset, torchvision.datasets.CocoDetection):
             break
@@ -220,7 +197,7 @@ class CocoDetection(torchvision.datasets.CocoDetection):
         return img, target
 
 
-def get_coco(root, image_set, transforms, mode="instances"):
+def get_coco(root, image_set, transforms, mode="instances", use_v2=False, with_masks=False):
     anno_file_template = "{}_{}2017.json"
     PATHS = {
         "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
@@ -228,17 +205,26 @@ def get_coco(root, image_set, transforms, mode="instances"):
         # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
     }
 
-    t = [ConvertCocoPolysToMask()]
-
-    if transforms is not None:
-        t.append(transforms)
-    transforms = T.Compose(t)
-
     img_folder, ann_file = PATHS[image_set]
     img_folder = os.path.join(root, img_folder)
     ann_file = os.path.join(root, ann_file)
 
-    dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
+    if use_v2:
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        target_keys = ["boxes", "labels", "image_id"]
+        if with_masks:
+            target_keys += ["masks"]
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+    else:
+        # TODO: handle with_masks for V1?
+        t = [ConvertCocoPolysToMask()]
+        if transforms is not None:
+            t.append(transforms)
+        transforms = T.Compose(t)
+
+        dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
 
     if image_set == "train":
         dataset = _coco_remove_images_without_annotations(dataset)
@@ -246,7 +232,3 @@ def get_coco(root, image_set, transforms, mode="instances"):
     # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
 
     return dataset
-
-
-def get_coco_kp(root, image_set, transforms):
-    return get_coco(root, image_set, transforms, mode="person_keypoints")
diff --git a/references/detection/engine.py b/references/detection/engine.py
index 0e5d55f189d482d34371f49c719c51660228244e..0e9bfffdf8af566c4bc13436361005c1e7b84dcb 100644
--- a/references/detection/engine.py
+++ b/references/detection/engine.py
@@ -26,7 +26,7 @@ def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq, sc
 
     for images, targets in metric_logger.log_every(data_loader, print_freq, header):
         images = list(image.to(device) for image in images)
-        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+        targets = [{k: v.to(device) if isinstance(v, torch.Tensor) else v for k, v in t.items()} for t in targets]
         with torch.cuda.amp.autocast(enabled=scaler is not None):
             loss_dict = model(images, targets)
             losses = sum(loss for loss in loss_dict.values())
@@ -97,7 +97,7 @@ def evaluate(model, data_loader, device):
         outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
         model_time = time.time() - model_time
 
-        res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
+        res = {target["image_id"]: output for target, output in zip(targets, outputs)}
         evaluator_time = time.time()
         coco_evaluator.update(res)
         evaluator_time = time.time() - evaluator_time
diff --git a/references/detection/group_by_aspect_ratio.py b/references/detection/group_by_aspect_ratio.py
index 5312cc036d61f730771dbf3df9b6b1c4416dc2dd..d12e14b540cc788abb98f40134ca9738dcd88a9a 100644
--- a/references/detection/group_by_aspect_ratio.py
+++ b/references/detection/group_by_aspect_ratio.py
@@ -63,7 +63,7 @@ class GroupedBatchSampler(BatchSampler):
         expected_num_batches = len(self)
         num_remaining = expected_num_batches - num_batches
         if num_remaining > 0:
-            # for the remaining batches, take first the buffers with largest number
+            # for the remaining batches, take first the buffers with the largest number
             # of elements
             for group_id, _ in sorted(buffer_per_group.items(), key=lambda x: len(x[1]), reverse=True):
                 remaining = self.batch_size - len(buffer_per_group[group_id])
diff --git a/references/detection/presets.py b/references/detection/presets.py
index 779f3f218ca0b4092c2d32374ccfdfb9a41369f9..e9b6d56c8861263fbe70acc1f6e01bb56f172e2b 100644
--- a/references/detection/presets.py
+++ b/references/detection/presets.py
@@ -1,73 +1,114 @@
+from collections import defaultdict
+
 import torch
-import transforms as T
+import transforms as reference_transforms
+
+
+def get_modules(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+        import torchvision.tv_tensors
+
+        return torchvision.transforms.v2, torchvision.tv_tensors
+    else:
+        return reference_transforms, None
 
 
 class DetectionPresetTrain:
-    def __init__(self, *, data_augmentation, hflip_prob=0.5, mean=(123.0, 117.0, 104.0)):
+    # Note: this transform assumes that the input to forward() are always PIL
+    # images, regardless of the backend parameter.
+    def __init__(
+        self,
+        *,
+        data_augmentation,
+        hflip_prob=0.5,
+        mean=(123.0, 117.0, 104.0),
+        backend="pil",
+        use_v2=False,
+    ):
+
+        T, tv_tensors = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tv_tensor":
+            transforms.append(T.ToImage())
+        elif backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
         if data_augmentation == "hflip":
-            self.transforms = T.Compose(
-                [
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [T.RandomHorizontalFlip(p=hflip_prob)]
         elif data_augmentation == "lsj":
-            self.transforms = T.Compose(
-                [
-                    T.ScaleJitter(target_size=(1024, 1024)),
-                    T.FixedSizeCrop(size=(1024, 1024), fill=mean),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [
+                T.ScaleJitter(target_size=(1024, 1024), antialias=True),
+                # TODO: FixedSizeCrop below doesn't work on tensors!
+                reference_transforms.FixedSizeCrop(size=(1024, 1024), fill=mean),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         elif data_augmentation == "multiscale":
-            self.transforms = T.Compose(
-                [
-                    T.RandomShortestSize(
-                        min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
-                    ),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [
+                T.RandomShortestSize(min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         elif data_augmentation == "ssd":
-            self.transforms = T.Compose(
-                [
-                    T.RandomPhotometricDistort(),
-                    T.RandomZoomOut(fill=list(mean)),
-                    T.RandomIoUCrop(),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            fill = defaultdict(lambda: mean, {tv_tensors.Mask: 0}) if use_v2 else list(mean)
+            transforms += [
+                T.RandomPhotometricDistort(),
+                T.RandomZoomOut(fill=fill),
+                T.RandomIoUCrop(),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         elif data_augmentation == "ssdlite":
-            self.transforms = T.Compose(
-                [
-                    T.RandomIoUCrop(),
-                    T.RandomHorizontalFlip(p=hflip_prob),
-                    T.PILToTensor(),
-                    T.ConvertImageDtype(torch.float),
-                ]
-            )
+            transforms += [
+                T.RandomIoUCrop(),
+                T.RandomHorizontalFlip(p=hflip_prob),
+            ]
         else:
             raise ValueError(f'Unknown data augmentation policy "{data_augmentation}"')
 
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2.
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
+
+        transforms += [T.ToDtype(torch.float, scale=True)]
+
+        if use_v2:
+            transforms += [
+                T.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.XYXY),
+                T.SanitizeBoundingBoxes(),
+                T.ToPureTensor(),
+            ]
+
+        self.transforms = T.Compose(transforms)
+
     def __call__(self, img, target):
         return self.transforms(img, target)
 
 
 class DetectionPresetEval:
-    def __init__(self):
-        self.transforms = T.Compose(
-            [
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-            ]
-        )
+    def __init__(self, backend="pil", use_v2=False):
+        T, _ = get_modules(use_v2)
+        transforms = []
+        backend = backend.lower()
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2?
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
+        elif backend == "tensor":
+            transforms += [T.PILToTensor()]
+        elif backend == "tv_tensor":
+            transforms += [T.ToImage()]
+        else:
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
+        transforms += [T.ToDtype(torch.float, scale=True)]
+
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
         return self.transforms(img, target)
diff --git a/references/detection/train.py b/references/detection/train.py
index dea483c5f7537ded5da48bd523be8394928714e4..d165a2d3598da093b41c3df145cd3a732d04d56a 100644
--- a/references/detection/train.py
+++ b/references/detection/train.py
@@ -28,7 +28,7 @@ import torchvision
 import torchvision.models.detection
 import torchvision.models.detection.mask_rcnn
 import utils
-from coco_utils import get_coco, get_coco_kp
+from coco_utils import get_coco
 from engine import evaluate, train_one_epoch
 from group_by_aspect_ratio import create_aspect_ratio_groups, GroupedBatchSampler
 from torchvision.transforms import InterpolationMode
@@ -40,23 +40,32 @@ def copypaste_collate_fn(batch):
     return copypaste(*utils.collate_fn(batch))
 
 
-def get_dataset(name, image_set, transform, data_path):
-    paths = {"coco": (data_path, get_coco, 91), "coco_kp": (data_path, get_coco_kp, 2)}
-    p, ds_fn, num_classes = paths[name]
-
-    ds = ds_fn(p, image_set=image_set, transforms=transform)
+def get_dataset(is_train, args):
+    image_set = "train" if is_train else "val"
+    num_classes, mode = {"coco": (91, "instances"), "coco_kp": (2, "person_keypoints")}[args.dataset]
+    with_masks = "mask" in args.model
+    ds = get_coco(
+        root=args.data_path,
+        image_set=image_set,
+        transforms=get_transform(is_train, args),
+        mode=mode,
+        use_v2=args.use_v2,
+        with_masks=with_masks,
+    )
     return ds, num_classes
 
 
-def get_transform(train, args):
-    if train:
-        return presets.DetectionPresetTrain(data_augmentation=args.data_augmentation)
+def get_transform(is_train, args):
+    if is_train:
+        return presets.DetectionPresetTrain(
+            data_augmentation=args.data_augmentation, backend=args.backend, use_v2=args.use_v2
+        )
     elif args.weights and args.test_only:
         weights = torchvision.models.get_weight(args.weights)
         trans = weights.transforms()
         return lambda img, target: (trans(img), target)
     else:
-        return presets.DetectionPresetEval()
+        return presets.DetectionPresetEval(backend=args.backend, use_v2=args.use_v2)
 
 
 def get_args_parser(add_help=True):
@@ -65,7 +74,12 @@ def get_args_parser(add_help=True):
     parser = argparse.ArgumentParser(description="PyTorch Detection Training", add_help=add_help)
 
     parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path")
-    parser.add_argument("--dataset", default="coco", type=str, help="dataset name")
+    parser.add_argument(
+        "--dataset",
+        default="coco",
+        type=str,
+        help="dataset name. Use coco for object detection and instance segmentation and coco_kp for Keypoint detection",
+    )
     parser.add_argument("--model", default="maskrcnn_resnet50_fpn", type=str, help="model name")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
     parser.add_argument(
@@ -159,10 +173,22 @@ def get_args_parser(add_help=True):
         help="Use CopyPaste data augmentation. Works only with data-augmentation='lsj'.",
     )
 
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
+
     return parser
 
 
 def main(args):
+    if args.backend.lower() == "tv_tensor" and not args.use_v2:
+        raise ValueError("Use --use-v2 if you want to use the tv_tensor backend.")
+    if args.dataset not in ("coco", "coco_kp"):
+        raise ValueError(f"Dataset should be coco or coco_kp, got {args.dataset}")
+    if "keypoint" in args.model and args.dataset != "coco_kp":
+        raise ValueError("Oops, if you want Keypoint detection, set --dataset coco_kp")
+    if args.dataset == "coco_kp" and args.use_v2:
+        raise ValueError("KeyPoint detection doesn't support V2 transforms yet")
+
     if args.output_dir:
         utils.mkdir(args.output_dir)
 
@@ -177,8 +203,8 @@ def main(args):
     # Data loading code
     print("Loading data")
 
-    dataset, num_classes = get_dataset(args.dataset, "train", get_transform(True, args), args.data_path)
-    dataset_test, _ = get_dataset(args.dataset, "val", get_transform(False, args), args.data_path)
+    dataset, num_classes = get_dataset(is_train=True, args=args)
+    dataset_test, _ = get_dataset(is_train=False, args=args)
 
     print("Creating data loaders")
     if args.distributed:
diff --git a/references/detection/transforms.py b/references/detection/transforms.py
index d26bf6eac8566e21f6c9e7a50f1c38f28af20aa5..e07ccfc992153960b5360b59f24b33585ec62130 100644
--- a/references/detection/transforms.py
+++ b/references/detection/transforms.py
@@ -53,14 +53,17 @@ class PILToTensor(nn.Module):
         return image, target
 
 
-class ConvertImageDtype(nn.Module):
-    def __init__(self, dtype: torch.dtype) -> None:
+class ToDtype(nn.Module):
+    def __init__(self, dtype: torch.dtype, scale: bool = False) -> None:
         super().__init__()
         self.dtype = dtype
+        self.scale = scale
 
     def forward(
         self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
     ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
         image = F.convert_image_dtype(image, self.dtype)
         return image, target
 
@@ -293,11 +296,13 @@ class ScaleJitter(nn.Module):
         target_size: Tuple[int, int],
         scale_range: Tuple[float, float] = (0.1, 2.0),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias=True,
     ):
         super().__init__()
         self.target_size = target_size
         self.scale_range = scale_range
         self.interpolation = interpolation
+        self.antialias = antialias
 
     def forward(
         self, image: Tensor, target: Optional[Dict[str, Tensor]] = None
@@ -315,14 +320,17 @@ class ScaleJitter(nn.Module):
         new_width = int(orig_width * r)
         new_height = int(orig_height * r)
 
-        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation)
+        image = F.resize(image, [new_height, new_width], interpolation=self.interpolation, antialias=self.antialias)
 
         if target is not None:
             target["boxes"][:, 0::2] *= new_width / orig_width
             target["boxes"][:, 1::2] *= new_height / orig_height
             if "masks" in target:
                 target["masks"] = F.resize(
-                    target["masks"], [new_height, new_width], interpolation=InterpolationMode.NEAREST
+                    target["masks"],
+                    [new_height, new_width],
+                    interpolation=InterpolationMode.NEAREST,
+                    antialias=self.antialias,
                 )
 
         return image, target
diff --git a/references/optical_flow/README.md b/references/optical_flow/README.md
index a7ac0223739ab76bffe1999bd701dd74f37ee93b..6ad1d4079f7629d92421accc9bcce2ee391afd9f 100644
--- a/references/optical_flow/README.md
+++ b/references/optical_flow/README.md
@@ -56,7 +56,7 @@ torchrun --nproc_per_node 1 --nnodes 1 train.py --val-dataset sintel --batch-siz
 
 This should give an epe of about 1.3822 on the clean pass and 2.7161 on the
 final pass of Sintel-train. Results may vary slightly depending on the batch
-size and the number of GPUs. For the most accurate resuts use 1 GPU and
+size and the number of GPUs. For the most accurate results use 1 GPU and
 `--batch-size 1`:
 
 ```
diff --git a/references/optical_flow/train.py b/references/optical_flow/train.py
index be6ffe4ccefaa8ad2dd3544893b66f08f83449ce..ab99cc3ae55bef3f8c110b357d44543752abfee7 100644
--- a/references/optical_flow/train.py
+++ b/references/optical_flow/train.py
@@ -82,7 +82,7 @@ def _evaluate(model, args, val_dataset, *, padder_mode, num_flow_updates=None, b
 
     def inner_loop(blob):
         if blob[0].dim() == 3:
-            # input is not batched so we add an extra dim for consistency
+            # input is not batched, so we add an extra dim for consistency
             blob = [x[None, :, :, :] if x is not None else None for x in blob]
 
         image1, image2, flow_gt = blob[:3]
@@ -150,7 +150,7 @@ def evaluate(model, args):
 
     for name in val_datasets:
         if name == "kitti":
-            # Kitti has different image sizes so we need to individually pad them, we can't batch.
+            # Kitti has different image sizes, so we need to individually pad them, we can't batch.
             # see comment in InputPadder
             if args.batch_size != 1 and (not args.distributed or args.rank == 0):
                 warnings.warn(
diff --git a/references/optical_flow/transforms.py b/references/optical_flow/transforms.py
index 6011608183a7e5d8d1a7ecb1675b96fe0c430972..bc831a2ee52cb7ad1b87162c3035d134249ba633 100644
--- a/references/optical_flow/transforms.py
+++ b/references/optical_flow/transforms.py
@@ -164,7 +164,7 @@ class RandomResizeAndCrop(torch.nn.Module):
     # The reason we don't rely on RandomResizedCrop is because of a significant
     # difference in the parametrization of both transforms, in particular,
     # because of the way the random parameters are sampled in both transforms,
-    # which leads to fairly different resuts (and different epe). For more details see
+    # which leads to fairly different results (and different epe). For more details see
     # https://github.com/pytorch/vision/pull/5026/files#r762932579
     def __init__(self, crop_size, min_scale=-0.2, max_scale=0.5, stretch_prob=0.8):
         super().__init__()
@@ -196,8 +196,12 @@ class RandomResizeAndCrop(torch.nn.Module):
 
         if torch.rand(1).item() < self.resize_prob:
             # rescale the images
-            img1 = F.resize(img1, size=(new_h, new_w))
-            img2 = F.resize(img2, size=(new_h, new_w))
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the OF models with antialias=True?
+            img1 = F.resize(img1, size=(new_h, new_w), antialias=False)
+            img2 = F.resize(img2, size=(new_h, new_w), antialias=False)
             if valid_flow_mask is None:
                 flow = F.resize(flow, size=(new_h, new_w))
                 flow = flow * torch.tensor([scale_x, scale_y])[:, None, None]
@@ -208,7 +212,7 @@ class RandomResizeAndCrop(torch.nn.Module):
 
         # Note: For sparse datasets (Kitti), the original code uses a "margin"
         # See e.g. https://github.com/princeton-vl/RAFT/blob/master/core/utils/augmentor.py#L220:L220
-        # We don't, not sure it matters much
+        # We don't, not sure if it matters much
         y0 = torch.randint(0, img1.shape[1] - self.crop_size[0], size=(1,)).item()
         x0 = torch.randint(0, img1.shape[2] - self.crop_size[1], size=(1,)).item()
 
diff --git a/references/optical_flow/utils.py b/references/optical_flow/utils.py
index 8b07e9de35c075424e9756a8ebe7ec13d4afb520..cd4b16eb0d8c9ed773d284e8702e6d87e687733f 100644
--- a/references/optical_flow/utils.py
+++ b/references/optical_flow/utils.py
@@ -181,7 +181,7 @@ def sequence_loss(flow_preds, flow_gt, valid_flow_mask, gamma=0.8, max_flow=400)
     if gamma > 1:
         raise ValueError(f"Gamma should be < 1, got {gamma}.")
 
-    # exlude invalid pixels and extremely large diplacements
+    # exclude invalid pixels and extremely large diplacements
     flow_norm = torch.sum(flow_gt**2, dim=1).sqrt()
     valid_flow_mask = valid_flow_mask & (flow_norm < max_flow)
 
@@ -248,7 +248,7 @@ def setup_ddp(args):
     # https://discuss.pytorch.org/t/what-is-the-difference-between-rank-and-local-rank/61940/2
 
     if all(key in os.environ for key in ("LOCAL_RANK", "RANK", "WORLD_SIZE")):
-        # if we're here, the script was called with torchrun. Otherwise
+        # if we're here, the script was called with torchrun. Otherwise,
         # these args will be set already by the run_with_submitit script
         args.local_rank = int(os.environ["LOCAL_RANK"])
         args.rank = int(os.environ["RANK"])
diff --git a/references/segmentation/coco_utils.py b/references/segmentation/coco_utils.py
index e02434012f1fc3517a284be04aa058a3ac32b79d..6a15dbefb526c1b01085ed05de0452b5e24d7c30 100644
--- a/references/segmentation/coco_utils.py
+++ b/references/segmentation/coco_utils.py
@@ -68,11 +68,6 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
         # if more than 1k pixels occupied in the image
         return sum(obj["area"] for obj in anno) > 1000
 
-    if not isinstance(dataset, torchvision.datasets.CocoDetection):
-        raise TypeError(
-            f"This function expects dataset of type torchvision.datasets.CocoDetection, instead  got {type(dataset)}"
-        )
-
     ids = []
     for ds_idx, img_id in enumerate(dataset.ids):
         ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
@@ -86,7 +81,7 @@ def _coco_remove_images_without_annotations(dataset, cat_list=None):
     return dataset
 
 
-def get_coco(root, image_set, transforms):
+def get_coco(root, image_set, transforms, use_v2=False):
     PATHS = {
         "train": ("train2017", os.path.join("annotations", "instances_train2017.json")),
         "val": ("val2017", os.path.join("annotations", "instances_val2017.json")),
@@ -94,13 +89,24 @@ def get_coco(root, image_set, transforms):
     }
     CAT_LIST = [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72]
 
-    transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
-
     img_folder, ann_file = PATHS[image_set]
     img_folder = os.path.join(root, img_folder)
     ann_file = os.path.join(root, ann_file)
 
-    dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+    # The 2 "Compose" below achieve the same thing: converting coco detection
+    # samples into segmentation-compatible samples. They just do it with
+    # slightly different implementations. We could refactor and unify, but
+    # keeping them separate helps keeping the v2 version clean
+    if use_v2:
+        import v2_extras
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        transforms = Compose([v2_extras.CocoDetectionToVOCSegmentation(), transforms])
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
+        dataset = wrap_dataset_for_transforms_v2(dataset, target_keys={"masks", "labels"})
+    else:
+        transforms = Compose([FilterAndRemapCocoCategories(CAT_LIST, remap=True), ConvertCocoPolysToMask(), transforms])
+        dataset = torchvision.datasets.CocoDetection(img_folder, ann_file, transforms=transforms)
 
     if image_set == "train":
         dataset = _coco_remove_images_without_annotations(dataset, CAT_LIST)
diff --git a/references/segmentation/presets.py b/references/segmentation/presets.py
index ed02ae660e4e4272b37cf1970ed3e1d8dcfeba2c..803769fcafce82d15f25637e67918dfa0f2d003b 100644
--- a/references/segmentation/presets.py
+++ b/references/segmentation/presets.py
@@ -1,39 +1,109 @@
 import torch
-import transforms as T
+
+
+def get_modules(use_v2):
+    # We need a protected import to avoid the V2 warning in case just V1 is used
+    if use_v2:
+        import torchvision.transforms.v2
+        import torchvision.tv_tensors
+        import v2_extras
+
+        return torchvision.transforms.v2, torchvision.tv_tensors, v2_extras
+    else:
+        import transforms
+
+        return transforms, None, None
 
 
 class SegmentationPresetTrain:
-    def __init__(self, *, base_size, crop_size, hflip_prob=0.5, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
-        min_size = int(0.5 * base_size)
-        max_size = int(2.0 * base_size)
+    def __init__(
+        self,
+        *,
+        base_size,
+        crop_size,
+        hflip_prob=0.5,
+        mean=(0.485, 0.456, 0.406),
+        std=(0.229, 0.224, 0.225),
+        backend="pil",
+        use_v2=False,
+    ):
+        T, tv_tensors, v2_extras = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tv_tensor":
+            transforms.append(T.ToImage())
+        elif backend == "tensor":
+            transforms.append(T.PILToTensor())
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
+        transforms += [T.RandomResize(min_size=int(0.5 * base_size), max_size=int(2.0 * base_size))]
 
-        trans = [T.RandomResize(min_size, max_size)]
         if hflip_prob > 0:
-            trans.append(T.RandomHorizontalFlip(hflip_prob))
-        trans.extend(
-            [
-                T.RandomCrop(crop_size),
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-                T.Normalize(mean=mean, std=std),
+            transforms += [T.RandomHorizontalFlip(hflip_prob)]
+
+        if use_v2:
+            # We need a custom pad transform here, since the padding we want to perform here is fundamentally
+            # different from the padding in `RandomCrop` if `pad_if_needed=True`.
+            transforms += [v2_extras.PadIfSmaller(crop_size, fill={tv_tensors.Mask: 255, "others": 0})]
+
+        transforms += [T.RandomCrop(crop_size)]
+
+        if backend == "pil":
+            transforms += [T.PILToTensor()]
+
+        if use_v2:
+            img_type = tv_tensors.Image if backend == "tv_tensor" else torch.Tensor
+            transforms += [
+                T.ToDtype(dtype={img_type: torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True)
             ]
-        )
-        self.transforms = T.Compose(trans)
+        else:
+            # No need to explicitly convert masks as they're magically int64 already
+            transforms += [T.ToDtype(torch.float, scale=True)]
+
+        transforms += [T.Normalize(mean=mean, std=std)]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
         return self.transforms(img, target)
 
 
 class SegmentationPresetEval:
-    def __init__(self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)):
-        self.transforms = T.Compose(
-            [
-                T.RandomResize(base_size, base_size),
-                T.PILToTensor(),
-                T.ConvertImageDtype(torch.float),
-                T.Normalize(mean=mean, std=std),
-            ]
-        )
+    def __init__(
+        self, *, base_size, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225), backend="pil", use_v2=False
+    ):
+        T, _, _ = get_modules(use_v2)
+
+        transforms = []
+        backend = backend.lower()
+        if backend == "tensor":
+            transforms += [T.PILToTensor()]
+        elif backend == "tv_tensor":
+            transforms += [T.ToImage()]
+        elif backend != "pil":
+            raise ValueError(f"backend can be 'tv_tensor', 'tensor' or 'pil', but got {backend}")
+
+        if use_v2:
+            transforms += [T.Resize(size=(base_size, base_size))]
+        else:
+            transforms += [T.RandomResize(min_size=base_size, max_size=base_size)]
+
+        if backend == "pil":
+            # Note: we could just convert to pure tensors even in v2?
+            transforms += [T.ToImage() if use_v2 else T.PILToTensor()]
+
+        transforms += [
+            T.ToDtype(torch.float, scale=True),
+            T.Normalize(mean=mean, std=std),
+        ]
+        if use_v2:
+            transforms += [T.ToPureTensor()]
+
+        self.transforms = T.Compose(transforms)
 
     def __call__(self, img, target):
         return self.transforms(img, target)
diff --git a/references/segmentation/train.py b/references/segmentation/train.py
index bb57e65b801c5776ee9f2b950cc6c789c5f86da6..35ece7264a336a8ec1d97ce206bcf681b7a21027 100644
--- a/references/segmentation/train.py
+++ b/references/segmentation/train.py
@@ -14,24 +14,30 @@ from torch.optim.lr_scheduler import PolynomialLR
 from torchvision.transforms import functional as F, InterpolationMode
 
 
-def get_dataset(dir_path, name, image_set, transform):
+def get_dataset(args, is_train):
     def sbd(*args, **kwargs):
+        kwargs.pop("use_v2")
         return torchvision.datasets.SBDataset(*args, mode="segmentation", **kwargs)
 
+    def voc(*args, **kwargs):
+        kwargs.pop("use_v2")
+        return torchvision.datasets.VOCSegmentation(*args, **kwargs)
+
     paths = {
-        "voc": (dir_path, torchvision.datasets.VOCSegmentation, 21),
-        "voc_aug": (dir_path, sbd, 21),
-        "coco": (dir_path, get_coco, 21),
+        "voc": (args.data_path, voc, 21),
+        "voc_aug": (args.data_path, sbd, 21),
+        "coco": (args.data_path, get_coco, 21),
     }
-    p, ds_fn, num_classes = paths[name]
+    p, ds_fn, num_classes = paths[args.dataset]
 
-    ds = ds_fn(p, image_set=image_set, transforms=transform)
+    image_set = "train" if is_train else "val"
+    ds = ds_fn(p, image_set=image_set, transforms=get_transform(is_train, args), use_v2=args.use_v2)
     return ds, num_classes
 
 
-def get_transform(train, args):
-    if train:
-        return presets.SegmentationPresetTrain(base_size=520, crop_size=480)
+def get_transform(is_train, args):
+    if is_train:
+        return presets.SegmentationPresetTrain(base_size=520, crop_size=480, backend=args.backend, use_v2=args.use_v2)
     elif args.weights and args.test_only:
         weights = torchvision.models.get_weight(args.weights)
         trans = weights.transforms()
@@ -44,7 +50,7 @@ def get_transform(train, args):
 
         return preprocessing
     else:
-        return presets.SegmentationPresetEval(base_size=520)
+        return presets.SegmentationPresetEval(base_size=520, backend=args.backend, use_v2=args.use_v2)
 
 
 def criterion(inputs, target):
@@ -120,6 +126,12 @@ def train_one_epoch(model, criterion, optimizer, data_loader, lr_scheduler, devi
 
 
 def main(args):
+    if args.backend.lower() != "pil" and not args.use_v2:
+        # TODO: Support tensor backend in V1?
+        raise ValueError("Use --use-v2 if you want to use the tv_tensor or tensor backend.")
+    if args.use_v2 and args.dataset != "coco":
+        raise ValueError("v2 is only support supported for coco dataset for now.")
+
     if args.output_dir:
         utils.mkdir(args.output_dir)
 
@@ -134,8 +146,8 @@ def main(args):
     else:
         torch.backends.cudnn.benchmark = True
 
-    dataset, num_classes = get_dataset(args.data_path, args.dataset, "train", get_transform(True, args))
-    dataset_test, _ = get_dataset(args.data_path, args.dataset, "val", get_transform(False, args))
+    dataset, num_classes = get_dataset(args, is_train=True)
+    dataset_test, _ = get_dataset(args, is_train=False)
 
     if args.distributed:
         train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
@@ -260,7 +272,7 @@ def get_args_parser(add_help=True):
     parser.add_argument("--data-path", default="/datasets01/COCO/022719/", type=str, help="dataset path")
     parser.add_argument("--dataset", default="coco", type=str, help="dataset name")
     parser.add_argument("--model", default="fcn_resnet101", type=str, help="model name")
-    parser.add_argument("--aux-loss", action="store_true", help="auxiliar loss")
+    parser.add_argument("--aux-loss", action="store_true", help="auxiliary loss")
     parser.add_argument("--device", default="cuda", type=str, help="device (Use cuda or cpu Default: cuda)")
     parser.add_argument(
         "-b", "--batch-size", default=8, type=int, help="images per gpu, the total batch size is $NGPU x batch_size"
@@ -307,6 +319,8 @@ def get_args_parser(add_help=True):
     # Mixed precision training parameters
     parser.add_argument("--amp", action="store_true", help="Use torch.cuda.amp for mixed precision training")
 
+    parser.add_argument("--backend", default="PIL", type=str.lower, help="PIL or tensor - case insensitive")
+    parser.add_argument("--use-v2", action="store_true", help="Use V2 transforms")
     return parser
 
 
diff --git a/references/segmentation/transforms.py b/references/segmentation/transforms.py
index 518048db2faef36297c4a47c700d7235434fcc0b..6934b9f862ea62984c4e505e6544dac39dd1ab18 100644
--- a/references/segmentation/transforms.py
+++ b/references/segmentation/transforms.py
@@ -35,7 +35,7 @@ class RandomResize:
 
     def __call__(self, image, target):
         size = random.randint(self.min_size, self.max_size)
-        image = F.resize(image, size)
+        image = F.resize(image, size, antialias=True)
         target = F.resize(target, size, interpolation=T.InterpolationMode.NEAREST)
         return image, target
 
@@ -81,11 +81,14 @@ class PILToTensor:
         return image, target
 
 
-class ConvertImageDtype:
-    def __init__(self, dtype):
+class ToDtype:
+    def __init__(self, dtype, scale=False):
         self.dtype = dtype
+        self.scale = scale
 
     def __call__(self, image, target):
+        if not self.scale:
+            return image.to(dtype=self.dtype), target
         image = F.convert_image_dtype(image, self.dtype)
         return image, target
 
diff --git a/references/segmentation/utils.py b/references/segmentation/utils.py
index 4ea24db83ed99dcc9b1edb6a646cadb6cfd07bb3..cb200f23d766b92440108cb05fe0fe093f89a08c 100644
--- a/references/segmentation/utils.py
+++ b/references/segmentation/utils.py
@@ -267,9 +267,9 @@ def init_distributed_mode(args):
         args.rank = int(os.environ["RANK"])
         args.world_size = int(os.environ["WORLD_SIZE"])
         args.gpu = int(os.environ["LOCAL_RANK"])
-    elif "SLURM_PROCID" in os.environ:
-        args.rank = int(os.environ["SLURM_PROCID"])
-        args.gpu = args.rank % torch.cuda.device_count()
+    # elif "SLURM_PROCID" in os.environ:
+    #     args.rank = int(os.environ["SLURM_PROCID"])
+    #     args.gpu = args.rank % torch.cuda.device_count()
     elif hasattr(args, "rank"):
         pass
     else:
diff --git a/references/segmentation/v2_extras.py b/references/segmentation/v2_extras.py
new file mode 100644
index 0000000000000000000000000000000000000000..e1a8b53e02ba016a49e5c96f3ea5c70a87bb5c47
--- /dev/null
+++ b/references/segmentation/v2_extras.py
@@ -0,0 +1,83 @@
+"""This file only exists to be lazy-imported and avoid V2-related import warnings when just using V1."""
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import v2
+
+
+class PadIfSmaller(v2.Transform):
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = size
+        self.fill = v2._utils._setup_fill_arg(fill)
+
+    def _get_params(self, sample):
+        _, height, width = v2._utils.query_chw(sample)
+        padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
+        needs_padding = any(padding)
+        return dict(padding=padding, needs_padding=needs_padding)
+
+    def _transform(self, inpt, params):
+        if not params["needs_padding"]:
+            return inpt
+
+        fill = v2._utils._get_fill(self.fill, type(inpt))
+        fill = v2._utils._convert_fill_arg(fill)
+
+        return v2.functional.pad(inpt, padding=params["padding"], fill=fill)
+
+
+class CocoDetectionToVOCSegmentation(v2.Transform):
+    """Turn samples from datasets.CocoDetection into the same format as VOCSegmentation.
+
+    This is achieved in two steps:
+
+    1. COCO differentiates between 91 categories while VOC only supports 21, including background for both. Fortunately,
+       the COCO categories are a superset of the VOC ones and thus can be mapped. Instances of the 70 categories not
+       present in VOC are dropped and replaced by background.
+    2. COCO only offers detection masks, i.e. a (N, H, W) bool-ish tensor, where the truthy values in each individual
+       mask denote the instance. However, a segmentation mask is a (H, W) integer tensor (typically torch.uint8), where
+       the value of each pixel denotes the category it belongs to. The detection masks are merged into one segmentation
+       mask while pixels that belong to multiple detection masks are marked as invalid.
+    """
+
+    COCO_TO_VOC_LABEL_MAP = dict(
+        zip(
+            [0, 5, 2, 16, 9, 44, 6, 3, 17, 62, 21, 67, 18, 19, 4, 1, 64, 20, 63, 7, 72],
+            range(21),
+        )
+    )
+    INVALID_VALUE = 255
+
+    def _coco_detection_masks_to_voc_segmentation_mask(self, target):
+        if "masks" not in target:
+            return None
+
+        instance_masks, instance_labels_coco = target["masks"], target["labels"]
+
+        valid_labels_voc = [
+            (idx, label_voc)
+            for idx, label_coco in enumerate(instance_labels_coco.tolist())
+            if (label_voc := self.COCO_TO_VOC_LABEL_MAP.get(label_coco)) is not None
+        ]
+
+        if not valid_labels_voc:
+            return None
+
+        valid_voc_category_idcs, instance_labels_voc = zip(*valid_labels_voc)
+
+        instance_masks = instance_masks[list(valid_voc_category_idcs)].to(torch.uint8)
+        instance_labels_voc = torch.tensor(instance_labels_voc, dtype=torch.uint8)
+
+        # Calling `.max()` on the stacked detection masks works fine to separate background from foreground as long as
+        # there is at most a single instance per pixel. Overlapping instances will be filtered out in the next step.
+        segmentation_mask, _ = (instance_masks * instance_labels_voc.reshape(-1, 1, 1)).max(dim=0)
+        segmentation_mask[instance_masks.sum(dim=0) > 1] = self.INVALID_VALUE
+
+        return segmentation_mask
+
+    def forward(self, image, target):
+        segmentation_mask = self._coco_detection_masks_to_voc_segmentation_mask(target)
+        if segmentation_mask is None:
+            segmentation_mask = torch.zeros(v2.functional.get_size(image), dtype=torch.uint8)
+
+        return image, tv_tensors.Mask(segmentation_mask)
diff --git a/references/similarity/sampler.py b/references/similarity/sampler.py
index f4564eca33e22c2d1f6fea1cee8a49236497295e..fe6517418ab092f1b859bc5802268e774411c40b 100644
--- a/references/similarity/sampler.py
+++ b/references/similarity/sampler.py
@@ -48,7 +48,7 @@ class PKSampler(Sampler):
 
         # Ensures there are enough classes to sample from
         if len(self.groups) < p:
-            raise ValueError("There are not enought classes to sample from")
+            raise ValueError("There are not enough classes to sample from")
 
     def __iter__(self):
         # Shuffle samples within groups
diff --git a/references/video_classification/README.md b/references/video_classification/README.md
index cbd303275e5e82667b730c8e6a523159f2de3827..39c5d8f1bbaee7a6dcde7145f928b10b0f030616 100644
--- a/references/video_classification/README.md
+++ b/references/video_classification/README.md
@@ -76,7 +76,7 @@ Input data augmentations at validation time (with optional parameters):
 5. Convert BCHW to CBHW
 
 This translates in the following set of command-line arguments. Please note that `--batch-size` parameter controls the
-batch size per GPU. Moreover note that our default `--lr` is configured for 64 GPUs which is how many we used for the 
+batch size per GPU. Moreover, note that our default `--lr` is configured for 64 GPUs which is how many we used for the 
 Video resnet models:
 ```
 # number of frames per clip
diff --git a/references/video_classification/presets.py b/references/video_classification/presets.py
index ef77405225786883990d110c3ed726870f0eaa16..f73802c9666cca56411e8b1c7b2483719c578c31 100644
--- a/references/video_classification/presets.py
+++ b/references/video_classification/presets.py
@@ -15,7 +15,11 @@ class VideoClassificationPresetTrain:
     ):
         trans = [
             transforms.ConvertImageDtype(torch.float32),
-            transforms.Resize(resize_size),
+            # We hard-code antialias=False to preserve results after we changed
+            # its default from None to True (see
+            # https://github.com/pytorch/vision/pull/7160)
+            # TODO: we could re-train the video models with antialias=True?
+            transforms.Resize(resize_size, antialias=False),
         ]
         if hflip_prob > 0:
             trans.append(transforms.RandomHorizontalFlip(hflip_prob))
@@ -31,7 +35,11 @@ class VideoClassificationPresetEval:
         self.transforms = transforms.Compose(
             [
                 transforms.ConvertImageDtype(torch.float32),
-                transforms.Resize(resize_size),
+                # We hard-code antialias=False to preserve results after we changed
+                # its default from None to True (see
+                # https://github.com/pytorch/vision/pull/7160)
+                # TODO: we could re-train the video models with antialias=True?
+                transforms.Resize(resize_size, antialias=False),
                 transforms.Normalize(mean=mean, std=std),
                 transforms.CenterCrop(crop_size),
                 ConvertBCHWtoCBHW(),
diff --git a/scripts/download_model_urls.py b/scripts/download_model_urls.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5f53d71e98f1c0c82d74bdb5b6cca122c4090c2
--- /dev/null
+++ b/scripts/download_model_urls.py
@@ -0,0 +1,41 @@
+import asyncio
+import sys
+from pathlib import Path
+from time import perf_counter
+from urllib.parse import urlsplit
+
+import aiofiles
+import aiohttp
+from torchvision import models
+from tqdm.asyncio import tqdm
+
+
+async def main(download_root):
+    download_root.mkdir(parents=True, exist_ok=True)
+    urls = {weight.url for name in models.list_models() for weight in iter(models.get_model_weights(name))}
+
+    async with aiohttp.ClientSession(timeout=aiohttp.ClientTimeout(total=None)) as session:
+        await tqdm.gather(*[download(download_root, session, url) for url in urls])
+
+
+async def download(download_root, session, url):
+    response = await session.get(url, params=dict(source="ci"))
+
+    assert response.ok
+
+    file_name = Path(urlsplit(url).path).name
+    async with aiofiles.open(download_root / file_name, "wb") as f:
+        async for data in response.content.iter_any():
+            await f.write(data)
+
+
+if __name__ == "__main__":
+    download_root = (
+        (Path(sys.argv[1]) if len(sys.argv) > 1 else Path("~/.cache/torch/hub/checkpoints")).expanduser().resolve()
+    )
+    print(f"Downloading model weights to {download_root}")
+    start = perf_counter()
+    asyncio.get_event_loop().run_until_complete(main(download_root))
+    stop = perf_counter()
+    minutes, seconds = divmod(stop - start, 60)
+    print(f"Download took {minutes:2.0f}m {seconds:2.0f}s")
diff --git a/setup.cfg b/setup.cfg
index f36195194cd058f3d95689eebc25d7b775acaabb..0f4ddbfab10c11315a9de75f7dcc35cf7ddeae52 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -2,7 +2,7 @@
 universal=1
 
 [metadata]
-license_file = LICENSE
+license_files = LICENSE
 
 [pep8]
 max-line-length = 120
@@ -10,7 +10,7 @@ max-line-length = 120
 [flake8]
 # note: we ignore all 501s (line too long) anyway as they're taken care of by black
 max-line-length = 120
-ignore = E203, E402, W503, W504, F821, E501
+ignore = E203, E402, W503, W504, F821, E501, B, C4, EXE
 per-file-ignores =
     __init__.py: F401, F403, F405
     ./hubconf.py: F401
diff --git a/setup.py b/setup.py
index be1a29609a805c7a4f122e3ff8040bb330046e6b..f0aa3f4ab2a13a19270bb3cc2109cde6a1001210 100644
--- a/setup.py
+++ b/setup.py
@@ -86,7 +86,6 @@ if os.getenv("PYTORCH_VERSION"):
     pytorch_dep += "==" + os.getenv("PYTORCH_VERSION")
 
 requirements = [
-    "typing_extensions",
     "numpy",
     "requests",
     pytorch_dep,
@@ -166,10 +165,13 @@ def get_extensions():
         + glob.glob(os.path.join(extensions_dir, "ops", "cpu", "*.cpp"))
         + glob.glob(os.path.join(extensions_dir, "ops", "quantized", "cpu", "*.cpp"))
     )
+    source_mps = glob.glob(os.path.join(extensions_dir, "ops", "mps", "*.mm"))
 
     print("Compiling extensions with following flags:")
     force_cuda = os.getenv("FORCE_CUDA", "0") == "1"
     print(f"  FORCE_CUDA: {force_cuda}")
+    force_mps = os.getenv("FORCE_MPS", "0") == "1"
+    print(f"  FORCE_MPS: {force_mps}")
     debug_mode = os.getenv("DEBUG", "0") == "1"
     print(f"  DEBUG: {debug_mode}")
     use_png = os.getenv("TORCHVISION_USE_PNG", "1") == "1"
@@ -231,6 +233,8 @@ def get_extensions():
             define_macros += [("WITH_HIP", None)]
             nvcc_flags = []
         extra_compile_args["nvcc"] = nvcc_flags
+    elif torch.backends.mps.is_available() or force_mps:
+        sources += source_mps
 
     if sys.platform == "win32":
         define_macros += [("torchvision_EXPORTS", None)]
@@ -247,6 +251,9 @@ def get_extensions():
             extra_compile_args["nvcc"] = [f for f in nvcc_flags if not ("-O" in f or "-g" in f)]
             extra_compile_args["nvcc"].append("-O0")
             extra_compile_args["nvcc"].append("-g")
+    else:
+        print("Compiling with debug mode OFF")
+        extra_compile_args["cxx"].append("-g0")
 
     sources = [os.path.join(extensions_dir, s) for s in sources]
 
@@ -327,6 +334,8 @@ def get_extensions():
     use_jpeg = use_jpeg and jpeg_found
     if use_jpeg:
         print("Building torchvision with JPEG image support")
+        print(f"  libjpeg include path: {jpeg_include}")
+        print(f"  libjpeg lib path: {jpeg_lib}")
         image_link_flags.append("jpeg")
         if jpeg_conda:
             image_library += [jpeg_lib]
@@ -352,11 +361,14 @@ def get_extensions():
     image_macros += [("NVJPEG_FOUND", str(int(use_nvjpeg)))]
 
     image_path = os.path.join(extensions_dir, "io", "image")
-    image_src = (
-        glob.glob(os.path.join(image_path, "*.cpp"))
-        + glob.glob(os.path.join(image_path, "cpu", "*.cpp"))
-        + glob.glob(os.path.join(image_path, "cuda", "*.cpp"))
-    )
+    image_src = glob.glob(os.path.join(image_path, "*.cpp")) + glob.glob(os.path.join(image_path, "cpu", "*.cpp"))
+
+    if is_rocm_pytorch:
+        image_src += glob.glob(os.path.join(image_path, "hip", "*.cpp"))
+        # we need to exclude this in favor of the hipified source
+        image_src.remove(os.path.join(image_path, "image.cpp"))
+    else:
+        image_src += glob.glob(os.path.join(image_path, "cuda", "*.cpp"))
 
     if use_png or use_jpeg:
         ext_modules.append(
@@ -464,8 +476,8 @@ def get_extensions():
                     "swresample",
                     "swscale",
                 ],
-                extra_compile_args=["-std=c++14"] if os.name != "nt" else ["/std:c++14", "/MP"],
-                extra_link_args=["-std=c++14" if os.name != "nt" else "/std:c++14"],
+                extra_compile_args=["-std=c++17"] if os.name != "nt" else ["/std:c++17", "/MP"],
+                extra_link_args=["-std=c++17" if os.name != "nt" else "/std:c++17"],
             )
         )
 
@@ -564,6 +576,7 @@ if __name__ == "__main__":
         url="https://github.com/pytorch/vision",
         description="image and video datasets and models for torch deep learning",
         long_description=readme,
+        long_description_content_type="text/markdown",
         license="BSD",
         # Package info
         packages=find_packages(exclude=("test",)),
@@ -574,7 +587,7 @@ if __name__ == "__main__":
             "scipy": ["scipy"],
         },
         ext_modules=get_extensions(),
-        python_requires=">=3.7",
+        python_requires=">=3.8",
         cmdclass={
             "build_ext": BuildExtension.with_options(no_python_abi_suffix=True),
             "clean": clean,
diff --git a/test/assets/toosmall_png/heapbof.png b/test/assets/toosmall_png/heapbof.png
new file mode 100644
index 0000000000000000000000000000000000000000..e720d1833423d20f7df5a5bab5411956ed01a879
Binary files /dev/null and b/test/assets/toosmall_png/heapbof.png differ
diff --git a/test/builtin_dataset_mocks.py b/test/builtin_dataset_mocks.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef5d5e1ec9690f4731059628db485df8a04a33f0
--- /dev/null
+++ b/test/builtin_dataset_mocks.py
@@ -0,0 +1,1582 @@
+import bz2
+import collections.abc
+import csv
+import functools
+import gzip
+import io
+import itertools
+import json
+import lzma
+import pathlib
+import pickle
+import random
+import shutil
+import unittest.mock
+import xml.etree.ElementTree as ET
+from collections import Counter, defaultdict
+
+import numpy as np
+import pytest
+import torch
+from common_utils import combinations_grid
+from datasets_utils import create_image_file, create_image_folder, make_tar, make_zip
+from torch.nn.functional import one_hot
+from torch.testing import make_tensor as _make_tensor
+from torchvision.prototype import datasets
+
+make_tensor = functools.partial(_make_tensor, device="cpu")
+make_scalar = functools.partial(make_tensor, ())
+
+
+__all__ = ["DATASET_MOCKS", "parametrize_dataset_mocks"]
+
+
+class DatasetMock:
+    def __init__(self, name, *, mock_data_fn, configs):
+        # FIXME: error handling for unknown names
+        self.name = name
+        self.mock_data_fn = mock_data_fn
+        self.configs = configs
+
+    def _parse_mock_info(self, mock_info):
+        if mock_info is None:
+            raise pytest.UsageError(
+                f"The mock data function for dataset '{self.name}' returned nothing. It needs to at least return an "
+                f"integer indicating the number of samples for the current `config`."
+            )
+        elif isinstance(mock_info, int):
+            mock_info = dict(num_samples=mock_info)
+        elif not isinstance(mock_info, dict):
+            raise pytest.UsageError(
+                f"The mock data function for dataset '{self.name}' returned a {type(mock_info)}. The returned object "
+                f"should be a dictionary containing at least the number of samples for the key `'num_samples'`. If no "
+                f"additional information is required for specific tests, the number of samples can also be returned as "
+                f"an integer."
+            )
+        elif "num_samples" not in mock_info:
+            raise pytest.UsageError(
+                f"The dictionary returned by the mock data function for dataset '{self.name}' has to contain a "
+                f"`'num_samples'` entry indicating the number of samples."
+            )
+
+        return mock_info
+
+    def load(self, config):
+        # `datasets.home()` is patched to a temporary directory through the autouse fixture `test_home` in
+        # test/test_prototype_builtin_datasets.py
+        root = pathlib.Path(datasets.home()) / self.name
+        # We cannot place the mock data upfront in `root`. Loading a dataset calls `OnlineResource.load`. In turn,
+        # this will only download **and** preprocess if the file is not present. In other words, if we already place
+        # the file in `root` before the resource is loaded, we are effectively skipping the preprocessing.
+        # To avoid that we first place the mock data in a temporary directory and patch the download logic to move it to
+        # `root` only when it is requested.
+        tmp_mock_data_folder = root / "__mock__"
+        tmp_mock_data_folder.mkdir(parents=True)
+
+        mock_info = self._parse_mock_info(self.mock_data_fn(tmp_mock_data_folder, config))
+
+        def patched_download(resource, root, **kwargs):
+            src = tmp_mock_data_folder / resource.file_name
+            if not src.exists():
+                raise pytest.UsageError(
+                    f"Dataset '{self.name}' requires the file {resource.file_name} for {config}"
+                    f"but it was not created by the mock data function."
+                )
+
+            dst = root / resource.file_name
+            shutil.move(str(src), str(root))
+
+            return dst
+
+        with unittest.mock.patch(
+            "torchvision.prototype.datasets.utils._resource.OnlineResource.download", new=patched_download
+        ):
+            dataset = datasets.load(self.name, **config)
+
+        extra_files = list(tmp_mock_data_folder.glob("**/*"))
+        if extra_files:
+            raise pytest.UsageError(
+                (
+                    f"Dataset '{self.name}' created the following files for {config} in the mock data function, "
+                    f"but they were not loaded:\n\n"
+                )
+                + "\n".join(str(file.relative_to(tmp_mock_data_folder)) for file in extra_files)
+            )
+
+        tmp_mock_data_folder.rmdir()
+
+        return dataset, mock_info
+
+
+def config_id(name, config):
+    parts = [name]
+    for name, value in config.items():
+        if isinstance(value, bool):
+            part = ("" if value else "no_") + name
+        else:
+            part = str(value)
+        parts.append(part)
+    return "-".join(parts)
+
+
+def parametrize_dataset_mocks(*dataset_mocks, marks=None):
+    mocks = {}
+    for mock in dataset_mocks:
+        if isinstance(mock, DatasetMock):
+            mocks[mock.name] = mock
+        elif isinstance(mock, collections.abc.Mapping):
+            mocks.update(mock)
+        else:
+            raise pytest.UsageError(
+                f"The positional arguments passed to `parametrize_dataset_mocks` can either be a `DatasetMock`, "
+                f"a sequence of `DatasetMock`'s, or a mapping of names to `DatasetMock`'s, "
+                f"but got {mock} instead."
+            )
+    dataset_mocks = mocks
+
+    if marks is None:
+        marks = {}
+    elif not isinstance(marks, collections.abc.Mapping):
+        raise pytest.UsageError()
+
+    return pytest.mark.parametrize(
+        ("dataset_mock", "config"),
+        [
+            pytest.param(dataset_mock, config, id=config_id(name, config), marks=marks.get(name, ()))
+            for name, dataset_mock in dataset_mocks.items()
+            for config in dataset_mock.configs
+        ],
+    )
+
+
+DATASET_MOCKS = {}
+
+
+def register_mock(name=None, *, configs):
+    def wrapper(mock_data_fn):
+        nonlocal name
+        if name is None:
+            name = mock_data_fn.__name__
+        DATASET_MOCKS[name] = DatasetMock(name, mock_data_fn=mock_data_fn, configs=configs)
+
+        return mock_data_fn
+
+    return wrapper
+
+
+class MNISTMockData:
+    _DTYPES_ID = {
+        torch.uint8: 8,
+        torch.int8: 9,
+        torch.int16: 11,
+        torch.int32: 12,
+        torch.float32: 13,
+        torch.float64: 14,
+    }
+
+    @classmethod
+    def _magic(cls, dtype, ndim):
+        return cls._DTYPES_ID[dtype] * 256 + ndim + 1
+
+    @staticmethod
+    def _encode(t):
+        return torch.tensor(t, dtype=torch.int32).numpy().tobytes()[::-1]
+
+    @staticmethod
+    def _big_endian_dtype(dtype):
+        np_dtype = getattr(np, str(dtype).replace("torch.", ""))().dtype
+        return np.dtype(f">{np_dtype.kind}{np_dtype.itemsize}")
+
+    @classmethod
+    def _create_binary_file(cls, root, filename, *, num_samples, shape, dtype, compressor, low=0, high):
+        with compressor(root / filename, "wb") as fh:
+            for meta in (cls._magic(dtype, len(shape)), num_samples, *shape):
+                fh.write(cls._encode(meta))
+
+            data = make_tensor((num_samples, *shape), dtype=dtype, low=low, high=high)
+
+            fh.write(data.numpy().astype(cls._big_endian_dtype(dtype)).tobytes())
+
+    @classmethod
+    def generate(
+        cls,
+        root,
+        *,
+        num_categories,
+        num_samples=None,
+        images_file,
+        labels_file,
+        image_size=(28, 28),
+        image_dtype=torch.uint8,
+        label_size=(),
+        label_dtype=torch.uint8,
+        compressor=None,
+    ):
+        if num_samples is None:
+            num_samples = num_categories
+        if compressor is None:
+            compressor = gzip.open
+
+        cls._create_binary_file(
+            root,
+            images_file,
+            num_samples=num_samples,
+            shape=image_size,
+            dtype=image_dtype,
+            compressor=compressor,
+            high=float("inf"),
+        )
+        cls._create_binary_file(
+            root,
+            labels_file,
+            num_samples=num_samples,
+            shape=label_size,
+            dtype=label_dtype,
+            compressor=compressor,
+            high=num_categories,
+        )
+
+        return num_samples
+
+
+def mnist(root, config):
+    prefix = "train" if config["split"] == "train" else "t10k"
+    return MNISTMockData.generate(
+        root,
+        num_categories=10,
+        images_file=f"{prefix}-images-idx3-ubyte.gz",
+        labels_file=f"{prefix}-labels-idx1-ubyte.gz",
+    )
+
+
+DATASET_MOCKS.update(
+    {
+        name: DatasetMock(name, mock_data_fn=mnist, configs=combinations_grid(split=("train", "test")))
+        for name in ["mnist", "fashionmnist", "kmnist"]
+    }
+)
+
+
+@register_mock(
+    configs=combinations_grid(
+        split=("train", "test"),
+        image_set=("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
+    )
+)
+def emnist(root, config):
+    num_samples_map = {}
+    file_names = set()
+    for split, image_set in itertools.product(
+        ("train", "test"),
+        ("Balanced", "By_Merge", "By_Class", "Letters", "Digits", "MNIST"),
+    ):
+        prefix = f"emnist-{image_set.replace('_', '').lower()}-{split}"
+        images_file = f"{prefix}-images-idx3-ubyte.gz"
+        labels_file = f"{prefix}-labels-idx1-ubyte.gz"
+        file_names.update({images_file, labels_file})
+        num_samples_map[(split, image_set)] = MNISTMockData.generate(
+            root,
+            # The image sets that merge some lower case letters in their respective upper case variant, still use dense
+            # labels in the data files. Thus, num_categories != len(categories) there.
+            num_categories=47 if config["image_set"] in ("Balanced", "By_Merge") else 62,
+            images_file=images_file,
+            labels_file=labels_file,
+        )
+
+    make_zip(root, "emnist-gzip.zip", *file_names)
+
+    return num_samples_map[(config["split"], config["image_set"])]
+
+
+@register_mock(configs=combinations_grid(split=("train", "test", "test10k", "test50k", "nist")))
+def qmnist(root, config):
+    num_categories = 10
+    if config["split"] == "train":
+        num_samples = num_samples_gen = num_categories + 2
+        prefix = "qmnist-train"
+        suffix = ".gz"
+        compressor = gzip.open
+    elif config["split"].startswith("test"):
+        # The split 'test50k' is defined as the last 50k images beginning at index 10000. Thus, we need to create
+        # more than 10000 images for the dataset to not be empty.
+        num_samples_gen = 10001
+        num_samples = {
+            "test": num_samples_gen,
+            "test10k": min(num_samples_gen, 10_000),
+            "test50k": num_samples_gen - 10_000,
+        }[config["split"]]
+        prefix = "qmnist-test"
+        suffix = ".gz"
+        compressor = gzip.open
+    else:  # config["split"] == "nist"
+        num_samples = num_samples_gen = num_categories + 3
+        prefix = "xnist"
+        suffix = ".xz"
+        compressor = lzma.open
+
+    MNISTMockData.generate(
+        root,
+        num_categories=num_categories,
+        num_samples=num_samples_gen,
+        images_file=f"{prefix}-images-idx3-ubyte{suffix}",
+        labels_file=f"{prefix}-labels-idx2-int{suffix}",
+        label_size=(8,),
+        label_dtype=torch.int32,
+        compressor=compressor,
+    )
+    return num_samples
+
+
+class CIFARMockData:
+    NUM_PIXELS = 32 * 32 * 3
+
+    @classmethod
+    def _create_batch_file(cls, root, name, *, num_categories, labels_key, num_samples=1):
+        content = {
+            "data": make_tensor((num_samples, cls.NUM_PIXELS), dtype=torch.uint8).numpy(),
+            labels_key: torch.randint(0, num_categories, size=(num_samples,)).tolist(),
+        }
+        with open(pathlib.Path(root) / name, "wb") as fh:
+            pickle.dump(content, fh)
+
+    @classmethod
+    def generate(
+        cls,
+        root,
+        name,
+        *,
+        folder,
+        train_files,
+        test_files,
+        num_categories,
+        labels_key,
+    ):
+        folder = root / folder
+        folder.mkdir()
+        files = (*train_files, *test_files)
+        for file in files:
+            cls._create_batch_file(
+                folder,
+                file,
+                num_categories=num_categories,
+                labels_key=labels_key,
+            )
+
+        make_tar(root, name, folder, compression="gz")
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def cifar10(root, config):
+    train_files = [f"data_batch_{idx}" for idx in range(1, 6)]
+    test_files = ["test_batch"]
+
+    CIFARMockData.generate(
+        root=root,
+        name="cifar-10-python.tar.gz",
+        folder=pathlib.Path("cifar-10-batches-py"),
+        train_files=train_files,
+        test_files=test_files,
+        num_categories=10,
+        labels_key="labels",
+    )
+
+    return len(train_files if config["split"] == "train" else test_files)
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def cifar100(root, config):
+    train_files = ["train"]
+    test_files = ["test"]
+
+    CIFARMockData.generate(
+        root=root,
+        name="cifar-100-python.tar.gz",
+        folder=pathlib.Path("cifar-100-python"),
+        train_files=train_files,
+        test_files=test_files,
+        num_categories=100,
+        labels_key="fine_labels",
+    )
+
+    return len(train_files if config["split"] == "train" else test_files)
+
+
+@register_mock(configs=[dict()])
+def caltech101(root, config):
+    def create_ann_file(root, name):
+        import scipy.io
+
+        box_coord = make_tensor((1, 4), dtype=torch.int32, low=0).numpy().astype(np.uint16)
+        obj_contour = make_tensor((2, int(torch.randint(3, 6, size=()))), dtype=torch.float64, low=0).numpy()
+
+        scipy.io.savemat(str(pathlib.Path(root) / name), dict(box_coord=box_coord, obj_contour=obj_contour))
+
+    def create_ann_folder(root, name, file_name_fn, num_examples):
+        root = pathlib.Path(root) / name
+        root.mkdir(parents=True)
+
+        for idx in range(num_examples):
+            create_ann_file(root, file_name_fn(idx))
+
+    images_root = root / "101_ObjectCategories"
+    anns_root = root / "Annotations"
+
+    image_category_map = {
+        "Faces": "Faces_2",
+        "Faces_easy": "Faces_3",
+        "Motorbikes": "Motorbikes_16",
+        "airplanes": "Airplanes_Side_2",
+    }
+
+    categories = ["Faces", "Faces_easy", "Motorbikes", "airplanes", "yin_yang"]
+
+    num_images_per_category = 2
+    for category in categories:
+        create_image_folder(
+            root=images_root,
+            name=category,
+            file_name_fn=lambda idx: f"image_{idx + 1:04d}.jpg",
+            num_examples=num_images_per_category,
+        )
+        create_ann_folder(
+            root=anns_root,
+            name=image_category_map.get(category, category),
+            file_name_fn=lambda idx: f"annotation_{idx + 1:04d}.mat",
+            num_examples=num_images_per_category,
+        )
+
+    (images_root / "BACKGROUND_Goodle").mkdir()
+    make_tar(root, f"{images_root.name}.tar.gz", images_root, compression="gz")
+
+    make_tar(root, f"{anns_root.name}.tar", anns_root)
+
+    return num_images_per_category * len(categories)
+
+
+@register_mock(configs=[dict()])
+def caltech256(root, config):
+    dir = root / "256_ObjectCategories"
+    num_images_per_category = 2
+
+    categories = [
+        (1, "ak47"),
+        (127, "laptop-101"),
+        (198, "spider"),
+        (257, "clutter"),
+    ]
+
+    for category_idx, category in categories:
+        files = create_image_folder(
+            dir,
+            name=f"{category_idx:03d}.{category}",
+            file_name_fn=lambda image_idx: f"{category_idx:03d}_{image_idx + 1:04d}.jpg",
+            num_examples=num_images_per_category,
+        )
+        if category == "spider":
+            open(files[0].parent / "RENAME2", "w").close()
+
+    make_tar(root, f"{dir.name}.tar", dir)
+
+    return num_images_per_category * len(categories)
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def imagenet(root, config):
+    from scipy.io import savemat
+
+    info = datasets.info("imagenet")
+
+    if config["split"] == "train":
+        num_samples = len(info["wnids"])
+        archive_name = "ILSVRC2012_img_train.tar"
+
+        files = []
+        for wnid in info["wnids"]:
+            create_image_folder(
+                root=root,
+                name=wnid,
+                file_name_fn=lambda image_idx: f"{wnid}_{image_idx:04d}.JPEG",
+                num_examples=1,
+            )
+            files.append(make_tar(root, f"{wnid}.tar"))
+    elif config["split"] == "val":
+        num_samples = 3
+        archive_name = "ILSVRC2012_img_val.tar"
+        files = [create_image_file(root, f"ILSVRC2012_val_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
+
+        devkit_root = root / "ILSVRC2012_devkit_t12"
+        data_root = devkit_root / "data"
+        data_root.mkdir(parents=True)
+
+        with open(data_root / "ILSVRC2012_validation_ground_truth.txt", "w") as file:
+            for label in torch.randint(0, len(info["wnids"]), (num_samples,)).tolist():
+                file.write(f"{label}\n")
+
+        num_children = 0
+        synsets = [
+            (idx, wnid, category, "", num_children, [], 0, 0)
+            for idx, (category, wnid) in enumerate(zip(info["categories"], info["wnids"]), 1)
+        ]
+        num_children = 1
+        synsets.extend((0, "", "", "", num_children, [], 0, 0) for _ in range(5))
+        synsets = np.array(
+            synsets,
+            dtype=np.dtype(
+                [
+                    ("ILSVRC2012_ID", "O"),
+                    ("WNID", "O"),
+                    ("words", "O"),
+                    ("gloss", "O"),
+                    ("num_children", "O"),
+                    ("children", "O"),
+                    ("wordnet_height", "O"),
+                    ("num_train_images", "O"),
+                ]
+            ),
+        )
+        savemat(data_root / "meta.mat", dict(synsets=synsets))
+
+        make_tar(root, devkit_root.with_suffix(".tar.gz").name, compression="gz")
+    else:  # config["split"] == "test"
+        num_samples = 5
+        archive_name = "ILSVRC2012_img_test_v10102019.tar"
+        files = [create_image_file(root, f"ILSVRC2012_test_{idx + 1:08d}.JPEG") for idx in range(num_samples)]
+
+    make_tar(root, archive_name, *files)
+
+    return num_samples
+
+
+class CocoMockData:
+    @classmethod
+    def _make_annotations_json(
+        cls,
+        root,
+        name,
+        *,
+        images_meta,
+        fn,
+    ):
+        num_anns_per_image = torch.randint(1, 5, (len(images_meta),))
+        num_anns_total = int(num_anns_per_image.sum())
+        ann_ids_iter = iter(torch.arange(num_anns_total)[torch.randperm(num_anns_total)])
+
+        anns_meta = []
+        for image_meta, num_anns in zip(images_meta, num_anns_per_image):
+            for _ in range(num_anns):
+                ann_id = int(next(ann_ids_iter))
+                anns_meta.append(dict(fn(ann_id, image_meta), id=ann_id, image_id=image_meta["id"]))
+        anns_meta.sort(key=lambda ann: ann["id"])
+
+        with open(root / name, "w") as file:
+            json.dump(dict(images=images_meta, annotations=anns_meta), file)
+
+        return num_anns_per_image
+
+    @staticmethod
+    def _make_instances_data(ann_id, image_meta):
+        def make_rle_segmentation():
+            height, width = image_meta["height"], image_meta["width"]
+            numel = height * width
+            counts = []
+            while sum(counts) <= numel:
+                counts.append(int(torch.randint(5, 8, ())))
+            if sum(counts) > numel:
+                counts[-1] -= sum(counts) - numel
+            return dict(counts=counts, size=[height, width])
+
+        return dict(
+            segmentation=make_rle_segmentation(),
+            bbox=make_tensor((4,), dtype=torch.float32, low=0).tolist(),
+            iscrowd=True,
+            area=float(make_scalar(dtype=torch.float32)),
+            category_id=int(make_scalar(dtype=torch.int64)),
+        )
+
+    @staticmethod
+    def _make_captions_data(ann_id, image_meta):
+        return dict(caption=f"Caption {ann_id} describing image {image_meta['id']}.")
+
+    @classmethod
+    def _make_annotations(cls, root, name, *, images_meta):
+        num_anns_per_image = torch.zeros((len(images_meta),), dtype=torch.int64)
+        for annotations, fn in (
+            ("instances", cls._make_instances_data),
+            ("captions", cls._make_captions_data),
+        ):
+            num_anns_per_image += cls._make_annotations_json(
+                root, f"{annotations}_{name}.json", images_meta=images_meta, fn=fn
+            )
+
+        return int(num_anns_per_image.sum())
+
+    @classmethod
+    def generate(
+        cls,
+        root,
+        *,
+        split,
+        year,
+        num_samples,
+    ):
+        annotations_dir = root / "annotations"
+        annotations_dir.mkdir()
+
+        for split_ in ("train", "val"):
+            config_name = f"{split_}{year}"
+
+            images_meta = [
+                dict(
+                    file_name=f"{idx:012d}.jpg",
+                    id=idx,
+                    width=width,
+                    height=height,
+                )
+                for idx, (height, width) in enumerate(
+                    torch.randint(3, 11, size=(num_samples, 2), dtype=torch.int).tolist()
+                )
+            ]
+
+            if split_ == split:
+                create_image_folder(
+                    root,
+                    config_name,
+                    file_name_fn=lambda idx: images_meta[idx]["file_name"],
+                    num_examples=num_samples,
+                    size=lambda idx: (3, images_meta[idx]["height"], images_meta[idx]["width"]),
+                )
+                make_zip(root, f"{config_name}.zip")
+
+            cls._make_annotations(
+                annotations_dir,
+                config_name,
+                images_meta=images_meta,
+            )
+
+        make_zip(root, f"annotations_trainval{year}.zip", annotations_dir)
+
+        return num_samples
+
+
+@register_mock(
+    configs=combinations_grid(
+        split=("train", "val"),
+        year=("2017", "2014"),
+        annotations=("instances", "captions", None),
+    )
+)
+def coco(root, config):
+    return CocoMockData.generate(root, split=config["split"], year=config["year"], num_samples=5)
+
+
+class SBDMockData:
+    _NUM_CATEGORIES = 20
+
+    @classmethod
+    def _make_split_files(cls, root_map, *, split):
+        splits_and_idcs = [
+            ("train", [0, 1, 2]),
+            ("val", [3]),
+        ]
+        if split == "train_noval":
+            splits_and_idcs.append(("train_noval", [0, 2]))
+
+        ids_map = {split: [f"2008_{idx:06d}" for idx in idcs] for split, idcs in splits_and_idcs}
+
+        for split, ids in ids_map.items():
+            with open(root_map[split] / f"{split}.txt", "w") as fh:
+                fh.writelines(f"{id}\n" for id in ids)
+
+        return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
+
+    @classmethod
+    def _make_anns_folder(cls, root, name, ids):
+        from scipy.io import savemat
+
+        anns_folder = root / name
+        anns_folder.mkdir()
+
+        sizes = torch.randint(1, 9, size=(len(ids), 2)).tolist()
+        for id, size in zip(ids, sizes):
+            savemat(
+                anns_folder / f"{id}.mat",
+                {
+                    "GTcls": {
+                        "Boundaries": cls._make_boundaries(size),
+                        "Segmentation": cls._make_segmentation(size),
+                    }
+                },
+            )
+        return sizes
+
+    @classmethod
+    def _make_boundaries(cls, size):
+        from scipy.sparse import csc_matrix
+
+        return [
+            [csc_matrix(torch.randint(0, 2, size=size, dtype=torch.uint8).numpy())] for _ in range(cls._NUM_CATEGORIES)
+        ]
+
+    @classmethod
+    def _make_segmentation(cls, size):
+        return torch.randint(0, cls._NUM_CATEGORIES + 1, size=size, dtype=torch.uint8).numpy()
+
+    @classmethod
+    def generate(cls, root, *, split):
+        archive_folder = root / "benchmark_RELEASE"
+        dataset_folder = archive_folder / "dataset"
+        dataset_folder.mkdir(parents=True, exist_ok=True)
+
+        ids, num_samples_map = cls._make_split_files(
+            defaultdict(lambda: dataset_folder, {"train_noval": root}), split=split
+        )
+        sizes = cls._make_anns_folder(dataset_folder, "cls", ids)
+        create_image_folder(
+            dataset_folder, "img", lambda idx: f"{ids[idx]}.jpg", num_examples=len(ids), size=lambda idx: sizes[idx]
+        )
+
+        make_tar(root, "benchmark.tgz", archive_folder, compression="gz")
+
+        return num_samples_map[split]
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "train_noval")))
+def sbd(root, config):
+    return SBDMockData.generate(root, split=config["split"])
+
+
+@register_mock(configs=[dict()])
+def semeion(root, config):
+    num_samples = 3
+    num_categories = 10
+
+    images = torch.rand(num_samples, 256)
+    labels = one_hot(torch.randint(num_categories, size=(num_samples,)), num_classes=num_categories)
+    with open(root / "semeion.data", "w") as fh:
+        for image, one_hot_label in zip(images, labels):
+            image_columns = " ".join([f"{pixel.item():.4f}" for pixel in image])
+            labels_columns = " ".join([str(label.item()) for label in one_hot_label])
+            fh.write(f"{image_columns} {labels_columns} \n")
+
+    return num_samples
+
+
+class VOCMockData:
+    _TRAIN_VAL_FILE_NAMES = {
+        "2007": "VOCtrainval_06-Nov-2007.tar",
+        "2008": "VOCtrainval_14-Jul-2008.tar",
+        "2009": "VOCtrainval_11-May-2009.tar",
+        "2010": "VOCtrainval_03-May-2010.tar",
+        "2011": "VOCtrainval_25-May-2011.tar",
+        "2012": "VOCtrainval_11-May-2012.tar",
+    }
+    _TEST_FILE_NAMES = {
+        "2007": "VOCtest_06-Nov-2007.tar",
+    }
+
+    @classmethod
+    def _make_split_files(cls, root, *, year, trainval):
+        split_folder = root / "ImageSets"
+
+        if trainval:
+            idcs_map = {
+                "train": [0, 1, 2],
+                "val": [3, 4],
+            }
+            idcs_map["trainval"] = [*idcs_map["train"], *idcs_map["val"]]
+        else:
+            idcs_map = {
+                "test": [5],
+            }
+        ids_map = {split: [f"{year}_{idx:06d}" for idx in idcs] for split, idcs in idcs_map.items()}
+
+        for task_sub_folder in ("Main", "Segmentation"):
+            task_folder = split_folder / task_sub_folder
+            task_folder.mkdir(parents=True, exist_ok=True)
+            for split, ids in ids_map.items():
+                with open(task_folder / f"{split}.txt", "w") as fh:
+                    fh.writelines(f"{id}\n" for id in ids)
+
+        return sorted(set(itertools.chain(*ids_map.values()))), {split: len(ids) for split, ids in ids_map.items()}
+
+    @classmethod
+    def _make_detection_anns_folder(cls, root, name, *, file_name_fn, num_examples):
+        folder = root / name
+        folder.mkdir(parents=True, exist_ok=True)
+
+        for idx in range(num_examples):
+            cls._make_detection_ann_file(folder, file_name_fn(idx))
+
+    @classmethod
+    def _make_detection_ann_file(cls, root, name):
+        def add_child(parent, name, text=None):
+            child = ET.SubElement(parent, name)
+            child.text = str(text)
+            return child
+
+        def add_name(obj, name="dog"):
+            add_child(obj, "name", name)
+
+        def add_size(obj):
+            obj = add_child(obj, "size")
+            size = {"width": 0, "height": 0, "depth": 3}
+            for name, text in size.items():
+                add_child(obj, name, text)
+
+        def add_bndbox(obj):
+            obj = add_child(obj, "bndbox")
+            bndbox = {"xmin": 1, "xmax": 2, "ymin": 3, "ymax": 4}
+            for name, text in bndbox.items():
+                add_child(obj, name, text)
+
+        annotation = ET.Element("annotation")
+        add_size(annotation)
+        obj = add_child(annotation, "object")
+        add_name(obj)
+        add_bndbox(obj)
+
+        with open(root / name, "wb") as fh:
+            fh.write(ET.tostring(annotation))
+
+    @classmethod
+    def generate(cls, root, *, year, trainval):
+        archive_folder = root
+        if year == "2011":
+            archive_folder = root / "TrainVal"
+            data_folder = archive_folder / "VOCdevkit"
+        else:
+            archive_folder = data_folder = root / "VOCdevkit"
+        data_folder = data_folder / f"VOC{year}"
+        data_folder.mkdir(parents=True, exist_ok=True)
+
+        ids, num_samples_map = cls._make_split_files(data_folder, year=year, trainval=trainval)
+        for make_folder_fn, name, suffix in [
+            (create_image_folder, "JPEGImages", ".jpg"),
+            (create_image_folder, "SegmentationClass", ".png"),
+            (cls._make_detection_anns_folder, "Annotations", ".xml"),
+        ]:
+            make_folder_fn(data_folder, name, file_name_fn=lambda idx: ids[idx] + suffix, num_examples=len(ids))
+        make_tar(root, (cls._TRAIN_VAL_FILE_NAMES if trainval else cls._TEST_FILE_NAMES)[year], archive_folder)
+
+        return num_samples_map
+
+
+@register_mock(
+    configs=[
+        *combinations_grid(
+            split=("train", "val", "trainval"),
+            year=("2007", "2008", "2009", "2010", "2011", "2012"),
+            task=("detection", "segmentation"),
+        ),
+        *combinations_grid(
+            split=("test",),
+            year=("2007",),
+            task=("detection", "segmentation"),
+        ),
+    ],
+)
+def voc(root, config):
+    trainval = config["split"] != "test"
+    return VOCMockData.generate(root, year=config["year"], trainval=trainval)[config["split"]]
+
+
+class CelebAMockData:
+    @classmethod
+    def _make_ann_file(cls, root, name, data, *, field_names=None):
+        with open(root / name, "w") as file:
+            if field_names:
+                file.write(f"{len(data)}\r\n")
+                file.write(" ".join(field_names) + "\r\n")
+            file.writelines(" ".join(str(item) for item in row) + "\r\n" for row in data)
+
+    _SPLIT_TO_IDX = {
+        "train": 0,
+        "val": 1,
+        "test": 2,
+    }
+
+    @classmethod
+    def _make_split_file(cls, root):
+        num_samples_map = {"train": 4, "val": 3, "test": 2}
+
+        data = [
+            (f"{idx:06d}.jpg", cls._SPLIT_TO_IDX[split])
+            for split, num_samples in num_samples_map.items()
+            for idx in range(num_samples)
+        ]
+        cls._make_ann_file(root, "list_eval_partition.txt", data)
+
+        image_file_names, _ = zip(*data)
+        return image_file_names, num_samples_map
+
+    @classmethod
+    def _make_identity_file(cls, root, image_file_names):
+        cls._make_ann_file(
+            root, "identity_CelebA.txt", [(name, int(make_scalar(low=1, dtype=torch.int))) for name in image_file_names]
+        )
+
+    @classmethod
+    def _make_attributes_file(cls, root, image_file_names):
+        field_names = ("5_o_Clock_Shadow", "Young")
+        data = [
+            [name, *[" 1" if attr else "-1" for attr in make_tensor((len(field_names),), dtype=torch.bool)]]
+            for name in image_file_names
+        ]
+        cls._make_ann_file(root, "list_attr_celeba.txt", data, field_names=(*field_names, ""))
+
+    @classmethod
+    def _make_bounding_boxes_file(cls, root, image_file_names):
+        field_names = ("image_id", "x_1", "y_1", "width", "height")
+        data = [
+            [f"{name}  ", *[f"{coord:3d}" for coord in make_tensor((4,), low=0, dtype=torch.int).tolist()]]
+            for name in image_file_names
+        ]
+        cls._make_ann_file(root, "list_bbox_celeba.txt", data, field_names=field_names)
+
+    @classmethod
+    def _make_landmarks_file(cls, root, image_file_names):
+        field_names = ("lefteye_x", "lefteye_y", "rightmouth_x", "rightmouth_y")
+        data = [
+            [
+                name,
+                *[
+                    f"{coord:4d}" if idx else coord
+                    for idx, coord in enumerate(make_tensor((len(field_names),), low=0, dtype=torch.int).tolist())
+                ],
+            ]
+            for name in image_file_names
+        ]
+        cls._make_ann_file(root, "list_landmarks_align_celeba.txt", data, field_names=field_names)
+
+    @classmethod
+    def generate(cls, root):
+        image_file_names, num_samples_map = cls._make_split_file(root)
+
+        image_files = create_image_folder(
+            root, "img_align_celeba", file_name_fn=lambda idx: image_file_names[idx], num_examples=len(image_file_names)
+        )
+        make_zip(root, image_files[0].parent.with_suffix(".zip").name)
+
+        for make_ann_file_fn in (
+            cls._make_identity_file,
+            cls._make_attributes_file,
+            cls._make_bounding_boxes_file,
+            cls._make_landmarks_file,
+        ):
+            make_ann_file_fn(root, image_file_names)
+
+        return num_samples_map
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def celeba(root, config):
+    return CelebAMockData.generate(root)[config["split"]]
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def country211(root, config):
+    split_folder = pathlib.Path(root, "country211", "valid" if config["split"] == "val" else config["split"])
+    split_folder.mkdir(parents=True, exist_ok=True)
+
+    num_examples = {
+        "train": 3,
+        "val": 4,
+        "test": 5,
+    }[config["split"]]
+
+    classes = ("AD", "BS", "GR")
+    for cls in classes:
+        create_image_folder(
+            split_folder,
+            name=cls,
+            file_name_fn=lambda idx: f"{idx}.jpg",
+            num_examples=num_examples,
+        )
+    make_tar(root, f"{split_folder.parent.name}.tgz", split_folder.parent, compression="gz")
+    return num_examples * len(classes)
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def food101(root, config):
+    data_folder = root / "food-101"
+
+    num_images_per_class = 3
+    image_folder = data_folder / "images"
+    categories = ["apple_pie", "baby_back_ribs", "waffles"]
+    image_ids = []
+    for category in categories:
+        image_files = create_image_folder(
+            image_folder,
+            category,
+            file_name_fn=lambda idx: f"{idx:04d}.jpg",
+            num_examples=num_images_per_class,
+        )
+        image_ids.extend(path.relative_to(path.parents[1]).with_suffix("").as_posix() for path in image_files)
+
+    meta_folder = data_folder / "meta"
+    meta_folder.mkdir()
+
+    with open(meta_folder / "classes.txt", "w") as file:
+        for category in categories:
+            file.write(f"{category}\n")
+
+    splits = ["train", "test"]
+    num_samples_map = {}
+    for offset, split in enumerate(splits):
+        image_ids_in_split = image_ids[offset :: len(splits)]
+        num_samples_map[split] = len(image_ids_in_split)
+        with open(meta_folder / f"{split}.txt", "w") as file:
+            for image_id in image_ids_in_split:
+                file.write(f"{image_id}\n")
+
+    make_tar(root, f"{data_folder.name}.tar.gz", compression="gz")
+
+    return num_samples_map[config["split"]]
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test"), fold=(1, 4, 10)))
+def dtd(root, config):
+    data_folder = root / "dtd"
+
+    num_images_per_class = 3
+    image_folder = data_folder / "images"
+    categories = {"banded", "marbled", "zigzagged"}
+    image_ids_per_category = {
+        category: [
+            str(path.relative_to(path.parents[1]).as_posix())
+            for path in create_image_folder(
+                image_folder,
+                category,
+                file_name_fn=lambda idx: f"{category}_{idx:04d}.jpg",
+                num_examples=num_images_per_class,
+            )
+        ]
+        for category in categories
+    }
+
+    meta_folder = data_folder / "labels"
+    meta_folder.mkdir()
+
+    with open(meta_folder / "labels_joint_anno.txt", "w") as file:
+        for cls, image_ids in image_ids_per_category.items():
+            for image_id in image_ids:
+                joint_categories = random.choices(
+                    list(categories - {cls}), k=int(torch.randint(len(categories) - 1, ()))
+                )
+                file.write(" ".join([image_id, *sorted([cls, *joint_categories])]) + "\n")
+
+    image_ids = list(itertools.chain(*image_ids_per_category.values()))
+    splits = ("train", "val", "test")
+    num_samples_map = {}
+    for fold in range(1, 11):
+        random.shuffle(image_ids)
+        for offset, split in enumerate(splits):
+            image_ids_in_config = image_ids[offset :: len(splits)]
+            with open(meta_folder / f"{split}{fold}.txt", "w") as file:
+                file.write("\n".join(image_ids_in_config) + "\n")
+
+            num_samples_map[(split, fold)] = len(image_ids_in_config)
+
+    make_tar(root, "dtd-r1.0.1.tar.gz", data_folder, compression="gz")
+
+    return num_samples_map[config["split"], config["fold"]]
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def fer2013(root, config):
+    split = config["split"]
+    num_samples = 5 if split == "train" else 3
+
+    path = root / f"{split}.csv"
+    with open(path, "w", newline="") as file:
+        field_names = ["emotion"] if split == "train" else []
+        field_names.append("pixels")
+
+        file.write(",".join(field_names) + "\n")
+
+        writer = csv.DictWriter(file, fieldnames=field_names, quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
+        for _ in range(num_samples):
+            rowdict = {
+                "pixels": " ".join([str(int(pixel)) for pixel in torch.randint(256, (48 * 48,), dtype=torch.uint8)])
+            }
+            if split == "train":
+                rowdict["emotion"] = int(torch.randint(7, ()))
+            writer.writerow(rowdict)
+
+    make_zip(root, f"{path.name}.zip", path)
+
+    return num_samples
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def gtsrb(root, config):
+    num_examples_per_class = 5 if config["split"] == "train" else 3
+    classes = ("00000", "00042", "00012")
+    num_examples = num_examples_per_class * len(classes)
+
+    csv_columns = ["Filename", "Width", "Height", "Roi.X1", "Roi.Y1", "Roi.X2", "Roi.Y2", "ClassId"]
+
+    def _make_ann_file(path, num_examples, class_idx):
+        if class_idx == "random":
+            class_idx = torch.randint(1, len(classes) + 1, size=(1,)).item()
+
+        with open(path, "w") as csv_file:
+            writer = csv.DictWriter(csv_file, fieldnames=csv_columns, delimiter=";")
+            writer.writeheader()
+            for image_idx in range(num_examples):
+                writer.writerow(
+                    {
+                        "Filename": f"{image_idx:05d}.ppm",
+                        "Width": torch.randint(1, 100, size=()).item(),
+                        "Height": torch.randint(1, 100, size=()).item(),
+                        "Roi.X1": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y1": torch.randint(1, 100, size=()).item(),
+                        "Roi.X2": torch.randint(1, 100, size=()).item(),
+                        "Roi.Y2": torch.randint(1, 100, size=()).item(),
+                        "ClassId": class_idx,
+                    }
+                )
+
+    archive_folder = root / "GTSRB"
+
+    if config["split"] == "train":
+        train_folder = archive_folder / "Training"
+        train_folder.mkdir(parents=True)
+
+        for class_idx in classes:
+            create_image_folder(
+                train_folder,
+                name=class_idx,
+                file_name_fn=lambda image_idx: f"{class_idx}_{image_idx:05d}.ppm",
+                num_examples=num_examples_per_class,
+            )
+            _make_ann_file(
+                path=train_folder / class_idx / f"GT-{class_idx}.csv",
+                num_examples=num_examples_per_class,
+                class_idx=int(class_idx),
+            )
+        make_zip(root, "GTSRB-Training_fixed.zip", archive_folder)
+    else:
+        test_folder = archive_folder / "Final_Test"
+        test_folder.mkdir(parents=True)
+
+        create_image_folder(
+            test_folder,
+            name="Images",
+            file_name_fn=lambda image_idx: f"{image_idx:05d}.ppm",
+            num_examples=num_examples,
+        )
+
+        make_zip(root, "GTSRB_Final_Test_Images.zip", archive_folder)
+
+        _make_ann_file(
+            path=root / "GT-final_test.csv",
+            num_examples=num_examples,
+            class_idx="random",
+        )
+
+        make_zip(root, "GTSRB_Final_Test_GT.zip", "GT-final_test.csv")
+
+    return num_examples
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def clevr(root, config):
+    data_folder = root / "CLEVR_v1.0"
+
+    num_samples_map = {
+        "train": 3,
+        "val": 2,
+        "test": 1,
+    }
+
+    images_folder = data_folder / "images"
+    image_files = {
+        split: create_image_folder(
+            images_folder,
+            split,
+            file_name_fn=lambda idx: f"CLEVR_{split}_{idx:06d}.jpg",
+            num_examples=num_samples,
+        )
+        for split, num_samples in num_samples_map.items()
+    }
+
+    scenes_folder = data_folder / "scenes"
+    scenes_folder.mkdir()
+    for split in ["train", "val"]:
+        with open(scenes_folder / f"CLEVR_{split}_scenes.json", "w") as file:
+            json.dump(
+                {
+                    "scenes": [
+                        {
+                            "image_filename": image_file.name,
+                            # We currently only return the number of objects in a scene.
+                            # Thus, it is sufficient for now to only mock the number of elements.
+                            "objects": [None] * int(torch.randint(1, 5, ())),
+                        }
+                        for image_file in image_files[split]
+                    ]
+                },
+                file,
+            )
+
+    make_zip(root, f"{data_folder.name}.zip", data_folder)
+
+    return num_samples_map[config["split"]]
+
+
+class OxfordIIITPetMockData:
+    @classmethod
+    def _meta_to_split_and_classification_ann(cls, meta, idx):
+        image_id = "_".join(
+            [
+                *[(str.title if meta["species"] == "cat" else str.lower)(part) for part in meta["cls"].split()],
+                str(idx),
+            ]
+        )
+        class_id = str(meta["label"] + 1)
+        species = "1" if meta["species"] == "cat" else "2"
+        breed_id = "-1"
+        return (image_id, class_id, species, breed_id)
+
+    @classmethod
+    def generate(self, root):
+        classification_anns_meta = (
+            dict(cls="Abyssinian", label=0, species="cat"),
+            dict(cls="Keeshond", label=18, species="dog"),
+            dict(cls="Yorkshire Terrier", label=36, species="dog"),
+        )
+        split_and_classification_anns = [
+            self._meta_to_split_and_classification_ann(meta, idx)
+            for meta, idx in itertools.product(classification_anns_meta, (1, 2, 10))
+        ]
+        image_ids, *_ = zip(*split_and_classification_anns)
+
+        image_files = create_image_folder(
+            root, "images", file_name_fn=lambda idx: f"{image_ids[idx]}.jpg", num_examples=len(image_ids)
+        )
+
+        anns_folder = root / "annotations"
+        anns_folder.mkdir()
+        random.shuffle(split_and_classification_anns)
+        splits = ("trainval", "test")
+        num_samples_map = {}
+        for offset, split in enumerate(splits):
+            split_and_classification_anns_in_split = split_and_classification_anns[offset :: len(splits)]
+            with open(anns_folder / f"{split}.txt", "w") as file:
+                writer = csv.writer(file, delimiter=" ")
+                for split_and_classification_ann in split_and_classification_anns_in_split:
+                    writer.writerow(split_and_classification_ann)
+
+            num_samples_map[split] = len(split_and_classification_anns_in_split)
+
+        segmentation_files = create_image_folder(
+            anns_folder, "trimaps", file_name_fn=lambda idx: f"{image_ids[idx]}.png", num_examples=len(image_ids)
+        )
+
+        # The dataset has some rogue files
+        for path in image_files[:3]:
+            path.with_suffix(".mat").touch()
+        for path in segmentation_files:
+            path.with_name(f".{path.name}").touch()
+
+        make_tar(root, "images.tar.gz", compression="gz")
+        make_tar(root, anns_folder.with_suffix(".tar.gz").name, compression="gz")
+
+        return num_samples_map
+
+
+@register_mock(name="oxford-iiit-pet", configs=combinations_grid(split=("trainval", "test")))
+def oxford_iiit_pet(root, config):
+    return OxfordIIITPetMockData.generate(root)[config["split"]]
+
+
+class _CUB200MockData:
+    @classmethod
+    def _category_folder(cls, category, idx):
+        return f"{idx:03d}.{category}"
+
+    @classmethod
+    def _file_stem(cls, category, idx):
+        return f"{category}_{idx:04d}"
+
+    @classmethod
+    def _make_images(cls, images_folder):
+        image_files = []
+        for category_idx, category in [
+            (1, "Black_footed_Albatross"),
+            (100, "Brown_Pelican"),
+            (200, "Common_Yellowthroat"),
+        ]:
+            image_files.extend(
+                create_image_folder(
+                    images_folder,
+                    cls._category_folder(category, category_idx),
+                    lambda image_idx: f"{cls._file_stem(category, image_idx)}.jpg",
+                    num_examples=5,
+                )
+            )
+
+        return image_files
+
+
+class CUB2002011MockData(_CUB200MockData):
+    @classmethod
+    def _make_archive(cls, root):
+        archive_folder = root / "CUB_200_2011"
+
+        images_folder = archive_folder / "images"
+        image_files = cls._make_images(images_folder)
+        image_ids = list(range(1, len(image_files) + 1))
+
+        with open(archive_folder / "images.txt", "w") as file:
+            file.write(
+                "\n".join(
+                    f"{id} {path.relative_to(images_folder).as_posix()}" for id, path in zip(image_ids, image_files)
+                )
+            )
+
+        split_ids = torch.randint(2, (len(image_ids),)).tolist()
+        counts = Counter(split_ids)
+        num_samples_map = {"train": counts[1], "test": counts[0]}
+        with open(archive_folder / "train_test_split.txt", "w") as file:
+            file.write("\n".join(f"{image_id} {split_id}" for image_id, split_id in zip(image_ids, split_ids)))
+
+        with open(archive_folder / "bounding_boxes.txt", "w") as file:
+            file.write(
+                "\n".join(
+                    " ".join(
+                        str(item)
+                        for item in [image_id, *make_tensor((4,), dtype=torch.int, low=0).to(torch.float).tolist()]
+                    )
+                    for image_id in image_ids
+                )
+            )
+
+        make_tar(root, archive_folder.with_suffix(".tgz").name, compression="gz")
+
+        return image_files, num_samples_map
+
+    @classmethod
+    def _make_segmentations(cls, root, image_files):
+        segmentations_folder = root / "segmentations"
+        for image_file in image_files:
+            folder = segmentations_folder.joinpath(image_file.relative_to(image_file.parents[1]))
+            folder.mkdir(exist_ok=True, parents=True)
+            create_image_file(
+                folder,
+                image_file.with_suffix(".png").name,
+                size=[1, *make_tensor((2,), low=3, dtype=torch.int).tolist()],
+            )
+
+        make_tar(root, segmentations_folder.with_suffix(".tgz").name, compression="gz")
+
+    @classmethod
+    def generate(cls, root):
+        image_files, num_samples_map = cls._make_archive(root)
+        cls._make_segmentations(root, image_files)
+        return num_samples_map
+
+
+class CUB2002010MockData(_CUB200MockData):
+    @classmethod
+    def _make_hidden_rouge_file(cls, *files):
+        for file in files:
+            (file.parent / f"._{file.name}").touch()
+
+    @classmethod
+    def _make_splits(cls, root, image_files):
+        split_folder = root / "lists"
+        split_folder.mkdir()
+        random.shuffle(image_files)
+        splits = ("train", "test")
+        num_samples_map = {}
+        for offset, split in enumerate(splits):
+            image_files_in_split = image_files[offset :: len(splits)]
+
+            split_file = split_folder / f"{split}.txt"
+            with open(split_file, "w") as file:
+                file.write(
+                    "\n".join(
+                        sorted(
+                            str(image_file.relative_to(image_file.parents[1]).as_posix())
+                            for image_file in image_files_in_split
+                        )
+                    )
+                )
+
+            cls._make_hidden_rouge_file(split_file)
+            num_samples_map[split] = len(image_files_in_split)
+
+        make_tar(root, split_folder.with_suffix(".tgz").name, compression="gz")
+
+        return num_samples_map
+
+    @classmethod
+    def _make_anns(cls, root, image_files):
+        from scipy.io import savemat
+
+        anns_folder = root / "annotations-mat"
+        for image_file in image_files:
+            ann_file = anns_folder / image_file.with_suffix(".mat").relative_to(image_file.parents[1])
+            ann_file.parent.mkdir(parents=True, exist_ok=True)
+
+            savemat(
+                ann_file,
+                {
+                    "seg": torch.randint(
+                        256, make_tensor((2,), low=3, dtype=torch.int).tolist(), dtype=torch.uint8
+                    ).numpy(),
+                    "bbox": dict(
+                        zip(("left", "top", "right", "bottom"), make_tensor((4,), dtype=torch.uint8).tolist())
+                    ),
+                },
+            )
+
+        readme_file = anns_folder / "README.txt"
+        readme_file.touch()
+        cls._make_hidden_rouge_file(readme_file)
+
+        make_tar(root, "annotations.tgz", anns_folder, compression="gz")
+
+    @classmethod
+    def generate(cls, root):
+        images_folder = root / "images"
+        image_files = cls._make_images(images_folder)
+        cls._make_hidden_rouge_file(*image_files)
+        make_tar(root, images_folder.with_suffix(".tgz").name, compression="gz")
+
+        num_samples_map = cls._make_splits(root, image_files)
+        cls._make_anns(root, image_files)
+
+        return num_samples_map
+
+
+@register_mock(configs=combinations_grid(split=("train", "test"), year=("2010", "2011")))
+def cub200(root, config):
+    num_samples_map = (CUB2002011MockData if config["year"] == "2011" else CUB2002010MockData).generate(root)
+    return num_samples_map[config["split"]]
+
+
+@register_mock(configs=[dict()])
+def eurosat(root, config):
+    data_folder = root / "2750"
+    data_folder.mkdir(parents=True)
+
+    num_examples_per_class = 3
+    categories = ["AnnualCrop", "Forest"]
+    for category in categories:
+        create_image_folder(
+            root=data_folder,
+            name=category,
+            file_name_fn=lambda idx: f"{category}_{idx + 1}.jpg",
+            num_examples=num_examples_per_class,
+        )
+    make_zip(root, "EuroSAT.zip", data_folder)
+    return len(categories) * num_examples_per_class
+
+
+@register_mock(configs=combinations_grid(split=("train", "test", "extra")))
+def svhn(root, config):
+    import scipy.io as sio
+
+    num_samples = {
+        "train": 2,
+        "test": 3,
+        "extra": 4,
+    }[config["split"]]
+
+    sio.savemat(
+        root / f"{config['split']}_32x32.mat",
+        {
+            "X": np.random.randint(256, size=(32, 32, 3, num_samples), dtype=np.uint8),
+            "y": np.random.randint(10, size=(num_samples,), dtype=np.uint8),
+        },
+    )
+    return num_samples
+
+
+@register_mock(configs=combinations_grid(split=("train", "val", "test")))
+def pcam(root, config):
+    import h5py
+
+    num_images = {"train": 2, "test": 3, "val": 4}[config["split"]]
+
+    split = "valid" if config["split"] == "val" else config["split"]
+
+    images_io = io.BytesIO()
+    with h5py.File(images_io, "w") as f:
+        f["x"] = np.random.randint(0, 256, size=(num_images, 10, 10, 3), dtype=np.uint8)
+
+    targets_io = io.BytesIO()
+    with h5py.File(targets_io, "w") as f:
+        f["y"] = np.random.randint(0, 2, size=(num_images, 1, 1, 1), dtype=np.uint8)
+
+    # Create .gz compressed files
+    images_file = root / f"camelyonpatch_level_2_split_{split}_x.h5.gz"
+    targets_file = root / f"camelyonpatch_level_2_split_{split}_y.h5.gz"
+    for compressed_file_name, uncompressed_file_io in ((images_file, images_io), (targets_file, targets_io)):
+        compressed_data = gzip.compress(uncompressed_file_io.getbuffer())
+        with open(compressed_file_name, "wb") as compressed_file:
+            compressed_file.write(compressed_data)
+
+    return num_images
+
+
+@register_mock(name="stanford-cars", configs=combinations_grid(split=("train", "test")))
+def stanford_cars(root, config):
+    import scipy.io as io
+    from numpy.core.records import fromarrays
+
+    split = config["split"]
+    num_samples = {"train": 5, "test": 7}[split]
+    num_categories = 3
+
+    if split == "train":
+        images_folder_name = "cars_train"
+        devkit = root / "devkit"
+        devkit.mkdir()
+        annotations_mat_path = devkit / "cars_train_annos.mat"
+    else:
+        images_folder_name = "cars_test"
+        annotations_mat_path = root / "cars_test_annos_withlabels.mat"
+
+    create_image_folder(
+        root=root,
+        name=images_folder_name,
+        file_name_fn=lambda image_index: f"{image_index:5d}.jpg",
+        num_examples=num_samples,
+    )
+
+    make_tar(root, f"cars_{split}.tgz", images_folder_name)
+    bbox = np.random.randint(1, 200, num_samples, dtype=np.uint8)
+    classes = np.random.randint(1, num_categories + 1, num_samples, dtype=np.uint8)
+    fnames = [f"{i:5d}.jpg" for i in range(num_samples)]
+    rec_array = fromarrays(
+        [bbox, bbox, bbox, bbox, classes, fnames],
+        names=["bbox_x1", "bbox_y1", "bbox_x2", "bbox_y2", "class", "fname"],
+    )
+
+    io.savemat(annotations_mat_path, {"annotations": rec_array})
+    if split == "train":
+        make_tar(root, "car_devkit.tgz", devkit, compression="gz")
+
+    return num_samples
+
+
+@register_mock(configs=combinations_grid(split=("train", "test")))
+def usps(root, config):
+    num_samples = {"train": 15, "test": 7}[config["split"]]
+
+    with bz2.open(root / f"usps{'.t' if not config['split'] == 'train' else ''}.bz2", "wb") as fh:
+        lines = []
+        for _ in range(num_samples):
+            label = make_tensor(1, low=1, high=11, dtype=torch.int)
+            values = make_tensor(256, low=-1, high=1, dtype=torch.float)
+            lines.append(
+                " ".join([f"{int(label)}", *(f"{idx}:{float(value):.6f}" for idx, value in enumerate(values, 1))])
+            )
+
+        fh.write("\n".join(lines).encode())
+
+    return num_samples
diff --git a/test/common_extended_utils.py b/test/common_extended_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a34e15629bba78961accaddc91be82e1ec08386c
--- /dev/null
+++ b/test/common_extended_utils.py
@@ -0,0 +1,310 @@
+import os
+from collections import defaultdict
+from numbers import Number
+from typing import Any, List
+
+import torch
+from torch.utils._python_dispatch import TorchDispatchMode
+
+from torch.utils._pytree import tree_map
+
+from torchvision.models._api import Weights
+
+aten = torch.ops.aten
+quantized = torch.ops.quantized
+
+
+def get_shape(i):
+    if isinstance(i, torch.Tensor):
+        return i.shape
+    elif hasattr(i, "weight"):
+        return i.weight().shape
+    else:
+        raise ValueError(f"Unknown type {type(i)}")
+
+
+def prod(x):
+    res = 1
+    for i in x:
+        res *= i
+    return res
+
+
+def matmul_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for matmul.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two matrices.
+    input_shapes = [get_shape(v) for v in inputs]
+    assert len(input_shapes) == 2, input_shapes
+    assert input_shapes[0][-1] == input_shapes[1][-2], input_shapes
+    flop = prod(input_shapes[0]) * input_shapes[-1][-1]
+    return flop
+
+
+def addmm_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for fully connected layers.
+    """
+    # Count flop for nn.Linear
+    # inputs is a list of length 3.
+    input_shapes = [get_shape(v) for v in inputs[1:3]]
+    # input_shapes[0]: [batch size, input feature dimension]
+    # input_shapes[1]: [batch size, output feature dimension]
+    assert len(input_shapes[0]) == 2, input_shapes[0]
+    assert len(input_shapes[1]) == 2, input_shapes[1]
+    batch_size, input_dim = input_shapes[0]
+    output_dim = input_shapes[1][1]
+    flops = batch_size * input_dim * output_dim
+    return flops
+
+
+def bmm_flop(inputs: List[Any], outputs: List[Any]) -> Number:
+    """
+    Count flops for the bmm operation.
+    """
+    # Inputs should be a list of length 2.
+    # Inputs contains the shapes of two tensor.
+    assert len(inputs) == 2, len(inputs)
+    input_shapes = [get_shape(v) for v in inputs]
+    n, c, t = input_shapes[0]
+    d = input_shapes[-1][-1]
+    flop = n * c * t * d
+    return flop
+
+
+def conv_flop_count(
+    x_shape: List[int],
+    w_shape: List[int],
+    out_shape: List[int],
+    transposed: bool = False,
+) -> Number:
+    """
+    Count flops for convolution. Note only multiplication is
+    counted. Computation for addition and bias is ignored.
+    Flops for a transposed convolution are calculated as
+    flops = (x_shape[2:] * prod(w_shape) * batch_size).
+    Args:
+        x_shape (list(int)): The input shape before convolution.
+        w_shape (list(int)): The filter shape.
+        out_shape (list(int)): The output shape after convolution.
+        transposed (bool): is the convolution transposed
+    Returns:
+        int: the number of flops
+    """
+    batch_size = x_shape[0]
+    conv_shape = (x_shape if transposed else out_shape)[2:]
+    flop = batch_size * prod(w_shape) * prod(conv_shape)
+    return flop
+
+
+def conv_flop(inputs: List[Any], outputs: List[Any]):
+    """
+    Count flops for convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0]))
+    transposed = inputs[6]
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=transposed)
+
+
+def quant_conv_flop(inputs: List[Any], outputs: List[Any]):
+    """
+    Count flops for quantized convolution.
+    """
+    x, w = inputs[:2]
+    x_shape, w_shape, out_shape = (get_shape(x), get_shape(w), get_shape(outputs[0]))
+
+    return conv_flop_count(x_shape, w_shape, out_shape, transposed=False)
+
+
+def transpose_shape(shape):
+    return [shape[1], shape[0]] + list(shape[2:])
+
+
+def conv_backward_flop(inputs: List[Any], outputs: List[Any]):
+    grad_out_shape, x_shape, w_shape = [get_shape(i) for i in inputs[:3]]
+    output_mask = inputs[-1]
+    fwd_transposed = inputs[7]
+    flop_count = 0
+
+    if output_mask[0]:
+        grad_input_shape = get_shape(outputs[0])
+        flop_count += conv_flop_count(grad_out_shape, w_shape, grad_input_shape, not fwd_transposed)
+    if output_mask[1]:
+        grad_weight_shape = get_shape(outputs[1])
+        flop_count += conv_flop_count(transpose_shape(x_shape), grad_out_shape, grad_weight_shape, fwd_transposed)
+
+    return flop_count
+
+
+def scaled_dot_product_flash_attention_flop(inputs: List[Any], outputs: List[Any]):
+    # FIXME: this needs to count the flops of this kernel
+    # https://github.com/pytorch/pytorch/blob/207b06d099def9d9476176a1842e88636c1f714f/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp#L52-L267
+    return 0
+
+
+flop_mapping = {
+    aten.mm: matmul_flop,
+    aten.matmul: matmul_flop,
+    aten.addmm: addmm_flop,
+    aten.bmm: bmm_flop,
+    aten.convolution: conv_flop,
+    aten._convolution: conv_flop,
+    aten.convolution_backward: conv_backward_flop,
+    quantized.conv2d: quant_conv_flop,
+    quantized.conv2d_relu: quant_conv_flop,
+    aten._scaled_dot_product_flash_attention: scaled_dot_product_flash_attention_flop,
+}
+
+unmapped_ops = set()
+
+
+def normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+class FlopCounterMode(TorchDispatchMode):
+    def __init__(self, model=None):
+        self.flop_counts = defaultdict(lambda: defaultdict(int))
+        self.parents = ["Global"]
+        # global mod
+        if model is not None:
+            for name, module in dict(model.named_children()).items():
+                module.register_forward_pre_hook(self.enter_module(name))
+                module.register_forward_hook(self.exit_module(name))
+
+    def enter_module(self, name):
+        def f(module, inputs):
+            self.parents.append(name)
+            inputs = normalize_tuple(inputs)
+            out = self.create_backwards_pop(name)(*inputs)
+            return out
+
+        return f
+
+    def exit_module(self, name):
+        def f(module, inputs, outputs):
+            assert self.parents[-1] == name
+            self.parents.pop()
+            outputs = normalize_tuple(outputs)
+            return self.create_backwards_push(name)(*outputs)
+
+        return f
+
+    def create_backwards_push(self, name):
+        class PushState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                self.parents.append(name)
+                return grad_outs
+
+        return PushState.apply
+
+    def create_backwards_pop(self, name):
+        class PopState(torch.autograd.Function):
+            @staticmethod
+            def forward(ctx, *args):
+                args = tree_map(lambda x: x.clone() if isinstance(x, torch.Tensor) else x, args)
+                if len(args) == 1:
+                    return args[0]
+                return args
+
+            @staticmethod
+            def backward(ctx, *grad_outs):
+                assert self.parents[-1] == name
+                self.parents.pop()
+                return grad_outs
+
+        return PopState.apply
+
+    def __enter__(self):
+        self.flop_counts.clear()
+        super().__enter__()
+
+    def __exit__(self, *args):
+        # print(f"Total: {sum(self.flop_counts['Global'].values()) / 1e9} GFLOPS")
+        # for mod in self.flop_counts.keys():
+        #     print(f"Module: ", mod)
+        #     for k, v in self.flop_counts[mod].items():
+        #         print(f"{k}: {v / 1e9} GFLOPS")
+        #     print()
+        super().__exit__(*args)
+
+    def __torch_dispatch__(self, func, types, args=(), kwargs=None):
+        kwargs = kwargs if kwargs else {}
+
+        out = func(*args, **kwargs)
+        func_packet = func._overloadpacket
+        if func_packet in flop_mapping:
+            flop_count = flop_mapping[func_packet](args, normalize_tuple(out))
+            for par in self.parents:
+                self.flop_counts[par][func_packet] += flop_count
+        else:
+            unmapped_ops.add(func_packet)
+
+        return out
+
+    def get_flops(self):
+        return sum(self.flop_counts["Global"].values()) / 1e9
+
+
+def get_dims(module_name, height, width):
+    # detection models have curated input sizes
+    if module_name == "detection":
+        # we can feed a batch of 1 for detection model instead of a list of 1 image
+        dims = (3, height, width)
+    elif module_name == "video":
+        # hard-coding the time dimension to size 16
+        dims = (1, 16, 3, height, width)
+    else:
+        dims = (1, 3, height, width)
+
+    return dims
+
+
+def get_ops(model: torch.nn.Module, weight: Weights, height=512, width=512):
+    module_name = model.__module__.split(".")[-2]
+    dims = get_dims(module_name=module_name, height=height, width=width)
+
+    input_tensor = torch.randn(dims)
+
+    # try:
+    preprocess = weight.transforms()
+    if module_name == "optical_flow":
+        inp = preprocess(input_tensor, input_tensor)
+    else:
+        # hack to enable mod(*inp) for optical_flow models
+        inp = [preprocess(input_tensor)]
+
+    model.eval()
+
+    flop_counter = FlopCounterMode(model)
+    with flop_counter:
+        # detection models expect a list of 3d tensors as inputs
+        if module_name == "detection":
+            model(inp)
+        else:
+            model(*inp)
+
+        flops = flop_counter.get_flops()
+
+    return round(flops, 3)
+
+
+def get_file_size_mb(weight):
+    weights_path = os.path.join(os.getenv("HOME"), ".cache/torch/hub/checkpoints", weight.url.split("/")[-1])
+    weights_size_mb = os.path.getsize(weights_path) / 1024 / 1024
+
+    return round(weights_size_mb, 3)
diff --git a/test/common_utils.py b/test/common_utils.py
index 8f07e91d144e6a5495ecf4ab33b3174c5d27e3b0..a1d188efdaed686e129bef2844f1f24e4a1d5abe 100644
--- a/test/common_utils.py
+++ b/test/common_utils.py
@@ -1,23 +1,35 @@
 import contextlib
 import functools
+import itertools
 import os
+import pathlib
 import random
+import re
 import shutil
+import sys
 import tempfile
+import warnings
+from subprocess import CalledProcessError, check_output, STDOUT
 
 import numpy as np
+import PIL.Image
+import pytest
 import torch
+import torch.testing
 from PIL import Image
-from torchvision import io
 
-import __main__  # noqa: 401
+from torch.testing._comparison import BooleanPair, NonePair, not_close_error_metas, NumberPair, TensorLikePair
+from torchvision import io, tv_tensors
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import to_image, to_pil_image
 
 
-IN_CIRCLE_CI = os.getenv("CIRCLECI", False) == "true"
+IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"])
 IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None
 IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1"
 CUDA_NOT_AVAILABLE_MSG = "CUDA device not available"
-CIRCLECI_GPU_NO_CUDA_MSG = "We're in a CircleCI GPU machine, and this test doesn't need cuda."
+MPS_NOT_AVAILABLE_MSG = "MPS device not available"
+OSS_CI_GPU_NO_CUDA_MSG = "We're in an OSS GPU machine, and this test doesn't need cuda."
 
 
 @contextlib.contextmanager
@@ -107,18 +119,28 @@ def disable_console_output():
         yield
 
 
-def cpu_and_gpu():
+def cpu_and_cuda():
     import pytest  # noqa
 
     return ("cpu", pytest.param("cuda", marks=pytest.mark.needs_cuda))
 
 
+def cpu_and_cuda_and_mps():
+    return cpu_and_cuda() + (pytest.param("mps", marks=pytest.mark.needs_mps),)
+
+
 def needs_cuda(test_func):
     import pytest  # noqa
 
     return pytest.mark.needs_cuda(test_func)
 
 
+def needs_mps(test_func):
+    import pytest  # noqa
+
+    return pytest.mark.needs_mps(test_func)
+
+
 def _create_data(height=3, width=3, channels=3, device="cpu"):
     # TODO: When all relevant tests are ported to pytest, turn this into a module-level fixture
     tensor = torch.randint(0, 256, (channels, height, width), dtype=torch.uint8, device=device)
@@ -137,9 +159,6 @@ def _create_data_batch(height=3, width=3, channels=3, num_samples=4, device="cpu
     return batch_tensor
 
 
-assert_equal = functools.partial(torch.testing.assert_close, rtol=0, atol=0)
-
-
 def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):
     names = []
     for i in range(num_videos):
@@ -160,6 +179,7 @@ def get_list_of_videos(tmpdir, num_videos=5, sizes=None, fps=None):
 
 
 def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
+    # FIXME: this is handled automatically by `assert_equal` below. Let's remove this in favor of it
     np_pil_image = np.array(pil_image)
     if np_pil_image.ndim == 2:
         np_pil_image = np_pil_image[:, :, None]
@@ -172,6 +192,7 @@ def _assert_equal_tensor_to_pil(tensor, pil_image, msg=None):
 def _assert_approx_equal_tensor_to_pil(
     tensor, pil_image, tol=1e-5, msg=None, agg_method="mean", allowed_percentage_diff=None
 ):
+    # FIXME: this is handled automatically by `assert_close` below. Let's remove this in favor of it
     # TODO: we could just merge this into _assert_equal_tensor_to_pil
     np_pil_image = np.array(pil_image)
     if np_pil_image.ndim == 2:
@@ -210,7 +231,7 @@ def cache(fn):
     """
     sentinel = object()
     out_cache = {}
-    exc_cache = {}
+    exc_tb_cache = {}
 
     @functools.wraps(fn)
     def wrapper(*args, **kwargs):
@@ -220,17 +241,280 @@ def cache(fn):
         if out is not sentinel:
             return out
 
-        exc = exc_cache.get(key, sentinel)
-        if exc is not sentinel:
-            raise exc
+        exc_tb = exc_tb_cache.get(key, sentinel)
+        if exc_tb is not sentinel:
+            raise exc_tb[0].with_traceback(exc_tb[1])
 
         try:
             out = fn(*args, **kwargs)
         except Exception as exc:
-            exc_cache[key] = exc
+            # We need to cache the traceback here as well. Otherwise, each re-raise will add the internal pytest
+            # traceback frames anew, but they will only be removed once. Thus, the traceback will be ginormous hiding
+            # the actual information in the noise. See https://github.com/pytest-dev/pytest/issues/10363 for details.
+            exc_tb_cache[key] = exc, exc.__traceback__
             raise exc
 
         out_cache[key] = out
         return out
 
     return wrapper
+
+
+def combinations_grid(**kwargs):
+    """Creates a grid of input combinations.
+
+    Each element in the returned sequence is a dictionary containing one possible combination as values.
+
+    Example:
+        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
+        [
+            {'foo': 'bar', 'spam': 'eggs'},
+            {'foo': 'bar', 'spam': 'ham'},
+            {'foo': 'baz', 'spam': 'eggs'},
+            {'foo': 'baz', 'spam': 'ham'}
+        ]
+    """
+    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
+
+
+class ImagePair(TensorLikePair):
+    def __init__(
+        self,
+        actual,
+        expected,
+        *,
+        mae=False,
+        **other_parameters,
+    ):
+        if all(isinstance(input, PIL.Image.Image) for input in [actual, expected]):
+            actual, expected = [to_image(input) for input in [actual, expected]]
+
+        super().__init__(actual, expected, **other_parameters)
+        self.mae = mae
+
+    def compare(self) -> None:
+        actual, expected = self.actual, self.expected
+
+        self._compare_attributes(actual, expected)
+        actual, expected = self._equalize_attributes(actual, expected)
+
+        if self.mae:
+            if actual.dtype is torch.uint8:
+                actual, expected = actual.to(torch.int), expected.to(torch.int)
+            mae = float(torch.abs(actual - expected).float().mean())
+            if mae > self.atol:
+                self._fail(
+                    AssertionError,
+                    f"The MAE of the images is {mae}, but only {self.atol} is allowed.",
+                )
+        else:
+            super()._compare_values(actual, expected)
+
+
+def assert_close(
+    actual,
+    expected,
+    *,
+    allow_subclasses=True,
+    rtol=None,
+    atol=None,
+    equal_nan=False,
+    check_device=True,
+    check_dtype=True,
+    check_layout=True,
+    check_stride=False,
+    msg=None,
+    **kwargs,
+):
+    """Superset of :func:`torch.testing.assert_close` with support for PIL vs. tensor image comparison"""
+    __tracebackhide__ = True
+
+    error_metas = not_close_error_metas(
+        actual,
+        expected,
+        pair_types=(
+            NonePair,
+            BooleanPair,
+            NumberPair,
+            ImagePair,
+            TensorLikePair,
+        ),
+        allow_subclasses=allow_subclasses,
+        rtol=rtol,
+        atol=atol,
+        equal_nan=equal_nan,
+        check_device=check_device,
+        check_dtype=check_dtype,
+        check_layout=check_layout,
+        check_stride=check_stride,
+        **kwargs,
+    )
+
+    if error_metas:
+        raise error_metas[0].to_error(msg)
+
+
+assert_equal = functools.partial(assert_close, rtol=0, atol=0)
+
+
+DEFAULT_SIZE = (17, 11)
+
+
+NUM_CHANNELS_MAP = {
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
+}
+
+
+def make_image(
+    size=DEFAULT_SIZE,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+    memory_format=torch.contiguous_format,
+):
+    num_channels = NUM_CHANNELS_MAP[color_space]
+    dtype = dtype or torch.uint8
+    max_value = get_max_value(dtype)
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_channels, *size),
+        low=0,
+        high=max_value,
+        dtype=dtype,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
+        data[..., -1, :, :] = max_value
+
+    return tv_tensors.Image(data)
+
+
+def make_image_tensor(*args, **kwargs):
+    return make_image(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def make_image_pil(*args, **kwargs):
+    return to_pil_image(make_image(*args, **kwargs))
+
+
+def make_bounding_boxes(
+    canvas_size=DEFAULT_SIZE,
+    *,
+    format=tv_tensors.BoundingBoxFormat.XYXY,
+    dtype=None,
+    device="cpu",
+):
+    def sample_position(values, max_value):
+        # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
+        # However, if we have batch_dims, we need tensors as limits.
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.tolist()])
+
+    if isinstance(format, str):
+        format = tv_tensors.BoundingBoxFormat[format]
+
+    dtype = dtype or torch.float32
+
+    num_objects = 1
+    h, w = [torch.randint(1, c, (num_objects,)) for c in canvas_size]
+    y = sample_position(h, canvas_size[0])
+    x = sample_position(w, canvas_size[1])
+
+    if format is tv_tensors.BoundingBoxFormat.XYWH:
+        parts = (x, y, w, h)
+    elif format is tv_tensors.BoundingBoxFormat.XYXY:
+        x1, y1 = x, y
+        x2 = x1 + w
+        y2 = y1 + h
+        parts = (x1, y1, x2, y2)
+    elif format is tv_tensors.BoundingBoxFormat.CXCYWH:
+        cx = x + w / 2
+        cy = y + h / 2
+        parts = (cx, cy, w, h)
+    else:
+        raise ValueError(f"Format {format} is not supported")
+
+    return tv_tensors.BoundingBoxes(
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
+    )
+
+
+def make_detection_mask(size=DEFAULT_SIZE, *, dtype=None, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    num_objects = 1
+    return tv_tensors.Mask(
+        torch.testing.make_tensor(
+            (num_objects, *size),
+            low=0,
+            high=2,
+            dtype=dtype or torch.bool,
+            device=device,
+        )
+    )
+
+
+def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    return tv_tensors.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, *size),
+            low=0,
+            high=num_categories,
+            dtype=dtype or torch.uint8,
+            device=device,
+        )
+    )
+
+
+def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
+    return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
+def make_video_tensor(*args, **kwargs):
+    return make_video(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def assert_run_python_script(source_code):
+    """Utility to check assertions in an independent Python subprocess.
+
+    The script provided in the source code should return 0 and not print
+    anything on stderr or stdout. Modified from scikit-learn test utils.
+
+    Args:
+        source_code (str): The Python source code to execute.
+    """
+    with get_tmp_dir() as root:
+        path = pathlib.Path(root) / "main.py"
+        with open(path, "w") as file:
+            file.write(source_code)
+
+        try:
+            out = check_output([sys.executable, str(path)], stderr=STDOUT)
+        except CalledProcessError as e:
+            raise RuntimeError(f"script errored with output:\n{e.output.decode()}")
+        if out != b"":
+            raise AssertionError(out.decode())
+
+
+@contextlib.contextmanager
+def assert_no_warnings():
+    # The name `catch_warnings` is a misnomer as the context manager does **not** catch any warnings, but rather scopes
+    # the warning filters. All changes that are made to the filters while in this context, will be reset upon exit.
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        yield
+
+
+@contextlib.contextmanager
+def ignore_jit_no_profile_information_warning():
+    # Calling a scripted object often triggers a warning like
+    # `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information`
+    # with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore
+    # them.
+    with warnings.catch_warnings():
+        warnings.filterwarnings("ignore", message=re.escape("operator() profile_node %"), category=UserWarning)
+        yield
diff --git a/test/conftest.py b/test/conftest.py
index 1a9b2db7f5cd2f0e2513beb84d9d977e861b3f5f..ea73b09b906d6373e28f9f03fa4c082d54df2809 100644
--- a/test/conftest.py
+++ b/test/conftest.py
@@ -3,12 +3,21 @@ import random
 import numpy as np
 import pytest
 import torch
-from common_utils import CIRCLECI_GPU_NO_CUDA_MSG, CUDA_NOT_AVAILABLE_MSG, IN_CIRCLE_CI, IN_FBCODE, IN_RE_WORKER
+
+from common_utils import (
+    CUDA_NOT_AVAILABLE_MSG,
+    IN_FBCODE,
+    IN_OSS_CI,
+    IN_RE_WORKER,
+    MPS_NOT_AVAILABLE_MSG,
+    OSS_CI_GPU_NO_CUDA_MSG,
+)
 
 
 def pytest_configure(config):
     # register an additional marker (see pytest_collection_modifyitems)
     config.addinivalue_line("markers", "needs_cuda: mark for tests that rely on a CUDA device")
+    config.addinivalue_line("markers", "needs_mps: mark for tests that rely on a MPS device")
     config.addinivalue_line("markers", "dont_collect: mark for tests that should not be collected")
 
 
@@ -16,9 +25,9 @@ def pytest_collection_modifyitems(items):
     # This hook is called by pytest after it has collected the tests (google its name to check out its doc!)
     # We can ignore some tests as we see fit here, or add marks, such as a skip mark.
     #
-    # Typically here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
+    # Typically, here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the
     # tests that don't need CUDA, because those tests are extensively tested in the CPU CI instances already.
-    # This is true for both CircleCI and the fbcode internal CI.
+    # This is true for both OSS CI and the fbcode internal CI.
     # In the fbcode CI, we have an additional constraint: we try to avoid skipping tests. So instead of relying on
     # pytest.mark.skip, in fbcode we literally just remove those tests from the `items` list, and it's as if
     # these tests never existed.
@@ -28,16 +37,20 @@ def pytest_collection_modifyitems(items):
         # The needs_cuda mark will exist if the test was explicitly decorated with
         # the @needs_cuda decorator. It will also exist if it was parametrized with a
         # parameter that has the mark: for example if a test is parametrized with
-        # @pytest.mark.parametrize('device', cpu_and_gpu())
+        # @pytest.mark.parametrize('device', cpu_and_cuda())
         # the "instances" of the tests where device == 'cuda' will have the 'needs_cuda' mark,
         # and the ones with device == 'cpu' won't have the mark.
         needs_cuda = item.get_closest_marker("needs_cuda") is not None
+        needs_mps = item.get_closest_marker("needs_mps") is not None
 
         if needs_cuda and not torch.cuda.is_available():
             # In general, we skip cuda tests on machines without a GPU
             # There are special cases though, see below
             item.add_marker(pytest.mark.skip(reason=CUDA_NOT_AVAILABLE_MSG))
 
+        if needs_mps and not torch.backends.mps.is_available():
+            item.add_marker(pytest.mark.skip(reason=MPS_NOT_AVAILABLE_MSG))
+
         if IN_FBCODE:
             # fbcode doesn't like skipping tests, so instead we  just don't collect the test
             # so that they don't even "exist", hence the continue statements.
@@ -49,15 +62,18 @@ def pytest_collection_modifyitems(items):
                 # TODO: something more robust would be to do that only in a sandcastle instance,
                 # so that we can still see the test being skipped when testing locally from a devvm
                 continue
-        elif IN_CIRCLE_CI:
+            if needs_mps and not torch.backends.mps.is_available():
+                # Same as above, but for MPS
+                continue
+        elif IN_OSS_CI:
             # Here we're not in fbcode, so we can safely collect and skip tests.
             if not needs_cuda and torch.cuda.is_available():
-                # Similar to what happens in RE workers: we don't need the CircleCI GPU machines
+                # Similar to what happens in RE workers: we don't need the OSS CI GPU machines
                 # to run the CPU-only tests.
-                item.add_marker(pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG))
+                item.add_marker(pytest.mark.skip(reason=OSS_CI_GPU_NO_CUDA_MSG))
 
         if item.get_closest_marker("dont_collect") is not None:
-            # currently, this is only used for some tests we're sure we dont want to run on fbcode
+            # currently, this is only used for some tests we're sure we don't want to run on fbcode
             continue
 
         out_items.append(item)
diff --git a/test/datasets_utils.py b/test/datasets_utils.py
index c232e7132b4beb3a138b704e32687b7b749a5f0a..bd9f7ea3a0f8cd1b0ef5211b4cd4667475fb9b62 100644
--- a/test/datasets_utils.py
+++ b/test/datasets_utils.py
@@ -5,6 +5,7 @@ import inspect
 import itertools
 import os
 import pathlib
+import platform
 import random
 import shutil
 import string
@@ -25,6 +26,7 @@ import torch
 import torchvision.datasets
 import torchvision.io
 from common_utils import disable_console_output, get_tmp_dir
+from torch.utils._pytree import tree_any
 from torchvision.transforms.functional import get_dimensions
 
 
@@ -137,7 +139,7 @@ def test_all_configs(test):
 
     .. note::
 
-        This will try to remove duplicate configurations. During this process it will not not preserve a potential
+        This will try to remove duplicate configurations. During this process it will not preserve a potential
         ordering of the configurations or an inner ordering of a configuration.
     """
 
@@ -146,7 +148,7 @@ def test_all_configs(test):
             return [dict(config_) for config_ in {tuple(sorted(config.items())) for config in configs}]
         except TypeError:
             # A TypeError will be raised if a value of any config is not hashable, e.g. a list. In that case duplicate
-            # removal would be a lot more elaborate and we simply bail out.
+            # removal would be a lot more elaborate, and we simply bail out.
             return configs
 
     @functools.wraps(test)
@@ -169,23 +171,6 @@ def test_all_configs(test):
     return wrapper
 
 
-def combinations_grid(**kwargs):
-    """Creates a grid of input combinations.
-
-    Each element in the returned sequence is a dictionary containing one possible combination as values.
-
-    Example:
-        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
-        [
-            {'foo': 'bar', 'spam': 'eggs'},
-            {'foo': 'bar', 'spam': 'ham'},
-            {'foo': 'baz', 'spam': 'eggs'},
-            {'foo': 'baz', 'spam': 'ham'}
-        ]
-    """
-    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
-
-
 class DatasetTestCase(unittest.TestCase):
     """Abstract base class for all dataset testcases.
 
@@ -297,7 +282,7 @@ class DatasetTestCase(unittest.TestCase):
         .. note::
 
             The default behavior is only valid if the dataset to be tested has ``root`` as the only required parameter.
-            Otherwise you need to overwrite this method.
+            Otherwise, you need to overwrite this method.
 
         Args:
             tmpdir (str): Path to a temporary directory. For most cases this acts as root directory for the dataset
@@ -564,7 +549,7 @@ class DatasetTestCase(unittest.TestCase):
     @test_all_configs
     def test_num_examples(self, config):
         with self.create_dataset(config) as (dataset, info):
-            assert len(dataset) == info["num_examples"]
+            assert len(list(dataset)) == len(dataset) == info["num_examples"]
 
     @test_all_configs
     def test_transforms(self, config):
@@ -581,6 +566,42 @@ class DatasetTestCase(unittest.TestCase):
 
                 mock.assert_called()
 
+    @test_all_configs
+    def test_transforms_v2_wrapper(self, config):
+        from torchvision import tv_tensors
+        from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+        try:
+            with self.create_dataset(config) as (dataset, info):
+                for target_keys in [None, "all"]:
+                    if target_keys is not None and self.DATASET_CLASS not in {
+                        torchvision.datasets.CocoDetection,
+                        torchvision.datasets.VOCDetection,
+                        torchvision.datasets.Kitti,
+                        torchvision.datasets.WIDERFace,
+                    }:
+                        with self.assertRaisesRegex(ValueError, "`target_keys` is currently only supported for"):
+                            wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+                        continue
+
+                    wrapped_dataset = wrap_dataset_for_transforms_v2(dataset, target_keys=target_keys)
+                    assert isinstance(wrapped_dataset, self.DATASET_CLASS)
+                    assert len(wrapped_dataset) == info["num_examples"]
+
+                    wrapped_sample = wrapped_dataset[0]
+                    assert tree_any(
+                        lambda item: isinstance(item, (tv_tensors.TVTensor, PIL.Image.Image)), wrapped_sample
+                    )
+        except TypeError as error:
+            msg = f"No wrapper exists for dataset class {type(dataset).__name__}"
+            if str(error).startswith(msg):
+                pytest.skip(msg)
+            raise error
+        except RuntimeError as error:
+            if "currently not supported by this wrapper" in str(error):
+                pytest.skip("Config is currently not supported by this wrapper")
+            raise error
+
 
 class ImageDatasetTestCase(DatasetTestCase):
     """Abstract base class for image dataset testcases.
@@ -604,7 +625,7 @@ class ImageDatasetTestCase(DatasetTestCase):
             patch_checks=patch_checks,
             **kwargs,
         ) as (dataset, info):
-            # PIL.Image.open() only loads the image meta data upfront and keeps the file open until the first access
+            # PIL.Image.open() only loads the image metadata upfront and keeps the file open until the first access
             # to the pixel data occurs. Trying to delete such a file results in an PermissionError on Windows. Thus, we
             # force-load opened images.
             # This problem only occurs during testing since some tests, e.g. DatasetTestCase.test_feature_types open an
@@ -641,27 +662,73 @@ class VideoDatasetTestCase(DatasetTestCase):
     FEATURE_TYPES = (torch.Tensor, torch.Tensor, int)
     REQUIRED_PACKAGES = ("av",)
 
-    DEFAULT_FRAMES_PER_CLIP = 1
+    FRAMES_PER_CLIP = 1
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.dataset_args = self._set_default_frames_per_clip(self.dataset_args)
 
-    def _set_default_frames_per_clip(self, inject_fake_data):
+    def _set_default_frames_per_clip(self, dataset_args):
         argspec = inspect.getfullargspec(self.DATASET_CLASS.__init__)
         args_without_default = argspec.args[1 : (-len(argspec.defaults) if argspec.defaults else None)]
         frames_per_clip_last = args_without_default[-1] == "frames_per_clip"
 
-        @functools.wraps(inject_fake_data)
+        @functools.wraps(dataset_args)
         def wrapper(tmpdir, config):
-            args = inject_fake_data(tmpdir, config)
+            args = dataset_args(tmpdir, config)
             if frames_per_clip_last and len(args) == len(args_without_default) - 1:
-                args = (*args, self.DEFAULT_FRAMES_PER_CLIP)
+                args = (*args, self.FRAMES_PER_CLIP)
 
             return args
 
         return wrapper
 
+    def test_output_format(self):
+        for output_format in ["TCHW", "THWC"]:
+            with self.create_dataset(output_format=output_format) as (dataset, _):
+                for video, *_ in dataset:
+                    if output_format == "TCHW":
+                        num_frames, num_channels, *_ = video.shape
+                    else:  # output_format == "THWC":
+                        num_frames, *_, num_channels = video.shape
+
+                assert num_frames == self.FRAMES_PER_CLIP
+                assert num_channels == 3
+
+    @test_all_configs
+    def test_transforms_v2_wrapper(self, config):
+        # `output_format == "THWC"` is not supported by the wrapper. Thus, we skip the `config` if it is set explicitly
+        # or use the supported `"TCHW"`
+        if config.setdefault("output_format", "TCHW") == "THWC":
+            return
+
+        super().test_transforms_v2_wrapper.__wrapped__(self, config)
+
+
+def _no_collate(batch):
+    return batch
+
+
+def check_transforms_v2_wrapper_spawn(dataset):
+    # On Linux and Windows, the DataLoader forks the main process by default. This is not available on macOS, so new
+    # subprocesses are spawned. This requires the whole pipeline including the dataset to be pickleable, which is what
+    # we are enforcing here.
+    if platform.system() != "Darwin":
+        pytest.skip("Multiprocessing spawning is only checked on macOS.")
+
+    from torch.utils.data import DataLoader
+    from torchvision import tv_tensors
+    from torchvision.datasets import wrap_dataset_for_transforms_v2
+
+    wrapped_dataset = wrap_dataset_for_transforms_v2(dataset)
+
+    dataloader = DataLoader(wrapped_dataset, num_workers=2, multiprocessing_context="spawn", collate_fn=_no_collate)
+
+    for wrapped_sample in dataloader:
+        assert tree_any(
+            lambda item: isinstance(item, (tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)), wrapped_sample
+        )
+
 
 def create_image_or_video_tensor(size: Sequence[int]) -> torch.Tensor:
     r"""Create a random uint8 tensor.
@@ -786,7 +853,7 @@ def create_video_file(
     fps: float = 25,
     **kwargs: Any,
 ) -> pathlib.Path:
-    """Create an video file from random data.
+    """Create a video file from random data.
 
     Args:
         root (Union[str, pathlib.Path]): Root directory the video file will be placed in.
@@ -951,7 +1018,7 @@ def create_random_string(length: int, *digits: str) -> str:
 
     Args:
         length (int): Number of characters in the generated string.
-        *characters (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`.
+        *digits (str): Characters to sample from. If omitted defaults to :attr:`string.ascii_lowercase`.
     """
     if not digits:
         digits = string.ascii_lowercase
diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl
index e95ba5f53985e3773c6a625bed929b406350aa90..862af2185c75bd90734b068981e298cf94d11cc8 100644
Binary files a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl
index c2875679efd98e7d3454084ddc67054a8dec047e..1d317eb791515686c7294d8c0663f798df6fb71c 100644
Binary files a/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_fasterrcnn_resnet50_fpn_v2_expect.pkl differ
diff --git a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl
index 0657261d96cefe0d09b93efcd81c21b4cb56b3da..3d4e3e63f280c79044706fa5ac4e9c1c448fdefe 100644
Binary files a/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_fcos_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl
index 2f1ff941abae5994144c73dcfd361e963ab28cb9..54dfb7cd206f1e420915bb5703f13971a4055cbe 100644
Binary files a/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_keypointrcnn_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl
index 36b680816726017ffafc262ca38861df2737087a..f52b77a8dd8eb18ec2d4b0c85a52968bf6d7d92b 100644
Binary files a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl
index c6d1fd14081505a25ffea7ddc8e279078a917b3b..23e841bf8749504030baca953e351dd9b7f146b0 100644
Binary files a/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_maskrcnn_resnet50_fpn_v2_expect.pkl differ
diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl
index 7fb8d66b080dfdcebb4bed386cd752b99398b779..f188ee7b911cc7a024563f7572eb71062a0f97e7 100644
Binary files a/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl and b/test/expect/ModelTester.test_retinanet_resnet50_fpn_expect.pkl differ
diff --git a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl
index 9c74f2e9b9940de50adb68253c2b3d2bf9b41ba2..beaf6c8e84b1dee9a3748c0cc08dcaab2cf15c07 100644
Binary files a/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl and b/test/expect/ModelTester.test_retinanet_resnet50_fpn_v2_expect.pkl differ
diff --git a/test/expect/ModelTester.test_swin3d_b_expect.pkl b/test/expect/ModelTester.test_swin3d_b_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1efc513c91166243925d9f32cc2ae2d35de2f019
Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_b_expect.pkl differ
diff --git a/test/expect/ModelTester.test_swin3d_s_expect.pkl b/test/expect/ModelTester.test_swin3d_s_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..0c1e594993e01c3610c395608fe46ec6bde16214
Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_s_expect.pkl differ
diff --git a/test/expect/ModelTester.test_swin3d_t_expect.pkl b/test/expect/ModelTester.test_swin3d_t_expect.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..5e658ff16b7352da3748eebeabd03a7c4fb5a8dc
Binary files /dev/null and b/test/expect/ModelTester.test_swin3d_t_expect.pkl differ
diff --git a/test/prototype_common_utils.py b/test/prototype_common_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..b26bcff32466b33004883b0acaca9b124e819485
--- /dev/null
+++ b/test/prototype_common_utils.py
@@ -0,0 +1,82 @@
+import collections.abc
+import dataclasses
+from typing import Optional, Sequence
+
+import pytest
+import torch
+from torch.nn.functional import one_hot
+
+from torchvision.prototype import tv_tensors
+
+from transforms_v2_legacy_utils import combinations_grid, DEFAULT_EXTRA_DIMS, from_loader, from_loaders, TensorLoader
+
+
+@dataclasses.dataclass
+class LabelLoader(TensorLoader):
+    categories: Optional[Sequence[str]]
+
+
+def _parse_categories(categories):
+    if categories is None:
+        num_categories = int(torch.randint(1, 11, ()))
+    elif isinstance(categories, int):
+        num_categories = categories
+        categories = [f"category{idx}" for idx in range(num_categories)]
+    elif isinstance(categories, collections.abc.Sequence) and all(isinstance(category, str) for category in categories):
+        categories = list(categories)
+        num_categories = len(categories)
+    else:
+        raise pytest.UsageError(
+            f"`categories` can either be `None` (default), an integer, or a sequence of strings, "
+            f"but got '{categories}' instead."
+        )
+    return categories, num_categories
+
+
+def make_label_loader(*, extra_dims=(), categories=None, dtype=torch.int64):
+    categories, num_categories = _parse_categories(categories)
+
+    def fn(shape, dtype, device):
+        # The idiom `make_tensor(..., dtype=torch.int64).to(dtype)` is intentional to only get integer values,
+        # regardless of the requested dtype, e.g. 0 or 0.0 rather than 0 or 0.123
+        data = torch.testing.make_tensor(shape, low=0, high=num_categories, dtype=torch.int64, device=device).to(dtype)
+        return tv_tensors.Label(data, categories=categories)
+
+    return LabelLoader(fn, shape=extra_dims, dtype=dtype, categories=categories)
+
+
+make_label = from_loader(make_label_loader)
+
+
+@dataclasses.dataclass
+class OneHotLabelLoader(TensorLoader):
+    categories: Optional[Sequence[str]]
+
+
+def make_one_hot_label_loader(*, categories=None, extra_dims=(), dtype=torch.int64):
+    categories, num_categories = _parse_categories(categories)
+
+    def fn(shape, dtype, device):
+        if num_categories == 0:
+            data = torch.empty(shape, dtype=dtype, device=device)
+        else:
+            # The idiom `make_label_loader(..., dtype=torch.int64); ...; one_hot(...).to(dtype)` is intentional
+            # since `one_hot` only supports int64
+            label = make_label_loader(extra_dims=extra_dims, categories=num_categories, dtype=torch.int64).load(device)
+            data = one_hot(label, num_classes=num_categories).to(dtype)
+        return tv_tensors.OneHotLabel(data, categories=categories)
+
+    return OneHotLabelLoader(fn, shape=(*extra_dims, num_categories), dtype=dtype, categories=categories)
+
+
+def make_one_hot_label_loaders(
+    *,
+    categories=(1, 0, None),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.int64, torch.float32),
+):
+    for params in combinations_grid(categories=categories, extra_dims=extra_dims, dtype=dtypes):
+        yield make_one_hot_label_loader(**params)
+
+
+make_one_hot_labels = from_loaders(make_one_hot_label_loaders)
diff --git a/test/smoke_test.py b/test/smoke_test.py
index c3a4bdd19d6431250591c8376bf1d2c785c2cb10..6cc07c00aedcb72c4476a335e37e726af576eb5a 100644
--- a/test/smoke_test.py
+++ b/test/smoke_test.py
@@ -1,4 +1,102 @@
+"""Run smoke tests"""
+
+import sys
+from pathlib import Path
+
 import torch
 import torchvision
-import torchvision.datasets as dset
-import torchvision.transforms
+from torchvision.io import decode_jpeg, read_file, read_image
+from torchvision.models import resnet50, ResNet50_Weights
+
+SCRIPT_DIR = Path(__file__).parent
+
+
+def smoke_test_torchvision() -> None:
+    print(
+        "Is torchvision usable?",
+        all(x is not None for x in [torch.ops.image.decode_png, torch.ops.torchvision.roi_align]),
+    )
+
+
+def smoke_test_torchvision_read_decode() -> None:
+    img_jpg = read_image(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
+    if img_jpg.shape != (3, 606, 517):
+        raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}")
+    img_png = read_image(str(SCRIPT_DIR / "assets" / "interlaced_png" / "wizard_low.png"))
+    if img_png.shape != (4, 471, 354):
+        raise RuntimeError(f"Unexpected shape of img_png: {img_png.shape}")
+
+
+def smoke_test_torchvision_decode_jpeg(device: str = "cpu"):
+    img_jpg_data = read_file(str(SCRIPT_DIR / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg"))
+    img_jpg = decode_jpeg(img_jpg_data, device=device)
+    if img_jpg.shape != (3, 606, 517):
+        raise RuntimeError(f"Unexpected shape of img_jpg: {img_jpg.shape}")
+
+
+def smoke_test_compile() -> None:
+    try:
+        model = resnet50().cuda()
+        model = torch.compile(model)
+        x = torch.randn(1, 3, 224, 224, device="cuda")
+        out = model(x)
+        print(f"torch.compile model output: {out.shape}")
+    except RuntimeError:
+        if sys.platform == "win32":
+            print("Successfully caught torch.compile RuntimeError on win")
+        elif sys.version_info >= (3, 11, 0):
+            print("Successfully caught torch.compile RuntimeError on Python 3.11")
+        else:
+            raise
+
+
+def smoke_test_torchvision_resnet50_classify(device: str = "cpu") -> None:
+    img = read_image(str(SCRIPT_DIR / ".." / "gallery" / "assets" / "dog2.jpg")).to(device)
+
+    # Step 1: Initialize model with the best available weights
+    weights = ResNet50_Weights.DEFAULT
+    model = resnet50(weights=weights).to(device)
+    model.eval()
+
+    # Step 2: Initialize the inference transforms
+    preprocess = weights.transforms()
+
+    # Step 3: Apply inference preprocessing transforms
+    batch = preprocess(img).unsqueeze(0)
+
+    # Step 4: Use the model and print the predicted category
+    prediction = model(batch).squeeze(0).softmax(0)
+    class_id = prediction.argmax().item()
+    score = prediction[class_id].item()
+    category_name = weights.meta["categories"][class_id]
+    expected_category = "German shepherd"
+    print(f"{category_name} ({device}): {100 * score:.1f}%")
+    if category_name != expected_category:
+        raise RuntimeError(f"Failed ResNet50 classify {category_name} Expected: {expected_category}")
+
+
+def main() -> None:
+    print(f"torchvision: {torchvision.__version__}")
+    print(f"torch.cuda.is_available: {torch.cuda.is_available()}")
+
+    # Turn 1.11.0aHASH into 1.11 (major.minor only)
+    version = ".".join(torchvision.__version__.split(".")[:2])
+    if version >= "0.16":
+        print(f"{torch.ops.image._jpeg_version() = }")
+        assert torch.ops.image._is_compiled_against_turbo()
+
+    smoke_test_torchvision()
+    smoke_test_torchvision_read_decode()
+    smoke_test_torchvision_resnet50_classify()
+    smoke_test_torchvision_decode_jpeg()
+    if torch.cuda.is_available():
+        smoke_test_torchvision_decode_jpeg("cuda")
+        smoke_test_torchvision_resnet50_classify("cuda")
+        smoke_test_compile()
+
+    if torch.backends.mps.is_available():
+        smoke_test_torchvision_resnet50_classify("mps")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_architecture_ops.py b/test/test_architecture_ops.py
index 9f254c7942bd9cd33ec5d71904addbbcd4d6a63b..32ad1a32f897e11a3c1e05050f1c1f691b7a6936 100644
--- a/test/test_architecture_ops.py
+++ b/test/test_architecture_ops.py
@@ -20,7 +20,7 @@ class MaxvitTester(unittest.TestCase):
         x_hat = partition(x, partition_size)
         x_hat = departition(x_hat, partition_size, n_partitions, n_partitions)
 
-        assert torch.allclose(x, x_hat)
+        torch.testing.assert_close(x, x_hat)
 
     def test_maxvit_grid_partition(self):
         input_shape = (1, 3, 224, 224)
@@ -39,7 +39,7 @@ class MaxvitTester(unittest.TestCase):
         x_hat = post_swap(x_hat)
         x_hat = departition(x_hat, n_partitions, partition_size, partition_size)
 
-        assert torch.allclose(x, x_hat)
+        torch.testing.assert_close(x, x_hat)
 
 
 if __name__ == "__main__":
diff --git a/test/test_backbone_utils.py b/test/test_backbone_utils.py
index 4fba3c3d09838661e0886a5e1bd1faa45ad2c67c..befceca020e0b8d0d9b8608ca161c114c7b762ba 100644
--- a/test/test_backbone_utils.py
+++ b/test/test_backbone_utils.py
@@ -194,7 +194,7 @@ class TestFxFeatureExtraction:
             assert n1 == n2
             assert p1.equal(p2)
 
-        # And that ouputs match
+        # And that outputs match
         with torch.no_grad():
             ilg_out = ilg_model(self.inp)
             fgn_out = fx_model(self.inp)
diff --git a/test/test_datasets.py b/test/test_datasets.py
index dbce7853effa27f595a0c76f71aaddaacedf311e..1270201d53e059437560dce28239a5ab93305e6d 100644
--- a/test/test_datasets.py
+++ b/test/test_datasets.py
@@ -8,6 +8,7 @@ import os
 import pathlib
 import pickle
 import random
+import re
 import shutil
 import string
 import unittest
@@ -21,12 +22,13 @@ import PIL
 import pytest
 import torch
 import torch.nn.functional as F
+from common_utils import combinations_grid
 from torchvision import datasets
 
 
 class STL10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.STL10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "unlabeled", "train+unlabeled"))
 
     @staticmethod
     def _make_binary_file(num_elements, root, name):
@@ -112,9 +114,7 @@ class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Caltech101
     FEATURE_TYPES = (PIL.Image.Image, (int, np.ndarray, tuple))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        target_type=("category", "annotation", ["category", "annotation"])
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(target_type=("category", "annotation", ["category", "annotation"]))
     REQUIRED_PACKAGES = ("scipy",)
 
     def inject_fake_data(self, tmpdir, config):
@@ -183,6 +183,10 @@ class Caltech101TestCase(datasets_utils.ImageDatasetTestCase):
                 ), "Type of the combined target does not match the type of the corresponding individual target: "
                 f"{actual} is not {expected}",
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset(target_type="category") as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Caltech256
@@ -190,7 +194,7 @@ class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
     def inject_fake_data(self, tmpdir, config):
         tmpdir = pathlib.Path(tmpdir) / "caltech256" / "256_ObjectCategories"
 
-        categories = ((1, "ak47"), (127, "laptop-101"), (257, "clutter"))
+        categories = ((1, "ak47"), (2, "american-flag"), (3, "backpack"))
         num_images_per_category = 2
 
         for idx, category in categories:
@@ -207,7 +211,7 @@ class Caltech256TestCase(datasets_utils.ImageDatasetTestCase):
 class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.WIDERFace
     FEATURE_TYPES = (PIL.Image.Image, (dict, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
 
     def inject_fake_data(self, tmpdir, config):
         widerface_dir = pathlib.Path(tmpdir) / "widerface"
@@ -258,6 +262,10 @@ class WIDERFaceTestCase(datasets_utils.ImageDatasetTestCase):
 
         return split_to_num_examples[config["split"]]
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Cityscapes
@@ -268,8 +276,8 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
         "color",
     )
     ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
-        *datasets_utils.combinations_grid(
+        *combinations_grid(mode=("fine",), split=("train", "test", "val"), target_type=TARGET_TYPES),
+        *combinations_grid(
             mode=("coarse",),
             split=("train", "train_extra", "val"),
             target_type=TARGET_TYPES,
@@ -382,11 +390,16 @@ class CityScapesTestCase(datasets_utils.ImageDatasetTestCase):
             assert isinstance(polygon_img, PIL.Image.Image)
             (polygon_target, info["expected_polygon_target"])
 
+    def test_transforms_v2_wrapper_spawn(self):
+        for target_type in ["instance", "semantic", ["instance", "semantic"]]:
+            with self.create_dataset(target_type=target_type) as (dataset, _):
+                datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.ImageNet
     REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"))
 
     def inject_fake_data(self, tmpdir, config):
         tmpdir = pathlib.Path(tmpdir)
@@ -413,10 +426,14 @@ class ImageNetTestCase(datasets_utils.ImageDatasetTestCase):
         torch.save((wnid_to_classes, None), tmpdir / "meta.bin")
         return num_examples
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class CIFAR10TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CIFAR10
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     _VERSION_CONFIG = dict(
         base_folder="cifar-10-batches-py",
@@ -489,7 +506,7 @@ class CelebATestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CelebA
     FEATURE_TYPES = (PIL.Image.Image, (torch.Tensor, int, tuple, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "valid", "test", "all"),
         target_type=("attr", "identity", "bbox", "landmarks", ["attr", "identity"]),
     )
@@ -607,15 +624,18 @@ class CelebATestCase(datasets_utils.ImageDatasetTestCase):
 
         assert merged_imgs_names == all_imgs_names
 
+    def test_transforms_v2_wrapper_spawn(self):
+        for target_type in ["identity", "bbox", ["identity", "bbox"]]:
+            with self.create_dataset(target_type=target_type) as (dataset, _):
+                datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.VOCSegmentation
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image)
 
     ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(
-            year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")
-        ),
+        *combinations_grid(year=[f"20{year:02d}" for year in range(7, 13)], image_set=("train", "val", "trainval")),
         dict(year="2007", image_set="test"),
     )
 
@@ -696,6 +716,10 @@ class VOCSegmentationTestCase(datasets_utils.ImageDatasetTestCase):
 
         return data
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class VOCDetectionTestCase(VOCSegmentationTestCase):
     DATASET_CLASS = datasets.VOCDetection
@@ -716,6 +740,10 @@ class VOCDetectionTestCase(VOCSegmentationTestCase):
 
             assert object == info["annotation"]
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CocoDetection
@@ -763,11 +791,21 @@ class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
         return info
 
     def _create_annotations(self, image_ids, num_annotations_per_image):
-        annotations = datasets_utils.combinations_grid(
-            image_id=image_ids, bbox=([1.0, 2.0, 3.0, 4.0],) * num_annotations_per_image
-        )
-        for id, annotation in enumerate(annotations):
-            annotation["id"] = id
+        annotations = []
+        annotion_id = 0
+        for image_id in itertools.islice(itertools.cycle(image_ids), len(image_ids) * num_annotations_per_image):
+            annotations.append(
+                dict(
+                    image_id=image_id,
+                    id=annotion_id,
+                    bbox=torch.rand(4).tolist(),
+                    segmentation=[torch.rand(8).tolist()],
+                    category_id=int(torch.randint(91, ())),
+                    area=float(torch.rand(1)),
+                    iscrowd=int(torch.randint(2, size=(1,))),
+                )
+            )
+            annotion_id += 1
         return annotations, dict()
 
     def _create_json(self, root, name, content):
@@ -776,13 +814,17 @@ class CocoDetectionTestCase(datasets_utils.ImageDatasetTestCase):
             json.dump(content, fh)
         return file
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class CocoCaptionsTestCase(CocoDetectionTestCase):
     DATASET_CLASS = datasets.CocoCaptions
 
     def _create_annotations(self, image_ids, num_annotations_per_image):
         captions = [str(idx) for idx in range(num_annotations_per_image)]
-        annotations = datasets_utils.combinations_grid(image_id=image_ids, caption=captions)
+        annotations = combinations_grid(image_id=image_ids, caption=captions)
         for id, annotation in enumerate(annotations):
             annotation["id"] = id
         return annotations, dict(captions=captions)
@@ -792,11 +834,16 @@ class CocoCaptionsTestCase(CocoDetectionTestCase):
             _, captions = dataset[0]
             assert tuple(captions) == tuple(info["captions"])
 
+    def test_transforms_v2_wrapper_spawn(self):
+        # We need to define this method, because otherwise the test from the super class will
+        # be run
+        pytest.skip("CocoCaptions is currently not supported by the v2 wrapper.")
+
 
 class UCF101TestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.UCF101
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False))
 
     _VIDEO_FOLDER = "videos"
     _ANNOTATIONS_FOLDER = "annotations"
@@ -857,9 +904,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.LSUN
 
     REQUIRED_PACKAGES = ("lmdb",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
-        classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"])
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(classes=("train", "test", "val", ["bedroom_train", "church_outdoor_train"]))
 
     _CATEGORIES = (
         "bedroom",
@@ -944,7 +989,7 @@ class LSUNTestCase(datasets_utils.ImageDatasetTestCase):
 
 class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.Kinetics
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"), num_classes=("400", "600", "700"))
 
     def inject_fake_data(self, tmpdir, config):
         classes = ("Abseiling", "Zumba")
@@ -960,11 +1005,15 @@ class KineticsTestCase(datasets_utils.VideoDatasetTestCase):
             )
         return num_videos_per_class * len(classes)
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset(output_format="TCHW") as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
     DATASET_CLASS = datasets.HMDB51
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(fold=(1, 2, 3), train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(fold=(1, 2, 3), train=(True, False))
 
     _VIDEO_FOLDER = "videos"
     _SPLITS_FOLDER = "splits"
@@ -1024,7 +1073,7 @@ class HMDB51TestCase(datasets_utils.VideoDatasetTestCase):
 class OmniglotTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Omniglot
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(background=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(background=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
         target_folder = (
@@ -1104,7 +1153,7 @@ class SEMEIONTestCase(datasets_utils.ImageDatasetTestCase):
 class USPSTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.USPS
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
         num_images = 2 if config["train"] else 1
@@ -1126,7 +1175,7 @@ class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):
 
     REQUIRED_PACKAGES = ("scipy.io", "scipy.sparse")
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         image_set=("train", "val", "train_noval"), mode=("boundaries", "segmentation")
     )
 
@@ -1187,6 +1236,10 @@ class SBDatasetTestCase(datasets_utils.ImageDatasetTestCase):
     def _file_stem(self, idx):
         return f"2008_{idx:06d}"
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset(mode="segmentation") as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class FakeDataTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FakeData
@@ -1212,7 +1265,7 @@ class PhotoTourTestCase(datasets_utils.ImageDatasetTestCase):
     _TRAIN_FEATURE_TYPES = (torch.Tensor,)
     _TEST_FEATURE_TYPES = (torch.Tensor, torch.Tensor, torch.Tensor)
 
-    datasets_utils.combinations_grid(train=(True, False))
+    combinations_grid(train=(True, False))
 
     _NAME = "liberty"
 
@@ -1371,7 +1424,7 @@ class Flickr30kTestCase(Flickr8kTestCase):
 class MNISTTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.MNIST
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     _MAGIC_DTYPES = {
         torch.uint8: 8,
@@ -1441,7 +1494,7 @@ class EMNISTTestCase(MNISTTestCase):
     DATASET_CLASS = datasets.EMNIST
 
     DEFAULT_CONFIG = dict(split="byclass")
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("byclass", "bymerge", "balanced", "letters", "digits", "mnist"), train=(True, False)
     )
 
@@ -1452,7 +1505,7 @@ class EMNISTTestCase(MNISTTestCase):
 class QMNISTTestCase(MNISTTestCase):
     DATASET_CLASS = datasets.QMNIST
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(what=("train", "test", "test10k", "nist"))
+    ADDITIONAL_CONFIGS = combinations_grid(what=("train", "test", "test10k", "nist"))
 
     _LABELS_SIZE = (8,)
     _LABELS_DTYPE = torch.int32
@@ -1494,30 +1547,51 @@ class QMNISTTestCase(MNISTTestCase):
             assert len(dataset) == info["num_examples"] - 10000
 
 
+class MovingMNISTTestCase(datasets_utils.DatasetTestCase):
+    DATASET_CLASS = datasets.MovingMNIST
+    FEATURE_TYPES = (torch.Tensor,)
+
+    ADDITIONAL_CONFIGS = combinations_grid(split=(None, "train", "test"), split_ratio=(10, 1, 19))
+
+    _NUM_FRAMES = 20
+
+    def inject_fake_data(self, tmpdir, config):
+        base_folder = os.path.join(tmpdir, self.DATASET_CLASS.__name__)
+        os.makedirs(base_folder, exist_ok=True)
+        num_samples = 5
+        data = np.concatenate(
+            [
+                np.zeros((config["split_ratio"], num_samples, 64, 64)),
+                np.ones((self._NUM_FRAMES - config["split_ratio"], num_samples, 64, 64)),
+            ]
+        )
+        np.save(os.path.join(base_folder, "mnist_test_seq.npy"), data)
+        return num_samples
+
+    @datasets_utils.test_all_configs
+    def test_split(self, config):
+        with self.create_dataset(config) as (dataset, _):
+            if config["split"] == "train":
+                assert (dataset.data == 0).all()
+            elif config["split"] == "test":
+                assert (dataset.data == 1).all()
+            else:
+                assert dataset.data.size()[1] == self._NUM_FRAMES
+
+
 class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.DatasetFolder
 
-    # The dataset has no fixed return type since it is defined by the loader parameter. For testing, we use a loader
-    # that simply returns the path as type 'str' instead of loading anything. See the 'dataset_args()' method.
-    FEATURE_TYPES = (str, int)
-
-    _IMAGE_EXTENSIONS = ("jpg", "png")
-    _VIDEO_EXTENSIONS = ("avi", "mp4")
-    _EXTENSIONS = (*_IMAGE_EXTENSIONS, *_VIDEO_EXTENSIONS)
+    _EXTENSIONS = ("jpg", "png")
 
     # DatasetFolder has two mutually exclusive parameters: 'extensions' and 'is_valid_file'. One of both is required.
     # We only iterate over different 'extensions' here and handle the tests for 'is_valid_file' in the
     # 'test_is_valid_file()' method.
     DEFAULT_CONFIG = dict(extensions=_EXTENSIONS)
-    ADDITIONAL_CONFIGS = (
-        *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _IMAGE_EXTENSIONS]),
-        dict(extensions=_IMAGE_EXTENSIONS),
-        *datasets_utils.combinations_grid(extensions=[(ext,) for ext in _VIDEO_EXTENSIONS]),
-        dict(extensions=_VIDEO_EXTENSIONS),
-    )
+    ADDITIONAL_CONFIGS = combinations_grid(extensions=[(ext,) for ext in _EXTENSIONS])
 
     def dataset_args(self, tmpdir, config):
-        return tmpdir, lambda x: x
+        return tmpdir, datasets.folder.pil_loader
 
     def inject_fake_data(self, tmpdir, config):
         extensions = config["extensions"] or self._is_valid_file_to_extensions(config["is_valid_file"])
@@ -1528,14 +1602,8 @@ class DatasetFolderTestCase(datasets_utils.ImageDatasetTestCase):
             if ext not in extensions:
                 continue
 
-            create_example_folder = (
-                datasets_utils.create_image_folder
-                if ext in self._IMAGE_EXTENSIONS
-                else datasets_utils.create_video_folder
-            )
-
             num_examples = torch.randint(1, 3, size=()).item()
-            create_example_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples)
+            datasets_utils.create_image_folder(tmpdir, cls, lambda idx: self._file_name_fn(cls, ext, idx), num_examples)
 
             num_examples_total += num_examples
             classes.append(cls)
@@ -1589,7 +1657,7 @@ class ImageFolderTestCase(datasets_utils.ImageDatasetTestCase):
 class KittiTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Kitti
     FEATURE_TYPES = (PIL.Image.Image, (list, type(None)))  # test split returns None as target
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(train=(True, False))
+    ADDITIONAL_CONFIGS = combinations_grid(train=(True, False))
 
     def inject_fake_data(self, tmpdir, config):
         kitti_dir = os.path.join(tmpdir, "Kitti", "raw")
@@ -1621,11 +1689,15 @@ class KittiTestCase(datasets_utils.ImageDatasetTestCase):
 
         return split_to_num_examples[config["train"]]
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class SvhnTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SVHN
     REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test", "extra"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test", "extra"))
 
     def inject_fake_data(self, tmpdir, config):
         import scipy.io as sio
@@ -1646,7 +1718,7 @@ class SvhnTestCase(datasets_utils.ImageDatasetTestCase):
 
 class Places365TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Places365
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train-standard", "train-challenge", "val"),
         small=(False, True),
     )
@@ -1738,7 +1810,7 @@ class INaturalistTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.INaturalist
     FEATURE_TYPES = (PIL.Image.Image, (int, tuple))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         target_type=("kingdom", "full", "genus", ["kingdom", "phylum", "class", "order", "family", "genus", "full"]),
         version=("2021_train",),
     )
@@ -1775,7 +1847,7 @@ class INaturalistTestCase(datasets_utils.ImageDatasetTestCase):
 class LFWPeopleTestCase(datasets_utils.DatasetTestCase):
     DATASET_CLASS = datasets.LFWPeople
     FEATURE_TYPES = (PIL.Image.Image, int)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("10fold", "train", "test"), image_set=("original", "funneled", "deepfunneled")
     )
     _IMAGES_DIR = {"original": "lfw", "funneled": "lfw_funneled", "deepfunneled": "lfw-deepfunneled"}
@@ -1851,7 +1923,7 @@ class LFWPairsTestCase(LFWPeopleTestCase):
 
 class SintelTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Sintel
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"), pass_name=("clean", "final", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     FLOW_H, FLOW_W = 3, 4
@@ -1919,7 +1991,7 @@ class SintelTestCase(datasets_utils.ImageDatasetTestCase):
 
 class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.KittiFlow
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -1979,7 +2051,7 @@ class KittiFlowTestCase(datasets_utils.ImageDatasetTestCase):
 
 class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FlyingChairs
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     FLOW_H, FLOW_W = 3, 4
@@ -2034,7 +2106,7 @@ class FlyingChairsTestCase(datasets_utils.ImageDatasetTestCase):
 
 class FlyingThings3DTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FlyingThings3D
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "test"), pass_name=("clean", "final", "both"), camera=("left", "right", "both")
     )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
@@ -2171,7 +2243,7 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Food101
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     def inject_fake_data(self, tmpdir: str, config):
         root_folder = pathlib.Path(tmpdir) / "food-101"
@@ -2206,7 +2278,7 @@ class Food101TestCase(datasets_utils.ImageDatasetTestCase):
 
 class FGVCAircraftTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FGVCAircraft
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "val", "trainval", "test"), annotation_level=("variant", "family", "manufacturer")
     )
 
@@ -2289,7 +2361,7 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.DTD
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "test", "val"),
         # There is no need to test the whole matrix here, since each fold is treated exactly the same
         partition=(1, 5, 10),
@@ -2323,7 +2395,7 @@ class DTDTestCase(datasets_utils.ImageDatasetTestCase):
 
 class FER2013TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FER2013
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
 
@@ -2358,7 +2430,7 @@ class GTSRBTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.GTSRB
     FEATURE_TYPES = (PIL.Image.Image, int)
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     def inject_fake_data(self, tmpdir: str, config):
         root_folder = os.path.join(tmpdir, "gtsrb")
@@ -2408,7 +2480,7 @@ class CLEVRClassificationTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.CLEVRClassification
     FEATURE_TYPES = (PIL.Image.Image, (int, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
 
     def inject_fake_data(self, tmpdir, config):
         data_folder = pathlib.Path(tmpdir) / "clevr" / "CLEVR_v1.0"
@@ -2440,7 +2512,7 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.OxfordIIITPet
     FEATURE_TYPES = (PIL.Image.Image, (int, PIL.Image.Image, tuple, type(None)))
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("trainval", "test"),
         target_types=("category", "segmentation", ["category", "segmentation"], []),
     )
@@ -2495,11 +2567,15 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase):
         breed_id = "-1"
         return (image_id, class_id, species, breed_id)
 
+    def test_transforms_v2_wrapper_spawn(self):
+        with self.create_dataset() as (dataset, _):
+            datasets_utils.check_transforms_v2_wrapper_spawn(dataset)
+
 
 class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.StanfordCars
     REQUIRED_PACKAGES = ("scipy",)
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     def inject_fake_data(self, tmpdir, config):
         import scipy.io as io
@@ -2543,7 +2619,7 @@ class StanfordCarsTestCase(datasets_utils.ImageDatasetTestCase):
 class Country211TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Country211
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "valid", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "valid", "test"))
 
     def inject_fake_data(self, tmpdir: str, config):
         split_folder = pathlib.Path(tmpdir) / "country211" / config["split"]
@@ -2570,7 +2646,7 @@ class Country211TestCase(datasets_utils.ImageDatasetTestCase):
 class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Flowers102
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
     REQUIRED_PACKAGES = ("scipy",)
 
     def inject_fake_data(self, tmpdir: str, config):
@@ -2606,7 +2682,7 @@ class Flowers102TestCase(datasets_utils.ImageDatasetTestCase):
 class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.PCAM
 
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
     REQUIRED_PACKAGES = ("h5py",)
 
     def inject_fake_data(self, tmpdir: str, config):
@@ -2628,7 +2704,7 @@ class PCAMTestCase(datasets_utils.ImageDatasetTestCase):
 
 class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.RenderedSST2
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "val", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "val", "test"))
     SPLIT_TO_FOLDER = {"train": "train", "val": "valid", "test": "test"}
 
     def inject_fake_data(self, tmpdir: str, config):
@@ -2650,7 +2726,7 @@ class RenderedSST2TestCase(datasets_utils.ImageDatasetTestCase):
 
 class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Kitti2012Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -2712,7 +2788,7 @@ class Kitti2012StereoTestCase(datasets_utils.ImageDatasetTestCase):
 
 class Kitti2015StereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Kitti2015Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -2850,7 +2926,7 @@ class CREStereoTestCase(datasets_utils.ImageDatasetTestCase):
 
 class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.FallingThingsStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(variant=("single", "mixed", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(variant=("single", "mixed", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
 
     @staticmethod
@@ -2924,7 +3000,7 @@ class FallingThingsStereoTestCase(datasets_utils.ImageDatasetTestCase):
 
 class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SceneFlowStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         variant=("FlyingThings3D", "Driving", "Monkaa"), pass_name=("clean", "final", "both")
     )
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
@@ -3011,7 +3087,7 @@ class SceneFlowStereoTestCase(datasets_utils.ImageDatasetTestCase):
 class InStereo2k(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.InStereo2k
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)))
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
 
     @staticmethod
     def _make_scene_folder(root: str, name: str, size: Tuple[int, int]):
@@ -3053,7 +3129,7 @@ class InStereo2k(datasets_utils.ImageDatasetTestCase):
 
 class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.SintelStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(pass_name=("final", "clean", "both"))
+    ADDITIONAL_CONFIGS = combinations_grid(pass_name=("final", "clean", "both"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     def inject_fake_data(self, tmpdir, config):
@@ -3129,7 +3205,7 @@ class SintelStereoTestCase(datasets_utils.ImageDatasetTestCase):
 
 class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.ETH3DStereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(split=("train", "test"))
+    ADDITIONAL_CONFIGS = combinations_grid(split=("train", "test"))
     FEATURE_TYPES = (PIL.Image.Image, PIL.Image.Image, (np.ndarray, type(None)), (np.ndarray, type(None)))
 
     @staticmethod
@@ -3196,7 +3272,7 @@ class ETH3DStereoestCase(datasets_utils.ImageDatasetTestCase):
 
 class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
     DATASET_CLASS = datasets.Middlebury2014Stereo
-    ADDITIONAL_CONFIGS = datasets_utils.combinations_grid(
+    ADDITIONAL_CONFIGS = combinations_grid(
         split=("train", "additional"),
         calibration=("perfect", "imperfect", "both"),
         use_ambient_views=(True, False),
@@ -3287,5 +3363,47 @@ class Middlebury2014StereoTestCase(datasets_utils.ImageDatasetTestCase):
                 pass
 
 
+class TestDatasetWrapper:
+    def test_unknown_type(self):
+        unknown_object = object()
+        with pytest.raises(
+            TypeError, match=re.escape("is meant for subclasses of `torchvision.datasets.VisionDataset`")
+        ):
+            datasets.wrap_dataset_for_transforms_v2(unknown_object)
+
+    def test_unknown_dataset(self):
+        class MyVisionDataset(datasets.VisionDataset):
+            pass
+
+        dataset = MyVisionDataset("root")
+
+        with pytest.raises(TypeError, match="No wrapper exist"):
+            datasets.wrap_dataset_for_transforms_v2(dataset)
+
+    def test_missing_wrapper(self):
+        dataset = datasets.FakeData()
+
+        with pytest.raises(TypeError, match="please open an issue"):
+            datasets.wrap_dataset_for_transforms_v2(dataset)
+
+    def test_subclass(self, mocker):
+        from torchvision import tv_tensors
+
+        sentinel = object()
+        mocker.patch.dict(
+            tv_tensors._dataset_wrapper.WRAPPER_FACTORIES,
+            clear=False,
+            values={datasets.FakeData: lambda dataset, target_keys: lambda idx, sample: sentinel},
+        )
+
+        class MyFakeData(datasets.FakeData):
+            pass
+
+        dataset = MyFakeData()
+        wrapped_dataset = datasets.wrap_dataset_for_transforms_v2(dataset)
+
+        assert wrapped_dataset[0] is sentinel
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/test/test_datasets_download.py b/test/test_datasets_download.py
index b44d954241baad190af69bd20f38c863171d9089..e99017d8b5589360ca2c60283b09fb54e4509f8a 100644
--- a/test/test_datasets_download.py
+++ b/test/test_datasets_download.py
@@ -2,6 +2,7 @@ import contextlib
 import itertools
 import tempfile
 import time
+import traceback
 import unittest.mock
 import warnings
 from datetime import datetime
@@ -13,13 +14,7 @@ from urllib.request import Request, urlopen
 
 import pytest
 from torchvision import datasets
-from torchvision.datasets.utils import (
-    _get_redirect_url,
-    check_integrity,
-    download_file_from_google_drive,
-    download_url,
-    USER_AGENT,
-)
+from torchvision.datasets.utils import _get_redirect_url, USER_AGENT
 
 
 def limit_requests_per_time(min_secs_between_requests=2.0):
@@ -83,63 +78,65 @@ urlopen = resolve_redirects()(urlopen)
 
 @contextlib.contextmanager
 def log_download_attempts(
-    urls_and_md5s=None,
-    file="utils",
-    patch=True,
-    mock_auxiliaries=None,
+    urls,
+    *,
+    dataset_module,
 ):
-    def add_mock(stack, name, file, **kwargs):
+    def maybe_add_mock(*, module, name, stack, lst=None):
+        patcher = unittest.mock.patch(f"torchvision.datasets.{module}.{name}")
+
         try:
-            return stack.enter_context(unittest.mock.patch(f"torchvision.datasets.{file}.{name}", **kwargs))
-        except AttributeError as error:
-            if file != "utils":
-                return add_mock(stack, name, "utils", **kwargs)
-            else:
-                raise pytest.UsageError from error
-
-    if urls_and_md5s is None:
-        urls_and_md5s = set()
-    if mock_auxiliaries is None:
-        mock_auxiliaries = patch
+            mock = stack.enter_context(patcher)
+        except AttributeError:
+            return
 
-    with contextlib.ExitStack() as stack:
-        url_mock = add_mock(stack, "download_url", file, wraps=None if patch else download_url)
-        google_drive_mock = add_mock(
-            stack, "download_file_from_google_drive", file, wraps=None if patch else download_file_from_google_drive
-        )
+        if lst is not None:
+            lst.append(mock)
 
-        if mock_auxiliaries:
-            add_mock(stack, "extract_archive", file)
+    with contextlib.ExitStack() as stack:
+        download_url_mocks = []
+        download_file_from_google_drive_mocks = []
+        for module in [dataset_module, "utils"]:
+            maybe_add_mock(module=module, name="download_url", stack=stack, lst=download_url_mocks)
+            maybe_add_mock(
+                module=module,
+                name="download_file_from_google_drive",
+                stack=stack,
+                lst=download_file_from_google_drive_mocks,
+            )
+            maybe_add_mock(module=module, name="extract_archive", stack=stack)
 
         try:
-            yield urls_and_md5s
+            yield
         finally:
-            for args, kwargs in url_mock.call_args_list:
-                url = args[0]
-                md5 = args[-1] if len(args) == 4 else kwargs.get("md5")
-                urls_and_md5s.add((url, md5))
+            for download_url_mock in download_url_mocks:
+                for args, kwargs in download_url_mock.call_args_list:
+                    urls.append(args[0] if args else kwargs["url"])
 
-            for args, kwargs in google_drive_mock.call_args_list:
-                id = args[0]
-                url = f"https://drive.google.com/file/d/{id}"
-                md5 = args[3] if len(args) == 4 else kwargs.get("md5")
-                urls_and_md5s.add((url, md5))
+            for download_file_from_google_drive_mock in download_file_from_google_drive_mocks:
+                for args, kwargs in download_file_from_google_drive_mock.call_args_list:
+                    file_id = args[0] if args else kwargs["file_id"]
+                    urls.append(f"https://drive.google.com/file/d/{file_id}")
 
 
 def retry(fn, times=1, wait=5.0):
-    msgs = []
+    tbs = []
     for _ in range(times + 1):
         try:
             return fn()
         except AssertionError as error:
-            msgs.append(str(error))
+            tbs.append("".join(traceback.format_exception(type(error), error, error.__traceback__)))
             time.sleep(wait)
     else:
         raise AssertionError(
             "\n".join(
                 (
-                    f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time.\n",
-                    *(f"{idx}: {error}" for idx, error in enumerate(msgs, 1)),
+                    "\n",
+                    *[f"{'_' * 40}  {idx:2d}  {'_' * 40}\n\n{tb}" for idx, tb in enumerate(tbs, 1)],
+                    (
+                        f"Assertion failed {times + 1} times with {wait:.1f} seconds intermediate wait time. "
+                        f"You can find the the full tracebacks above."
+                    ),
                 )
             )
         )
@@ -149,10 +146,12 @@ def retry(fn, times=1, wait=5.0):
 def assert_server_response_ok():
     try:
         yield
-    except URLError as error:
-        raise AssertionError("The request timed out.") from error
     except HTTPError as error:
         raise AssertionError(f"The server returned {error.code}: {error.reason}.") from error
+    except URLError as error:
+        raise AssertionError(
+            "Connection not possible due to SSL." if "SSL" in str(error) else "The request timed out."
+        ) from error
     except RecursionError as error:
         raise AssertionError(str(error)) from error
 
@@ -163,45 +162,14 @@ def assert_url_is_accessible(url, timeout=5.0):
         urlopen(request, timeout=timeout)
 
 
-def assert_file_downloads_correctly(url, md5, tmpdir, timeout=5.0):
-    file = path.join(tmpdir, path.basename(url))
-    with assert_server_response_ok():
-        with open(file, "wb") as fh:
-            request = Request(url, headers={"User-Agent": USER_AGENT})
-            response = urlopen(request, timeout=timeout)
-            fh.write(response.read())
-
-    assert check_integrity(file, md5=md5), "The MD5 checksums mismatch"
-
-
-class DownloadConfig:
-    def __init__(self, url, md5=None, id=None):
-        self.url = url
-        self.md5 = md5
-        self.id = id or url
+def collect_urls(dataset_cls, *args, **kwargs):
+    urls = []
+    with contextlib.suppress(Exception), log_download_attempts(
+        urls, dataset_module=dataset_cls.__module__.split(".")[-1]
+    ):
+        dataset_cls(*args, **kwargs)
 
-    def __repr__(self) -> str:
-        return self.id
-
-
-def make_download_configs(urls_and_md5s, name=None):
-    return [
-        DownloadConfig(url, md5=md5, id=f"{name}, {url}" if name is not None else None) for url, md5 in urls_and_md5s
-    ]
-
-
-def collect_download_configs(dataset_loader, name=None, **kwargs):
-    urls_and_md5s = set()
-    try:
-        with log_download_attempts(urls_and_md5s=urls_and_md5s, **kwargs):
-            dataset = dataset_loader()
-    except Exception:
-        dataset = None
-
-    if name is None and dataset is not None:
-        name = type(dataset).__name__
-
-    return make_download_configs(urls_and_md5s, name)
+    return [(url, f"{dataset_cls.__name__}, {url}") for url in urls]
 
 
 # This is a workaround since fixtures, such as the built-in tmp_dir, can only be used within a test but not within a
@@ -216,12 +184,14 @@ def root():
 
 
 def places365():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.Places365(ROOT, split=split, small=small, download=True),
-                name=f"Places365, {split}, {'small' if small else 'large'}",
-                file="places365",
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(
+                datasets.Places365,
+                ROOT,
+                split=split,
+                small=small,
+                download=True,
             )
             for split, small in itertools.product(("train-standard", "train-challenge", "val"), (False, True))
         ]
@@ -229,30 +199,26 @@ def places365():
 
 
 def caltech101():
-    return collect_download_configs(lambda: datasets.Caltech101(ROOT, download=True), name="Caltech101")
+    return collect_urls(datasets.Caltech101, ROOT, download=True)
 
 
 def caltech256():
-    return collect_download_configs(lambda: datasets.Caltech256(ROOT, download=True), name="Caltech256")
+    return collect_urls(datasets.Caltech256, ROOT, download=True)
 
 
 def cifar10():
-    return collect_download_configs(lambda: datasets.CIFAR10(ROOT, download=True), name="CIFAR10")
+    return collect_urls(datasets.CIFAR10, ROOT, download=True)
 
 
 def cifar100():
-    return collect_download_configs(lambda: datasets.CIFAR100(ROOT, download=True), name="CIFAR100")
+    return collect_urls(datasets.CIFAR100, ROOT, download=True)
 
 
 def voc():
     # TODO: Also test the "2007-test" key
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.VOCSegmentation(ROOT, year=year, download=True),
-                name=f"VOC, {year}",
-                file="voc",
-            )
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(datasets.VOCSegmentation, ROOT, year=year, download=True)
             for year in ("2007", "2008", "2009", "2010", "2011", "2012")
         ]
     )
@@ -260,55 +226,42 @@ def voc():
 
 def mnist():
     with unittest.mock.patch.object(datasets.MNIST, "mirrors", datasets.MNIST.mirrors[-1:]):
-        return collect_download_configs(lambda: datasets.MNIST(ROOT, download=True), name="MNIST")
+        return collect_urls(datasets.MNIST, ROOT, download=True)
 
 
 def fashion_mnist():
-    return collect_download_configs(lambda: datasets.FashionMNIST(ROOT, download=True), name="FashionMNIST")
+    return collect_urls(datasets.FashionMNIST, ROOT, download=True)
 
 
 def kmnist():
-    return collect_download_configs(lambda: datasets.KMNIST(ROOT, download=True), name="KMNIST")
+    return collect_urls(datasets.KMNIST, ROOT, download=True)
 
 
 def emnist():
     # the 'split' argument can be any valid one, since everything is downloaded anyway
-    return collect_download_configs(lambda: datasets.EMNIST(ROOT, split="byclass", download=True), name="EMNIST")
+    return collect_urls(datasets.EMNIST, ROOT, split="byclass", download=True)
 
 
 def qmnist():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.QMNIST(ROOT, what=what, download=True),
-                name=f"QMNIST, {what}",
-                file="mnist",
-            )
-            for what in ("train", "test", "nist")
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.QMNIST, ROOT, what=what, download=True) for what in ("train", "test", "nist")]
     )
 
 
+def moving_mnist():
+    return collect_urls(datasets.MovingMNIST, ROOT, download=True)
+
+
 def omniglot():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.Omniglot(ROOT, background=background, download=True),
-                name=f"Omniglot, {'background' if background else 'evaluation'}",
-            )
-            for background in (True, False)
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.Omniglot, ROOT, background=background, download=True) for background in (True, False)]
     )
 
 
 def phototour():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.PhotoTour(ROOT, name=name, download=True),
-                name=f"PhotoTour, {name}",
-                file="phototour",
-            )
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(datasets.PhotoTour, ROOT, name=name, download=True)
             # The names postfixed with '_harris' point to the domain 'matthewalunbrown.com'. For some reason all
             # requests timeout from within CI. They are disabled until this is resolved.
             for name in ("notredame", "yosemite", "liberty")  # "notredame_harris", "yosemite_harris", "liberty_harris"
@@ -317,91 +270,51 @@ def phototour():
 
 
 def sbdataset():
-    return collect_download_configs(
-        lambda: datasets.SBDataset(ROOT, download=True),
-        name="SBDataset",
-        file="voc",
-    )
+    return collect_urls(datasets.SBDataset, ROOT, download=True)
 
 
 def sbu():
-    return collect_download_configs(
-        lambda: datasets.SBU(ROOT, download=True),
-        name="SBU",
-        file="sbu",
-    )
+    return collect_urls(datasets.SBU, ROOT, download=True)
 
 
 def semeion():
-    return collect_download_configs(
-        lambda: datasets.SEMEION(ROOT, download=True),
-        name="SEMEION",
-        file="semeion",
-    )
+    return collect_urls(datasets.SEMEION, ROOT, download=True)
 
 
 def stl10():
-    return collect_download_configs(
-        lambda: datasets.STL10(ROOT, download=True),
-        name="STL10",
-    )
+    return collect_urls(datasets.STL10, ROOT, download=True)
 
 
 def svhn():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.SVHN(ROOT, split=split, download=True),
-                name=f"SVHN, {split}",
-                file="svhn",
-            )
-            for split in ("train", "test", "extra")
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.SVHN, ROOT, split=split, download=True) for split in ("train", "test", "extra")]
     )
 
 
 def usps():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.USPS(ROOT, train=train, download=True),
-                name=f"USPS, {'train' if train else 'test'}",
-                file="usps",
-            )
-            for train in (True, False)
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.USPS, ROOT, train=train, download=True) for train in (True, False)]
     )
 
 
 def celeba():
-    return collect_download_configs(
-        lambda: datasets.CelebA(ROOT, download=True),
-        name="CelebA",
-        file="celeba",
-    )
+    return collect_urls(datasets.CelebA, ROOT, download=True)
 
 
 def widerface():
-    return collect_download_configs(
-        lambda: datasets.WIDERFace(ROOT, download=True),
-        name="WIDERFace",
-        file="widerface",
-    )
+    return collect_urls(datasets.WIDERFace, ROOT, download=True)
 
 
 def kinetics():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda: datasets.Kinetics(
-                    path.join(ROOT, f"Kinetics{num_classes}"),
-                    frames_per_clip=1,
-                    num_classes=num_classes,
-                    split=split,
-                    download=True,
-                ),
-                name=f"Kinetics, {num_classes}, {split}",
-                file="kinetics",
+    return itertools.chain.from_iterable(
+        [
+            collect_urls(
+                datasets.Kinetics,
+                path.join(ROOT, f"Kinetics{num_classes}"),
+                frames_per_clip=1,
+                num_classes=num_classes,
+                split=split,
+                download=True,
             )
             for num_classes, split in itertools.product(("400", "600", "700"), ("train", "val"))
         ]
@@ -409,58 +322,55 @@ def kinetics():
 
 
 def kitti():
-    return itertools.chain(
-        *[
-            collect_download_configs(
-                lambda train=train: datasets.Kitti(ROOT, train=train, download=True),
-                name=f"Kitti, {'train' if train else 'test'}",
-                file="kitti",
-            )
-            for train in (True, False)
-        ]
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.Kitti, ROOT, train=train, download=True) for train in (True, False)]
     )
 
 
-def make_parametrize_kwargs(download_configs):
-    argvalues = []
-    ids = []
-    for config in download_configs:
-        argvalues.append((config.url, config.md5))
-        ids.append(config.id)
-
-    return dict(argnames=("url", "md5"), argvalues=argvalues, ids=ids)
-
-
-@pytest.mark.parametrize(
-    **make_parametrize_kwargs(
-        itertools.chain(
-            caltech101(),
-            caltech256(),
-            cifar10(),
-            cifar100(),
-            # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details.
-            # voc(),
-            mnist(),
-            fashion_mnist(),
-            kmnist(),
-            emnist(),
-            qmnist(),
-            omniglot(),
-            phototour(),
-            sbdataset(),
-            sbu(),
-            semeion(),
-            stl10(),
-            svhn(),
-            usps(),
-            celeba(),
-            widerface(),
-            kinetics(),
-            kitti(),
-        )
+def stanford_cars():
+    return itertools.chain.from_iterable(
+        [collect_urls(datasets.StanfordCars, ROOT, split=split, download=True) for split in ["train", "test"]]
+    )
+
+
+def url_parametrization(*dataset_urls_and_ids_fns):
+    return pytest.mark.parametrize(
+        "url",
+        [
+            pytest.param(url, id=id)
+            for dataset_urls_and_ids_fn in dataset_urls_and_ids_fns
+            for url, id in sorted(set(dataset_urls_and_ids_fn()))
+        ],
     )
+
+
+@url_parametrization(
+    caltech101,
+    caltech256,
+    cifar10,
+    cifar100,
+    # The VOC download server is unstable. See https://github.com/pytorch/vision/issues/2953 for details.
+    # voc,
+    mnist,
+    fashion_mnist,
+    kmnist,
+    emnist,
+    qmnist,
+    omniglot,
+    phototour,
+    sbdataset,
+    semeion,
+    stl10,
+    svhn,
+    usps,
+    celeba,
+    widerface,
+    kinetics,
+    kitti,
+    places365,
+    sbu,
 )
-def test_url_is_accessible(url, md5):
+def test_url_is_accessible(url):
     """
     If you see this test failing, find the offending dataset in the parametrization and move it to
     ``test_url_is_not_accessible`` and link an issue detailing the problem.
@@ -468,15 +378,11 @@ def test_url_is_accessible(url, md5):
     retry(lambda: assert_url_is_accessible(url))
 
 
-@pytest.mark.parametrize(
-    **make_parametrize_kwargs(
-        itertools.chain(
-            places365(),  # https://github.com/pytorch/vision/issues/6268
-        )
-    )
+@url_parametrization(
+    stanford_cars,  # https://github.com/pytorch/vision/issues/7545
 )
 @pytest.mark.xfail
-def test_url_is_not_accessible(url, md5):
+def test_url_is_not_accessible(url):
     """
     As the name implies, this test is the 'inverse' of ``test_url_is_accessible``. Since the download servers are
     beyond our control, some files might not be accessible for longer stretches of time. Still, we want to know if they
@@ -486,8 +392,3 @@ def test_url_is_not_accessible(url, md5):
     ``test_url_is_accessible``.
     """
     retry(lambda: assert_url_is_accessible(url))
-
-
-@pytest.mark.parametrize(**make_parametrize_kwargs(itertools.chain()))
-def test_file_downloads_correctly(url, md5):
-    retry(lambda: assert_file_downloads_correctly(url, md5))
diff --git a/test/test_datasets_utils.py b/test/test_datasets_utils.py
index ec68fd72a5be464c0b3fb7d2b1ef1e48e98f66fc..4e30dfab2cc6e99424da8f66d87ceccfa1acbbd4 100644
--- a/test/test_datasets_utils.py
+++ b/test/test_datasets_utils.py
@@ -7,7 +7,9 @@ import tarfile
 import zipfile
 
 import pytest
+import torch
 import torchvision.datasets.utils as utils
+from common_utils import assert_equal
 from torch._utils_internal import get_file_path_2
 from torchvision.datasets.folder import make_dataset
 from torchvision.datasets.utils import _COMPRESSED_FILE_OPENERS
@@ -215,6 +217,24 @@ class TestDatasetsUtils:
         pytest.raises(ValueError, utils.verify_str_arg, 0, ("a",), "arg")
         pytest.raises(ValueError, utils.verify_str_arg, "b", ("a",), "arg")
 
+    @pytest.mark.parametrize(
+        ("dtype", "actual_hex", "expected_hex"),
+        [
+            (torch.uint8, "01 23 45 67 89 AB CD EF", "01 23 45 67 89 AB CD EF"),
+            (torch.float16, "01 23 45 67 89 AB CD EF", "23 01 67 45 AB 89 EF CD"),
+            (torch.int32, "01 23 45 67 89 AB CD EF", "67 45 23 01 EF CD AB 89"),
+            (torch.float64, "01 23 45 67 89 AB CD EF", "EF CD AB 89 67 45 23 01"),
+        ],
+    )
+    def test_flip_byte_order(self, dtype, actual_hex, expected_hex):
+        def to_tensor(hex):
+            return torch.frombuffer(bytes.fromhex(hex), dtype=dtype)
+
+        assert_equal(
+            utils._flip_byte_order(to_tensor(actual_hex)),
+            to_tensor(expected_hex),
+        )
+
 
 @pytest.mark.parametrize(
     ("kwargs", "expected_error_msg"),
diff --git a/test/test_extended_models.py b/test/test_extended_models.py
index c467564c9c4a46af9e546430770f3ff38d2d511f..0c918c0afd1a64c3762e6d816b991d5a4f726f88 100644
--- a/test/test_extended_models.py
+++ b/test/test_extended_models.py
@@ -1,12 +1,15 @@
+import copy
 import os
+import pickle
 
 import pytest
 import test_models as TM
 import torch
+from common_extended_utils import get_file_size_mb, get_ops
 from torchvision import models
-from torchvision.models._api import get_model_weights, Weights, WeightsEnum
+from torchvision.models import get_model_weights, Weights, WeightsEnum
 from torchvision.models._utils import handle_legacy_interface
-
+from torchvision.models.detection.backbone_utils import mobilenet_backbone, resnet_fpn_backbone
 
 run_if_test_with_extended = pytest.mark.skipif(
     os.getenv("PYTORCH_TEST_WITH_EXTENDED", "0") != "1",
@@ -59,17 +62,59 @@ def test_get_model_weights(name, weight):
     assert models.get_model_weights(name) == weight
 
 
+@pytest.mark.parametrize("copy_fn", [copy.copy, copy.deepcopy])
+@pytest.mark.parametrize(
+    "name",
+    [
+        "resnet50",
+        "retinanet_resnet50_fpn_v2",
+        "raft_large",
+        "quantized_resnet50",
+        "lraspp_mobilenet_v3_large",
+        "mvit_v1_b",
+    ],
+)
+def test_weights_copyable(copy_fn, name):
+    for weights in list(models.get_model_weights(name)):
+        # It is somewhat surprising that (deep-)copying is an identity operation here, but this is the default behavior
+        # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances
+        # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop
+        # support for the identity operation in the future.
+        assert copy_fn(weights) is weights
+
+
+@pytest.mark.parametrize(
+    "name",
+    [
+        "resnet50",
+        "retinanet_resnet50_fpn_v2",
+        "raft_large",
+        "quantized_resnet50",
+        "lraspp_mobilenet_v3_large",
+        "mvit_v1_b",
+    ],
+)
+def test_weights_deserializable(name):
+    for weights in list(models.get_model_weights(name)):
+        # It is somewhat surprising that deserialization is an identity operation here, but this is the default behavior
+        # of enums: https://docs.python.org/3/howto/enum.html#enum-members-aka-instances
+        # Checking for equality, i.e. `==`, is sufficient (and even preferable) for our use case, should we need to drop
+        # support for the identity operation in the future.
+        assert pickle.loads(pickle.dumps(weights)) is weights
+
+
+def get_models_from_module(module):
+    return [
+        v.__name__
+        for k, v in module.__dict__.items()
+        if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__
+    ]
+
+
 @pytest.mark.parametrize(
     "module", [models, models.detection, models.quantization, models.segmentation, models.video, models.optical_flow]
 )
 def test_list_models(module):
-    def get_models_from_module(module):
-        return [
-            v.__name__
-            for k, v in module.__dict__.items()
-            if callable(v) and k[0].islower() and k[0] != "_" and k not in models._api.__all__
-        ]
-
     a = set(get_models_from_module(module))
     b = set(x.replace("quantized_", "") for x in models.list_models(module))
 
@@ -77,6 +122,65 @@ def test_list_models(module):
     assert a == b
 
 
+@pytest.mark.parametrize(
+    "include_filters",
+    [
+        None,
+        [],
+        (),
+        "",
+        "*resnet*",
+        ["*alexnet*"],
+        "*not-existing-model-for-test?",
+        ["*resnet*", "*alexnet*"],
+        ["*resnet*", "*alexnet*", "*not-existing-model-for-test?"],
+        ("*resnet*", "*alexnet*"),
+        set(["*resnet*", "*alexnet*"]),
+    ],
+)
+@pytest.mark.parametrize(
+    "exclude_filters",
+    [
+        None,
+        [],
+        (),
+        "",
+        "*resnet*",
+        ["*alexnet*"],
+        ["*not-existing-model-for-test?"],
+        ["resnet34", "*not-existing-model-for-test?"],
+        ["resnet34", "*resnet1*"],
+        ("resnet34", "*resnet1*"),
+        set(["resnet34", "*resnet1*"]),
+    ],
+)
+def test_list_models_filters(include_filters, exclude_filters):
+    actual = set(models.list_models(models, include=include_filters, exclude=exclude_filters))
+    classification_models = set(get_models_from_module(models))
+
+    if isinstance(include_filters, str):
+        include_filters = [include_filters]
+    if isinstance(exclude_filters, str):
+        exclude_filters = [exclude_filters]
+
+    if include_filters:
+        expected = set()
+        for include_f in include_filters:
+            include_f = include_f.strip("*?")
+            expected = expected | set(x for x in classification_models if include_f in x)
+    else:
+        expected = classification_models
+
+    if exclude_filters:
+        for exclude_f in exclude_filters:
+            exclude_f = exclude_f.strip("*?")
+            if exclude_f != "":
+                a_exclude = set(x for x in classification_models if exclude_f in x)
+                expected = expected - a_exclude
+
+    assert expected == actual
+
+
 @pytest.mark.parametrize(
     "name, weight",
     [
@@ -111,6 +215,22 @@ def test_naming_conventions(model_fn):
     assert len(weights_enum) == 0 or hasattr(weights_enum, "DEFAULT")
 
 
+detection_models_input_dims = {
+    "fasterrcnn_mobilenet_v3_large_320_fpn": (320, 320),
+    "fasterrcnn_mobilenet_v3_large_fpn": (800, 800),
+    "fasterrcnn_resnet50_fpn": (800, 800),
+    "fasterrcnn_resnet50_fpn_v2": (800, 800),
+    "fcos_resnet50_fpn": (800, 800),
+    "keypointrcnn_resnet50_fpn": (1333, 1333),
+    "maskrcnn_resnet50_fpn": (800, 800),
+    "maskrcnn_resnet50_fpn_v2": (800, 800),
+    "retinanet_resnet50_fpn": (800, 800),
+    "retinanet_resnet50_fpn_v2": (800, 800),
+    "ssd300_vgg16": (300, 300),
+    "ssdlite320_mobilenet_v3_large": (320, 320),
+}
+
+
 @pytest.mark.parametrize(
     "model_fn",
     TM.list_model_fns(models)
@@ -122,6 +242,9 @@ def test_naming_conventions(model_fn):
 )
 @run_if_test_with_extended
 def test_schema_meta_validation(model_fn):
+    if model_fn.__name__ == "maskrcnn_resnet50_fpn_v2":
+        pytest.skip(reason="FIXME https://github.com/pytorch/vision/issues/7349")
+
     # list of all possible supported high-level fields for weights meta-data
     permitted_fields = {
         "backend",
@@ -135,11 +258,13 @@ def test_schema_meta_validation(model_fn):
         "recipe",
         "unquantized",
         "_docs",
+        "_ops",
+        "_file_size",
     }
     # mandatory fields for each computer vision task
     classification_fields = {"categories", ("_metrics", "ImageNet-1K", "acc@1"), ("_metrics", "ImageNet-1K", "acc@5")}
     defaults = {
-        "all": {"_metrics", "min_size", "num_params", "recipe", "_docs"},
+        "all": {"_metrics", "min_size", "num_params", "recipe", "_docs", "_file_size", "_ops"},
         "models": classification_fields,
         "detection": {"categories", ("_metrics", "COCO-val2017", "box_map")},
         "quantization": classification_fields | {"backend", "unquantized"},
@@ -160,7 +285,7 @@ def test_schema_meta_validation(model_fn):
         pytest.skip(f"Model '{model_name}' doesn't have any pre-trained weights.")
 
     problematic_weights = {}
-    incorrect_params = []
+    incorrect_meta = []
     bad_names = []
     for w in weights_enum:
         actual_fields = set(w.meta.keys())
@@ -173,24 +298,47 @@ def test_schema_meta_validation(model_fn):
         unsupported_fields = set(w.meta.keys()) - permitted_fields
         if missing_fields or unsupported_fields:
             problematic_weights[w] = {"missing": missing_fields, "unsupported": unsupported_fields}
-        if w == weights_enum.DEFAULT:
+
+        if w == weights_enum.DEFAULT or any(w.meta[k] != weights_enum.DEFAULT.meta[k] for k in ["num_params", "_ops"]):
             if module_name == "quantization":
                 # parameters() count doesn't work well with quantization, so we check against the non-quantized
                 unquantized_w = w.meta.get("unquantized")
-                if unquantized_w is not None and w.meta.get("num_params") != unquantized_w.meta.get("num_params"):
-                    incorrect_params.append(w)
+                if unquantized_w is not None:
+                    if w.meta.get("num_params") != unquantized_w.meta.get("num_params"):
+                        incorrect_meta.append((w, "num_params"))
+
+                    # the methodology for quantized ops count doesn't work as well, so we take unquantized FLOPs
+                    # instead
+                    if w.meta["_ops"] != unquantized_w.meta.get("_ops"):
+                        incorrect_meta.append((w, "_ops"))
+
             else:
-                if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()):
-                    incorrect_params.append(w)
-        else:
-            if w.meta.get("num_params") != weights_enum.DEFAULT.meta.get("num_params"):
-                if w.meta.get("num_params") != sum(p.numel() for p in model_fn(weights=w).parameters()):
-                    incorrect_params.append(w)
+                # loading the model and using it for parameter and ops verification
+                model = model_fn(weights=w)
+
+                if w.meta.get("num_params") != sum(p.numel() for p in model.parameters()):
+                    incorrect_meta.append((w, "num_params"))
+
+                kwargs = {}
+                if model_name in detection_models_input_dims:
+                    # detection models have non default height and width
+                    height, width = detection_models_input_dims[model_name]
+                    kwargs = {"height": height, "width": width}
+
+                if not model_fn.__name__.startswith("vit"):
+                    # FIXME: https://github.com/pytorch/vision/issues/7871
+                    calculated_ops = get_ops(model=model, weight=w, **kwargs)
+                    if calculated_ops != w.meta["_ops"]:
+                        incorrect_meta.append((w, "_ops"))
+
         if not w.name.isupper():
             bad_names.append(w)
 
+        if get_file_size_mb(w) != w.meta.get("_file_size"):
+            incorrect_meta.append((w, "_file_size"))
+
     assert not problematic_weights
-    assert not incorrect_params
+    assert not incorrect_meta
     assert not bad_names
 
 
@@ -343,7 +491,11 @@ class TestHandleLegacyInterface:
         + TM.list_model_fns(models.quantization)
         + TM.list_model_fns(models.segmentation)
         + TM.list_model_fns(models.video)
-        + TM.list_model_fns(models.optical_flow),
+        + TM.list_model_fns(models.optical_flow)
+        + [
+            lambda pretrained: resnet_fpn_backbone(backbone_name="resnet50", pretrained=pretrained),
+            lambda pretrained: mobilenet_backbone(backbone_name="mobilenet_v2", fpn=False, pretrained=pretrained),
+        ],
     )
     @run_if_test_with_extended
     def test_pretrained_deprecation(self, model_fn):
diff --git a/test/test_functional_tensor.py b/test/test_functional_tensor.py
index 9bdd4ab83a54e377da116bad73e8b1ee753d67e8..fb3f5744e54875561ec2981de1a35f1c792931ad 100644
--- a/test/test_functional_tensor.py
+++ b/test/test_functional_tensor.py
@@ -2,17 +2,18 @@ import colorsys
 import itertools
 import math
 import os
-import re
+import warnings
 from functools import partial
 from typing import Sequence
 
 import numpy as np
+import PIL.Image
 import pytest
 import torch
 import torchvision.transforms as T
+import torchvision.transforms._functional_pil as F_pil
+import torchvision.transforms._functional_tensor as F_t
 import torchvision.transforms.functional as F
-import torchvision.transforms.functional_pil as F_pil
-import torchvision.transforms.functional_tensor as F_t
 from common_utils import (
     _assert_approx_equal_tensor_to_pil,
     _assert_equal_tensor_to_pil,
@@ -20,15 +21,20 @@ from common_utils import (
     _create_data_batch,
     _test_fn_on_batch,
     assert_equal,
-    cpu_and_gpu,
+    cpu_and_cuda,
     needs_cuda,
 )
 from torchvision.transforms import InterpolationMode
 
-NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC
+NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = (
+    InterpolationMode.NEAREST,
+    InterpolationMode.NEAREST_EXACT,
+    InterpolationMode.BILINEAR,
+    InterpolationMode.BICUBIC,
+)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("fn", [F.get_image_size, F.get_image_num_channels, F.get_dimensions])
 def test_image_sizes(device, fn):
     script_F = torch.jit.script(fn)
@@ -66,7 +72,7 @@ class TestRotate:
     scripted_rotate = torch.jit.script(F.rotate)
     IMG_W = 26
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(7, 33), (26, IMG_W), (32, IMG_W)])
     @pytest.mark.parametrize(
         "center",
@@ -125,7 +131,7 @@ class TestRotate:
             f"{out_pil_tensor[0, :7, :7]}"
         )
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     def test_rotate_batch(self, device, dt):
         if dt == torch.float16 and device == "cpu":
@@ -141,17 +147,9 @@ class TestRotate:
 
     def test_rotate_interpolation_type(self):
         tensor, _ = _create_data(26, 26)
-        # assert changed type warning
-        with pytest.warns(
-            UserWarning,
-            match=re.escape(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            ),
-        ):
-            res1 = F.rotate(tensor, 45, interpolation=2)
-            res2 = F.rotate(tensor, 45, interpolation=BILINEAR)
-            assert_equal(res1, res2)
+        res1 = F.rotate(tensor, 45, interpolation=PIL.Image.BILINEAR)
+        res2 = F.rotate(tensor, 45, interpolation=BILINEAR)
+        assert_equal(res1, res2)
 
 
 class TestAffine:
@@ -159,7 +157,7 @@ class TestAffine:
     ALL_DTYPES = [None, torch.float32, torch.float64, torch.float16]
     scripted_affine = torch.jit.script(F.affine)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     def test_identity_map(self, device, height, width, dt):
@@ -182,7 +180,7 @@ class TestAffine:
         )
         assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize(
@@ -226,7 +224,7 @@ class TestAffine:
         # Tolerance : less than 6% of different pixels
         assert ratio_diff_pixels < 0.06
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize("angle", [90, 45, 15, -30, -60, -120])
@@ -260,7 +258,7 @@ class TestAffine:
         # Tolerance : less than 3% of different pixels
         assert ratio_diff_pixels < 0.03
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize("t", [[10, 12], (-12, -13)])
@@ -285,7 +283,7 @@ class TestAffine:
 
         _assert_equal_tensor_to_pil(out_tensor, out_pil_img)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("height, width", [(26, 26), (32, 26)])
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     @pytest.mark.parametrize(
@@ -295,24 +293,8 @@ class TestAffine:
             (33, (5, -4), 1.0, [0.0, 0.0], [0, 0, 0]),
             (45, [-5, 4], 1.2, [0.0, 0.0], (1, 2, 3)),
             (33, (-4, -8), 2.0, [0.0, 0.0], [255, 255, 255]),
-            (
-                85,
-                (10, -10),
-                0.7,
-                [0.0, 0.0],
-                [
-                    1,
-                ],
-            ),
-            (
-                0,
-                [0, 0],
-                1.0,
-                [
-                    35.0,
-                ],
-                (2.0,),
-            ),
+            (85, (10, -10), 0.7, [0.0, 0.0], [1]),
+            (0, [0, 0], 1.0, [35.0], (2.0,)),
             (-25, [0, 0], 1.2, [0.0, 15.0], None),
             (-45, [-10, 0], 0.7, [2.0, 5.0], None),
             (-45, [-10, -10], 1.2, [4.0, 5.0], None),
@@ -346,7 +328,7 @@ class TestAffine:
         tol = 0.06 if device == "cuda" else 0.05
         assert ratio_diff_pixels < tol
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dt", ALL_DTYPES)
     def test_batches(self, device, dt):
         if dt == torch.float16 and device == "cpu":
@@ -359,21 +341,13 @@ class TestAffine:
 
         _test_fn_on_batch(batch_tensors, F.affine, angle=-43, translate=[-3, 4], scale=1.2, shear=[4.0, 5.0])
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
-    def test_warnings(self, device):
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_interpolation_type(self, device):
         tensor, pil_img = _create_data(26, 26, device=device)
 
-        # assert changed type warning
-        with pytest.warns(
-            UserWarning,
-            match=re.escape(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            ),
-        ):
-            res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=2)
-            res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR)
-            assert_equal(res1, res2)
+        res1 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=PIL.Image.BILINEAR)
+        res2 = F.affine(tensor, 45, translate=[0, 0], scale=1.0, shear=[0.0, 0.0], interpolation=BILINEAR)
+        assert_equal(res1, res2)
 
 
 def _get_data_dims_and_points_for_perspective():
@@ -399,22 +373,10 @@ def _get_data_dims_and_points_for_perspective():
     return dims_and_points
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize(
-    "fill",
-    (
-        None,
-        [0, 0, 0],
-        [1, 2, 3],
-        [255, 255, 255],
-        [
-            1,
-        ],
-        (2.0,),
-    ),
-)
+@pytest.mark.parametrize("fill", (None, [0, 0, 0], [1, 2, 3], [255, 255, 255], [1], (2.0,)))
 @pytest.mark.parametrize("fn", [F.perspective, torch.jit.script(F.perspective)])
 def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn):
 
@@ -445,7 +407,7 @@ def test_perspective_pil_vs_tensor(device, dims_and_points, dt, fill, fn):
     assert ratio_diff_pixels < 0.05
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dims_and_points", _get_data_dims_and_points_for_perspective())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 def test_perspective_batch(device, dims_and_points, dt):
@@ -473,40 +435,21 @@ def test_perspective_batch(device, dims_and_points, dt):
     )
 
 
-def test_perspective_interpolation_warning():
-    # assert changed type warning
+def test_perspective_interpolation_type():
     spoints = [[0, 0], [33, 0], [33, 25], [0, 25]]
     epoints = [[3, 2], [32, 3], [30, 24], [2, 25]]
     tensor = torch.randint(0, 256, (3, 26, 26))
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=2)
-        res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR)
-        assert_equal(res1, res2)
+
+    res1 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=PIL.Image.BILINEAR)
+    res2 = F.perspective(tensor, startpoints=spoints, endpoints=epoints, interpolation=BILINEAR)
+    assert_equal(res1, res2)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize(
-    "size",
-    [
-        32,
-        26,
-        [
-            32,
-        ],
-        [32, 32],
-        (32, 32),
-        [26, 35],
-    ],
-)
+@pytest.mark.parametrize("size", [32, 26, [32], [32, 32], (32, 32), [26, 35]])
 @pytest.mark.parametrize("max_size", [None, 34, 40, 1000])
-@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST])
+@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT])
 def test_resize(device, dt, size, max_size, interpolation):
 
     if dt == torch.float16 and device == "cpu":
@@ -526,14 +469,12 @@ def test_resize(device, dt, size, max_size, interpolation):
         tensor = tensor.to(dt)
         batch_tensors = batch_tensors.to(dt)
 
-    resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size)
-    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size)
+    resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, max_size=max_size, antialias=True)
+    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, max_size=max_size, antialias=True)
 
     assert resized_tensor.size()[1:] == resized_pil_img.size[::-1]
 
-    if interpolation not in [
-        NEAREST,
-    ]:
+    if interpolation != NEAREST:
         # We can not check values if mode = NEAREST, as results are different
         # E.g. resized_tensor  = [[a, a, b, c, d, d, e, ...]]
         # E.g. resized_pil_img = [[a, b, c, c, d, e, f, ...]]
@@ -543,36 +484,27 @@ def test_resize(device, dt, size, max_size, interpolation):
             resized_tensor_f = resized_tensor_f.to(torch.float)
 
         # Pay attention to high tolerance for MAE
-        _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=8.0)
+        _assert_approx_equal_tensor_to_pil(resized_tensor_f, resized_pil_img, tol=3.0)
 
     if isinstance(size, int):
-        script_size = [
-            size,
-        ]
+        script_size = [size]
     else:
         script_size = size
 
-    resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size)
+    resize_result = script_fn(tensor, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True)
     assert_equal(resized_tensor, resize_result)
 
-    _test_fn_on_batch(batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size)
+    _test_fn_on_batch(
+        batch_tensors, F.resize, size=script_size, interpolation=interpolation, max_size=max_size, antialias=True
+    )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_resize_asserts(device):
 
     tensor, pil_img = _create_data(26, 36, device=device)
 
-    # assert changed type warning
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        res1 = F.resize(tensor, size=32, interpolation=2)
-
+    res1 = F.resize(tensor, size=32, interpolation=PIL.Image.BILINEAR)
     res2 = F.resize(tensor, size=32, interpolation=BILINEAR)
     assert_equal(res1, res2)
 
@@ -584,7 +516,7 @@ def test_resize_asserts(device):
             F.resize(img, size=32, max_size=32)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("size", [[96, 72], [96, 420], [420, 72]])
 @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC])
@@ -603,7 +535,7 @@ def test_resize_antialias(device, dt, size, interpolation):
         tensor = tensor.to(dt)
 
     resized_tensor = F.resize(tensor, size=size, interpolation=interpolation, antialias=True)
-    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation)
+    resized_pil_img = F.resize(pil_img, size=size, interpolation=interpolation, antialias=True)
 
     assert resized_tensor.size()[1:] == resized_pil_img.size[::-1]
 
@@ -637,38 +569,21 @@ def test_resize_antialias(device, dt, size, interpolation):
     assert_equal(resized_tensor, resize_result)
 
 
-@needs_cuda
-@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC])
-def test_assert_resize_antialias(interpolation):
-
-    # Checks implementation on very large scales
-    # and catch TORCH_CHECK inside PyTorch implementation
-    torch.manual_seed(12)
-    tensor, _ = _create_data(1000, 1000, device="cuda")
-
-    # Error message is not yet updated in pytorch nightly
-    # with pytest.raises(RuntimeError, match=r"Provided interpolation parameters can not be handled"):
-    with pytest.raises(RuntimeError, match=r"Too much shared memory required"):
-        F.resize(tensor, size=(5, 5), interpolation=interpolation, antialias=True)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("dt", [torch.float32, torch.float64, torch.float16])
-@pytest.mark.parametrize("size", [[10, 7], [10, 42], [42, 7]])
-@pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC])
-def test_interpolate_antialias_backward(device, dt, size, interpolation):
+def test_resize_antialias_default_warning():
 
-    if dt == torch.float16 and device == "cpu":
-        # skip float16 on CPU case
-        return
+    img = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8)
 
-    torch.manual_seed(12)
-    x = (torch.rand(1, 32, 29, 3, dtype=torch.double, device=device).permute(0, 3, 1, 2).requires_grad_(True),)
-    resize = partial(F.resize, size=size, interpolation=interpolation, antialias=True)
-    assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False)
+    match = "The default value of the antialias"
+    with pytest.warns(UserWarning, match=match):
+        F.resize(img, size=(20, 20))
+    with pytest.warns(UserWarning, match=match):
+        F.resized_crop(img, 0, 0, 10, 10, size=(20, 20))
 
-    x = (torch.rand(1, 3, 32, 29, dtype=torch.double, device=device, requires_grad=True),)
-    assert torch.autograd.gradcheck(resize, x, eps=1e-8, atol=1e-6, rtol=1e-6, fast_mode=False)
+    # For modes that aren't bicubic or bilinear, don't throw a warning
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        F.resize(img, size=(20, 20), interpolation=NEAREST)
+        F.resized_crop(img, 0, 0, 10, 10, size=(20, 20), interpolation=NEAREST)
 
 
 def check_functional_vs_PIL_vs_scripted(
@@ -708,7 +623,7 @@ def check_functional_vs_PIL_vs_scripted(
     _test_fn_on_batch(batch_tensors, fn, scripted_fn_atol=atol, **config)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"brightness_factor": f} for f in (0.1, 0.5, 1.0, 1.34, 2.5)])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -724,7 +639,7 @@ def test_adjust_brightness(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("channels", [1, 3])
 def test_invert(device, dtype, channels):
@@ -733,7 +648,7 @@ def test_invert(device, dtype, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("config", [{"bits": bits} for bits in range(0, 8)])
 @pytest.mark.parametrize("channels", [1, 3])
 def test_posterize(device, config, channels):
@@ -750,7 +665,7 @@ def test_posterize(device, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0, 64, 128, 192, 255]])
 @pytest.mark.parametrize("channels", [1, 3])
 def test_solarize1(device, config, channels):
@@ -767,7 +682,7 @@ def test_solarize1(device, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"threshold": threshold} for threshold in [0.0, 0.25, 0.5, 0.75, 1.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -785,37 +700,45 @@ def test_solarize2(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [0.0, 0.25, 0.5, 0.75, 1.0])
-def test_solarize_threshold1_bound(threshold, device):
-    img = torch.rand((3, 12, 23)).to(device)
-    F_t.solarize(img, threshold)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [1.5])
-def test_solarize_threshold1_upper_bound(threshold, device):
-    img = torch.rand((3, 12, 23)).to(device)
-    with pytest.raises(TypeError, match="Threshold should be less than bound of img."):
-        F_t.solarize(img, threshold)
-
-
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [0, 64, 128, 192, 255])
-def test_solarize_threshold2_bound(threshold, device):
-    img = torch.randint(0, 256, (3, 12, 23)).to(device)
+@pytest.mark.parametrize(
+    ("dtype", "threshold"),
+    [
+        *[
+            (dtype, threshold)
+            for dtype, threshold in itertools.product(
+                [torch.float32, torch.float16],
+                [0.0, 0.25, 0.5, 0.75, 1.0],
+            )
+        ],
+        *[(torch.uint8, threshold) for threshold in [0, 64, 128, 192, 255]],
+        *[(torch.int64, threshold) for threshold in [0, 2**32, 2**63 - 1]],
+    ],
+)
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_solarize_threshold_within_bound(threshold, dtype, device):
+    make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max)
+    img = make_img((3, 12, 23), dtype=dtype, device=device)
     F_t.solarize(img, threshold)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("threshold", [260])
-def test_solarize_threshold2_upper_bound(threshold, device):
-    img = torch.randint(0, 256, (3, 12, 23)).to(device)
+@pytest.mark.parametrize(
+    ("dtype", "threshold"),
+    [
+        (torch.float32, 1.5),
+        (torch.float16, 1.5),
+        (torch.uint8, 260),
+        (torch.int64, 2**64),
+    ],
+)
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_solarize_threshold_above_bound(threshold, dtype, device):
+    make_img = torch.rand if dtype.is_floating_point else partial(torch.randint, 0, torch.iinfo(dtype).max)
+    img = make_img((3, 12, 23), dtype=dtype, device=device)
     with pytest.raises(TypeError, match="Threshold should be less than bound of img."):
         F_t.solarize(img, threshold)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"sharpness_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -831,7 +754,7 @@ def test_adjust_sharpness(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("channels", [1, 3])
 def test_autocontrast(device, dtype, channels):
@@ -840,7 +763,7 @@ def test_autocontrast(device, dtype, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("channels", [1, 3])
 def test_autocontrast_equal_minmax(device, dtype, channels):
@@ -852,7 +775,7 @@ def test_autocontrast_equal_minmax(device, dtype, channels):
     assert (F.autocontrast(a)[0] == F.autocontrast(a[0])).all()
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("channels", [1, 3])
 def test_equalize(device, channels):
     torch.use_deterministic_algorithms(False)
@@ -869,7 +792,7 @@ def test_equalize(device, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"contrast_factor": f} for f in [0.2, 0.5, 1.0, 1.5, 2.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -879,7 +802,7 @@ def test_adjust_contrast(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"saturation_factor": f} for f in [0.5, 0.75, 1.0, 1.5, 2.0]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -889,7 +812,7 @@ def test_adjust_saturation(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"hue_factor": f} for f in [-0.45, -0.25, 0.0, 0.25, 0.45]])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -899,7 +822,7 @@ def test_adjust_hue(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dtype", (None, torch.float32, torch.float64))
 @pytest.mark.parametrize("config", [{"gamma": g1, "gain": g2} for g1, g2 in zip([0.8, 1.0, 1.2], [0.7, 1.0, 1.3])])
 @pytest.mark.parametrize("channels", [1, 3])
@@ -915,7 +838,7 @@ def test_adjust_gamma(device, dtype, config, channels):
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("pad", [2, [3], [0, 3], (3, 3), [4, 2, 4, 3]])
 @pytest.mark.parametrize(
@@ -965,14 +888,16 @@ def test_pad(device, dt, pad, config):
     _test_fn_on_batch(batch_tensors, F.pad, padding=script_pad, **config)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
-@pytest.mark.parametrize("mode", [NEAREST, BILINEAR, BICUBIC])
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize("mode", [NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC])
 def test_resized_crop(device, mode):
     # test values of F.resized_crop in several cases:
     # 1) resize to the same size, crop to the same size => should be identity
     tensor, _ = _create_data(26, 36, device=device)
 
-    out_tensor = F.resized_crop(tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode)
+    out_tensor = F.resized_crop(
+        tensor, top=0, left=0, height=26, width=36, size=[26, 36], interpolation=mode, antialias=True
+    )
     assert_equal(tensor, out_tensor, msg=f"{out_tensor[0, :5, :5]} vs {tensor[0, :5, :5]}")
 
     # 2) resize by half and crop a TL corner
@@ -987,11 +912,18 @@ def test_resized_crop(device, mode):
 
     batch_tensors = _create_data_batch(26, 36, num_samples=4, device=device)
     _test_fn_on_batch(
-        batch_tensors, F.resized_crop, top=1, left=2, height=20, width=30, size=[10, 15], interpolation=NEAREST
+        batch_tensors,
+        F.resized_crop,
+        top=1,
+        left=2,
+        height=20,
+        width=30,
+        size=[10, 15],
+        interpolation=NEAREST,
     )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "func, args",
     [
@@ -1024,7 +956,7 @@ def test_assert_image_tensor(device, func, args):
         func(tensor, *args)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_vflip(device):
     script_vflip = torch.jit.script(F.vflip)
 
@@ -1041,7 +973,7 @@ def test_vflip(device):
     _test_fn_on_batch(batch_tensors, F.vflip)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_hflip(device):
     script_hflip = torch.jit.script(F.hflip)
 
@@ -1058,7 +990,7 @@ def test_hflip(device):
     _test_fn_on_batch(batch_tensors, F.hflip)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "top, left, height, width",
     [
@@ -1087,7 +1019,7 @@ def test_crop(device, top, left, height, width):
     _test_fn_on_batch(batch_tensors, F.crop, top=top, left=left, height=height, width=width)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("image_size", ("small", "large"))
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
@@ -1141,7 +1073,7 @@ def test_gaussian_blur(device, image_size, dt, ksize, sigma, fn):
     torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_hsv2rgb(device):
     scripted_fn = torch.jit.script(F_t._hsv2rgb)
     shape = (3, 100, 150)
@@ -1172,7 +1104,7 @@ def test_hsv2rgb(device):
     _test_fn_on_batch(batch_tensors, F_t._hsv2rgb)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_rgb2hsv(device):
     scripted_fn = torch.jit.script(F_t._rgb2hsv)
     shape = (3, 150, 100)
@@ -1211,7 +1143,7 @@ def test_rgb2hsv(device):
     _test_fn_on_batch(batch_tensors, F_t._rgb2hsv)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("num_output_channels", (3, 1))
 def test_rgb_to_grayscale(device, num_output_channels):
     script_rgb_to_grayscale = torch.jit.script(F.rgb_to_grayscale)
@@ -1230,7 +1162,7 @@ def test_rgb_to_grayscale(device, num_output_channels):
     _test_fn_on_batch(batch_tensors, F.rgb_to_grayscale, num_output_channels=num_output_channels)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_center_crop(device):
     script_center_crop = torch.jit.script(F.center_crop)
 
@@ -1248,7 +1180,7 @@ def test_center_crop(device):
     _test_fn_on_batch(batch_tensors, F.center_crop, output_size=[10, 11])
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_five_crop(device):
     script_five_crop = torch.jit.script(F.five_crop)
 
@@ -1282,7 +1214,7 @@ def test_five_crop(device):
         assert_equal(transformed_batch, s_transformed_batch)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_ten_crop(device):
     script_ten_crop = torch.jit.script(F.ten_crop)
 
@@ -1328,7 +1260,7 @@ def test_elastic_transform_asserts():
         _ = F.elastic_transform(img_tensor, displacement=torch.rand(1, 2))
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC])
 @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
 @pytest.mark.parametrize(
diff --git a/test/test_hub.py b/test/test_hub.py
deleted file mode 100644
index d88c6fa2cd25b2a866748c21f45e5f4d6199d564..0000000000000000000000000000000000000000
--- a/test/test_hub.py
+++ /dev/null
@@ -1,46 +0,0 @@
-import os
-import shutil
-import sys
-import tempfile
-
-import pytest
-import torch.hub as hub
-
-
-def sum_of_model_parameters(model):
-    s = 0
-    for p in model.parameters():
-        s += p.sum()
-    return s
-
-
-SUM_OF_PRETRAINED_RESNET18_PARAMS = -12703.9931640625
-
-
-@pytest.mark.skipif("torchvision" in sys.modules, reason="TestHub must start without torchvision imported")
-class TestHub:
-    # Only run this check ONCE before all tests start.
-    # - If torchvision is imported before all tests start, e.g. we might find _C.so
-    #   which doesn't exist in downloaded zip but in the installed wheel.
-    # - After the first test is run, torchvision is already in sys.modules due to
-    #   Python cache as we run all hub tests in the same python process.
-
-    def test_load_from_github(self):
-        hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False)
-        assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS)
-
-    def test_set_dir(self):
-        temp_dir = tempfile.gettempdir()
-        hub.set_dir(temp_dir)
-        hub_model = hub.load("pytorch/vision", "resnet18", weights="DEFAULT", progress=False)
-        assert sum_of_model_parameters(hub_model).item() == pytest.approx(SUM_OF_PRETRAINED_RESNET18_PARAMS)
-        assert os.path.exists(temp_dir + "/pytorch_vision_master")
-        shutil.rmtree(temp_dir + "/pytorch_vision_master")
-
-    def test_list_entrypoints(self):
-        entry_lists = hub.list("pytorch/vision", force_reload=True)
-        assert "resnet18" in entry_lists
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/test/test_image.py b/test/test_image.py
index 7fcd54c9c8f43277517605025c96a34df1f2f62b..a87f5fa2d1e3ee7b5a0e2c94aab4772dd39645da 100644
--- a/test/test_image.py
+++ b/test/test_image.py
@@ -32,6 +32,7 @@ DAMAGED_JPEG = os.path.join(IMAGE_ROOT, "damaged_jpeg")
 DAMAGED_PNG = os.path.join(IMAGE_ROOT, "damaged_png")
 ENCODE_JPEG = os.path.join(IMAGE_ROOT, "encode_jpeg")
 INTERLACED_PNG = os.path.join(IMAGE_ROOT, "interlaced_png")
+TOOSMALL_PNG = os.path.join(IMAGE_ROOT, "toosmall_png")
 IS_WINDOWS = sys.platform in ("win32", "cygwin")
 PILLOW_VERSION = tuple(int(x) for x in PILLOW_VERSION.split("."))
 
@@ -82,12 +83,9 @@ def test_decode_jpeg(img_path, pil_mode, mode):
     with Image.open(img_path) as img:
         is_cmyk = img.mode == "CMYK"
         if pil_mode is not None:
-            if is_cmyk:
-                # libjpeg does not support the conversion
-                pytest.xfail("Decoding a CMYK jpeg isn't supported")
             img = img.convert(pil_mode)
         img_pil = torch.from_numpy(np.array(img))
-        if is_cmyk:
+        if is_cmyk and mode == ImageReadMode.UNCHANGED:
             # flip the colors to match libjpeg
             img_pil = 255 - img_pil
 
@@ -193,6 +191,8 @@ def test_decode_png_errors():
         decode_png(torch.randint(3, 5, (300,), dtype=torch.uint8))
     with pytest.raises(RuntimeError, match="Out of bound read in decode_png"):
         decode_png(read_file(os.path.join(DAMAGED_PNG, "sigsegv.png")))
+    with pytest.raises(RuntimeError, match="Content is too small for png"):
+        decode_png(read_file(os.path.join(TOOSMALL_PNG, "heapbof.png")))
 
 
 @pytest.mark.parametrize(
@@ -369,6 +369,13 @@ def test_decode_jpeg_cuda(mode, img_path, scripted):
     assert (img.float() - img_nvjpeg.cpu().float()).abs().mean() < 2
 
 
+@needs_cuda
+def test_decode_image_cuda_raises():
+    data = torch.randint(0, 127, size=(255,), device="cuda", dtype=torch.uint8)
+    with pytest.raises(RuntimeError):
+        decode_image(data)
+
+
 @needs_cuda
 @pytest.mark.parametrize("cuda_device", ("cuda", "cuda:0", torch.device("cuda")))
 def test_decode_jpeg_cuda_device_param(cuda_device):
@@ -412,77 +419,6 @@ def test_encode_jpeg_errors():
         encode_jpeg(torch.empty((100, 100), dtype=torch.uint8))
 
 
-def _collect_if(cond):
-    # TODO: remove this once test_encode_jpeg_reference and test_write_jpeg_reference
-    # are removed
-    def _inner(test_func):
-        if cond:
-            return test_func
-        else:
-            return pytest.mark.dont_collect(test_func)
-
-    return _inner
-
-
-@_collect_if(cond=False)
-@pytest.mark.parametrize(
-    "img_path",
-    [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
-)
-def test_encode_jpeg_reference(img_path):
-    # This test is *wrong*.
-    # It compares a torchvision-encoded jpeg with a PIL-encoded jpeg (the reference), but it
-    # starts encoding the torchvision version from an image that comes from
-    # decode_jpeg, which can yield different results from pil.decode (see
-    # test_decode... which uses a high tolerance).
-    # Instead, we should start encoding from the exact same decoded image, for a
-    # valid comparison. This is done in test_encode_jpeg, but unfortunately
-    # these more correct tests fail on windows (probably because of a difference
-    # in libjpeg) between torchvision and PIL.
-    # FIXME: make the correct tests pass on windows and remove this.
-    dirname = os.path.dirname(img_path)
-    filename, _ = os.path.splitext(os.path.basename(img_path))
-    write_folder = os.path.join(dirname, "jpeg_write")
-    expected_file = os.path.join(write_folder, f"{filename}_pil.jpg")
-    img = decode_jpeg(read_file(img_path))
-
-    with open(expected_file, "rb") as f:
-        pil_bytes = f.read()
-        pil_bytes = torch.as_tensor(list(pil_bytes), dtype=torch.uint8)
-    for src_img in [img, img.contiguous()]:
-        # PIL sets jpeg quality to 75 by default
-        jpeg_bytes = encode_jpeg(src_img, quality=75)
-        assert_equal(jpeg_bytes, pil_bytes)
-
-
-@_collect_if(cond=False)
-@pytest.mark.parametrize(
-    "img_path",
-    [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
-)
-def test_write_jpeg_reference(img_path, tmpdir):
-    # FIXME: Remove this eventually, see test_encode_jpeg_reference
-    data = read_file(img_path)
-    img = decode_jpeg(data)
-
-    basedir = os.path.dirname(img_path)
-    filename, _ = os.path.splitext(os.path.basename(img_path))
-    torch_jpeg = os.path.join(tmpdir, f"{filename}_torch.jpg")
-    pil_jpeg = os.path.join(basedir, "jpeg_write", f"{filename}_pil.jpg")
-
-    write_jpeg(img, torch_jpeg, quality=75)
-
-    with open(torch_jpeg, "rb") as f:
-        torch_bytes = f.read()
-
-    with open(pil_jpeg, "rb") as f:
-        pil_bytes = f.read()
-
-    assert_equal(torch_bytes, pil_bytes)
-
-
-# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162.
-@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo")
 @pytest.mark.parametrize(
     "img_path",
     [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
@@ -501,8 +437,6 @@ def test_encode_jpeg(img_path):
         assert_equal(encoded_jpeg_torch, encoded_jpeg_pil)
 
 
-# TODO: Remove the skip. See https://github.com/pytorch/vision/issues/5162.
-@pytest.mark.skip("this test fails because PIL uses libjpeg-turbo")
 @pytest.mark.parametrize(
     "img_path",
     [pytest.param(jpeg_path, id=_get_safe_image_name(jpeg_path)) for jpeg_path in get_images(ENCODE_JPEG, ".jpg")],
diff --git a/test/test_models.py b/test/test_models.py
index 5d2b5565a9e6bb913f275cb2ef653d7b693c8304..76bddebefe4ee479b1c218a128b5eb9631bbfa20 100644
--- a/test/test_models.py
+++ b/test/test_models.py
@@ -15,8 +15,9 @@ import torch
 import torch.fx
 import torch.nn as nn
 from _utils_internal import get_relative_path
-from common_utils import cpu_and_gpu, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed
-from torchvision import models
+from common_utils import cpu_and_cuda, freeze_rng_state, map_nested_tensor_object, needs_cuda, set_rng_seed
+from PIL import Image
+from torchvision import models, transforms
 from torchvision.models import get_model_builder, list_models
 
 
@@ -24,10 +25,57 @@ ACCEPT = os.getenv("EXPECTTEST_ACCEPT", "0") == "1"
 SKIP_BIG_MODEL = os.getenv("SKIP_BIG_MODEL", "1") == "1"
 
 
+@contextlib.contextmanager
+def disable_tf32():
+    previous = torch.backends.cudnn.allow_tf32
+    torch.backends.cudnn.allow_tf32 = False
+    try:
+        yield
+    finally:
+        torch.backends.cudnn.allow_tf32 = previous
+
+
 def list_model_fns(module):
     return [get_model_builder(name) for name in list_models(module)]
 
 
+def _get_image(input_shape, real_image, device, dtype=None):
+    """This routine loads a real or random image based on `real_image` argument.
+    Currently, the real image is utilized for the following list of models:
+    - `retinanet_resnet50_fpn`,
+    - `retinanet_resnet50_fpn_v2`,
+    - `keypointrcnn_resnet50_fpn`,
+    - `fasterrcnn_resnet50_fpn`,
+    - `fasterrcnn_resnet50_fpn_v2`,
+    - `fcos_resnet50_fpn`,
+    - `maskrcnn_resnet50_fpn`,
+    - `maskrcnn_resnet50_fpn_v2`,
+    in `test_classification_model` and `test_detection_model`.
+    To do so, a keyword argument `real_image` was added to the abovelisted models in `_model_params`
+    """
+    if real_image:
+        # TODO: Maybe unify file discovery logic with test_image.py
+        GRACE_HOPPER = os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "assets", "encode_jpeg", "grace_hopper_517x606.jpg"
+        )
+
+        img = Image.open(GRACE_HOPPER)
+
+        original_width, original_height = img.size
+
+        # make the image square
+        img = img.crop((0, 0, original_width, original_width))
+        img = img.resize(input_shape[1:3])
+
+        convert_tensor = transforms.ToTensor()
+        image = convert_tensor(img)
+        assert tuple(image.size()) == input_shape
+        return image.to(device=device, dtype=dtype)
+
+    # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
+    return torch.rand(input_shape).to(device=device, dtype=dtype)
+
+
 @pytest.fixture
 def disable_weight_loading(mocker):
     """When testing models, the two slowest operations are the downloading of the weights to a file and loading them
@@ -129,6 +177,7 @@ def _check_jit_scriptable(nn_module, args, unwrapper=None, eager_out=None):
         return imported
 
     sm = torch.jit.script(nn_module)
+    sm.eval()
 
     if eager_out is None:
         with torch.no_grad(), freeze_rng_state():
@@ -154,7 +203,8 @@ def _check_fx_compatible(model, inputs, eager_out=None):
     model_fx = torch.fx.symbolic_trace(model)
     if eager_out is None:
         eager_out = model(inputs)
-    fx_out = model_fx(inputs)
+    with torch.no_grad(), freeze_rng_state():
+        fx_out = model_fx(inputs)
     torch.testing.assert_close(eager_out, fx_out)
 
 
@@ -231,7 +281,6 @@ autocast_flaky_numerics = (
     "maskrcnn_resnet50_fpn",
     "maskrcnn_resnet50_fpn_v2",
     "keypointrcnn_resnet50_fpn",
-    "fasterrcnn_resnet50_fpn",  # See: https://github.com/pytorch/vision/issues/6655
 )
 
 # The tests for the following quantized models are flaky possibly due to inconsistent
@@ -239,6 +288,11 @@ autocast_flaky_numerics = (
 # tests under test_quantized_classification_model will be skipped for the following models.
 quantized_flaky_models = ("inception_v3", "resnet50")
 
+# The tests for the following detection models are flaky.
+# We run those tests on float64 to avoid floating point errors.
+# FIXME: we shouldn't have to do that :'/
+detection_flaky_models = ("keypointrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn", "maskrcnn_resnet50_fpn_v2")
+
 
 # The following contains configuration parameters for all models which are used by
 # the _test_*_model methods.
@@ -250,6 +304,7 @@ _model_params = {
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "retinanet_resnet50_fpn_v2": {
         "num_classes": 20,
@@ -257,6 +312,7 @@ _model_params = {
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "keypointrcnn_resnet50_fpn": {
         "num_classes": 2,
@@ -264,18 +320,21 @@ _model_params = {
         "max_size": 224,
         "box_score_thresh": 0.17,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fasterrcnn_resnet50_fpn": {
         "num_classes": 20,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fasterrcnn_resnet50_fpn_v2": {
         "num_classes": 20,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fcos_resnet50_fpn": {
         "num_classes": 2,
@@ -283,18 +342,21 @@ _model_params = {
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "maskrcnn_resnet50_fpn": {
         "num_classes": 10,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "maskrcnn_resnet50_fpn_v2": {
         "num_classes": 10,
         "min_size": 224,
         "max_size": 224,
         "input_shape": (3, 224, 224),
+        "real_image": True,
     },
     "fasterrcnn_mobilenet_v3_large_fpn": {
         "box_score_thresh": 0.02076,
@@ -614,13 +676,14 @@ def vitc_b_16(**kwargs: Any):
 
 
 @pytest.mark.parametrize("model_fn", [vitc_b_16])
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_vitc_models(model_fn, dev):
     test_classification_model(model_fn, dev)
 
 
+@disable_tf32()  # see: https://github.com/pytorch/vision/issues/7618
 @pytest.mark.parametrize("model_fn", list_model_fns(models))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_classification_model(model_fn, dev):
     set_rng_seed(0)
     defaults = {
@@ -633,13 +696,20 @@ def test_classification_model(model_fn, dev):
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     num_classes = kwargs.get("num_classes")
     input_shape = kwargs.pop("input_shape")
+    real_image = kwargs.pop("real_image", False)
 
     model = model_fn(**kwargs)
     model.eval().to(device=dev)
-    # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
-    x = torch.rand(input_shape).to(device=dev)
+    x = _get_image(input_shape=input_shape, real_image=real_image, device=dev)
     out = model(x)
-    _assert_expected(out.cpu(), model_name, prec=1e-3)
+    # FIXME: this if/else is nasty and only here to please our CI prior to the
+    # release. We rethink these tests altogether.
+    if model_name == "resnet101":
+        prec = 0.2
+    else:
+        # FIXME: this is probably still way too high.
+        prec = 0.1
+    _assert_expected(out.cpu(), model_name, prec=prec)
     assert out.shape[-1] == num_classes
     _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
     _check_fx_compatible(model, x, eager_out=out)
@@ -656,7 +726,7 @@ def test_classification_model(model_fn, dev):
 
 
 @pytest.mark.parametrize("model_fn", list_model_fns(models.segmentation))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_segmentation_model(model_fn, dev):
     set_rng_seed(0)
     defaults = {
@@ -672,7 +742,8 @@ def test_segmentation_model(model_fn, dev):
     model.eval().to(device=dev)
     # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
     x = torch.rand(input_shape).to(device=dev)
-    out = model(x)
+    with torch.no_grad(), freeze_rng_state():
+        out = model(x)
 
     def check_out(out):
         prec = 0.01
@@ -700,7 +771,7 @@ def test_segmentation_model(model_fn, dev):
     _check_fx_compatible(model, x, eager_out=out)
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast():
+        with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state():
             out = model(x)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
@@ -720,7 +791,7 @@ def test_segmentation_model(model_fn, dev):
 
 
 @pytest.mark.parametrize("model_fn", list_model_fns(models.detection))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_detection_model(model_fn, dev):
     set_rng_seed(0)
     defaults = {
@@ -729,15 +800,20 @@ def test_detection_model(model_fn, dev):
         "input_shape": (3, 300, 300),
     }
     model_name = model_fn.__name__
+    if model_name in detection_flaky_models:
+        dtype = torch.float64
+    else:
+        dtype = torch.get_default_dtype()
     kwargs = {**defaults, **_model_params.get(model_name, {})}
     input_shape = kwargs.pop("input_shape")
+    real_image = kwargs.pop("real_image", False)
 
     model = model_fn(**kwargs)
-    model.eval().to(device=dev)
-    # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
-    x = torch.rand(input_shape).to(device=dev)
+    model.eval().to(device=dev, dtype=dtype)
+    x = _get_image(input_shape=input_shape, real_image=real_image, device=dev, dtype=dtype)
     model_input = [x]
-    out = model(model_input)
+    with torch.no_grad(), freeze_rng_state():
+        out = model(model_input)
     assert model_input[0] is x
 
     def check_out(out):
@@ -798,7 +874,7 @@ def test_detection_model(model_fn, dev):
     _check_jit_scriptable(model, ([x],), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
 
     if dev == "cuda":
-        with torch.cuda.amp.autocast():
+        with torch.cuda.amp.autocast(), torch.no_grad(), freeze_rng_state():
             out = model(model_input)
             # See autocast_flaky_numerics comment at top of file.
             if model_name not in autocast_flaky_numerics:
@@ -847,7 +923,7 @@ def test_detection_model_validation(model_fn):
 
 
 @pytest.mark.parametrize("model_fn", list_model_fns(models.video))
-@pytest.mark.parametrize("dev", cpu_and_gpu())
+@pytest.mark.parametrize("dev", cpu_and_cuda())
 def test_video_model(model_fn, dev):
     set_rng_seed(0)
     # the default input shape is
@@ -868,7 +944,7 @@ def test_video_model(model_fn, dev):
     # RNG always on CPU, to ensure x in cuda tests is bitwise identical to x in cpu tests
     x = torch.rand(input_shape).to(device=dev)
     out = model(x)
-    _assert_expected(out.cpu(), model_name, prec=1e-5)
+    _assert_expected(out.cpu(), model_name, prec=0.1)
     assert out.shape[-1] == num_classes
     _check_jit_scriptable(model, (x,), unwrapper=script_model_unwrapper.get(model_name, None), eager_out=out)
     _check_fx_compatible(model, x, eager_out=out)
@@ -961,7 +1037,7 @@ def test_raft(model_fn, scripted):
     torch.manual_seed(0)
 
     # We need very small images, otherwise the pickle size would exceed the 50KB
-    # As a resut we need to override the correlation pyramid to not downsample
+    # As a result we need to override the correlation pyramid to not downsample
     # too much, otherwise we would get nan values (effective H and W would be
     # reduced to 1)
     corr_block = models.optical_flow.raft.CorrBlock(num_levels=2, radius=2)
@@ -977,9 +1053,29 @@ def test_raft(model_fn, scripted):
     preds = model(img1, img2)
     flow_pred = preds[-1]
     # Tolerance is fairly high, but there are 2 * H * W outputs to check
-    # The .pkl were generated on the AWS cluter, on the CI it looks like the resuts are slightly different
+    # The .pkl were generated on the AWS cluter, on the CI it looks like the results are slightly different
     _assert_expected(flow_pred.cpu(), name=model_fn.__name__, atol=1e-2, rtol=1)
 
 
+def test_presets_antialias():
+
+    img = torch.randint(0, 256, size=(1, 3, 224, 224), dtype=torch.uint8)
+
+    match = "The default value of the antialias parameter"
+    with pytest.warns(UserWarning, match=match):
+        models.ResNet18_Weights.DEFAULT.transforms()(img)
+    with pytest.warns(UserWarning, match=match):
+        models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT.transforms()(img)
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        models.ResNet18_Weights.DEFAULT.transforms(antialias=True)(img)
+        models.segmentation.DeepLabV3_ResNet50_Weights.DEFAULT.transforms(antialias=True)(img)
+
+        models.detection.FasterRCNN_ResNet50_FPN_Weights.DEFAULT.transforms()(img)
+        models.video.R3D_18_Weights.DEFAULT.transforms()(img)
+        models.optical_flow.Raft_Small_Weights.DEFAULT.transforms()(img, img)
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/test/test_models_detection_utils.py b/test/test_models_detection_utils.py
index 09895057a9a0052b900287a40534986a7f5dc4a7..69703ab5817cd2e10067f3d4c4bdc5653874fe25 100644
--- a/test/test_models_detection_utils.py
+++ b/test/test_models_detection_utils.py
@@ -38,7 +38,7 @@ class TestModelsDetectionUtils:
     def test_resnet_fpn_backbone_frozen_layers(self, train_layers, exp_froz_params):
         # we know how many initial layers and parameters of the network should
         # be frozen for each trainable_backbone_layers parameter value
-        # i.e all 53 params are frozen if trainable_backbone_layers=0
+        # i.e. all 53 params are frozen if trainable_backbone_layers=0
         # ad first 24 params are frozen if trainable_backbone_layers=2
         model = backbone_utils.resnet_fpn_backbone("resnet50", weights=None, trainable_layers=train_layers)
         # boolean list that is true if the param at that index is frozen
diff --git a/test/test_onnx.py b/test/test_onnx.py
index d5dae64b4d09aa4b5595057dc6dbda2fe25dae5b..19ed13b1a6d2b321e2db45d7f3ed4c2ee33e2504 100644
--- a/test/test_onnx.py
+++ b/test/test_onnx.py
@@ -1,6 +1,6 @@
 import io
 from collections import OrderedDict
-from typing import List, Tuple
+from typing import List, Optional, Tuple
 
 import pytest
 import torch
@@ -11,7 +11,7 @@ from torchvision.models.detection.image_list import ImageList
 from torchvision.models.detection.roi_heads import RoIHeads
 from torchvision.models.detection.rpn import AnchorGenerator, RegionProposalNetwork, RPNHead
 from torchvision.models.detection.transform import GeneralizedRCNNTransform
-from torchvision.ops._register_onnx_ops import _onnx_opset_version
+from torchvision.ops import _register_onnx_ops
 
 # In environments without onnxruntime we prefer to
 # invoke all tests in the repo and have this one skipped rather than fail.
@@ -27,12 +27,15 @@ class TestONNXExporter:
         self,
         model,
         inputs_list,
-        tolerate_small_mismatch=False,
         do_constant_folding=True,
         dynamic_axes=None,
         output_names=None,
         input_names=None,
+        opset_version: Optional[int] = None,
     ):
+        if opset_version is None:
+            opset_version = _register_onnx_ops.BASE_ONNX_OPSET_VERSION
+
         model.eval()
 
         onnx_io = io.BytesIO()
@@ -46,10 +49,11 @@ class TestONNXExporter:
             torch_onnx_input,
             onnx_io,
             do_constant_folding=do_constant_folding,
-            opset_version=_onnx_opset_version,
+            opset_version=opset_version,
             dynamic_axes=dynamic_axes,
             input_names=input_names,
             output_names=output_names,
+            verbose=True,
         )
         # validate the exported model with onnx runtime
         for test_inputs in inputs_list:
@@ -59,9 +63,9 @@ class TestONNXExporter:
                 test_ouputs = model(*test_inputs)
                 if isinstance(test_ouputs, torch.Tensor):
                     test_ouputs = (test_ouputs,)
-            self.ort_validate(onnx_io, test_inputs, test_ouputs, tolerate_small_mismatch)
+            self.ort_validate(onnx_io, test_inputs, test_ouputs)
 
-    def ort_validate(self, onnx_io, inputs, outputs, tolerate_small_mismatch=False):
+    def ort_validate(self, onnx_io, inputs, outputs):
 
         inputs, _ = torch.jit._flatten(inputs)
         outputs, _ = torch.jit._flatten(outputs)
@@ -81,13 +85,7 @@ class TestONNXExporter:
         ort_outs = ort_session.run(None, ort_inputs)
 
         for i in range(0, len(outputs)):
-            try:
-                torch.testing.assert_allclose(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05)
-            except AssertionError as error:
-                if tolerate_small_mismatch:
-                    assert "(0.00%)" in str(error), str(error)
-                else:
-                    raise
+            torch.testing.assert_close(outputs[i], ort_outs[i], rtol=1e-03, atol=1e-05)
 
     def test_nms(self):
         num_boxes = 100
@@ -140,39 +138,39 @@ class TestONNXExporter:
         model = ops.RoIAlign((5, 5), 1, -1)
         self.run_model(model, [(x, single_roi)])
 
-    @pytest.mark.skip(reason="ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16.")
     def test_roi_align_aligned(self):
+        supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 1.5, 1.5, 3, 3]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1, 2, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 0.5, 3, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1.8, 2, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((2, 2), 2.5, 0, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 0.2, 0.3, 4.5, 3.5]], dtype=torch.float32)
         model = ops.RoIAlign((2, 2), 2.5, -1, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
-    @pytest.mark.skip(reason="Issue in exporting ROIAlign with aligned = True for malformed boxes")
     def test_roi_align_malformed_boxes(self):
+        supported_onnx_version = _register_onnx_ops._ONNX_OPSET_VERSION_16
         x = torch.randn(1, 1, 10, 10, dtype=torch.float32)
         single_roi = torch.tensor([[0, 2, 0.3, 1.5, 1.5]], dtype=torch.float32)
         model = ops.RoIAlign((5, 5), 1, 1, aligned=True)
-        self.run_model(model, [(x, single_roi)])
+        self.run_model(model, [(x, single_roi)], opset_version=supported_onnx_version)
 
     def test_roi_pool(self):
         x = torch.rand(1, 1, 10, 10, dtype=torch.float32)
@@ -320,7 +318,6 @@ class TestONNXExporter:
         self.run_model(
             model,
             [(images, features), (images2, test_features)],
-            tolerate_small_mismatch=True,
             input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
             dynamic_axes={
                 "input1": [0, 1, 2, 3],
@@ -396,7 +393,6 @@ class TestONNXExporter:
         self.run_model(
             model,
             [(images, features), (images2, test_features)],
-            tolerate_small_mismatch=True,
             input_names=["input1", "input2", "input3", "input4", "input5", "input6"],
             dynamic_axes={
                 "input1": [0, 1, 2, 3],
@@ -411,13 +407,12 @@ class TestONNXExporter:
     def get_image(self, rel_path: str, size: Tuple[int, int]) -> torch.Tensor:
         import os
 
-        import torchvision.transforms._pil_constants as _pil_constants
         from PIL import Image
         from torchvision.transforms import functional as F
 
         data_dir = os.path.join(os.path.dirname(__file__), "assets")
         path = os.path.join(data_dir, *rel_path.split("/"))
-        image = Image.open(path).convert("RGB").resize(size, _pil_constants.BILINEAR)
+        image = Image.open(path).convert("RGB").resize(size, Image.BILINEAR)
 
         return F.convert_image_dtype(F.pil_to_tensor(image))
 
@@ -442,7 +437,6 @@ class TestONNXExporter:
             input_names=["images_tensors"],
             output_names=["outputs"],
             dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
         # Test exported model for an image with no detections on other images
         self.run_model(
@@ -451,7 +445,6 @@ class TestONNXExporter:
             input_names=["images_tensors"],
             output_names=["outputs"],
             dynamic_axes={"images_tensors": [0, 1, 2], "outputs": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
 
     # Verify that paste_mask_in_image beahves the same in tracing.
@@ -506,7 +499,6 @@ class TestONNXExporter:
                 "scores": [0],
                 "masks": [0, 1, 2],
             },
-            tolerate_small_mismatch=True,
         )
         # Test exported model for an image with no detections on other images
         self.run_model(
@@ -521,7 +513,6 @@ class TestONNXExporter:
                 "scores": [0],
                 "masks": [0, 1, 2],
             },
-            tolerate_small_mismatch=True,
         )
 
     # Verify that heatmaps_to_keypoints behaves the same in tracing.
@@ -563,7 +554,6 @@ class TestONNXExporter:
             input_names=["images_tensors"],
             output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
             dynamic_axes={"images_tensors": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
 
         self.run_model(
@@ -572,7 +562,6 @@ class TestONNXExporter:
             input_names=["images_tensors"],
             output_names=["outputs1", "outputs2", "outputs3", "outputs4"],
             dynamic_axes={"images_tensors": [0, 1, 2]},
-            tolerate_small_mismatch=True,
         )
 
     def test_shufflenet_v2_dynamic_axes(self):
@@ -586,7 +575,6 @@ class TestONNXExporter:
             input_names=["input_images"],
             output_names=["output"],
             dynamic_axes={"input_images": {0: "batch_size"}, "output": {0: "batch_size"}},
-            tolerate_small_mismatch=True,
         )
 
 
diff --git a/test/test_ops.py b/test/test_ops.py
index b34fbe7f2546e8da0dd7d11b6043b550ef85b82d..743fe159e37a895e8a140b0b5321e1e5a918c9e2 100644
--- a/test/test_ops.py
+++ b/test/test_ops.py
@@ -10,7 +10,7 @@ import pytest
 import torch
 import torch.fx
 import torch.nn.functional as F
-from common_utils import assert_equal, cpu_and_gpu, needs_cuda
+from common_utils import assert_equal, cpu_and_cuda, cpu_and_cuda_and_mps, needs_cuda, needs_mps
 from PIL import Image
 from torch import nn, Tensor
 from torch.autograd import gradcheck
@@ -19,6 +19,22 @@ from torchvision import models, ops
 from torchvision.models.feature_extraction import get_graph_node_names
 
 
+# Context manager for setting deterministic flag and automatically
+# resetting it to its original value
+class DeterministicGuard:
+    def __init__(self, deterministic, *, warn_only=False):
+        self.deterministic = deterministic
+        self.warn_only = warn_only
+
+    def __enter__(self):
+        self.deterministic_restore = torch.are_deterministic_algorithms_enabled()
+        self.warn_only_restore = torch.is_deterministic_algorithms_warn_only_enabled()
+        torch.use_deterministic_algorithms(self.deterministic, warn_only=self.warn_only)
+
+    def __exit__(self, exception_type, exception_value, traceback):
+        torch.use_deterministic_algorithms(self.deterministic_restore, warn_only=self.warn_only_restore)
+
+
 class RoIOpTesterModuleWrapper(nn.Module):
     def __init__(self, obj):
         super().__init__()
@@ -80,14 +96,35 @@ class PoolWrapper(nn.Module):
 
 class RoIOpTester(ABC):
     dtype = torch.float64
+    mps_dtype = torch.float32
+    mps_backward_atol = 2e-2
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_forward(self, device, contiguous, x_dtype=None, rois_dtype=None, **kwargs):
-        x_dtype = self.dtype if x_dtype is None else x_dtype
-        rois_dtype = self.dtype if rois_dtype is None else rois_dtype
+    @pytest.mark.parametrize(
+        "x_dtype",
+        (
+            torch.float16,
+            torch.float32,
+            torch.float64,
+        ),
+        ids=str,
+    )
+    def test_forward(self, device, contiguous, x_dtype, rois_dtype=None, deterministic=False, **kwargs):
+        if device == "mps" and x_dtype is torch.float64:
+            pytest.skip("MPS does not support float64")
+
+        rois_dtype = x_dtype if rois_dtype is None else rois_dtype
+
+        tol = 1e-5
+        if x_dtype is torch.half:
+            if device == "mps":
+                tol = 5e-3
+            else:
+                tol = 4e-3
+
         pool_size = 5
-        # n_channels % (pool_size ** 2) == 0 required for PS opeartions.
+        # n_channels % (pool_size ** 2) == 0 required for PS operations.
         n_channels = 2 * (pool_size**2)
         x = torch.rand(2, n_channels, 10, 10, dtype=x_dtype, device=device)
         if not contiguous:
@@ -99,17 +136,17 @@ class RoIOpTester(ABC):
         )
 
         pool_h, pool_w = pool_size, pool_size
-        y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs)
+        with DeterministicGuard(deterministic):
+            y = self.fn(x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs)
         # the following should be true whether we're running an autocast test or not.
         assert y.dtype == x.dtype
         gt_y = self.expected_fn(
-            x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=self.dtype, **kwargs
+            x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, device=device, dtype=x_dtype, **kwargs
         )
 
-        tol = 1e-3 if (x_dtype is torch.half or rois_dtype is torch.half) else 1e-5
         torch.testing.assert_close(gt_y.to(y), y, rtol=tol, atol=tol)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_is_leaf_node(self, device):
         op_obj = self.make_obj(wrap=True).to(device=device)
         graph_node_names = get_graph_node_names(op_obj)
@@ -118,7 +155,7 @@ class RoIOpTester(ABC):
         assert len(graph_node_names[0]) == len(graph_node_names[1])
         assert len(graph_node_names[0]) == 1 + op_obj.n_inputs
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_torch_fx_trace(self, device, x_dtype=torch.float, rois_dtype=torch.float):
         op_obj = self.make_obj().to(device=device)
         graph_module = torch.fx.symbolic_trace(op_obj)
@@ -138,16 +175,19 @@ class RoIOpTester(ABC):
         torch.testing.assert_close(output_gt, output_fx, rtol=tol, atol=tol)
 
     @pytest.mark.parametrize("seed", range(10))
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_backward(self, seed, device, contiguous):
+    def test_backward(self, seed, device, contiguous, deterministic=False):
+        atol = self.mps_backward_atol if device == "mps" else 1e-05
+        dtype = self.mps_dtype if device == "mps" else self.dtype
+
         torch.random.manual_seed(seed)
         pool_size = 2
-        x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=self.dtype, device=device, requires_grad=True)
+        x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=dtype, device=device, requires_grad=True)
         if not contiguous:
             x = x.permute(0, 1, 3, 2)
         rois = torch.tensor(
-            [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=self.dtype, device=device  # format is (xyxy)
+            [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=dtype, device=device  # format is (xyxy)
         )
 
         def func(z):
@@ -155,8 +195,26 @@ class RoIOpTester(ABC):
 
         script_func = self.get_script_fn(rois, pool_size)
 
-        gradcheck(func, (x,))
-        gradcheck(script_func, (x,))
+        with DeterministicGuard(deterministic):
+            gradcheck(func, (x,), atol=atol)
+
+        gradcheck(script_func, (x,), atol=atol)
+
+    @needs_mps
+    def test_mps_error_inputs(self):
+        pool_size = 2
+        x = torch.rand(1, 2 * (pool_size**2), 5, 5, dtype=torch.float16, device="mps", requires_grad=True)
+        rois = torch.tensor(
+            [[0, 0, 0, 4, 4], [0, 0, 2, 3, 4], [0, 2, 2, 4, 4]], dtype=torch.float16, device="mps"  # format is (xyxy)
+        )
+
+        def func(z):
+            return self.fn(z, rois, pool_size, pool_size, spatial_scale=1, sampling_ratio=1)
+
+        with pytest.raises(
+            RuntimeError, match="MPS does not support (?:ps_)?roi_(?:align|pool)? backward with float16 inputs."
+        ):
+            gradcheck(func, (x,))
 
     @needs_cuda
     @pytest.mark.parametrize("x_dtype", (torch.float, torch.half))
@@ -252,6 +310,8 @@ class TestRoiPool(RoIOpTester):
 
 
 class TestPSRoIPool(RoIOpTester):
+    mps_backward_atol = 5e-2
+
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
         return ops.PSRoIPool((pool_h, pool_w), 1)(x, rois)
 
@@ -333,6 +393,8 @@ def bilinear_interpolate(data, y, x, snap_border=False):
 
 
 class TestRoIAlign(RoIOpTester):
+    mps_backward_atol = 6e-2
+
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, aligned=False, **kwargs):
         return ops.RoIAlign(
             (pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio, aligned=aligned
@@ -384,7 +446,6 @@ class TestRoIAlign(RoIOpTester):
                     grid_w = sampling_ratio if sampling_ratio > 0 else int(np.ceil(bin_w))
 
                     for channel in range(0, n_channels):
-
                         val = 0
                         for iy in range(0, grid_h):
                             y = start_h + (iy + 0.5) * bin_h / grid_h
@@ -400,23 +461,47 @@ class TestRoIAlign(RoIOpTester):
         self._helper_boxes_shape(ops.roi_align)
 
     @pytest.mark.parametrize("aligned", (True, False))
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
+    @pytest.mark.parametrize("x_dtype", (torch.float16, torch.float32, torch.float64), ids=str)
     @pytest.mark.parametrize("contiguous", (True, False))
-    def test_forward(self, device, contiguous, aligned, x_dtype=None, rois_dtype=None):
+    @pytest.mark.parametrize("deterministic", (True, False))
+    def test_forward(self, device, contiguous, deterministic, aligned, x_dtype, rois_dtype=None):
+        if deterministic and device == "cpu":
+            pytest.skip("cpu is always deterministic, don't retest")
         super().test_forward(
-            device=device, contiguous=contiguous, x_dtype=x_dtype, rois_dtype=rois_dtype, aligned=aligned
+            device=device,
+            contiguous=contiguous,
+            deterministic=deterministic,
+            x_dtype=x_dtype,
+            rois_dtype=rois_dtype,
+            aligned=aligned,
         )
 
     @needs_cuda
     @pytest.mark.parametrize("aligned", (True, False))
+    @pytest.mark.parametrize("deterministic", (True, False))
     @pytest.mark.parametrize("x_dtype", (torch.float, torch.half))
     @pytest.mark.parametrize("rois_dtype", (torch.float, torch.half))
-    def test_autocast(self, aligned, x_dtype, rois_dtype):
+    def test_autocast(self, aligned, deterministic, x_dtype, rois_dtype):
         with torch.cuda.amp.autocast():
             self.test_forward(
-                torch.device("cuda"), contiguous=False, aligned=aligned, x_dtype=x_dtype, rois_dtype=rois_dtype
+                torch.device("cuda"),
+                contiguous=False,
+                deterministic=deterministic,
+                aligned=aligned,
+                x_dtype=x_dtype,
+                rois_dtype=rois_dtype,
             )
 
+    @pytest.mark.parametrize("seed", range(10))
+    @pytest.mark.parametrize("device", cpu_and_cuda_and_mps())
+    @pytest.mark.parametrize("contiguous", (True, False))
+    @pytest.mark.parametrize("deterministic", (True, False))
+    def test_backward(self, seed, device, contiguous, deterministic):
+        if deterministic and device == "cpu":
+            pytest.skip("cpu is always deterministic, don't retest")
+        super().test_backward(seed, device, contiguous, deterministic)
+
     def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000):
         rois = torch.randint(0, img_size // 2, size=(num_rois, 5)).to(dtype)
         rois[:, 0] = torch.randint(0, num_imgs, size=(num_rois,))  # set batch index
@@ -496,6 +581,8 @@ class TestRoIAlign(RoIOpTester):
 
 
 class TestPSRoIAlign(RoIOpTester):
+    mps_backward_atol = 5e-2
+
     def fn(self, x, rois, pool_h, pool_w, spatial_scale=1, sampling_ratio=-1, **kwargs):
         return ops.PSRoIAlign((pool_h, pool_w), spatial_scale=spatial_scale, sampling_ratio=sampling_ratio)(x, rois)
 
@@ -571,7 +658,7 @@ class TestMultiScaleRoIAlign:
         )
         assert repr(t) == expected_string
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_is_leaf_node(self, device):
         op_obj = self.make_obj(wrap=True).to(device=device)
         graph_node_names = get_graph_node_names(op_obj)
@@ -585,8 +672,9 @@ class TestNMS:
     def _reference_nms(self, boxes, scores, iou_threshold):
         """
         Args:
-            box_scores (N, 5): boxes in corner-form and probabilities.
-            iou_threshold: intersection over union threshold.
+            boxes: boxes in corner-form
+            scores: probabilities
+            iou_threshold: intersection over union threshold
         Returns:
              picked: a list of indexes of the kept boxes
         """
@@ -630,7 +718,7 @@ class TestNMS:
         boxes, scores = self._create_tensors_with_iou(1000, iou)
         keep_ref = self._reference_nms(boxes, scores, iou)
         keep = ops.nms(boxes, scores, iou)
-        assert torch.allclose(keep, keep_ref), err_msg.format(iou)
+        torch.testing.assert_close(keep, keep_ref, msg=err_msg.format(iou))
 
     def test_nms_input_errors(self):
         with pytest.raises(RuntimeError):
@@ -646,11 +734,11 @@ class TestNMS:
     @pytest.mark.parametrize("scale, zero_point", ((1, 0), (2, 50), (3, 10)))
     def test_qnms(self, iou, scale, zero_point):
         # Note: we compare qnms vs nms instead of qnms vs reference implementation.
-        # This is because with the int convertion, the trick used in _create_tensors_with_iou
+        # This is because with the int conversion, the trick used in _create_tensors_with_iou
         # doesn't really work (in fact, nms vs reference implem will also fail with ints)
         err_msg = "NMS and QNMS give different results for IoU={}"
         boxes, scores = self._create_tensors_with_iou(1000, iou)
-        scores *= 100  # otherwise most scores would be 0 or 1 after int convertion
+        scores *= 100  # otherwise most scores would be 0 or 1 after int conversion
 
         qboxes = torch.quantize_per_tensor(boxes, scale=scale, zero_point=zero_point, dtype=torch.quint8)
         qscores = torch.quantize_per_tensor(scores, scale=scale, zero_point=zero_point, dtype=torch.quint8)
@@ -661,23 +749,30 @@ class TestNMS:
         keep = ops.nms(boxes, scores, iou)
         qkeep = ops.nms(qboxes, qscores, iou)
 
-        assert torch.allclose(qkeep, keep), err_msg.format(iou)
+        torch.testing.assert_close(qkeep, keep, msg=err_msg.format(iou))
 
-    @needs_cuda
+    @pytest.mark.parametrize(
+        "device",
+        (
+            pytest.param("cuda", marks=pytest.mark.needs_cuda),
+            pytest.param("mps", marks=pytest.mark.needs_mps),
+        ),
+    )
     @pytest.mark.parametrize("iou", (0.2, 0.5, 0.8))
-    def test_nms_cuda(self, iou, dtype=torch.float64):
+    def test_nms_gpu(self, iou, device, dtype=torch.float64):
+        dtype = torch.float32 if device == "mps" else dtype
         tol = 1e-3 if dtype is torch.half else 1e-5
         err_msg = "NMS incompatible between CPU and CUDA for IoU={}"
 
         boxes, scores = self._create_tensors_with_iou(1000, iou)
         r_cpu = ops.nms(boxes, scores, iou)
-        r_cuda = ops.nms(boxes.cuda(), scores.cuda(), iou)
+        r_gpu = ops.nms(boxes.to(device), scores.to(device), iou)
 
-        is_eq = torch.allclose(r_cpu, r_cuda.cpu())
+        is_eq = torch.allclose(r_cpu, r_gpu.cpu())
         if not is_eq:
             # if the indices are not the same, ensure that it's because the scores
             # are duplicate
-            is_eq = torch.allclose(scores[r_cpu], scores[r_cuda.cpu()], rtol=tol, atol=tol)
+            is_eq = torch.allclose(scores[r_cpu], scores[r_gpu.cpu()], rtol=tol, atol=tol)
         assert is_eq, err_msg.format(iou)
 
     @needs_cuda
@@ -685,18 +780,24 @@ class TestNMS:
     @pytest.mark.parametrize("dtype", (torch.float, torch.half))
     def test_autocast(self, iou, dtype):
         with torch.cuda.amp.autocast():
-            self.test_nms_cuda(iou=iou, dtype=dtype)
+            self.test_nms_gpu(iou=iou, dtype=dtype, device="cuda")
 
-    @needs_cuda
-    def test_nms_cuda_float16(self):
+    @pytest.mark.parametrize(
+        "device",
+        (
+            pytest.param("cuda", marks=pytest.mark.needs_cuda),
+            pytest.param("mps", marks=pytest.mark.needs_mps),
+        ),
+    )
+    def test_nms_float16(self, device):
         boxes = torch.tensor(
             [
                 [285.3538, 185.5758, 1193.5110, 851.4551],
                 [285.1472, 188.7374, 1192.4984, 851.0669],
                 [279.2440, 197.9812, 1189.4746, 849.2019],
             ]
-        ).cuda()
-        scores = torch.tensor([0.6370, 0.7569, 0.3966]).cuda()
+        ).to(device)
+        scores = torch.tensor([0.6370, 0.7569, 0.3966]).to(device)
 
         iou_thres = 0.2
         keep32 = ops.nms(boxes, scores, iou_thres)
@@ -843,7 +944,7 @@ class TestDeformConv:
         )
         return DeformConvModuleWrapper(obj) if wrap else obj
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_is_leaf_node(self, device):
         op_obj = self.make_obj(wrap=True).to(device=device)
         graph_node_names = get_graph_node_names(op_obj)
@@ -852,7 +953,7 @@ class TestDeformConv:
         assert len(graph_node_names[0]) == len(graph_node_names[1])
         assert len(graph_node_names[0]) == 1 + op_obj.n_inputs
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("batch_sz", (0, 33))
     def test_forward(self, device, contiguous, batch_sz, dtype=None):
@@ -904,7 +1005,7 @@ class TestDeformConv:
             wrong_mask = torch.rand_like(mask[:, :2])
             layer(x, offset, wrong_mask)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("contiguous", (True, False))
     @pytest.mark.parametrize("batch_sz", (0, 33))
     def test_backward(self, device, contiguous, batch_sz):
@@ -977,7 +1078,6 @@ class TestDeformConv:
             weight = init_weight
 
         for d in ["cpu", "cuda"]:
-
             out = ops.deform_conv2d(img.to(d), offset.to(d), weight.to(d), padding=1, mask=mask.to(d))
             out.mean().backward()
             if true_cpu_grads is None:
@@ -1237,7 +1337,7 @@ class TestIouBase:
         boxes2 = gen_box(7)
         a = TestIouBase._cartesian_product(boxes1, boxes2, target_fn)
         b = target_fn(boxes1, boxes2)
-        assert torch.allclose(a, b)
+        torch.testing.assert_close(a, b)
 
 
 class TestBoxIou(TestIouBase):
@@ -1370,10 +1470,9 @@ def assert_empty_loss(iou_fn, dtype, device):
 
 class TestGeneralizedBoxIouLoss:
     # We refer to original test: https://github.com/facebookresearch/fvcore/blob/main/tests/test_giou_loss.py
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_giou_loss(self, dtype, device):
-
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
 
         # Identical boxes should have loss of 0
@@ -1394,7 +1493,12 @@ class TestGeneralizedBoxIouLoss:
         assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 2.5, device=device, reduction="sum")
         assert_iou_loss(ops.generalized_box_iou_loss, box1s, box2s, 1.25, device=device, reduction="mean")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+        # Test reduction value
+        # reduction value other than ["none", "mean", "sum"] should raise a ValueError
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.generalized_box_iou_loss(box1s, box2s, reduction="xyz")
+
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_inputs(self, dtype, device):
         assert_empty_loss(ops.generalized_box_iou_loss, dtype, device)
@@ -1402,7 +1506,7 @@ class TestGeneralizedBoxIouLoss:
 
 class TestCompleteBoxIouLoss:
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     def test_ciou_loss(self, dtype, device):
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
 
@@ -1413,14 +1517,17 @@ class TestCompleteBoxIouLoss:
         assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean")
         assert_iou_loss(ops.complete_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.complete_box_iou_loss(box1s, box2s, reduction="xyz")
+
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_inputs(self, dtype, device):
         assert_empty_loss(ops.complete_box_iou_loss, dtype, device)
 
 
 class TestDistanceBoxIouLoss:
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_distance_iou_loss(self, dtype, device):
         box1, box2, box3, box4, box1s, box2s = get_boxes(dtype, device)
@@ -1432,7 +1539,10 @@ class TestDistanceBoxIouLoss:
         assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 1.2250, device=device, reduction="mean")
         assert_iou_loss(ops.distance_box_iou_loss, box1s, box2s, 2.4500, device=device, reduction="sum")
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.distance_box_iou_loss(box1s, box2s, reduction="xyz")
+
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     def test_empty_distance_iou_inputs(self, dtype, device):
         assert_empty_loss(ops.distance_box_iou_loss, dtype, device)
@@ -1477,7 +1587,7 @@ class TestFocalLoss:
 
     @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
     @pytest.mark.parametrize("gamma", [0, 2])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     @pytest.mark.parametrize("seed", [0, 1])
     def test_correct_ratio(self, alpha, gamma, device, dtype, seed):
@@ -1506,7 +1616,7 @@ class TestFocalLoss:
         torch.testing.assert_close(correct_ratio, loss_ratio, atol=tol, rtol=tol)
 
     @pytest.mark.parametrize("reduction", ["mean", "sum"])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     @pytest.mark.parametrize("seed", [2, 3])
     def test_equal_ce_loss(self, reduction, device, dtype, seed):
@@ -1533,7 +1643,7 @@ class TestFocalLoss:
     @pytest.mark.parametrize("alpha", [-1.0, 0.0, 0.58, 1.0])
     @pytest.mark.parametrize("gamma", [0, 2])
     @pytest.mark.parametrize("reduction", ["none", "mean", "sum"])
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
     @pytest.mark.parametrize("seed", [4, 5])
     def test_jit(self, alpha, gamma, reduction, device, dtype, seed):
@@ -1543,17 +1653,22 @@ class TestFocalLoss:
         torch.random.manual_seed(seed)
         inputs, targets = self._generate_diverse_input_target_pair(dtype=dtype, device=device)
         focal_loss = ops.sigmoid_focal_loss(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-        if device == "cpu":
-            scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
-        else:
-            with torch.jit.fuser("fuser2"):
-                # Use fuser2 to prevent a bug on fuser: https://github.com/pytorch/pytorch/issues/75476
-                # We may remove this condition once the bug is resolved
-                scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
+        scripted_focal_loss = script_fn(inputs, targets, gamma=gamma, alpha=alpha, reduction=reduction)
 
         tol = 1e-3 if dtype is torch.half else 1e-5
         torch.testing.assert_close(focal_loss, scripted_focal_loss, rtol=tol, atol=tol)
 
+    # Raise ValueError for anonymous reduction mode
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.half])
+    def test_reduction_mode(self, device, dtype, reduction="xyz"):
+        if device == "cpu" and dtype is torch.half:
+            pytest.skip("Currently torch.half is not fully supported on cpu")
+        torch.random.manual_seed(0)
+        inputs, targets = self._generate_diverse_input_target_pair(device=device, dtype=dtype)
+        with pytest.raises(ValueError, match="Invalid"):
+            ops.sigmoid_focal_loss(inputs, targets, 0.25, 2, reduction)
+
 
 class TestMasksToBoxes:
     def test_masks_box(self):
@@ -1623,7 +1738,7 @@ class TestStochasticDepth:
                 counts += batch_size - non_zero_count
                 num_samples += batch_size
 
-        p_value = stats.binom_test(counts, num_samples, p=p)
+        p_value = stats.binomtest(counts, num_samples, p=p).pvalue
         assert p_value > 0.01
 
     @pytest.mark.parametrize("seed", range(10))
diff --git a/test/test_transforms.py b/test/test_transforms.py
index e0f8d4a5927e8e23e0a701426a66c49c3ea4f608..7581bf33220db307b90f2435673ae0698128d452 100644
--- a/test/test_transforms.py
+++ b/test/test_transforms.py
@@ -2,15 +2,16 @@ import math
 import os
 import random
 import re
+import textwrap
+import warnings
 from functools import partial
 
 import numpy as np
 import pytest
 import torch
 import torchvision.transforms as transforms
-import torchvision.transforms._pil_constants as _pil_constants
+import torchvision.transforms._functional_tensor as F_t
 import torchvision.transforms.functional as F
-import torchvision.transforms.functional_tensor as F_t
 from PIL import Image
 from torch._utils_internal import get_file_path_2
 
@@ -24,7 +25,7 @@ try:
 except ImportError:
     stats = None
 
-from common_utils import assert_equal, cycle_over, float_dtypes, int_dtypes
+from common_utils import assert_equal, assert_run_python_script, cycle_over, float_dtypes, int_dtypes
 
 
 GRACE_HOPPER = get_file_path_2(
@@ -174,7 +175,7 @@ class TestAccImage:
     def test_accimage_resize(self):
         trans = transforms.Compose(
             [
-                transforms.Resize(256, interpolation=_pil_constants.LINEAR),
+                transforms.Resize(256, interpolation=Image.LINEAR),
                 transforms.PILToTensor(),
                 transforms.ConvertImageDtype(dtype=torch.float),
             ]
@@ -319,7 +320,7 @@ def test_randomresized_params():
         scale_range = (scale_min, scale_min + round(random.random(), 2))
         aspect_min = max(round(random.random(), 2), epsilon)
         aspect_ratio_range = (aspect_min, aspect_min + round(random.random(), 2))
-        randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range)
+        randresizecrop = transforms.RandomResizedCrop(size, scale_range, aspect_ratio_range, antialias=True)
         i, j, h, w = randresizecrop.get_params(img, scale_range, aspect_ratio_range)
         aspect_ratio_obtained = w / h
         assert (
@@ -366,7 +367,7 @@ def test_randomresized_params():
 def test_resize(height, width, osize, max_size):
     img = Image.new("RGB", size=(width, height), color=127)
 
-    t = transforms.Resize(osize, max_size=max_size)
+    t = transforms.Resize(osize, max_size=max_size, antialias=True)
     result = t(img)
 
     msg = f"{height}, {width} - {osize} - {max_size}"
@@ -424,7 +425,7 @@ def test_resize_sequence_output(height, width, osize):
     img = Image.new("RGB", size=(width, height), color=127)
     oheight, owidth = osize
 
-    t = transforms.Resize(osize)
+    t = transforms.Resize(osize, antialias=True)
     result = t(img)
 
     assert (owidth, oheight) == result.size
@@ -439,6 +440,16 @@ def test_resize_antialias_error():
         t(img)
 
 
+def test_resize_antialias_default_warning():
+
+    img = Image.new("RGB", size=(10, 10), color=127)
+    # We make sure we don't warn for PIL images since the default behaviour doesn't change
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        transforms.Resize((20, 20))(img)
+        transforms.RandomResizedCrop((20, 20))(img)
+
+
 @pytest.mark.parametrize("height, width", ((32, 64), (64, 32)))
 def test_resize_size_equals_small_edge_size(height, width):
     # Non-regression test for https://github.com/pytorch/vision/issues/5405
@@ -447,11 +458,21 @@ def test_resize_size_equals_small_edge_size(height, width):
     img = Image.new("RGB", size=(width, height), color=127)
 
     small_edge = min(height, width)
-    t = transforms.Resize(small_edge, max_size=max_size)
+    t = transforms.Resize(small_edge, max_size=max_size, antialias=True)
     result = t(img)
     assert max(result.size) == max_size
 
 
+def test_resize_equal_input_output_sizes():
+    # Regression test for https://github.com/pytorch/vision/issues/7518
+    height, width = 28, 27
+    img = Image.new("RGB", size=(width, height))
+
+    t = transforms.Resize((height, width), antialias=True)
+    result = t(img)
+    assert result is img
+
+
 class TestPad:
     @pytest.mark.parametrize("fill", [85, 85.0])
     def test_pad(self, fill):
@@ -931,33 +952,6 @@ def test_adjust_contrast():
     torch.testing.assert_close(y_np, y_ans)
 
 
-@pytest.mark.skipif(Image.__version__ >= "7", reason="Temporarily disabled")
-def test_adjust_saturation():
-    x_shape = [2, 2, 3]
-    x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
-    x_np = np.array(x_data, dtype=np.uint8).reshape(x_shape)
-    x_pil = Image.fromarray(x_np, mode="RGB")
-
-    # test 0
-    y_pil = F.adjust_saturation(x_pil, 1)
-    y_np = np.array(y_pil)
-    torch.testing.assert_close(y_np, x_np)
-
-    # test 1
-    y_pil = F.adjust_saturation(x_pil, 0.5)
-    y_np = np.array(y_pil)
-    y_ans = [2, 4, 8, 87, 128, 173, 39, 25, 138, 133, 215, 88]
-    y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
-    torch.testing.assert_close(y_np, y_ans)
-
-    # test 2
-    y_pil = F.adjust_saturation(x_pil, 2)
-    y_np = np.array(y_pil)
-    y_ans = [0, 6, 22, 0, 149, 255, 32, 0, 255, 4, 255, 0]
-    y_ans = np.array(y_ans, dtype=np.uint8).reshape(x_shape)
-    torch.testing.assert_close(y_np, y_ans)
-
-
 def test_adjust_hue():
     x_shape = [2, 2, 3]
     x_data = [0, 5, 13, 54, 135, 226, 37, 8, 234, 90, 255, 1]
@@ -1424,17 +1418,17 @@ def test_random_choice(proba_passthrough, seed):
 def test_random_order():
     random_state = random.getstate()
     random.seed(42)
-    random_order_transform = transforms.RandomOrder([transforms.Resize(20), transforms.CenterCrop(10)])
+    random_order_transform = transforms.RandomOrder([transforms.Resize(20, antialias=True), transforms.CenterCrop(10)])
     img = transforms.ToPILImage()(torch.rand(3, 25, 25))
     num_samples = 250
     num_normal_order = 0
-    resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20)(img))
+    resize_crop_out = transforms.CenterCrop(10)(transforms.Resize(20, antialias=True)(img))
     for _ in range(num_samples):
         out = random_order_transform(img)
         if out == resize_crop_out:
             num_normal_order += 1
 
-    p_value = stats.binom_test(num_normal_order, num_samples, p=0.5)
+    p_value = stats.binomtest(num_normal_order, num_samples, p=0.5).pvalue
     random.setstate(random_state)
     assert p_value > 0.0001
 
@@ -1522,10 +1516,10 @@ def test_ten_crop(should_vflip, single_dim):
     five_crop.__repr__()
 
     if should_vflip:
-        vflipped_img = img.transpose(_pil_constants.FLIP_TOP_BOTTOM)
+        vflipped_img = img.transpose(Image.FLIP_TOP_BOTTOM)
         expected_output += five_crop(vflipped_img)
     else:
-        hflipped_img = img.transpose(_pil_constants.FLIP_LEFT_RIGHT)
+        hflipped_img = img.transpose(Image.FLIP_LEFT_RIGHT)
         expected_output += five_crop(hflipped_img)
 
     assert len(results) == 10
@@ -1798,6 +1792,12 @@ def test_color_jitter():
     color_jitter.__repr__()
 
 
+@pytest.mark.parametrize("hue", [1, (-1, 1)])
+def test_color_jitter_hue_out_of_bounds(hue):
+    with pytest.raises(ValueError, match=re.escape("hue values should be between (-0.5, 0.5)")):
+        transforms.ColorJitter(hue=hue)
+
+
 @pytest.mark.parametrize("seed", range(10))
 @pytest.mark.skipif(stats is None, reason="scipy.stats not available")
 def test_random_erasing(seed):
@@ -1818,7 +1818,7 @@ def test_random_erasing(seed):
     tol = 0.05
     assert 1 / 3 - tol <= aspect_ratio <= 3 + tol
 
-    # Make sure that h > w and h < w are equaly likely (log-scale sampling)
+    # Make sure that h > w and h < w are equally likely (log-scale sampling)
     aspect_ratios = []
     random.seed(42)
     trial = 1000
@@ -1834,7 +1834,7 @@ def test_random_erasing(seed):
         aspect_ratios.append(h / w)
 
     count_bigger_then_ones = len([1 for aspect_ratio in aspect_ratios if aspect_ratio > 1])
-    p_value = stats.binom_test(count_bigger_then_ones, trial, p=0.5)
+    p_value = stats.binomtest(count_bigger_then_ones, trial, p=0.5).pvalue
     assert p_value > 0.0001
 
     # Checking if RandomErasing can be printed as string
@@ -1866,16 +1866,8 @@ def test_random_rotation():
     # Checking if RandomRotation can be printed as string
     t.__repr__()
 
-    # assert changed type warning
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        t = transforms.RandomRotation((-10, 10), interpolation=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
+    t = transforms.RandomRotation((-10, 10), interpolation=Image.BILINEAR)
+    assert t.interpolation == transforms.InterpolationMode.BILINEAR
 
 
 def test_random_rotation_error():
@@ -2067,7 +2059,7 @@ class TestAffine:
                 # https://github.com/python-pillow/Pillow/blob/71f8ec6a0cfc1008076a023c0756542539d057ab/
                 # src/libImaging/Geometry.c#L1060
                 input_pt = np.array([x + 0.5, y + 0.5, 1.0])
-                res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(np.int)
+                res = np.floor(np.dot(inv_true_matrix, input_pt)).astype(int)
                 _x, _y = res[:2]
                 if 0 <= _x < input_img.shape[1] and 0 <= _y < input_img.shape[0]:
                     true_result[y, x, :] = input_img[_y, _x, :]
@@ -2206,16 +2198,8 @@ def test_random_affine():
     t = transforms.RandomAffine(10, interpolation=transforms.InterpolationMode.BILINEAR)
     assert "bilinear" in t.__repr__()
 
-    # assert changed type warning
-    with pytest.warns(
-        UserWarning,
-        match=re.escape(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        ),
-    ):
-        t = transforms.RandomAffine(10, interpolation=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
+    t = transforms.RandomAffine(10, interpolation=Image.BILINEAR)
+    assert t.interpolation == transforms.InterpolationMode.BILINEAR
 
 
 def test_elastic_transformation():
@@ -2233,9 +2217,8 @@ def test_elastic_transformation():
     with pytest.raises(ValueError, match=r"sigma is a sequence its length should be 2"):
         transforms.ElasticTransform(alpha=2.0, sigma=[1.0, 0.0, 1.0])
 
-    with pytest.warns(UserWarning, match=r"Argument interpolation should be of type InterpolationMode"):
-        t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=2)
-        assert t.interpolation == transforms.InterpolationMode.BILINEAR
+    t = transforms.transforms.ElasticTransform(alpha=2.0, sigma=2.0, interpolation=Image.BILINEAR)
+    assert t.interpolation == transforms.InterpolationMode.BILINEAR
 
     with pytest.raises(TypeError, match=r"fill should be int or float"):
         transforms.ElasticTransform(alpha=1.0, sigma=1.0, fill={})
@@ -2267,5 +2250,35 @@ def test_random_grayscale_with_grayscale_input():
     torch.testing.assert_close(F.pil_to_tensor(output_pil), image_tensor)
 
 
+# TODO: remove in 0.17 when we can delete functional_pil.py and functional_tensor.py
+@pytest.mark.parametrize(
+    "import_statement",
+    (
+        "from torchvision.transforms import functional_pil",
+        "from torchvision.transforms import functional_tensor",
+        "from torchvision.transforms.functional_tensor import resize",
+        "from torchvision.transforms.functional_pil import resize",
+    ),
+)
+@pytest.mark.parametrize("from_private", (True, False))
+def test_functional_deprecation_warning(import_statement, from_private):
+    if from_private:
+        import_statement = import_statement.replace("functional", "_functional")
+        source = f"""
+        import warnings
+
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            {import_statement}
+        """
+    else:
+        source = f"""
+        import pytest
+        with pytest.warns(UserWarning, match="removed in 0.17"):
+            {import_statement}
+        """
+    assert_run_python_script(textwrap.dedent(source))
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/test/test_transforms_tensor.py b/test/test_transforms_tensor.py
index f4ca544deb844142629fc76ea33274821f2433a9..e2ab5673f1efe8aac2a2f041e1599a4c97cc9c83 100644
--- a/test/test_transforms_tensor.py
+++ b/test/test_transforms_tensor.py
@@ -1,17 +1,18 @@
 import os
 import sys
+import warnings
 
 import numpy as np
+import PIL.Image
 import pytest
 import torch
-import torchvision.transforms._pil_constants as _pil_constants
 from common_utils import (
     _assert_approx_equal_tensor_to_pil,
     _assert_equal_tensor_to_pil,
     _create_data,
     _create_data_batch,
     assert_equal,
-    cpu_and_gpu,
+    cpu_and_cuda,
     float_dtypes,
     get_tmp_dir,
     int_dtypes,
@@ -20,7 +21,12 @@ from torchvision import transforms as T
 from torchvision.transforms import functional as F, InterpolationMode
 from torchvision.transforms.autoaugment import _apply_op
 
-NEAREST, BILINEAR, BICUBIC = InterpolationMode.NEAREST, InterpolationMode.BILINEAR, InterpolationMode.BICUBIC
+NEAREST, NEAREST_EXACT, BILINEAR, BICUBIC = (
+    InterpolationMode.NEAREST,
+    InterpolationMode.NEAREST_EXACT,
+    InterpolationMode.BILINEAR,
+    InterpolationMode.BICUBIC,
+)
 
 
 def _test_transform_vs_scripted(transform, s_transform, tensor, msg=None):
@@ -94,12 +100,12 @@ def _test_op(func, method, device, channels=3, fn_kwargs=None, meth_kwargs=None,
 
 def _test_fn_save_load(fn, tmpdir):
     scripted_fn = torch.jit.script(fn)
-    p = os.path.join(tmpdir, f"t_op_list_{fn.__name__ if hasattr(fn, '__name__') else fn.__class__.__name__}.pt")
+    p = os.path.join(tmpdir, f"t_op_list_{getattr(fn, '__name__', fn.__class__.__name__)}.pt")
     scripted_fn.save(p)
     _ = torch.jit.load(p)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "func,method,fn_kwargs,match_kwargs",
     [
@@ -124,7 +130,7 @@ def test_random(func, method, device, channels, fn_kwargs, match_kwargs):
 
 
 @pytest.mark.parametrize("seed", range(10))
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("channels", [1, 3])
 class TestColorJitter:
     @pytest.fixture(autouse=True)
@@ -200,7 +206,7 @@ class TestColorJitter:
         )
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("m", ["constant", "edge", "reflect", "symmetric"])
 @pytest.mark.parametrize("mul", [1, -1])
 def test_pad(m, mul, device):
@@ -223,7 +229,7 @@ def test_pad(m, mul, device):
     _test_op(F.pad, T.Pad, device=device, fn_kwargs=fn_kwargs, meth_kwargs=meth_kwargs)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_crop(device):
     fn_kwargs = {"top": 2, "left": 3, "height": 4, "width": 5}
     # Test transforms.RandomCrop with size and padding as tuple
@@ -251,7 +257,7 @@ def test_crop(device):
     _test_functional_op(F.crop, fn_kwargs=fn_kwargs, device=device)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "padding_config",
     [
@@ -277,7 +283,7 @@ def test_random_crop_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_center_crop(device, tmpdir):
     fn_kwargs = {"output_size": (4, 5)}
     meth_kwargs = {"size": (4, 5)}
@@ -307,7 +313,7 @@ def test_center_crop_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "fn, method, out_length",
     [
@@ -366,7 +372,7 @@ class TestResize:
     def test_resize_int(self, size):
         # TODO: Minimal check for bug-fix, improve this later
         x = torch.rand(3, 32, 46)
-        t = T.Resize(size=size)
+        t = T.Resize(size=size, antialias=True)
         y = t(x)
         # If size is an int, smaller edge of the image will be matched to this number.
         # i.e, if height > width, then image will be rescaled to (size * height / width, size).
@@ -374,11 +380,11 @@ class TestResize:
         assert y.shape[1] == size
         assert y.shape[2] == int(size * 46 / 32)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("dt", [None, torch.float32, torch.float64])
     @pytest.mark.parametrize("size", [[32], [32, 32], (32, 32), [34, 35]])
     @pytest.mark.parametrize("max_size", [None, 35, 1000])
-    @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST])
+    @pytest.mark.parametrize("interpolation", [BILINEAR, BICUBIC, NEAREST, NEAREST_EXACT])
     def test_resize_scripted(self, dt, size, max_size, interpolation, device):
         tensor, _ = _create_data(height=34, width=36, device=device)
         batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device)
@@ -389,25 +395,25 @@ class TestResize:
         if max_size is not None and len(size) != 1:
             pytest.skip("Size should be an int or a sequence of length 1 if max_size is specified")
 
-        transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size)
+        transform = T.Resize(size=size, interpolation=interpolation, max_size=max_size, antialias=True)
         s_transform = torch.jit.script(transform)
         _test_transform_vs_scripted(transform, s_transform, tensor)
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
     def test_resize_save_load(self, tmpdir):
-        fn = T.Resize(size=[32])
+        fn = T.Resize(size=[32], antialias=True)
         _test_fn_save_load(fn, tmpdir)
 
-    @pytest.mark.parametrize("device", cpu_and_gpu())
+    @pytest.mark.parametrize("device", cpu_and_cuda())
     @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]])
     @pytest.mark.parametrize("ratio", [(0.75, 1.333), [0.75, 1.333]])
     @pytest.mark.parametrize("size", [(32,), [44], [32], [32, 32], (32, 32), [44, 55]])
-    @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC])
+    @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR, BICUBIC, NEAREST_EXACT])
     @pytest.mark.parametrize("antialias", [None, True, False])
     def test_resized_crop(self, scale, ratio, size, interpolation, antialias, device):
 
-        if antialias and interpolation == NEAREST:
-            pytest.skip("Can not resize if interpolation mode is NEAREST and antialias=True")
+        if antialias and interpolation in {NEAREST, NEAREST_EXACT}:
+            pytest.skip(f"Can not resize if interpolation mode is {interpolation} and antialias=True")
 
         tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
         batch_tensors = torch.randint(0, 256, size=(4, 3, 44, 56), dtype=torch.uint8, device=device)
@@ -419,9 +425,25 @@ class TestResize:
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
     def test_resized_crop_save_load(self, tmpdir):
-        fn = T.RandomResizedCrop(size=[32])
+        fn = T.RandomResizedCrop(size=[32], antialias=True)
         _test_fn_save_load(fn, tmpdir)
 
+    def test_antialias_default_warning(self):
+
+        img = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8)
+
+        match = "The default value of the antialias"
+        with pytest.warns(UserWarning, match=match):
+            T.Resize((20, 20))(img)
+        with pytest.warns(UserWarning, match=match):
+            T.RandomResizedCrop((20, 20))(img)
+
+        # For modes that aren't bicubic or bilinear, don't throw a warning
+        with warnings.catch_warnings():
+            warnings.simplefilter("error")
+            T.Resize((20, 20), interpolation=NEAREST)(img)
+            T.RandomResizedCrop((20, 20), interpolation=NEAREST)(img)
+
 
 def _test_random_affine_helper(device, **kwargs):
     tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
@@ -438,42 +460,42 @@ def test_random_affine_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("shear", [15, 10.0, (5.0, 10.0), [-15, 15], [-10.0, 10.0, -11.0, 11.0]])
 def test_random_affine_shear(device, interpolation, shear):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, shear=shear)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("scale", [(0.7, 1.2), [0.7, 1.2]])
 def test_random_affine_scale(device, interpolation, scale):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, scale=scale)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("translate", [(0.1, 0.2), [0.2, 0.1]])
 def test_random_affine_translate(device, interpolation, translate):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, translate=translate)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]])
 def test_random_affine_degrees(device, interpolation, degrees):
     _test_random_affine_helper(device, degrees=degrees, interpolation=interpolation)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_random_affine_fill(device, interpolation, fill):
     _test_random_affine_helper(device, degrees=0.0, interpolation=interpolation, fill=fill)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("center", [(0, 0), [10, 10], None, (56, 44)])
 @pytest.mark.parametrize("expand", [True, False])
 @pytest.mark.parametrize("degrees", [45, 35.0, (-45, 45), [-90.0, 90.0]])
@@ -495,7 +517,7 @@ def test_random_rotate_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("distortion_scale", np.linspace(0.1, 1.0, num=20))
 @pytest.mark.parametrize("interpolation", [NEAREST, BILINEAR])
 @pytest.mark.parametrize("fill", [85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
@@ -515,7 +537,7 @@ def test_random_perspective_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "Klass, meth_kwargs",
     [(T.Grayscale, {"num_output_channels": 1}), (T.Grayscale, {"num_output_channels": 3}), (T.RandomGrayscale, {})],
@@ -525,7 +547,7 @@ def test_to_grayscale(device, Klass, meth_kwargs):
     _test_class_op(Klass, meth_kwargs=meth_kwargs, test_exact_match=False, device=device, tol=tol, agg_method="max")
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("in_dtype", int_dtypes() + float_dtypes())
 @pytest.mark.parametrize("out_dtype", int_dtypes() + float_dtypes())
 def test_convert_image_dtype(device, in_dtype, out_dtype):
@@ -556,7 +578,7 @@ def test_convert_image_dtype_save_load(tmpdir):
     _test_fn_save_load(fn, tmpdir)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("policy", [policy for policy in T.AutoAugmentPolicy])
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_autoaugment(device, policy, fill):
@@ -570,7 +592,7 @@ def test_autoaugment(device, policy, fill):
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("num_ops", [1, 2, 3])
 @pytest.mark.parametrize("magnitude", [7, 9, 11])
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
@@ -585,7 +607,7 @@ def test_randaugment(device, num_ops, magnitude, fill):
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_trivialaugmentwide(device, fill):
     tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
@@ -598,7 +620,7 @@ def test_trivialaugmentwide(device, fill):
         _test_transform_vs_scripted_on_batch(transform, s_transform, batch_tensors)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
 def test_augmix(device, fill):
     tensor = torch.randint(0, 256, size=(3, 44, 56), dtype=torch.uint8, device=device)
@@ -635,13 +657,13 @@ def test_autoaugment__op_apply_shear(interpolation, mode):
             matrix = (1, level, 0, 0, 1, 0)
         elif mode == "Y":
             matrix = (1, 0, 0, level, 1, 0)
-        return pil_img.transform((image_size, image_size), _pil_constants.AFFINE, matrix, resample=resample)
+        return pil_img.transform((image_size, image_size), PIL.Image.AFFINE, matrix, resample=resample)
 
     t_img, pil_img = _create_data(image_size, image_size)
 
     resample_pil = {
-        F.InterpolationMode.NEAREST: _pil_constants.NEAREST,
-        F.InterpolationMode.BILINEAR: _pil_constants.BILINEAR,
+        F.InterpolationMode.NEAREST: PIL.Image.NEAREST,
+        F.InterpolationMode.BILINEAR: PIL.Image.BILINEAR,
     }[interpolation]
 
     level = 0.3
@@ -664,10 +686,20 @@ def test_autoaugment__op_apply_shear(interpolation, mode):
     _assert_approx_equal_tensor_to_pil(out, expected_out)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "config",
-    [{"value": 0.2}, {"value": "random"}, {"value": (0.2, 0.2, 0.2)}, {"value": "random", "ratio": (0.1, 0.2)}],
+    [
+        {},
+        {"value": 1},
+        {"value": 0.2},
+        {"value": "random"},
+        {"value": (1, 1, 1)},
+        {"value": (0.2, 0.2, 0.2)},
+        {"value": [1, 1, 1]},
+        {"value": [0.2, 0.2, 0.2]},
+        {"value": "random", "ratio": (0.1, 0.2)},
+    ],
 )
 def test_random_erasing(device, config):
     tensor, _ = _create_data(24, 32, channels=3, device=device)
@@ -692,7 +724,7 @@ def test_random_erasing_with_invalid_data():
         random_erasing(img)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_normalize(device, tmpdir):
     fn = T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
     tensor, _ = _create_data(26, 34, device=device)
@@ -711,7 +743,7 @@ def test_normalize(device, tmpdir):
     scripted_fn.save(os.path.join(tmpdir, "t_norm.pt"))
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_linear_transformation(device, tmpdir):
     c, h, w = 3, 24, 32
 
@@ -737,7 +769,7 @@ def test_linear_transformation(device, tmpdir):
     scripted_fn.save(os.path.join(tmpdir, "t_norm.pt"))
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_compose(device):
     tensor, _ = _create_data(26, 34, device=device)
     tensor = tensor.to(dtype=torch.float32) / 255.0
@@ -765,7 +797,7 @@ def test_compose(device):
         torch.jit.script(t)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 def test_random_apply(device):
     tensor, _ = _create_data(26, 34, device=device)
     tensor = tensor.to(dtype=torch.float32) / 255.0
@@ -807,7 +839,7 @@ def test_random_apply(device):
             torch.jit.script(transforms)
 
 
-@pytest.mark.parametrize("device", cpu_and_gpu())
+@pytest.mark.parametrize("device", cpu_and_cuda())
 @pytest.mark.parametrize(
     "meth_kwargs",
     [
@@ -843,3 +875,35 @@ def test_gaussian_blur(device, channels, meth_kwargs):
         agg_method="max",
         tol=tol,
     )
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize(
+    "fill",
+    [
+        1,
+        1.0,
+        [1],
+        [1.0],
+        (1,),
+        (1.0,),
+        [1, 2, 3],
+        [1.0, 2.0, 3.0],
+        (1, 2, 3),
+        (1.0, 2.0, 3.0),
+    ],
+)
+@pytest.mark.parametrize("channels", [1, 3])
+def test_elastic_transform(device, channels, fill):
+    if isinstance(fill, (list, tuple)) and len(fill) > 1 and channels == 1:
+        # For this the test would correctly fail, since the number of channels in the image does not match `fill`.
+        # Thus, this is not an issue in the transform, but rather a problem of parametrization that just gives the
+        # product of `fill` and `channels`.
+        return
+
+    _test_class_op(
+        T.ElasticTransform,
+        meth_kwargs=dict(fill=fill),
+        channels=channels,
+        device=device,
+    )
diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f0056e96ab67352dce71d80a72a6c47290d7315
--- /dev/null
+++ b/test/test_transforms_v2.py
@@ -0,0 +1,1185 @@
+import itertools
+import pathlib
+import pickle
+import random
+import warnings
+
+import numpy as np
+
+import PIL.Image
+import pytest
+import torch
+import torchvision.transforms.v2 as transforms
+
+from common_utils import assert_equal, cpu_and_cuda
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import tv_tensors
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import to_pil_image
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2._utils import check_type, is_pure_tensor, query_chw
+from transforms_v2_legacy_utils import (
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_images,
+    make_multiple_bounding_boxes,
+    make_segmentation_mask,
+    make_video,
+    make_videos,
+)
+
+
+def make_vanilla_tensor_images(*args, **kwargs):
+    for image in make_images(*args, **kwargs):
+        if image.ndim > 3:
+            continue
+        yield image.data
+
+
+def make_pil_images(*args, **kwargs):
+    for image in make_vanilla_tensor_images(*args, **kwargs):
+        yield to_pil_image(image)
+
+
+def make_vanilla_tensor_bounding_boxes(*args, **kwargs):
+    for bounding_boxes in make_multiple_bounding_boxes(*args, **kwargs):
+        yield bounding_boxes.data
+
+
+def parametrize(transforms_with_inputs):
+    return pytest.mark.parametrize(
+        ("transform", "input"),
+        [
+            pytest.param(
+                transform,
+                input,
+                id=f"{type(transform).__name__}-{type(input).__module__}.{type(input).__name__}-{idx}",
+            )
+            for transform, inputs in transforms_with_inputs
+            for idx, input in enumerate(inputs)
+        ],
+    )
+
+
+def auto_augment_adapter(transform, input, device):
+    adapted_input = {}
+    image_or_video_found = False
+    for key, value in input.items():
+        if isinstance(value, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
+            # AA transforms don't support bounding boxes or masks
+            continue
+        elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor, PIL.Image.Image)):
+            if image_or_video_found:
+                # AA transforms only support a single image or video
+                continue
+            image_or_video_found = True
+        adapted_input[key] = value
+    return adapted_input
+
+
+def linear_transformation_adapter(transform, input, device):
+    flat_inputs = list(input.values())
+    c, h, w = query_chw(
+        [
+            item
+            for item, needs_transform in zip(flat_inputs, transforms.Transform()._needs_transform_list(flat_inputs))
+            if needs_transform
+        ]
+    )
+    num_elements = c * h * w
+    transform.transformation_matrix = torch.randn((num_elements, num_elements), device=device)
+    transform.mean_vector = torch.randn((num_elements,), device=device)
+    return {key: value for key, value in input.items() if not isinstance(value, PIL.Image.Image)}
+
+
+def normalize_adapter(transform, input, device):
+    adapted_input = {}
+    for key, value in input.items():
+        if isinstance(value, PIL.Image.Image):
+            # normalize doesn't support PIL images
+            continue
+        elif check_type(value, (tv_tensors.Image, tv_tensors.Video, is_pure_tensor)):
+            # normalize doesn't support integer images
+            value = F.to_dtype(value, torch.float32, scale=True)
+        adapted_input[key] = value
+    return adapted_input
+
+
+class TestSmoke:
+    @pytest.mark.parametrize(
+        ("transform", "adapter"),
+        [
+            (transforms.RandomErasing(p=1.0), None),
+            (transforms.AugMix(), auto_augment_adapter),
+            (transforms.AutoAugment(), auto_augment_adapter),
+            (transforms.RandAugment(), auto_augment_adapter),
+            (transforms.TrivialAugmentWide(), auto_augment_adapter),
+            (transforms.ColorJitter(brightness=0.1, contrast=0.2, saturation=0.3, hue=0.15), None),
+            (transforms.Grayscale(), None),
+            (transforms.RandomAdjustSharpness(sharpness_factor=0.5, p=1.0), None),
+            (transforms.RandomAutocontrast(p=1.0), None),
+            (transforms.RandomEqualize(p=1.0), None),
+            (transforms.RandomGrayscale(p=1.0), None),
+            (transforms.RandomInvert(p=1.0), None),
+            (transforms.RandomChannelPermutation(), None),
+            (transforms.RandomPhotometricDistort(p=1.0), None),
+            (transforms.RandomPosterize(bits=4, p=1.0), None),
+            (transforms.RandomSolarize(threshold=0.5, p=1.0), None),
+            (transforms.CenterCrop([16, 16]), None),
+            (transforms.ElasticTransform(sigma=1.0), None),
+            (transforms.Pad(4), None),
+            (transforms.RandomAffine(degrees=30.0), None),
+            (transforms.RandomCrop([16, 16], pad_if_needed=True), None),
+            (transforms.RandomHorizontalFlip(p=1.0), None),
+            (transforms.RandomPerspective(p=1.0), None),
+            (transforms.RandomResize(min_size=10, max_size=20, antialias=True), None),
+            (transforms.RandomResizedCrop([16, 16], antialias=True), None),
+            (transforms.RandomRotation(degrees=30), None),
+            (transforms.RandomShortestSize(min_size=10, antialias=True), None),
+            (transforms.RandomVerticalFlip(p=1.0), None),
+            (transforms.RandomZoomOut(p=1.0), None),
+            (transforms.Resize([16, 16], antialias=True), None),
+            (transforms.ScaleJitter((16, 16), scale_range=(0.8, 1.2), antialias=True), None),
+            (transforms.ClampBoundingBoxes(), None),
+            (transforms.ConvertBoundingBoxFormat(tv_tensors.BoundingBoxFormat.CXCYWH), None),
+            (transforms.ConvertImageDtype(), None),
+            (transforms.GaussianBlur(kernel_size=3), None),
+            (
+                transforms.LinearTransformation(
+                    # These are just dummy values that will be filled by the adapter. We can't define them upfront,
+                    # because for we neither know the spatial size nor the device at this point
+                    transformation_matrix=torch.empty((1, 1)),
+                    mean_vector=torch.empty((1,)),
+                ),
+                linear_transformation_adapter,
+            ),
+            (transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), normalize_adapter),
+            (transforms.ToDtype(torch.float64), None),
+            (transforms.UniformTemporalSubsample(num_samples=2), None),
+        ],
+        ids=lambda transform: type(transform).__name__,
+    )
+    @pytest.mark.parametrize("container_type", [dict, list, tuple])
+    @pytest.mark.parametrize(
+        "image_or_video",
+        [
+            make_image(),
+            make_video(),
+            next(make_pil_images(color_spaces=["RGB"])),
+            next(make_vanilla_tensor_images()),
+        ],
+    )
+    @pytest.mark.parametrize("de_serialize", [lambda t: t, lambda t: pickle.loads(pickle.dumps(t))])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_common(self, transform, adapter, container_type, image_or_video, de_serialize, device):
+        transform = de_serialize(transform)
+
+        canvas_size = F.get_size(image_or_video)
+        input = dict(
+            image_or_video=image_or_video,
+            image_tv_tensor=make_image(size=canvas_size),
+            video_tv_tensor=make_video(size=canvas_size),
+            image_pil=next(make_pil_images(sizes=[canvas_size], color_spaces=["RGB"])),
+            bounding_boxes_xyxy=make_bounding_boxes(
+                format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, batch_dims=(3,)
+            ),
+            bounding_boxes_xywh=make_bounding_boxes(
+                format=tv_tensors.BoundingBoxFormat.XYWH, canvas_size=canvas_size, batch_dims=(4,)
+            ),
+            bounding_boxes_cxcywh=make_bounding_boxes(
+                format=tv_tensors.BoundingBoxFormat.CXCYWH, canvas_size=canvas_size, batch_dims=(5,)
+            ),
+            bounding_boxes_degenerate_xyxy=tv_tensors.BoundingBoxes(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [2, 0, 1, 1],  # x1 > x2, y1 < y2
+                    [0, 2, 1, 1],  # x1 < x2, y1 > y2
+                    [2, 2, 1, 1],  # x1 > x2, y1 > y2
+                ],
+                format=tv_tensors.BoundingBoxFormat.XYXY,
+                canvas_size=canvas_size,
+            ),
+            bounding_boxes_degenerate_xywh=tv_tensors.BoundingBoxes(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [0, 0, 1, -1],  # negative height
+                    [0, 0, -1, 1],  # negative width
+                    [0, 0, -1, -1],  # negative height and width
+                ],
+                format=tv_tensors.BoundingBoxFormat.XYWH,
+                canvas_size=canvas_size,
+            ),
+            bounding_boxes_degenerate_cxcywh=tv_tensors.BoundingBoxes(
+                [
+                    [0, 0, 0, 0],  # no height or width
+                    [0, 0, 0, 1],  # no height
+                    [0, 0, 1, 0],  # no width
+                    [0, 0, 1, -1],  # negative height
+                    [0, 0, -1, 1],  # negative width
+                    [0, 0, -1, -1],  # negative height and width
+                ],
+                format=tv_tensors.BoundingBoxFormat.CXCYWH,
+                canvas_size=canvas_size,
+            ),
+            detection_mask=make_detection_mask(size=canvas_size),
+            segmentation_mask=make_segmentation_mask(size=canvas_size),
+            int=0,
+            float=0.0,
+            bool=True,
+            none=None,
+            str="str",
+            path=pathlib.Path.cwd(),
+            object=object(),
+            tensor=torch.empty(5),
+            array=np.empty(5),
+        )
+        if adapter is not None:
+            input = adapter(transform, input, device)
+
+        if container_type in {tuple, list}:
+            input = container_type(input.values())
+
+        input_flat, input_spec = tree_flatten(input)
+        input_flat = [item.to(device) if isinstance(item, torch.Tensor) else item for item in input_flat]
+        input = tree_unflatten(input_flat, input_spec)
+
+        torch.manual_seed(0)
+        output = transform(input)
+        output_flat, output_spec = tree_flatten(output)
+
+        assert output_spec == input_spec
+
+        for output_item, input_item, should_be_transformed in zip(
+            output_flat, input_flat, transforms.Transform()._needs_transform_list(input_flat)
+        ):
+            if should_be_transformed:
+                assert type(output_item) is type(input_item)
+            else:
+                assert output_item is input_item
+
+            if isinstance(input_item, tv_tensors.BoundingBoxes) and not isinstance(
+                transform, transforms.ConvertBoundingBoxFormat
+            ):
+                assert output_item.format == input_item.format
+
+        # Enforce that the transform does not turn a degenerate box marked by RandomIoUCrop (or any other future
+        # transform that does this), back into a valid one.
+        # TODO: we should test that against all degenerate boxes above
+        for format in list(tv_tensors.BoundingBoxFormat):
+            sample = dict(
+                boxes=tv_tensors.BoundingBoxes([[0, 0, 0, 0]], format=format, canvas_size=(224, 244)),
+                labels=torch.tensor([3]),
+            )
+            assert transforms.SanitizeBoundingBoxes()(sample)["boxes"].shape == (0, 4)
+
+    @parametrize(
+        [
+            (
+                transform,
+                itertools.chain.from_iterable(
+                    fn(
+                        color_spaces=[
+                            "GRAY",
+                            "RGB",
+                        ],
+                        dtypes=[torch.uint8],
+                        extra_dims=[(), (4,)],
+                        **(dict(num_frames=[3]) if fn is make_videos else dict()),
+                    )
+                    for fn in [
+                        make_images,
+                        make_vanilla_tensor_images,
+                        make_pil_images,
+                        make_videos,
+                    ]
+                ),
+            )
+            for transform in (
+                transforms.RandAugment(),
+                transforms.TrivialAugmentWide(),
+                transforms.AutoAugment(),
+                transforms.AugMix(),
+            )
+        ]
+    )
+    def test_auto_augment(self, transform, input):
+        transform(input)
+
+    @parametrize(
+        [
+            (
+                transforms.Normalize(mean=[0.0, 0.0, 0.0], std=[1.0, 1.0, 1.0]),
+                itertools.chain.from_iterable(
+                    fn(color_spaces=["RGB"], dtypes=[torch.float32])
+                    for fn in [
+                        make_images,
+                        make_vanilla_tensor_images,
+                        make_videos,
+                    ]
+                ),
+            ),
+        ]
+    )
+    def test_normalize(self, transform, input):
+        transform(input)
+
+    @parametrize(
+        [
+            (
+                transforms.RandomResizedCrop([16, 16], antialias=True),
+                itertools.chain(
+                    make_images(extra_dims=[(4,)]),
+                    make_vanilla_tensor_images(),
+                    make_pil_images(),
+                    make_videos(extra_dims=[()]),
+                ),
+            )
+        ]
+    )
+    def test_random_resized_crop(self, transform, input):
+        transform(input)
+
+
+@pytest.mark.parametrize(
+    "flat_inputs",
+    itertools.permutations(
+        [
+            next(make_vanilla_tensor_images()),
+            next(make_vanilla_tensor_images()),
+            next(make_pil_images()),
+            make_image(),
+            next(make_videos()),
+        ],
+        3,
+    ),
+)
+def test_pure_tensor_heuristic(flat_inputs):
+    def split_on_pure_tensor(to_split):
+        # This takes a sequence that is structurally aligned with `flat_inputs` and splits its items into three parts:
+        # 1. The first pure tensor. If none is present, this will be `None`
+        # 2. A list of the remaining pure tensors
+        # 3. A list of all other items
+        pure_tensors = []
+        others = []
+        # Splitting always happens on the original `flat_inputs` to avoid any erroneous type changes by the transform to
+        # affect the splitting.
+        for item, inpt in zip(to_split, flat_inputs):
+            (pure_tensors if is_pure_tensor(inpt) else others).append(item)
+        return pure_tensors[0] if pure_tensors else None, pure_tensors[1:], others
+
+    class CopyCloneTransform(transforms.Transform):
+        def _transform(self, inpt, params):
+            return inpt.clone() if isinstance(inpt, torch.Tensor) else inpt.copy()
+
+        @staticmethod
+        def was_applied(output, inpt):
+            identity = output is inpt
+            if identity:
+                return False
+
+            # Make sure nothing fishy is going on
+            assert_equal(output, inpt)
+            return True
+
+    first_pure_tensor_input, other_pure_tensor_inputs, other_inputs = split_on_pure_tensor(flat_inputs)
+
+    transform = CopyCloneTransform()
+    transformed_sample = transform(flat_inputs)
+
+    first_pure_tensor_output, other_pure_tensor_outputs, other_outputs = split_on_pure_tensor(transformed_sample)
+
+    if first_pure_tensor_input is not None:
+        if other_inputs:
+            assert not transform.was_applied(first_pure_tensor_output, first_pure_tensor_input)
+        else:
+            assert transform.was_applied(first_pure_tensor_output, first_pure_tensor_input)
+
+    for output, inpt in zip(other_pure_tensor_outputs, other_pure_tensor_inputs):
+        assert not transform.was_applied(output, inpt)
+
+    for input, output in zip(other_inputs, other_outputs):
+        assert transform.was_applied(output, input)
+
+
+class TestPad:
+    def test_assertions(self):
+        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
+            transforms.Pad("abc")
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.Pad([-0.7, 0, 0.7])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.Pad(12, fill="abc")
+
+        with pytest.raises(ValueError, match="Padding mode should be either"):
+            transforms.Pad(12, padding_mode="abc")
+
+
+class TestRandomZoomOut:
+    def test_assertions(self):
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomZoomOut(fill="abc")
+
+        with pytest.raises(TypeError, match="should be a sequence of length"):
+            transforms.RandomZoomOut(0, side_range=0)
+
+        with pytest.raises(ValueError, match="Invalid canvas side range"):
+            transforms.RandomZoomOut(0, side_range=[4.0, 1.0])
+
+    @pytest.mark.parametrize("fill", [0, [1, 2, 3], (2, 3, 4)])
+    @pytest.mark.parametrize("side_range", [(1.0, 4.0), [2.0, 5.0]])
+    def test__get_params(self, fill, side_range):
+        transform = transforms.RandomZoomOut(fill=fill, side_range=side_range)
+
+        h, w = size = (24, 32)
+        image = make_image(size)
+
+        params = transform._get_params([image])
+
+        assert len(params["padding"]) == 4
+        assert 0 <= params["padding"][0] <= (side_range[1] - 1) * w
+        assert 0 <= params["padding"][1] <= (side_range[1] - 1) * h
+        assert 0 <= params["padding"][2] <= (side_range[1] - 1) * w
+        assert 0 <= params["padding"][3] <= (side_range[1] - 1) * h
+
+
+class TestRandomPerspective:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Argument distortion_scale value should be between 0 and 1"):
+            transforms.RandomPerspective(distortion_scale=-1.0)
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomPerspective(0.5, fill="abc")
+
+    def test__get_params(self):
+        dscale = 0.5
+        transform = transforms.RandomPerspective(dscale)
+
+        image = make_image((24, 32))
+
+        params = transform._get_params([image])
+
+        assert "coefficients" in params
+        assert len(params["coefficients"]) == 8
+
+
+class TestElasticTransform:
+    def test_assertions(self):
+
+        with pytest.raises(TypeError, match="alpha should be a number or a sequence of numbers"):
+            transforms.ElasticTransform({})
+
+        with pytest.raises(ValueError, match="alpha is a sequence its length should be 1 or 2"):
+            transforms.ElasticTransform([1.0, 2.0, 3.0])
+
+        with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"):
+            transforms.ElasticTransform(1.0, {})
+
+        with pytest.raises(ValueError, match="sigma is a sequence its length should be 1 or 2"):
+            transforms.ElasticTransform(1.0, [1.0, 2.0, 3.0])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.ElasticTransform(1.0, 2.0, fill="abc")
+
+    def test__get_params(self):
+        alpha = 2.0
+        sigma = 3.0
+        transform = transforms.ElasticTransform(alpha, sigma)
+
+        h, w = size = (24, 32)
+        image = make_image(size)
+
+        params = transform._get_params([image])
+
+        displacement = params["displacement"]
+        assert displacement.shape == (1, h, w, 2)
+        assert (-alpha / w <= displacement[0, ..., 0]).all() and (displacement[0, ..., 0] <= alpha / w).all()
+        assert (-alpha / h <= displacement[0, ..., 1]).all() and (displacement[0, ..., 1] <= alpha / h).all()
+
+
+class TestTransform:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int],
+    )
+    def test_check_transformed_types(self, inpt_type, mocker):
+        # This test ensures that we correctly handle which types to transform and which to bypass
+        t = transforms.Transform()
+        inpt = mocker.MagicMock(spec=inpt_type)
+
+        if inpt_type in (np.ndarray, str, int):
+            output = t(inpt)
+            assert output is inpt
+        else:
+            with pytest.raises(NotImplementedError):
+                t(inpt)
+
+
+class TestToImage:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch(
+            "torchvision.transforms.v2.functional.to_image",
+            return_value=torch.rand(1, 3, 8, 8),
+        )
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        transform = transforms.ToImage()
+        transform(inpt)
+        if inpt_type in (tv_tensors.BoundingBoxes, tv_tensors.Image, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt)
+
+
+class TestToPILImage:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch("torchvision.transforms.v2.functional.to_pil_image")
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        transform = transforms.ToPILImage()
+        transform(inpt)
+        if inpt_type in (PIL.Image.Image, tv_tensors.BoundingBoxes, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt, mode=transform.mode)
+
+
+class TestToTensor:
+    @pytest.mark.parametrize(
+        "inpt_type",
+        [torch.Tensor, PIL.Image.Image, tv_tensors.Image, np.ndarray, tv_tensors.BoundingBoxes, str, int],
+    )
+    def test__transform(self, inpt_type, mocker):
+        fn = mocker.patch("torchvision.transforms.functional.to_tensor")
+
+        inpt = mocker.MagicMock(spec=inpt_type)
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            transform = transforms.ToTensor()
+        transform(inpt)
+        if inpt_type in (tv_tensors.Image, torch.Tensor, tv_tensors.BoundingBoxes, str, int):
+            assert fn.call_count == 0
+        else:
+            fn.assert_called_once_with(inpt)
+
+
+class TestContainers:
+    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
+    def test_assertions(self, transform_cls):
+        with pytest.raises(TypeError, match="Argument transforms should be a sequence of callables"):
+            transform_cls(transforms.RandomCrop(28))
+
+    @pytest.mark.parametrize("transform_cls", [transforms.Compose, transforms.RandomChoice, transforms.RandomOrder])
+    @pytest.mark.parametrize(
+        "trfms",
+        [
+            [transforms.Pad(2), transforms.RandomCrop(28)],
+            [lambda x: 2.0 * x, transforms.Pad(2), transforms.RandomCrop(28)],
+            [transforms.Pad(2), lambda x: 2.0 * x, transforms.RandomCrop(28)],
+        ],
+    )
+    def test_ctor(self, transform_cls, trfms):
+        c = transform_cls(trfms)
+        inpt = torch.rand(1, 3, 32, 32)
+        output = c(inpt)
+        assert isinstance(output, torch.Tensor)
+        assert output.ndim == 4
+
+
+class TestRandomChoice:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Length of p doesn't match the number of transforms"):
+            transforms.RandomChoice([transforms.Pad(2), transforms.RandomCrop(28)], p=[1])
+
+
+class TestRandomIoUCrop:
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("options", [[0.5, 0.9], [2.0]])
+    def test__get_params(self, device, options):
+        orig_h, orig_w = size = (24, 32)
+        image = make_image(size)
+        bboxes = tv_tensors.BoundingBoxes(
+            torch.tensor([[1, 1, 10, 10], [20, 20, 23, 23], [1, 20, 10, 23], [20, 1, 23, 10]]),
+            format="XYXY",
+            canvas_size=size,
+            device=device,
+        )
+        sample = [image, bboxes]
+
+        transform = transforms.RandomIoUCrop(sampler_options=options)
+
+        n_samples = 5
+        for _ in range(n_samples):
+
+            params = transform._get_params(sample)
+
+            if options == [2.0]:
+                assert len(params) == 0
+                return
+
+            assert len(params["is_within_crop_area"]) > 0
+            assert params["is_within_crop_area"].dtype == torch.bool
+
+            assert int(transform.min_scale * orig_h) <= params["height"] <= int(transform.max_scale * orig_h)
+            assert int(transform.min_scale * orig_w) <= params["width"] <= int(transform.max_scale * orig_w)
+
+            left, top = params["left"], params["top"]
+            new_h, new_w = params["height"], params["width"]
+            ious = box_iou(
+                bboxes,
+                torch.tensor([[left, top, left + new_w, top + new_h]], dtype=bboxes.dtype, device=bboxes.device),
+            )
+            assert ious.max() >= options[0] or ious.max() >= options[1], f"{ious} vs {options}"
+
+    def test__transform_empty_params(self, mocker):
+        transform = transforms.RandomIoUCrop(sampler_options=[2.0])
+        image = tv_tensors.Image(torch.rand(1, 3, 4, 4))
+        bboxes = tv_tensors.BoundingBoxes(torch.tensor([[1, 1, 2, 2]]), format="XYXY", canvas_size=(4, 4))
+        label = torch.tensor([1])
+        sample = [image, bboxes, label]
+        # Let's mock transform._get_params to control the output:
+        transform._get_params = mocker.MagicMock(return_value={})
+        output = transform(sample)
+        torch.testing.assert_close(output, sample)
+
+    def test_forward_assertion(self):
+        transform = transforms.RandomIoUCrop()
+        with pytest.raises(
+            TypeError,
+            match="requires input sample to contain tensor or PIL images and bounding boxes",
+        ):
+            transform(torch.tensor(0))
+
+    def test__transform(self, mocker):
+        transform = transforms.RandomIoUCrop()
+
+        size = (32, 24)
+        image = make_image(size)
+        bboxes = make_bounding_boxes(format="XYXY", canvas_size=size, batch_dims=(6,))
+        masks = make_detection_mask(size, num_objects=6)
+
+        sample = [image, bboxes, masks]
+
+        is_within_crop_area = torch.tensor([0, 1, 0, 1, 0, 1], dtype=torch.bool)
+
+        params = dict(top=1, left=2, height=12, width=12, is_within_crop_area=is_within_crop_area)
+        transform._get_params = mocker.MagicMock(return_value=params)
+        output = transform(sample)
+
+        # check number of bboxes vs number of labels:
+        output_bboxes = output[1]
+        assert isinstance(output_bboxes, tv_tensors.BoundingBoxes)
+        assert (output_bboxes[~is_within_crop_area] == 0).all()
+
+        output_masks = output[2]
+        assert isinstance(output_masks, tv_tensors.Mask)
+
+
+class TestScaleJitter:
+    def test__get_params(self):
+        canvas_size = (24, 32)
+        target_size = (16, 12)
+        scale_range = (0.5, 1.5)
+
+        transform = transforms.ScaleJitter(target_size=target_size, scale_range=scale_range)
+
+        sample = make_image(canvas_size)
+
+        n_samples = 5
+        for _ in range(n_samples):
+
+            params = transform._get_params([sample])
+
+            assert "size" in params
+            size = params["size"]
+
+            assert isinstance(size, tuple) and len(size) == 2
+            height, width = size
+
+            r_min = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[0]
+            r_max = min(target_size[1] / canvas_size[0], target_size[0] / canvas_size[1]) * scale_range[1]
+
+            assert int(canvas_size[0] * r_min) <= height <= int(canvas_size[0] * r_max)
+            assert int(canvas_size[1] * r_min) <= width <= int(canvas_size[1] * r_max)
+
+
+class TestRandomShortestSize:
+    @pytest.mark.parametrize("min_size,max_size", [([5, 9], 20), ([5, 9], None)])
+    def test__get_params(self, min_size, max_size):
+        canvas_size = (3, 10)
+
+        transform = transforms.RandomShortestSize(min_size=min_size, max_size=max_size, antialias=True)
+
+        sample = make_image(canvas_size)
+        params = transform._get_params([sample])
+
+        assert "size" in params
+        size = params["size"]
+
+        assert isinstance(size, tuple) and len(size) == 2
+
+        longer = max(size)
+        shorter = min(size)
+        if max_size is not None:
+            assert longer <= max_size
+            assert shorter <= max_size
+        else:
+            assert shorter in min_size
+
+
+class TestLinearTransformation:
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="transformation_matrix should be square"):
+            transforms.LinearTransformation(torch.rand(2, 3), torch.rand(5))
+
+        with pytest.raises(ValueError, match="mean_vector should have the same length"):
+            transforms.LinearTransformation(torch.rand(3, 3), torch.rand(5))
+
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            122 * torch.ones(1, 3, 8, 8),
+            122.0 * torch.ones(1, 3, 8, 8),
+            tv_tensors.Image(122 * torch.ones(1, 3, 8, 8)),
+            PIL.Image.new("RGB", (8, 8), (122, 122, 122)),
+        ],
+    )
+    def test__transform(self, inpt):
+
+        v = 121 * torch.ones(3 * 8 * 8)
+        m = torch.ones(3 * 8 * 8, 3 * 8 * 8)
+        transform = transforms.LinearTransformation(m, v)
+
+        if isinstance(inpt, PIL.Image.Image):
+            with pytest.raises(TypeError, match="does not support PIL images"):
+                transform(inpt)
+        else:
+            output = transform(inpt)
+            assert isinstance(output, torch.Tensor)
+            assert output.unique() == 3 * 8 * 8
+            assert output.dtype == inpt.dtype
+
+
+class TestRandomResize:
+    def test__get_params(self):
+        min_size = 3
+        max_size = 6
+
+        transform = transforms.RandomResize(min_size=min_size, max_size=max_size, antialias=True)
+
+        for _ in range(10):
+            params = transform._get_params([])
+
+            assert isinstance(params["size"], list) and len(params["size"]) == 1
+            size = params["size"][0]
+
+            assert min_size <= size < max_size
+
+
+class TestUniformTemporalSubsample:
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            torch.zeros(10, 3, 8, 8),
+            torch.zeros(1, 10, 3, 8, 8),
+            tv_tensors.Video(torch.zeros(1, 10, 3, 8, 8)),
+        ],
+    )
+    def test__transform(self, inpt):
+        num_samples = 5
+        transform = transforms.UniformTemporalSubsample(num_samples)
+
+        output = transform(inpt)
+        assert type(output) is type(inpt)
+        assert output.shape[-4] == num_samples
+        assert output.dtype == inpt.dtype
+
+
+# TODO: remove this test in 0.17 when the default of antialias changes to True
+def test_antialias_warning():
+    pil_img = PIL.Image.new("RGB", size=(10, 10), color=127)
+    tensor_img = torch.randint(0, 256, size=(3, 10, 10), dtype=torch.uint8)
+    tensor_video = torch.randint(0, 256, size=(2, 3, 10, 10), dtype=torch.uint8)
+
+    match = "The default value of the antialias parameter"
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomResizedCrop((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.ScaleJitter((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomShortestSize((20, 20))(tensor_img)
+    with pytest.warns(UserWarning, match=match):
+        transforms.RandomResize(10, 20)(tensor_img)
+
+    with pytest.warns(UserWarning, match=match):
+        F.resized_crop(tv_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20))
+
+    with pytest.warns(UserWarning, match=match):
+        F.resize(tv_tensors.Video(tensor_video), (20, 20))
+    with pytest.warns(UserWarning, match=match):
+        F.resized_crop(tv_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20))
+
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        transforms.RandomResizedCrop((20, 20))(pil_img)
+        transforms.ScaleJitter((20, 20))(pil_img)
+        transforms.RandomShortestSize((20, 20))(pil_img)
+        transforms.RandomResize(10, 20)(pil_img)
+
+        transforms.RandomResizedCrop((20, 20), antialias=True)(tensor_img)
+        transforms.ScaleJitter((20, 20), antialias=True)(tensor_img)
+        transforms.RandomShortestSize((20, 20), antialias=True)(tensor_img)
+        transforms.RandomResize(10, 20, antialias=True)(tensor_img)
+
+        F.resized_crop(tv_tensors.Image(tensor_img), 0, 0, 10, 10, (20, 20), antialias=True)
+        F.resized_crop(tv_tensors.Video(tensor_video), 0, 0, 10, 10, (20, 20), antialias=True)
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image))
+@pytest.mark.parametrize("label_type", (torch.Tensor, int))
+@pytest.mark.parametrize("dataset_return_type", (dict, tuple))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage))
+def test_classif_preset(image_type, label_type, dataset_return_type, to_tensor):
+
+    image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 250, 250), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_pure_tensor(image)
+
+    label = 1 if label_type is int else torch.tensor([1])
+
+    if dataset_return_type is dict:
+        sample = {
+            "image": image,
+            "label": label,
+        }
+    else:
+        sample = image, label
+
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
+    t = transforms.Compose(
+        [
+            transforms.RandomResizedCrop((224, 224), antialias=True),
+            transforms.RandomHorizontalFlip(p=1),
+            transforms.RandAugment(),
+            transforms.TrivialAugmentWide(),
+            transforms.AugMix(),
+            transforms.AutoAugment(),
+            to_tensor,
+            # TODO: ConvertImageDtype is a pass-through on PIL images, is that
+            # intended?  This results in a failure if we convert to tensor after
+            # it, because the image would still be uint8 which make Normalize
+            # fail.
+            transforms.ConvertImageDtype(torch.float),
+            transforms.Normalize(mean=[0, 0, 0], std=[1, 1, 1]),
+            transforms.RandomErasing(p=1),
+        ]
+    )
+
+    out = t(sample)
+
+    assert type(out) == type(sample)
+
+    if dataset_return_type is tuple:
+        out_image, out_label = out
+    else:
+        assert out.keys() == sample.keys()
+        out_image, out_label = out.values()
+
+    assert out_image.shape[-2:] == (224, 224)
+    assert out_label == label
+
+
+@pytest.mark.parametrize("image_type", (PIL.Image, torch.Tensor, tv_tensors.Image))
+@pytest.mark.parametrize("data_augmentation", ("hflip", "lsj", "multiscale", "ssd", "ssdlite"))
+@pytest.mark.parametrize("to_tensor", (transforms.ToTensor, transforms.ToImage))
+@pytest.mark.parametrize("sanitize", (True, False))
+def test_detection_preset(image_type, data_augmentation, to_tensor, sanitize):
+    torch.manual_seed(0)
+
+    if to_tensor is transforms.ToTensor:
+        with pytest.warns(UserWarning, match="deprecated and will be removed"):
+            to_tensor = to_tensor()
+    else:
+        to_tensor = to_tensor()
+
+    if data_augmentation == "hflip":
+        t = [
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "lsj":
+        t = [
+            transforms.ScaleJitter(target_size=(1024, 1024), antialias=True),
+            # Note: replaced FixedSizeCrop with RandomCrop, becuase we're
+            # leaving FixedSizeCrop in prototype for now, and it expects Label
+            # classes which we won't release yet.
+            # transforms.FixedSizeCrop(
+            #     size=(1024, 1024), fill=defaultdict(lambda: (123.0, 117.0, 104.0), {tv_tensors.Mask: 0})
+            # ),
+            transforms.RandomCrop((1024, 1024), pad_if_needed=True),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "multiscale":
+        t = [
+            transforms.RandomShortestSize(
+                min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333, antialias=True
+            ),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssd":
+        t = [
+            transforms.RandomPhotometricDistort(p=1),
+            transforms.RandomZoomOut(fill={"others": (123.0, 117.0, 104.0), tv_tensors.Mask: 0}, p=1),
+            transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    elif data_augmentation == "ssdlite":
+        t = [
+            transforms.RandomIoUCrop(),
+            transforms.RandomHorizontalFlip(p=1),
+            to_tensor,
+            transforms.ConvertImageDtype(torch.float),
+        ]
+    if sanitize:
+        t += [transforms.SanitizeBoundingBoxes()]
+    t = transforms.Compose(t)
+
+    num_boxes = 5
+    H = W = 250
+
+    image = tv_tensors.Image(torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8))
+    if image_type is PIL.Image:
+        image = to_pil_image(image[0])
+    elif image_type is torch.Tensor:
+        image = image.as_subclass(torch.Tensor)
+        assert is_pure_tensor(image)
+
+    label = torch.randint(0, 10, size=(num_boxes,))
+
+    boxes = torch.randint(0, min(H, W) // 2, size=(num_boxes, 4))
+    boxes[:, 2:] += boxes[:, :2]
+    boxes = boxes.clamp(min=0, max=min(H, W))
+    boxes = tv_tensors.BoundingBoxes(boxes, format="XYXY", canvas_size=(H, W))
+
+    masks = tv_tensors.Mask(torch.randint(0, 2, size=(num_boxes, H, W), dtype=torch.uint8))
+
+    sample = {
+        "image": image,
+        "label": label,
+        "boxes": boxes,
+        "masks": masks,
+    }
+
+    out = t(sample)
+
+    if isinstance(to_tensor, transforms.ToTensor) and image_type is not tv_tensors.Image:
+        assert is_pure_tensor(out["image"])
+    else:
+        assert isinstance(out["image"], tv_tensors.Image)
+    assert isinstance(out["label"], type(sample["label"]))
+
+    num_boxes_expected = {
+        # ssd and ssdlite contain RandomIoUCrop which may "remove" some bbox. It
+        # doesn't remove them strictly speaking, it just marks some boxes as
+        # degenerate and those boxes will be later removed by
+        # SanitizeBoundingBoxes(), which we add to the pipelines if the sanitize
+        # param is True.
+        # Note that the values below are probably specific to the random seed
+        # set above (which is fine).
+        (True, "ssd"): 5,
+        (True, "ssdlite"): 4,
+    }.get((sanitize, data_augmentation), num_boxes)
+
+    assert out["boxes"].shape[0] == out["masks"].shape[0] == out["label"].shape[0] == num_boxes_expected
+
+
+@pytest.mark.parametrize("min_size", (1, 10))
+@pytest.mark.parametrize("labels_getter", ("default", lambda inputs: inputs["labels"], None, lambda inputs: None))
+@pytest.mark.parametrize("sample_type", (tuple, dict))
+def test_sanitize_bounding_boxes(min_size, labels_getter, sample_type):
+
+    if sample_type is tuple and not isinstance(labels_getter, str):
+        # The "lambda inputs: inputs["labels"]" labels_getter used in this test
+        # doesn't work if the input is a tuple.
+        return
+
+    H, W = 256, 128
+
+    boxes_and_validity = [
+        ([0, 1, 10, 1], False),  # Y1 == Y2
+        ([0, 1, 0, 20], False),  # X1 == X2
+        ([0, 0, min_size - 1, 10], False),  # H < min_size
+        ([0, 0, 10, min_size - 1], False),  # W < min_size
+        ([0, 0, 10, H + 1], False),  # Y2 > H
+        ([0, 0, W + 1, 10], False),  # X2 > W
+        ([-1, 1, 10, 20], False),  # any < 0
+        ([0, 0, -1, 20], False),  # any < 0
+        ([0, 0, -10, -1], False),  # any < 0
+        ([0, 0, min_size, 10], True),  # H < min_size
+        ([0, 0, 10, min_size], True),  # W < min_size
+        ([0, 0, W, H], True),  # TODO: Is that actually OK?? Should it be -1?
+        ([1, 1, 30, 20], True),
+        ([0, 0, 10, 10], True),
+        ([1, 1, 30, 20], True),
+    ]
+
+    random.shuffle(boxes_and_validity)  # For test robustness: mix order of wrong and correct cases
+    boxes, is_valid_mask = zip(*boxes_and_validity)
+    valid_indices = [i for (i, is_valid) in enumerate(is_valid_mask) if is_valid]
+
+    boxes = torch.tensor(boxes)
+    labels = torch.arange(boxes.shape[0])
+
+    boxes = tv_tensors.BoundingBoxes(
+        boxes,
+        format=tv_tensors.BoundingBoxFormat.XYXY,
+        canvas_size=(H, W),
+    )
+
+    masks = tv_tensors.Mask(torch.randint(0, 2, size=(boxes.shape[0], H, W)))
+    whatever = torch.rand(10)
+    input_img = torch.randint(0, 256, size=(1, 3, H, W), dtype=torch.uint8)
+    sample = {
+        "image": input_img,
+        "labels": labels,
+        "boxes": boxes,
+        "whatever": whatever,
+        "None": None,
+        "masks": masks,
+    }
+
+    if sample_type is tuple:
+        img = sample.pop("image")
+        sample = (img, sample)
+
+    out = transforms.SanitizeBoundingBoxes(min_size=min_size, labels_getter=labels_getter)(sample)
+
+    if sample_type is tuple:
+        out_image = out[0]
+        out_labels = out[1]["labels"]
+        out_boxes = out[1]["boxes"]
+        out_masks = out[1]["masks"]
+        out_whatever = out[1]["whatever"]
+    else:
+        out_image = out["image"]
+        out_labels = out["labels"]
+        out_boxes = out["boxes"]
+        out_masks = out["masks"]
+        out_whatever = out["whatever"]
+
+    assert out_image is input_img
+    assert out_whatever is whatever
+
+    assert isinstance(out_boxes, tv_tensors.BoundingBoxes)
+    assert isinstance(out_masks, tv_tensors.Mask)
+
+    if labels_getter is None or (callable(labels_getter) and labels_getter({"labels": "blah"}) is None):
+        assert out_labels is labels
+    else:
+        assert isinstance(out_labels, torch.Tensor)
+        assert out_boxes.shape[0] == out_labels.shape[0] == out_masks.shape[0]
+        # This works because we conveniently set labels to arange(num_boxes)
+        assert out_labels.tolist() == valid_indices
+
+
+def test_sanitize_bounding_boxes_no_label():
+    # Non-regression test for https://github.com/pytorch/vision/issues/7878
+
+    img = make_image()
+    boxes = make_bounding_boxes()
+
+    with pytest.raises(ValueError, match="or a two-tuple whose second item is a dict"):
+        transforms.SanitizeBoundingBoxes()(img, boxes)
+
+    out_img, out_boxes = transforms.SanitizeBoundingBoxes(labels_getter=None)(img, boxes)
+    assert isinstance(out_img, tv_tensors.Image)
+    assert isinstance(out_boxes, tv_tensors.BoundingBoxes)
+
+
+def test_sanitize_bounding_boxes_errors():
+
+    good_bbox = tv_tensors.BoundingBoxes(
+        [[0, 0, 10, 10]],
+        format=tv_tensors.BoundingBoxFormat.XYXY,
+        canvas_size=(20, 20),
+    )
+
+    with pytest.raises(ValueError, match="min_size must be >= 1"):
+        transforms.SanitizeBoundingBoxes(min_size=0)
+    with pytest.raises(ValueError, match="labels_getter should either be 'default'"):
+        transforms.SanitizeBoundingBoxes(labels_getter=12)
+
+    with pytest.raises(ValueError, match="Could not infer where the labels are"):
+        bad_labels_key = {"bbox": good_bbox, "BAD_KEY": torch.arange(good_bbox.shape[0])}
+        transforms.SanitizeBoundingBoxes()(bad_labels_key)
+
+    with pytest.raises(ValueError, match="must be a tensor"):
+        not_a_tensor = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0]).tolist()}
+        transforms.SanitizeBoundingBoxes()(not_a_tensor)
+
+    with pytest.raises(ValueError, match="Number of boxes"):
+        different_sizes = {"bbox": good_bbox, "labels": torch.arange(good_bbox.shape[0] + 3)}
+        transforms.SanitizeBoundingBoxes()(different_sizes)
+
+
+class TestLambda:
+    inputs = pytest.mark.parametrize("input", [object(), torch.empty(()), np.empty(()), "string", 1, 0.0])
+
+    @inputs
+    def test_default(self, input):
+        was_applied = False
+
+        def was_applied_fn(input):
+            nonlocal was_applied
+            was_applied = True
+            return input
+
+        transform = transforms.Lambda(was_applied_fn)
+
+        transform(input)
+
+        assert was_applied
+
+    @inputs
+    def test_with_types(self, input):
+        was_applied = False
+
+        def was_applied_fn(input):
+            nonlocal was_applied
+            was_applied = True
+            return input
+
+        types = (torch.Tensor, np.ndarray)
+        transform = transforms.Lambda(was_applied_fn, *types)
+
+        transform(input)
+
+        assert was_applied is isinstance(input, types)
diff --git a/test/test_transforms_v2_consistency.py b/test/test_transforms_v2_consistency.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f47eb2117f7fef887d179ae08663d82c739eddd
--- /dev/null
+++ b/test/test_transforms_v2_consistency.py
@@ -0,0 +1,1254 @@
+import importlib.machinery
+import importlib.util
+import inspect
+import random
+import re
+from pathlib import Path
+
+import numpy as np
+import PIL.Image
+import pytest
+
+import torch
+import torchvision.transforms.v2 as v2_transforms
+from common_utils import assert_close, assert_equal, set_rng_seed
+from torch import nn
+from torchvision import transforms as legacy_transforms, tv_tensors
+from torchvision._utils import sequence_to_str
+
+from torchvision.transforms import functional as legacy_F
+from torchvision.transforms.v2 import functional as prototype_F
+from torchvision.transforms.v2._utils import _get_fill, query_size
+from torchvision.transforms.v2.functional import to_pil_image
+from transforms_v2_legacy_utils import (
+    ArgsKwargs,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_images,
+    make_segmentation_mask,
+)
+
+DEFAULT_MAKE_IMAGES_KWARGS = dict(color_spaces=["RGB"], extra_dims=[(4,)])
+
+
+@pytest.fixture(autouse=True)
+def fix_rng_seed():
+    set_rng_seed(0)
+    yield
+
+
+class NotScriptableArgsKwargs(ArgsKwargs):
+    """
+    This class is used to mark parameters that render the transform non-scriptable. They still work in eager mode and
+    thus will be tested there, but will be skipped by the JIT tests.
+    """
+
+    pass
+
+
+class ConsistencyConfig:
+    def __init__(
+        self,
+        prototype_cls,
+        legacy_cls,
+        # If no args_kwargs is passed, only the signature will be checked
+        args_kwargs=(),
+        make_images_kwargs=None,
+        supports_pil=True,
+        removed_params=(),
+        closeness_kwargs=None,
+    ):
+        self.prototype_cls = prototype_cls
+        self.legacy_cls = legacy_cls
+        self.args_kwargs = args_kwargs
+        self.make_images_kwargs = make_images_kwargs or DEFAULT_MAKE_IMAGES_KWARGS
+        self.supports_pil = supports_pil
+        self.removed_params = removed_params
+        self.closeness_kwargs = closeness_kwargs or dict(rtol=0, atol=0)
+
+
+# These are here since both the prototype and legacy transform need to be constructed with the same random parameters
+LINEAR_TRANSFORMATION_MEAN = torch.rand(36)
+LINEAR_TRANSFORMATION_MATRIX = torch.rand([LINEAR_TRANSFORMATION_MEAN.numel()] * 2)
+
+CONSISTENCY_CONFIGS = [
+    ConsistencyConfig(
+        v2_transforms.Normalize,
+        legacy_transforms.Normalize,
+        [
+            ArgsKwargs(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+        ],
+        supports_pil=False,
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.float]),
+    ),
+    ConsistencyConfig(
+        v2_transforms.CenterCrop,
+        legacy_transforms.CenterCrop,
+        [
+            ArgsKwargs(18),
+            ArgsKwargs((18, 13)),
+        ],
+    ),
+    ConsistencyConfig(
+        v2_transforms.FiveCrop,
+        legacy_transforms.FiveCrop,
+        [
+            ArgsKwargs(18),
+            ArgsKwargs((18, 13)),
+        ],
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]),
+    ),
+    ConsistencyConfig(
+        v2_transforms.TenCrop,
+        legacy_transforms.TenCrop,
+        [
+            ArgsKwargs(18),
+            ArgsKwargs((18, 13)),
+            ArgsKwargs(18, vertical_flip=True),
+        ],
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(20, 19)]),
+    ),
+    ConsistencyConfig(
+        v2_transforms.Pad,
+        legacy_transforms.Pad,
+        [
+            NotScriptableArgsKwargs(3),
+            ArgsKwargs([3]),
+            ArgsKwargs([2, 3]),
+            ArgsKwargs([3, 2, 1, 4]),
+            NotScriptableArgsKwargs(5, fill=1, padding_mode="constant"),
+            ArgsKwargs([5], fill=1, padding_mode="constant"),
+            NotScriptableArgsKwargs(5, padding_mode="edge"),
+            NotScriptableArgsKwargs(5, padding_mode="reflect"),
+            NotScriptableArgsKwargs(5, padding_mode="symmetric"),
+        ],
+    ),
+    *[
+        ConsistencyConfig(
+            v2_transforms.LinearTransformation,
+            legacy_transforms.LinearTransformation,
+            [
+                ArgsKwargs(LINEAR_TRANSFORMATION_MATRIX.to(matrix_dtype), LINEAR_TRANSFORMATION_MEAN.to(matrix_dtype)),
+            ],
+            # Make sure that the product of the height, width and number of channels matches the number of elements in
+            # `LINEAR_TRANSFORMATION_MEAN`. For example 2 * 6 * 3 == 4 * 3 * 3 == 36.
+            make_images_kwargs=dict(
+                DEFAULT_MAKE_IMAGES_KWARGS, sizes=[(2, 6), (4, 3)], color_spaces=["RGB"], dtypes=[image_dtype]
+            ),
+            supports_pil=False,
+        )
+        for matrix_dtype, image_dtype in [
+            (torch.float32, torch.float32),
+            (torch.float64, torch.float64),
+            (torch.float32, torch.uint8),
+            (torch.float64, torch.float32),
+            (torch.float32, torch.float64),
+        ]
+    ],
+    ConsistencyConfig(
+        v2_transforms.Grayscale,
+        legacy_transforms.Grayscale,
+        [
+            ArgsKwargs(num_output_channels=1),
+            ArgsKwargs(num_output_channels=3),
+        ],
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=["RGB", "GRAY"]),
+        # Use default tolerances of `torch.testing.assert_close`
+        closeness_kwargs=dict(rtol=None, atol=None),
+    ),
+    ConsistencyConfig(
+        v2_transforms.ToPILImage,
+        legacy_transforms.ToPILImage,
+        [NotScriptableArgsKwargs()],
+        make_images_kwargs=dict(
+            color_spaces=[
+                "GRAY",
+                "GRAY_ALPHA",
+                "RGB",
+                "RGBA",
+            ],
+            extra_dims=[()],
+        ),
+        supports_pil=False,
+    ),
+    ConsistencyConfig(
+        v2_transforms.Lambda,
+        legacy_transforms.Lambda,
+        [
+            NotScriptableArgsKwargs(lambda image: image / 2),
+        ],
+        # Technically, this also supports PIL, but it is overkill to write a function here that supports tensor and PIL
+        # images given that the transform does nothing but call it anyway.
+        supports_pil=False,
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomEqualize,
+        legacy_transforms.RandomEqualize,
+        [
+            ArgsKwargs(p=0),
+            ArgsKwargs(p=1),
+        ],
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]),
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomInvert,
+        legacy_transforms.RandomInvert,
+        [
+            ArgsKwargs(p=0),
+            ArgsKwargs(p=1),
+        ],
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomPosterize,
+        legacy_transforms.RandomPosterize,
+        [
+            ArgsKwargs(p=0, bits=5),
+            ArgsKwargs(p=1, bits=1),
+            ArgsKwargs(p=1, bits=3),
+        ],
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[torch.uint8]),
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomSolarize,
+        legacy_transforms.RandomSolarize,
+        [
+            ArgsKwargs(p=0, threshold=0.5),
+            ArgsKwargs(p=1, threshold=0.3),
+            ArgsKwargs(p=1, threshold=0.99),
+        ],
+    ),
+    *[
+        ConsistencyConfig(
+            v2_transforms.RandomAutocontrast,
+            legacy_transforms.RandomAutocontrast,
+            [
+                ArgsKwargs(p=0),
+                ArgsKwargs(p=1),
+            ],
+            make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, dtypes=[dt]),
+            closeness_kwargs=ckw,
+        )
+        for dt, ckw in [(torch.uint8, dict(atol=1, rtol=0)), (torch.float32, dict(rtol=None, atol=None))]
+    ],
+    ConsistencyConfig(
+        v2_transforms.RandomAdjustSharpness,
+        legacy_transforms.RandomAdjustSharpness,
+        [
+            ArgsKwargs(p=0, sharpness_factor=0.5),
+            ArgsKwargs(p=1, sharpness_factor=0.2),
+            ArgsKwargs(p=1, sharpness_factor=0.99),
+        ],
+        closeness_kwargs={"atol": 1e-6, "rtol": 1e-6},
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomGrayscale,
+        legacy_transforms.RandomGrayscale,
+        [
+            ArgsKwargs(p=0),
+            ArgsKwargs(p=1),
+        ],
+        make_images_kwargs=dict(DEFAULT_MAKE_IMAGES_KWARGS, color_spaces=["RGB", "GRAY"]),
+        # Use default tolerances of `torch.testing.assert_close`
+        closeness_kwargs=dict(rtol=None, atol=None),
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomResizedCrop,
+        legacy_transforms.RandomResizedCrop,
+        [
+            ArgsKwargs(16),
+            ArgsKwargs(17, scale=(0.3, 0.7)),
+            ArgsKwargs(25, ratio=(0.5, 1.5)),
+            ArgsKwargs((31, 28), interpolation=v2_transforms.InterpolationMode.NEAREST),
+            ArgsKwargs((31, 28), interpolation=PIL.Image.NEAREST),
+            ArgsKwargs((29, 32), antialias=False),
+            ArgsKwargs((28, 31), antialias=True),
+        ],
+        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+        closeness_kwargs=dict(rtol=0, atol=1),
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomResizedCrop,
+        legacy_transforms.RandomResizedCrop,
+        [
+            ArgsKwargs((33, 26), interpolation=v2_transforms.InterpolationMode.BICUBIC, antialias=True),
+            ArgsKwargs((33, 26), interpolation=PIL.Image.BICUBIC, antialias=True),
+        ],
+        closeness_kwargs=dict(rtol=0, atol=21),
+    ),
+    ConsistencyConfig(
+        v2_transforms.ColorJitter,
+        legacy_transforms.ColorJitter,
+        [
+            ArgsKwargs(),
+            ArgsKwargs(brightness=0.1),
+            ArgsKwargs(brightness=(0.2, 0.3)),
+            ArgsKwargs(contrast=0.4),
+            ArgsKwargs(contrast=(0.5, 0.6)),
+            ArgsKwargs(saturation=0.7),
+            ArgsKwargs(saturation=(0.8, 0.9)),
+            ArgsKwargs(hue=0.3),
+            ArgsKwargs(hue=(-0.1, 0.2)),
+            ArgsKwargs(brightness=0.1, contrast=0.4, saturation=0.5, hue=0.3),
+        ],
+        closeness_kwargs={"atol": 1e-5, "rtol": 1e-5},
+    ),
+    ConsistencyConfig(
+        v2_transforms.GaussianBlur,
+        legacy_transforms.GaussianBlur,
+        [
+            ArgsKwargs(kernel_size=3),
+            ArgsKwargs(kernel_size=(1, 5)),
+            ArgsKwargs(kernel_size=3, sigma=0.7),
+            ArgsKwargs(kernel_size=5, sigma=(0.3, 1.4)),
+        ],
+        closeness_kwargs={"rtol": 1e-5, "atol": 1e-5},
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomPerspective,
+        legacy_transforms.RandomPerspective,
+        [
+            ArgsKwargs(p=0),
+            ArgsKwargs(p=1),
+            ArgsKwargs(p=1, distortion_scale=0.3),
+            ArgsKwargs(p=1, distortion_scale=0.2, interpolation=v2_transforms.InterpolationMode.NEAREST),
+            ArgsKwargs(p=1, distortion_scale=0.2, interpolation=PIL.Image.NEAREST),
+            ArgsKwargs(p=1, distortion_scale=0.1, fill=1),
+            ArgsKwargs(p=1, distortion_scale=0.4, fill=(1, 2, 3)),
+        ],
+        closeness_kwargs={"atol": None, "rtol": None},
+    ),
+    ConsistencyConfig(
+        v2_transforms.PILToTensor,
+        legacy_transforms.PILToTensor,
+    ),
+    ConsistencyConfig(
+        v2_transforms.ToTensor,
+        legacy_transforms.ToTensor,
+    ),
+    ConsistencyConfig(
+        v2_transforms.Compose,
+        legacy_transforms.Compose,
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomApply,
+        legacy_transforms.RandomApply,
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomChoice,
+        legacy_transforms.RandomChoice,
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandomOrder,
+        legacy_transforms.RandomOrder,
+    ),
+    ConsistencyConfig(
+        v2_transforms.AugMix,
+        legacy_transforms.AugMix,
+    ),
+    ConsistencyConfig(
+        v2_transforms.AutoAugment,
+        legacy_transforms.AutoAugment,
+    ),
+    ConsistencyConfig(
+        v2_transforms.RandAugment,
+        legacy_transforms.RandAugment,
+    ),
+    ConsistencyConfig(
+        v2_transforms.TrivialAugmentWide,
+        legacy_transforms.TrivialAugmentWide,
+    ),
+]
+
+
+@pytest.mark.parametrize("config", CONSISTENCY_CONFIGS, ids=lambda config: config.legacy_cls.__name__)
+def test_signature_consistency(config):
+    legacy_params = dict(inspect.signature(config.legacy_cls).parameters)
+    prototype_params = dict(inspect.signature(config.prototype_cls).parameters)
+
+    for param in config.removed_params:
+        legacy_params.pop(param, None)
+
+    missing = legacy_params.keys() - prototype_params.keys()
+    if missing:
+        raise AssertionError(
+            f"The prototype transform does not support the parameters "
+            f"{sequence_to_str(sorted(missing), separate_last='and ')}, but the legacy transform does. "
+            f"If that is intentional, e.g. pending deprecation, please add the parameters to the `removed_params` on "
+            f"the `ConsistencyConfig`."
+        )
+
+    extra = prototype_params.keys() - legacy_params.keys()
+    extra_without_default = {
+        param
+        for param in extra
+        if prototype_params[param].default is inspect.Parameter.empty
+        and prototype_params[param].kind not in {inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD}
+    }
+    if extra_without_default:
+        raise AssertionError(
+            f"The prototype transform requires the parameters "
+            f"{sequence_to_str(sorted(extra_without_default), separate_last='and ')}, but the legacy transform does "
+            f"not. Please add a default value."
+        )
+
+    legacy_signature = list(legacy_params.keys())
+    # Since we made sure that we don't have any extra parameters without default above, we clamp the prototype signature
+    # to the same number of parameters as the legacy one
+    prototype_signature = list(prototype_params.keys())[: len(legacy_signature)]
+
+    assert prototype_signature == legacy_signature
+
+
+def check_call_consistency(
+    prototype_transform, legacy_transform, images=None, supports_pil=True, closeness_kwargs=None
+):
+    if images is None:
+        images = make_images(**DEFAULT_MAKE_IMAGES_KWARGS)
+
+    closeness_kwargs = closeness_kwargs or dict()
+
+    for image in images:
+        image_repr = f"[{tuple(image.shape)}, {str(image.dtype).rsplit('.')[-1]}]"
+
+        image_tensor = torch.Tensor(image)
+        try:
+            torch.manual_seed(0)
+            output_legacy_tensor = legacy_transform(image_tensor)
+        except Exception as exc:
+            raise pytest.UsageError(
+                f"Transforming a tensor image {image_repr} failed in the legacy transform with the "
+                f"error above. This means that you need to specify the parameters passed to `make_images` through the "
+                "`make_images_kwargs` of the `ConsistencyConfig`."
+            ) from exc
+
+        try:
+            torch.manual_seed(0)
+            output_prototype_tensor = prototype_transform(image_tensor)
+        except Exception as exc:
+            raise AssertionError(
+                f"Transforming a tensor image with shape {image_repr} failed in the prototype transform with "
+                f"the error above. This means there is a consistency bug either in `_get_params` or in the "
+                f"`is_pure_tensor` path in `_transform`."
+            ) from exc
+
+        assert_close(
+            output_prototype_tensor,
+            output_legacy_tensor,
+            msg=lambda msg: f"Tensor image consistency check failed with: \n\n{msg}",
+            **closeness_kwargs,
+        )
+
+        try:
+            torch.manual_seed(0)
+            output_prototype_image = prototype_transform(image)
+        except Exception as exc:
+            raise AssertionError(
+                f"Transforming a image tv_tensor with shape {image_repr} failed in the prototype transform with "
+                f"the error above. This means there is a consistency bug either in `_get_params` or in the "
+                f"`tv_tensors.Image` path in `_transform`."
+            ) from exc
+
+        assert_close(
+            output_prototype_image,
+            output_prototype_tensor,
+            msg=lambda msg: f"Output for tv_tensor and tensor images is not equal: \n\n{msg}",
+            **closeness_kwargs,
+        )
+
+        if image.ndim == 3 and supports_pil:
+            image_pil = to_pil_image(image)
+
+            try:
+                torch.manual_seed(0)
+                output_legacy_pil = legacy_transform(image_pil)
+            except Exception as exc:
+                raise pytest.UsageError(
+                    f"Transforming a PIL image with shape {image_repr} failed in the legacy transform with the "
+                    f"error above. If this transform does not support PIL images, set `supports_pil=False` on the "
+                    "`ConsistencyConfig`. "
+                ) from exc
+
+            try:
+                torch.manual_seed(0)
+                output_prototype_pil = prototype_transform(image_pil)
+            except Exception as exc:
+                raise AssertionError(
+                    f"Transforming a PIL image with shape {image_repr} failed in the prototype transform with "
+                    f"the error above. This means there is a consistency bug either in `_get_params` or in the "
+                    f"`PIL.Image.Image` path in `_transform`."
+                ) from exc
+
+            assert_close(
+                output_prototype_pil,
+                output_legacy_pil,
+                msg=lambda msg: f"PIL image consistency check failed with: \n\n{msg}",
+                **closeness_kwargs,
+            )
+
+
+@pytest.mark.parametrize(
+    ("config", "args_kwargs"),
+    [
+        pytest.param(
+            config, args_kwargs, id=f"{config.legacy_cls.__name__}-{idx:0{len(str(len(config.args_kwargs)))}d}"
+        )
+        for config in CONSISTENCY_CONFIGS
+        for idx, args_kwargs in enumerate(config.args_kwargs)
+    ],
+)
+@pytest.mark.filterwarnings("ignore")
+def test_call_consistency(config, args_kwargs):
+    args, kwargs = args_kwargs
+
+    try:
+        legacy_transform = config.legacy_cls(*args, **kwargs)
+    except Exception as exc:
+        raise pytest.UsageError(
+            f"Initializing the legacy transform failed with the error above. "
+            f"Please correct the `ArgsKwargs({args_kwargs})` in the `ConsistencyConfig`."
+        ) from exc
+
+    try:
+        prototype_transform = config.prototype_cls(*args, **kwargs)
+    except Exception as exc:
+        raise AssertionError(
+            "Initializing the prototype transform failed with the error above. "
+            "This means there is a consistency bug in the constructor."
+        ) from exc
+
+    check_call_consistency(
+        prototype_transform,
+        legacy_transform,
+        images=make_images(**config.make_images_kwargs),
+        supports_pil=config.supports_pil,
+        closeness_kwargs=config.closeness_kwargs,
+    )
+
+
+get_params_parametrization = pytest.mark.parametrize(
+    ("config", "get_params_args_kwargs"),
+    [
+        pytest.param(
+            next(config for config in CONSISTENCY_CONFIGS if config.prototype_cls is transform_cls),
+            get_params_args_kwargs,
+            id=transform_cls.__name__,
+        )
+        for transform_cls, get_params_args_kwargs in [
+            (v2_transforms.RandomResizedCrop, ArgsKwargs(make_image(), scale=[0.3, 0.7], ratio=[0.5, 1.5])),
+            (v2_transforms.ColorJitter, ArgsKwargs(brightness=None, contrast=None, saturation=None, hue=None)),
+            (v2_transforms.GaussianBlur, ArgsKwargs(0.3, 1.4)),
+            (v2_transforms.RandomPerspective, ArgsKwargs(23, 17, 0.5)),
+            (v2_transforms.AutoAugment, ArgsKwargs(5)),
+        ]
+    ],
+)
+
+
+@get_params_parametrization
+def test_get_params_alias(config, get_params_args_kwargs):
+    assert config.prototype_cls.get_params is config.legacy_cls.get_params
+
+    if not config.args_kwargs:
+        return
+    args, kwargs = config.args_kwargs[0]
+    legacy_transform = config.legacy_cls(*args, **kwargs)
+    prototype_transform = config.prototype_cls(*args, **kwargs)
+
+    assert prototype_transform.get_params is legacy_transform.get_params
+
+
+@get_params_parametrization
+def test_get_params_jit(config, get_params_args_kwargs):
+    get_params_args, get_params_kwargs = get_params_args_kwargs
+
+    torch.jit.script(config.prototype_cls.get_params)(*get_params_args, **get_params_kwargs)
+
+    if not config.args_kwargs:
+        return
+    args, kwargs = config.args_kwargs[0]
+    transform = config.prototype_cls(*args, **kwargs)
+
+    torch.jit.script(transform.get_params)(*get_params_args, **get_params_kwargs)
+
+
+@pytest.mark.parametrize(
+    ("config", "args_kwargs"),
+    [
+        pytest.param(
+            config, args_kwargs, id=f"{config.legacy_cls.__name__}-{idx:0{len(str(len(config.args_kwargs)))}d}"
+        )
+        for config in CONSISTENCY_CONFIGS
+        for idx, args_kwargs in enumerate(config.args_kwargs)
+        if not isinstance(args_kwargs, NotScriptableArgsKwargs)
+    ],
+)
+def test_jit_consistency(config, args_kwargs):
+    args, kwargs = args_kwargs
+
+    prototype_transform_eager = config.prototype_cls(*args, **kwargs)
+    legacy_transform_eager = config.legacy_cls(*args, **kwargs)
+
+    legacy_transform_scripted = torch.jit.script(legacy_transform_eager)
+    prototype_transform_scripted = torch.jit.script(prototype_transform_eager)
+
+    for image in make_images(**config.make_images_kwargs):
+        image = image.as_subclass(torch.Tensor)
+
+        torch.manual_seed(0)
+        output_legacy_scripted = legacy_transform_scripted(image)
+
+        torch.manual_seed(0)
+        output_prototype_scripted = prototype_transform_scripted(image)
+
+        assert_close(output_prototype_scripted, output_legacy_scripted, **config.closeness_kwargs)
+
+
+class TestContainerTransforms:
+    """
+    Since we are testing containers here, we also need some transforms to wrap. Thus, testing a container transform for
+    consistency automatically tests the wrapped transforms consistency.
+
+    Instead of complicated mocking or creating custom transforms just for these tests, here we use deterministic ones
+    that were already tested for consistency above.
+    """
+
+    def test_compose(self):
+        prototype_transform = v2_transforms.Compose(
+            [
+                v2_transforms.Resize(256),
+                v2_transforms.CenterCrop(224),
+            ]
+        )
+        legacy_transform = legacy_transforms.Compose(
+            [
+                legacy_transforms.Resize(256),
+                legacy_transforms.CenterCrop(224),
+            ]
+        )
+
+        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+        check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1))
+
+    @pytest.mark.parametrize("p", [0, 0.1, 0.5, 0.9, 1])
+    @pytest.mark.parametrize("sequence_type", [list, nn.ModuleList])
+    def test_random_apply(self, p, sequence_type):
+        prototype_transform = v2_transforms.RandomApply(
+            sequence_type(
+                [
+                    v2_transforms.Resize(256),
+                    v2_transforms.CenterCrop(224),
+                ]
+            ),
+            p=p,
+        )
+        legacy_transform = legacy_transforms.RandomApply(
+            sequence_type(
+                [
+                    legacy_transforms.Resize(256),
+                    legacy_transforms.CenterCrop(224),
+                ]
+            ),
+            p=p,
+        )
+
+        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+        check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1))
+
+        if sequence_type is nn.ModuleList:
+            # quick and dirty test that it is jit-scriptable
+            scripted = torch.jit.script(prototype_transform)
+            scripted(torch.rand(1, 3, 300, 300))
+
+    # We can't test other values for `p` since the random parameter generation is different
+    @pytest.mark.parametrize("probabilities", [(0, 1), (1, 0)])
+    def test_random_choice(self, probabilities):
+        prototype_transform = v2_transforms.RandomChoice(
+            [
+                v2_transforms.Resize(256),
+                legacy_transforms.CenterCrop(224),
+            ],
+            p=probabilities,
+        )
+        legacy_transform = legacy_transforms.RandomChoice(
+            [
+                legacy_transforms.Resize(256),
+                legacy_transforms.CenterCrop(224),
+            ],
+            p=probabilities,
+        )
+
+        # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+        check_call_consistency(prototype_transform, legacy_transform, closeness_kwargs=dict(rtol=0, atol=1))
+
+
+class TestToTensorTransforms:
+    def test_pil_to_tensor(self):
+        prototype_transform = v2_transforms.PILToTensor()
+        legacy_transform = legacy_transforms.PILToTensor()
+
+        for image in make_images(extra_dims=[()]):
+            image_pil = to_pil_image(image)
+
+            assert_equal(prototype_transform(image_pil), legacy_transform(image_pil))
+
+    def test_to_tensor(self):
+        with pytest.warns(UserWarning, match=re.escape("The transform `ToTensor()` is deprecated")):
+            prototype_transform = v2_transforms.ToTensor()
+        legacy_transform = legacy_transforms.ToTensor()
+
+        for image in make_images(extra_dims=[()]):
+            image_pil = to_pil_image(image)
+            image_numpy = np.array(image_pil)
+
+            assert_equal(prototype_transform(image_pil), legacy_transform(image_pil))
+            assert_equal(prototype_transform(image_numpy), legacy_transform(image_numpy))
+
+
+class TestAATransforms:
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
+            PIL.Image.new("RGB", (256, 256), 123),
+            tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+            PIL.Image.NEAREST,
+        ],
+    )
+    def test_randaug(self, inpt, interpolation, mocker):
+        t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1)
+        t = v2_transforms.RandAugment(interpolation=interpolation, num_ops=1)
+
+        le = len(t._AUGMENTATION_SPACE)
+        keys = list(t._AUGMENTATION_SPACE.keys())
+        randint_values = []
+        for i in range(le):
+            # Stable API, op_index random call
+            randint_values.append(i)
+            # Stable API, if signed there is another random call
+            if t._AUGMENTATION_SPACE[keys[i]][1]:
+                randint_values.append(0)
+            # New API, _get_random_item
+            randint_values.append(i)
+        randint_values = iter(randint_values)
+
+        mocker.patch("torch.randint", side_effect=lambda *arg, **kwargs: torch.tensor(next(randint_values)))
+        mocker.patch("torch.rand", return_value=1.0)
+
+        for i in range(le):
+            expected_output = t_ref(inpt)
+            output = t(inpt)
+
+            assert_close(expected_output, output, atol=1, rtol=0.1)
+
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+        ],
+    )
+    @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
+    def test_randaug_jit(self, interpolation, fill):
+        inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
+        t_ref = legacy_transforms.RandAugment(interpolation=interpolation, num_ops=1, fill=fill)
+        t = v2_transforms.RandAugment(interpolation=interpolation, num_ops=1, fill=fill)
+
+        tt_ref = torch.jit.script(t_ref)
+        tt = torch.jit.script(t)
+
+        torch.manual_seed(12)
+        expected_output = tt_ref(inpt)
+
+        torch.manual_seed(12)
+        scripted_output = tt(inpt)
+
+        assert_equal(scripted_output, expected_output)
+
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
+            PIL.Image.new("RGB", (256, 256), 123),
+            tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+            PIL.Image.NEAREST,
+        ],
+    )
+    def test_trivial_aug(self, inpt, interpolation, mocker):
+        t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation)
+        t = v2_transforms.TrivialAugmentWide(interpolation=interpolation)
+
+        le = len(t._AUGMENTATION_SPACE)
+        keys = list(t._AUGMENTATION_SPACE.keys())
+        randint_values = []
+        for i in range(le):
+            # Stable API, op_index random call
+            randint_values.append(i)
+            key = keys[i]
+            # Stable API, random magnitude
+            aug_op = t._AUGMENTATION_SPACE[key]
+            magnitudes = aug_op[0](2, 0, 0)
+            if magnitudes is not None:
+                randint_values.append(5)
+            # Stable API, if signed there is another random call
+            if aug_op[1]:
+                randint_values.append(0)
+            # New API, _get_random_item
+            randint_values.append(i)
+            # New API, random magnitude
+            if magnitudes is not None:
+                randint_values.append(5)
+
+        randint_values = iter(randint_values)
+
+        mocker.patch("torch.randint", side_effect=lambda *arg, **kwargs: torch.tensor(next(randint_values)))
+        mocker.patch("torch.rand", return_value=1.0)
+
+        for _ in range(le):
+            expected_output = t_ref(inpt)
+            output = t(inpt)
+
+            assert_close(expected_output, output, atol=1, rtol=0.1)
+
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+        ],
+    )
+    @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
+    def test_trivial_aug_jit(self, interpolation, fill):
+        inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
+        t_ref = legacy_transforms.TrivialAugmentWide(interpolation=interpolation, fill=fill)
+        t = v2_transforms.TrivialAugmentWide(interpolation=interpolation, fill=fill)
+
+        tt_ref = torch.jit.script(t_ref)
+        tt = torch.jit.script(t)
+
+        torch.manual_seed(12)
+        expected_output = tt_ref(inpt)
+
+        torch.manual_seed(12)
+        scripted_output = tt(inpt)
+
+        assert_equal(scripted_output, expected_output)
+
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
+            PIL.Image.new("RGB", (256, 256), 123),
+            tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+            PIL.Image.NEAREST,
+        ],
+    )
+    def test_augmix(self, inpt, interpolation, mocker):
+        t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
+        t_ref._sample_dirichlet = lambda t: t.softmax(dim=-1)
+        t = v2_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1)
+        t._sample_dirichlet = lambda t: t.softmax(dim=-1)
+
+        le = len(t._AUGMENTATION_SPACE)
+        keys = list(t._AUGMENTATION_SPACE.keys())
+        randint_values = []
+        for i in range(le):
+            # Stable API, op_index random call
+            randint_values.append(i)
+            key = keys[i]
+            # Stable API, random magnitude
+            aug_op = t._AUGMENTATION_SPACE[key]
+            magnitudes = aug_op[0](2, 0, 0)
+            if magnitudes is not None:
+                randint_values.append(5)
+            # Stable API, if signed there is another random call
+            if aug_op[1]:
+                randint_values.append(0)
+            # New API, _get_random_item
+            randint_values.append(i)
+            # New API, random magnitude
+            if magnitudes is not None:
+                randint_values.append(5)
+
+        randint_values = iter(randint_values)
+
+        mocker.patch("torch.randint", side_effect=lambda *arg, **kwargs: torch.tensor(next(randint_values)))
+        mocker.patch("torch.rand", return_value=1.0)
+
+        expected_output = t_ref(inpt)
+        output = t(inpt)
+
+        assert_equal(expected_output, output)
+
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+        ],
+    )
+    @pytest.mark.parametrize("fill", [None, 85, (10, -10, 10), 0.7, [0.0, 0.0, 0.0], [1], 1])
+    def test_augmix_jit(self, interpolation, fill):
+        inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
+
+        t_ref = legacy_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1, fill=fill)
+        t = v2_transforms.AugMix(interpolation=interpolation, mixture_width=1, chain_depth=1, fill=fill)
+
+        tt_ref = torch.jit.script(t_ref)
+        tt = torch.jit.script(t)
+
+        torch.manual_seed(12)
+        expected_output = tt_ref(inpt)
+
+        torch.manual_seed(12)
+        scripted_output = tt(inpt)
+
+        assert_equal(scripted_output, expected_output)
+
+    @pytest.mark.parametrize(
+        "inpt",
+        [
+            torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8),
+            PIL.Image.new("RGB", (256, 256), 123),
+            tv_tensors.Image(torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)),
+        ],
+    )
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+            PIL.Image.NEAREST,
+        ],
+    )
+    def test_aa(self, inpt, interpolation):
+        aa_policy = legacy_transforms.AutoAugmentPolicy("imagenet")
+        t_ref = legacy_transforms.AutoAugment(aa_policy, interpolation=interpolation)
+        t = v2_transforms.AutoAugment(aa_policy, interpolation=interpolation)
+
+        torch.manual_seed(12)
+        expected_output = t_ref(inpt)
+
+        torch.manual_seed(12)
+        output = t(inpt)
+
+        assert_equal(expected_output, output)
+
+    @pytest.mark.parametrize(
+        "interpolation",
+        [
+            v2_transforms.InterpolationMode.NEAREST,
+            v2_transforms.InterpolationMode.BILINEAR,
+        ],
+    )
+    def test_aa_jit(self, interpolation):
+        inpt = torch.randint(0, 256, size=(1, 3, 256, 256), dtype=torch.uint8)
+        aa_policy = legacy_transforms.AutoAugmentPolicy("imagenet")
+        t_ref = legacy_transforms.AutoAugment(aa_policy, interpolation=interpolation)
+        t = v2_transforms.AutoAugment(aa_policy, interpolation=interpolation)
+
+        tt_ref = torch.jit.script(t_ref)
+        tt = torch.jit.script(t)
+
+        torch.manual_seed(12)
+        expected_output = tt_ref(inpt)
+
+        torch.manual_seed(12)
+        scripted_output = tt(inpt)
+
+        assert_equal(scripted_output, expected_output)
+
+
+def import_transforms_from_references(reference):
+    HERE = Path(__file__).parent
+    PROJECT_ROOT = HERE.parent
+
+    loader = importlib.machinery.SourceFileLoader(
+        "transforms", str(PROJECT_ROOT / "references" / reference / "transforms.py")
+    )
+    spec = importlib.util.spec_from_loader("transforms", loader)
+    module = importlib.util.module_from_spec(spec)
+    loader.exec_module(module)
+    return module
+
+
+det_transforms = import_transforms_from_references("detection")
+
+
+class TestRefDetTransforms:
+    def make_tv_tensors(self, with_mask=True):
+        size = (600, 800)
+        num_objects = 22
+
+        def make_label(extra_dims, categories):
+            return torch.randint(categories, extra_dims, dtype=torch.int64)
+
+        pil_image = to_pil_image(make_image(size=size, color_space="RGB"))
+        target = {
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+        }
+        if with_mask:
+            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
+
+        yield (pil_image, target)
+
+        tensor_image = torch.Tensor(make_image(size=size, color_space="RGB", dtype=torch.float32))
+        target = {
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+        }
+        if with_mask:
+            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
+
+        yield (tensor_image, target)
+
+        tv_tensor_image = make_image(size=size, color_space="RGB", dtype=torch.float32)
+        target = {
+            "boxes": make_bounding_boxes(canvas_size=size, format="XYXY", batch_dims=(num_objects,), dtype=torch.float),
+            "labels": make_label(extra_dims=(num_objects,), categories=80),
+        }
+        if with_mask:
+            target["masks"] = make_detection_mask(size=size, num_objects=num_objects, dtype=torch.long)
+
+        yield (tv_tensor_image, target)
+
+    @pytest.mark.parametrize(
+        "t_ref, t, data_kwargs",
+        [
+            (det_transforms.RandomHorizontalFlip(p=1.0), v2_transforms.RandomHorizontalFlip(p=1.0), {}),
+            (
+                det_transforms.RandomIoUCrop(),
+                v2_transforms.Compose(
+                    [
+                        v2_transforms.RandomIoUCrop(),
+                        v2_transforms.SanitizeBoundingBoxes(labels_getter=lambda sample: sample[1]["labels"]),
+                    ]
+                ),
+                {"with_mask": False},
+            ),
+            (det_transforms.RandomZoomOut(), v2_transforms.RandomZoomOut(), {"with_mask": False}),
+            (det_transforms.ScaleJitter((1024, 1024)), v2_transforms.ScaleJitter((1024, 1024), antialias=True), {}),
+            (
+                det_transforms.RandomShortestSize(
+                    min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
+                ),
+                v2_transforms.RandomShortestSize(
+                    min_size=(480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800), max_size=1333
+                ),
+                {},
+            ),
+        ],
+    )
+    def test_transform(self, t_ref, t, data_kwargs):
+        for dp in self.make_tv_tensors(**data_kwargs):
+
+            # We should use prototype transform first as reference transform performs inplace target update
+            torch.manual_seed(12)
+            output = t(dp)
+
+            torch.manual_seed(12)
+            expected_output = t_ref(*dp)
+
+            assert_equal(expected_output, output)
+
+
+seg_transforms = import_transforms_from_references("segmentation")
+
+
+# We need this transform for two reasons:
+# 1. transforms.RandomCrop uses a different scheme to pad images and masks of insufficient size than its name
+#    counterpart in the detection references. Thus, we cannot use it with `pad_if_needed=True`
+# 2. transforms.Pad only supports a fixed padding, but the segmentation datasets don't have a fixed image size.
+class PadIfSmaller(v2_transforms.Transform):
+    def __init__(self, size, fill=0):
+        super().__init__()
+        self.size = size
+        self.fill = v2_transforms._geometry._setup_fill_arg(fill)
+
+    def _get_params(self, sample):
+        height, width = query_size(sample)
+        padding = [0, 0, max(self.size - width, 0), max(self.size - height, 0)]
+        needs_padding = any(padding)
+        return dict(padding=padding, needs_padding=needs_padding)
+
+    def _transform(self, inpt, params):
+        if not params["needs_padding"]:
+            return inpt
+
+        fill = _get_fill(self.fill, type(inpt))
+        return prototype_F.pad(inpt, padding=params["padding"], fill=fill)
+
+
+class TestRefSegTransforms:
+    def make_tv_tensors(self, supports_pil=True, image_dtype=torch.uint8):
+        size = (256, 460)
+        num_categories = 21
+
+        conv_fns = []
+        if supports_pil:
+            conv_fns.append(to_pil_image)
+        conv_fns.extend([torch.Tensor, lambda x: x])
+
+        for conv_fn in conv_fns:
+            tv_tensor_image = make_image(size=size, color_space="RGB", dtype=image_dtype)
+            tv_tensor_mask = make_segmentation_mask(size=size, num_categories=num_categories, dtype=torch.uint8)
+
+            dp = (conv_fn(tv_tensor_image), tv_tensor_mask)
+            dp_ref = (
+                to_pil_image(tv_tensor_image) if supports_pil else tv_tensor_image.as_subclass(torch.Tensor),
+                to_pil_image(tv_tensor_mask),
+            )
+
+            yield dp, dp_ref
+
+    def set_seed(self, seed=12):
+        torch.manual_seed(seed)
+        random.seed(seed)
+
+    def check(self, t, t_ref, data_kwargs=None):
+        for dp, dp_ref in self.make_tv_tensors(**data_kwargs or dict()):
+
+            self.set_seed()
+            actual = actual_image, actual_mask = t(dp)
+
+            self.set_seed()
+            expected_image, expected_mask = t_ref(*dp_ref)
+            if isinstance(actual_image, torch.Tensor) and not isinstance(expected_image, torch.Tensor):
+                expected_image = legacy_F.pil_to_tensor(expected_image)
+            expected_mask = legacy_F.pil_to_tensor(expected_mask).squeeze(0)
+            expected = (expected_image, expected_mask)
+
+            assert_equal(actual, expected)
+
+    @pytest.mark.parametrize(
+        ("t_ref", "t", "data_kwargs"),
+        [
+            (
+                seg_transforms.RandomHorizontalFlip(flip_prob=1.0),
+                v2_transforms.RandomHorizontalFlip(p=1.0),
+                dict(),
+            ),
+            (
+                seg_transforms.RandomHorizontalFlip(flip_prob=0.0),
+                v2_transforms.RandomHorizontalFlip(p=0.0),
+                dict(),
+            ),
+            (
+                seg_transforms.RandomCrop(size=480),
+                v2_transforms.Compose(
+                    [
+                        PadIfSmaller(size=480, fill={tv_tensors.Mask: 255, "others": 0}),
+                        v2_transforms.RandomCrop(size=480),
+                    ]
+                ),
+                dict(),
+            ),
+            (
+                seg_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+                v2_transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
+                dict(supports_pil=False, image_dtype=torch.float),
+            ),
+        ],
+    )
+    def test_common(self, t_ref, t, data_kwargs):
+        self.check(t, t_ref, data_kwargs)
+
+
+@pytest.mark.parametrize(
+    ("legacy_dispatcher", "name_only_params"),
+    [
+        (legacy_F.get_dimensions, {}),
+        (legacy_F.get_image_size, {}),
+        (legacy_F.get_image_num_channels, {}),
+        (legacy_F.to_tensor, {}),
+        (legacy_F.pil_to_tensor, {}),
+        (legacy_F.convert_image_dtype, {}),
+        (legacy_F.to_pil_image, {}),
+        (legacy_F.normalize, {}),
+        (legacy_F.resize, {"interpolation"}),
+        (legacy_F.pad, {"padding", "fill"}),
+        (legacy_F.crop, {}),
+        (legacy_F.center_crop, {}),
+        (legacy_F.resized_crop, {"interpolation"}),
+        (legacy_F.hflip, {}),
+        (legacy_F.perspective, {"startpoints", "endpoints", "fill", "interpolation"}),
+        (legacy_F.vflip, {}),
+        (legacy_F.five_crop, {}),
+        (legacy_F.ten_crop, {}),
+        (legacy_F.adjust_brightness, {}),
+        (legacy_F.adjust_contrast, {}),
+        (legacy_F.adjust_saturation, {}),
+        (legacy_F.adjust_hue, {}),
+        (legacy_F.adjust_gamma, {}),
+        (legacy_F.rotate, {"center", "fill", "interpolation"}),
+        (legacy_F.affine, {"angle", "translate", "center", "fill", "interpolation"}),
+        (legacy_F.to_grayscale, {}),
+        (legacy_F.rgb_to_grayscale, {}),
+        (legacy_F.to_tensor, {}),
+        (legacy_F.erase, {}),
+        (legacy_F.gaussian_blur, {}),
+        (legacy_F.invert, {}),
+        (legacy_F.posterize, {}),
+        (legacy_F.solarize, {}),
+        (legacy_F.adjust_sharpness, {}),
+        (legacy_F.autocontrast, {}),
+        (legacy_F.equalize, {}),
+        (legacy_F.elastic_transform, {"fill", "interpolation"}),
+    ],
+)
+def test_dispatcher_signature_consistency(legacy_dispatcher, name_only_params):
+    legacy_signature = inspect.signature(legacy_dispatcher)
+    legacy_params = list(legacy_signature.parameters.values())[1:]
+
+    try:
+        prototype_dispatcher = getattr(prototype_F, legacy_dispatcher.__name__)
+    except AttributeError:
+        raise AssertionError(
+            f"Legacy dispatcher `F.{legacy_dispatcher.__name__}` has no prototype equivalent"
+        ) from None
+
+    prototype_signature = inspect.signature(prototype_dispatcher)
+    prototype_params = list(prototype_signature.parameters.values())[1:]
+
+    # Some dispatchers got extra parameters. This makes sure they have a default argument and thus are BC. We don't
+    # need to check if parameters were added in the middle rather than at the end, since that will be caught by the
+    # regular check below.
+    prototype_params, new_prototype_params = (
+        prototype_params[: len(legacy_params)],
+        prototype_params[len(legacy_params) :],
+    )
+    for param in new_prototype_params:
+        assert param.default is not param.empty
+
+    # Some annotations were changed mostly to supersets of what was there before. Plus, some legacy dispatchers had no
+    # annotations. In these cases we simply drop the annotation and default argument from the comparison
+    for prototype_param, legacy_param in zip(prototype_params, legacy_params):
+        if legacy_param.name in name_only_params:
+            prototype_param._annotation = prototype_param._default = inspect.Parameter.empty
+            legacy_param._annotation = legacy_param._default = inspect.Parameter.empty
+        elif legacy_param.annotation is inspect.Parameter.empty:
+            prototype_param._annotation = inspect.Parameter.empty
+
+    assert prototype_params == legacy_params
diff --git a/test/test_transforms_v2_functional.py b/test/test_transforms_v2_functional.py
new file mode 100644
index 0000000000000000000000000000000000000000..23f06475cf144bb73858155bdf9aef7021e4a5f4
--- /dev/null
+++ b/test/test_transforms_v2_functional.py
@@ -0,0 +1,958 @@
+import inspect
+import math
+import os
+import re
+
+import numpy as np
+import PIL.Image
+import pytest
+import torch
+
+from common_utils import assert_close, cache, cpu_and_cuda, needs_cuda, set_rng_seed
+from torch.utils._pytree import tree_map
+from torchvision import tv_tensors
+from torchvision.transforms.functional import _get_perspective_coeffs
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2._utils import is_pure_tensor
+from torchvision.transforms.v2.functional._geometry import _center_crop_compute_padding
+from torchvision.transforms.v2.functional._meta import clamp_bounding_boxes, convert_bounding_box_format
+from transforms_v2_dispatcher_infos import DISPATCHER_INFOS
+from transforms_v2_kernel_infos import KERNEL_INFOS
+from transforms_v2_legacy_utils import (
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+    make_multiple_bounding_boxes,
+    parametrized_error_message,
+)
+
+
+KERNEL_INFOS_MAP = {info.kernel: info for info in KERNEL_INFOS}
+DISPATCHER_INFOS_MAP = {info.dispatcher: info for info in DISPATCHER_INFOS}
+
+
+@cache
+def script(fn):
+    try:
+        return torch.jit.script(fn)
+    except Exception as error:
+        raise AssertionError(f"Trying to `torch.jit.script` '{fn.__name__}' raised the error above.") from error
+
+
+# Scripting a function often triggers a warning like
+# `UserWarning: operator() profile_node %$INT1 : int[] = prim::profile_ivalue($INT2) does not have profile information`
+# with varying `INT1` and `INT2`. Since these are uninteresting for us and only clutter the test summary, we ignore
+# them.
+ignore_jit_warning_no_profile = pytest.mark.filterwarnings(
+    f"ignore:{re.escape('operator() profile_node %')}:UserWarning"
+)
+
+
+def make_info_args_kwargs_params(info, *, args_kwargs_fn, test_id=None):
+    args_kwargs = list(args_kwargs_fn(info))
+    if not args_kwargs:
+        raise pytest.UsageError(
+            f"Couldn't collect a single `ArgsKwargs` for `{info.id}`{f' in {test_id}' if test_id else ''}"
+        )
+    idx_field_len = len(str(len(args_kwargs)))
+    return [
+        pytest.param(
+            info,
+            args_kwargs_,
+            marks=info.get_marks(test_id, args_kwargs_) if test_id else [],
+            id=f"{info.id}-{idx:0{idx_field_len}}",
+        )
+        for idx, args_kwargs_ in enumerate(args_kwargs)
+    ]
+
+
+def make_info_args_kwargs_parametrization(infos, *, args_kwargs_fn):
+    def decorator(test_fn):
+        parts = test_fn.__qualname__.split(".")
+        if len(parts) == 1:
+            test_class_name = None
+            test_function_name = parts[0]
+        elif len(parts) == 2:
+            test_class_name, test_function_name = parts
+        else:
+            raise pytest.UsageError("Unable to parse the test class name and test function name from test function")
+        test_id = (test_class_name, test_function_name)
+
+        argnames = ("info", "args_kwargs")
+        argvalues = []
+        for info in infos:
+            argvalues.extend(make_info_args_kwargs_params(info, args_kwargs_fn=args_kwargs_fn, test_id=test_id))
+
+        return pytest.mark.parametrize(argnames, argvalues)(test_fn)
+
+    return decorator
+
+
+@pytest.fixture(autouse=True)
+def fix_rng_seed():
+    set_rng_seed(0)
+    yield
+
+
+@pytest.fixture()
+def test_id(request):
+    test_class_name = request.cls.__name__ if request.cls is not None else None
+    test_function_name = request.node.originalname
+    return test_class_name, test_function_name
+
+
+class TestKernels:
+    sample_inputs = make_info_args_kwargs_parametrization(
+        KERNEL_INFOS,
+        args_kwargs_fn=lambda kernel_info: kernel_info.sample_inputs_fn(),
+    )
+    reference_inputs = make_info_args_kwargs_parametrization(
+        [info for info in KERNEL_INFOS if info.reference_fn is not None],
+        args_kwargs_fn=lambda info: info.reference_inputs_fn(),
+    )
+
+    @make_info_args_kwargs_parametrization(
+        [info for info in KERNEL_INFOS if info.logs_usage],
+        args_kwargs_fn=lambda info: info.sample_inputs_fn(),
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_logging(self, spy_on, info, args_kwargs, device):
+        spy = spy_on(torch._C._log_api_usage_once)
+
+        (input, *other_args), kwargs = args_kwargs.load(device)
+        info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs)
+
+        spy.assert_any_call(f"{info.kernel.__module__}.{info.id}")
+
+    @ignore_jit_warning_no_profile
+    @sample_inputs
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_scripted_vs_eager(self, test_id, info, args_kwargs, device):
+        kernel_eager = info.kernel
+        kernel_scripted = script(kernel_eager)
+
+        (input, *other_args), kwargs = args_kwargs.load(device)
+        input = input.as_subclass(torch.Tensor)
+
+        actual = kernel_scripted(input, *other_args, **kwargs)
+        expected = kernel_eager(input, *other_args, **kwargs)
+
+        assert_close(
+            actual,
+            expected,
+            **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
+            msg=parametrized_error_message(input, other_args, **kwargs),
+        )
+
+    def _unbatch(self, batch, *, data_dims):
+        if isinstance(batch, torch.Tensor):
+            batched_tensor = batch
+            metadata = ()
+        else:
+            batched_tensor, *metadata = batch
+
+        if batched_tensor.ndim == data_dims:
+            return batch
+
+        return [
+            self._unbatch(unbatched, data_dims=data_dims)
+            for unbatched in (
+                batched_tensor.unbind(0) if not metadata else [(t, *metadata) for t in batched_tensor.unbind(0)]
+            )
+        ]
+
+    @sample_inputs
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_batched_vs_single(self, test_id, info, args_kwargs, device):
+        (batched_input, *other_args), kwargs = args_kwargs.load(device)
+
+        tv_tensor_type = tv_tensors.Image if is_pure_tensor(batched_input) else type(batched_input)
+        # This dictionary contains the number of rightmost dimensions that contain the actual data.
+        # Everything to the left is considered a batch dimension.
+        data_dims = {
+            tv_tensors.Image: 3,
+            tv_tensors.BoundingBoxes: 1,
+            # `Mask`'s are special in the sense that the data dimensions depend on the type of mask. For detection masks
+            # it is 3 `(*, N, H, W)`, but for segmentation masks it is 2 `(*, H, W)`. Since both a grouped under one
+            # type all kernels should also work without differentiating between the two. Thus, we go with 2 here as
+            # common ground.
+            tv_tensors.Mask: 2,
+            tv_tensors.Video: 4,
+        }.get(tv_tensor_type)
+        if data_dims is None:
+            raise pytest.UsageError(
+                f"The number of data dimensions cannot be determined for input of type {tv_tensor_type.__name__}."
+            ) from None
+        elif batched_input.ndim <= data_dims:
+            pytest.skip("Input is not batched.")
+        elif not all(batched_input.shape[:-data_dims]):
+            pytest.skip("Input has a degenerate batch shape.")
+
+        batched_input = batched_input.as_subclass(torch.Tensor)
+        batched_output = info.kernel(batched_input, *other_args, **kwargs)
+        actual = self._unbatch(batched_output, data_dims=data_dims)
+
+        single_inputs = self._unbatch(batched_input, data_dims=data_dims)
+        expected = tree_map(lambda single_input: info.kernel(single_input, *other_args, **kwargs), single_inputs)
+
+        assert_close(
+            actual,
+            expected,
+            **info.get_closeness_kwargs(test_id, dtype=batched_input.dtype, device=batched_input.device),
+            msg=parametrized_error_message(batched_input, *other_args, **kwargs),
+        )
+
+    @sample_inputs
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_no_inplace(self, info, args_kwargs, device):
+        (input, *other_args), kwargs = args_kwargs.load(device)
+        input = input.as_subclass(torch.Tensor)
+
+        if input.numel() == 0:
+            pytest.skip("The input has a degenerate shape.")
+
+        input_version = input._version
+        info.kernel(input, *other_args, **kwargs)
+
+        assert input._version == input_version
+
+    @sample_inputs
+    @needs_cuda
+    def test_cuda_vs_cpu(self, test_id, info, args_kwargs):
+        (input_cpu, *other_args), kwargs = args_kwargs.load("cpu")
+        input_cpu = input_cpu.as_subclass(torch.Tensor)
+        input_cuda = input_cpu.to("cuda")
+
+        output_cpu = info.kernel(input_cpu, *other_args, **kwargs)
+        output_cuda = info.kernel(input_cuda, *other_args, **kwargs)
+
+        assert_close(
+            output_cuda,
+            output_cpu,
+            check_device=False,
+            **info.get_closeness_kwargs(test_id, dtype=input_cuda.dtype, device=input_cuda.device),
+            msg=parametrized_error_message(input_cpu, *other_args, **kwargs),
+        )
+
+    @sample_inputs
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_dtype_and_device_consistency(self, info, args_kwargs, device):
+        (input, *other_args), kwargs = args_kwargs.load(device)
+        input = input.as_subclass(torch.Tensor)
+
+        output = info.kernel(input, *other_args, **kwargs)
+        # Most kernels just return a tensor, but some also return some additional metadata
+        if not isinstance(output, torch.Tensor):
+            output, *_ = output
+
+        assert output.dtype == input.dtype
+        assert output.device == input.device
+
+    @reference_inputs
+    def test_against_reference(self, test_id, info, args_kwargs):
+        (input, *other_args), kwargs = args_kwargs.load("cpu")
+
+        actual = info.kernel(input.as_subclass(torch.Tensor), *other_args, **kwargs)
+        # We intnetionally don't unwrap the input of the reference function in order for it to have access to all
+        # metadata regardless of whether the kernel takes it explicitly or not
+        expected = info.reference_fn(input, *other_args, **kwargs)
+
+        assert_close(
+            actual,
+            expected,
+            **info.get_closeness_kwargs(test_id, dtype=input.dtype, device=input.device),
+            msg=parametrized_error_message(input, *other_args, **kwargs),
+        )
+
+    @make_info_args_kwargs_parametrization(
+        [info for info in KERNEL_INFOS if info.float32_vs_uint8],
+        args_kwargs_fn=lambda info: info.reference_inputs_fn(),
+    )
+    def test_float32_vs_uint8(self, test_id, info, args_kwargs):
+        (input, *other_args), kwargs = args_kwargs.load("cpu")
+        input = input.as_subclass(torch.Tensor)
+
+        if input.dtype != torch.uint8:
+            pytest.skip(f"Input dtype is {input.dtype}.")
+
+        adapted_other_args, adapted_kwargs = info.float32_vs_uint8(other_args, kwargs)
+
+        actual = info.kernel(
+            F.to_dtype_image(input, dtype=torch.float32, scale=True),
+            *adapted_other_args,
+            **adapted_kwargs,
+        )
+
+        expected = F.to_dtype_image(info.kernel(input, *other_args, **kwargs), dtype=torch.float32, scale=True)
+
+        assert_close(
+            actual,
+            expected,
+            **info.get_closeness_kwargs(test_id, dtype=torch.float32, device=input.device),
+            msg=parametrized_error_message(input, *other_args, **kwargs),
+        )
+
+
+@pytest.fixture
+def spy_on(mocker):
+    def make_spy(fn, *, module=None, name=None):
+        # TODO: we can probably get rid of the non-default modules and names if we eliminate aliasing
+        module = module or fn.__module__
+        name = name or fn.__name__
+        spy = mocker.patch(f"{module}.{name}", wraps=fn)
+        return spy
+
+    return make_spy
+
+
+class TestDispatchers:
+    image_sample_inputs = make_info_args_kwargs_parametrization(
+        [info for info in DISPATCHER_INFOS if tv_tensors.Image in info.kernels],
+        args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.Image),
+    )
+
+    @make_info_args_kwargs_parametrization(
+        DISPATCHER_INFOS,
+        args_kwargs_fn=lambda info: info.sample_inputs(),
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_logging(self, spy_on, info, args_kwargs, device):
+        spy = spy_on(torch._C._log_api_usage_once)
+
+        args, kwargs = args_kwargs.load(device)
+        info.dispatcher(*args, **kwargs)
+
+        spy.assert_any_call(f"{info.dispatcher.__module__}.{info.id}")
+
+    @ignore_jit_warning_no_profile
+    @image_sample_inputs
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_scripted_smoke(self, info, args_kwargs, device):
+        dispatcher = script(info.dispatcher)
+
+        (image_tv_tensor, *other_args), kwargs = args_kwargs.load(device)
+        image_pure_tensor = torch.Tensor(image_tv_tensor)
+
+        dispatcher(image_pure_tensor, *other_args, **kwargs)
+
+    # TODO: We need this until the dispatchers below also have `DispatcherInfo`'s. If they do, `test_scripted_smoke`
+    #  replaces this test for them.
+    @ignore_jit_warning_no_profile
+    @pytest.mark.parametrize(
+        "dispatcher",
+        [
+            F.get_dimensions,
+            F.get_image_num_channels,
+            F.get_image_size,
+            F.get_num_channels,
+            F.get_num_frames,
+            F.get_size,
+            F.rgb_to_grayscale,
+            F.uniform_temporal_subsample,
+        ],
+        ids=lambda dispatcher: dispatcher.__name__,
+    )
+    def test_scriptable(self, dispatcher):
+        script(dispatcher)
+
+    @image_sample_inputs
+    def test_pure_tensor_output_type(self, info, args_kwargs):
+        (image_tv_tensor, *other_args), kwargs = args_kwargs.load()
+        image_pure_tensor = image_tv_tensor.as_subclass(torch.Tensor)
+
+        output = info.dispatcher(image_pure_tensor, *other_args, **kwargs)
+
+        # We cannot use `isinstance` here since all tv_tensors are instances of `torch.Tensor` as well
+        assert type(output) is torch.Tensor
+
+    @make_info_args_kwargs_parametrization(
+        [info for info in DISPATCHER_INFOS if info.pil_kernel_info is not None],
+        args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.Image),
+    )
+    def test_pil_output_type(self, info, args_kwargs):
+        (image_tv_tensor, *other_args), kwargs = args_kwargs.load()
+
+        if image_tv_tensor.ndim > 3:
+            pytest.skip("Input is batched")
+
+        image_pil = F.to_pil_image(image_tv_tensor)
+
+        output = info.dispatcher(image_pil, *other_args, **kwargs)
+
+        assert isinstance(output, PIL.Image.Image)
+
+    @make_info_args_kwargs_parametrization(
+        DISPATCHER_INFOS,
+        args_kwargs_fn=lambda info: info.sample_inputs(),
+    )
+    def test_tv_tensor_output_type(self, info, args_kwargs):
+        (tv_tensor, *other_args), kwargs = args_kwargs.load()
+
+        output = info.dispatcher(tv_tensor, *other_args, **kwargs)
+
+        assert isinstance(output, type(tv_tensor))
+
+        if isinstance(tv_tensor, tv_tensors.BoundingBoxes) and info.dispatcher is not F.convert_bounding_box_format:
+            assert output.format == tv_tensor.format
+
+    @pytest.mark.parametrize(
+        ("dispatcher_info", "tv_tensor_type", "kernel_info"),
+        [
+            pytest.param(
+                dispatcher_info, tv_tensor_type, kernel_info, id=f"{dispatcher_info.id}-{tv_tensor_type.__name__}"
+            )
+            for dispatcher_info in DISPATCHER_INFOS
+            for tv_tensor_type, kernel_info in dispatcher_info.kernel_infos.items()
+        ],
+    )
+    def test_dispatcher_kernel_signatures_consistency(self, dispatcher_info, tv_tensor_type, kernel_info):
+        dispatcher_signature = inspect.signature(dispatcher_info.dispatcher)
+        dispatcher_params = list(dispatcher_signature.parameters.values())[1:]
+
+        kernel_signature = inspect.signature(kernel_info.kernel)
+        kernel_params = list(kernel_signature.parameters.values())[1:]
+
+        # We filter out metadata that is implicitly passed to the dispatcher through the input tv_tensor, but has to be
+        # explicitly passed to the kernel.
+        input_type = {v: k for k, v in dispatcher_info.kernels.items()}.get(kernel_info.kernel)
+        explicit_metadata = {
+            tv_tensors.BoundingBoxes: {"format", "canvas_size"},
+        }
+        kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())]
+
+        dispatcher_params = iter(dispatcher_params)
+        for dispatcher_param, kernel_param in zip(dispatcher_params, kernel_params):
+            try:
+                # In general, the dispatcher parameters are a superset of the kernel parameters. Thus, we filter out
+                # dispatcher parameters that have no kernel equivalent while keeping the order intact.
+                while dispatcher_param.name != kernel_param.name:
+                    dispatcher_param = next(dispatcher_params)
+            except StopIteration:
+                raise AssertionError(
+                    f"Parameter `{kernel_param.name}` of kernel `{kernel_info.id}` "
+                    f"has no corresponding parameter on the dispatcher `{dispatcher_info.id}`."
+                ) from None
+
+            assert dispatcher_param == kernel_param
+
+    @pytest.mark.parametrize("info", DISPATCHER_INFOS, ids=lambda info: info.id)
+    def test_unkown_type(self, info):
+        unkown_input = object()
+        (_, *other_args), kwargs = next(iter(info.sample_inputs())).load("cpu")
+
+        with pytest.raises(TypeError, match=re.escape(str(type(unkown_input)))):
+            info.dispatcher(unkown_input, *other_args, **kwargs)
+
+    @make_info_args_kwargs_parametrization(
+        [
+            info
+            for info in DISPATCHER_INFOS
+            if tv_tensors.BoundingBoxes in info.kernels and info.dispatcher is not F.convert_bounding_box_format
+        ],
+        args_kwargs_fn=lambda info: info.sample_inputs(tv_tensors.BoundingBoxes),
+    )
+    def test_bounding_boxes_format_consistency(self, info, args_kwargs):
+        (bounding_boxes, *other_args), kwargs = args_kwargs.load()
+        format = bounding_boxes.format
+
+        output = info.dispatcher(bounding_boxes, *other_args, **kwargs)
+
+        assert output.format == format
+
+
+@pytest.mark.parametrize(
+    ("alias", "target"),
+    [
+        pytest.param(alias, target, id=alias.__name__)
+        for alias, target in [
+            (F.hflip, F.horizontal_flip),
+            (F.vflip, F.vertical_flip),
+            (F.get_image_num_channels, F.get_num_channels),
+            (F.to_pil_image, F.to_pil_image),
+            (F.elastic_transform, F.elastic),
+            (F.to_grayscale, F.rgb_to_grayscale),
+        ]
+    ],
+)
+def test_alias(alias, target):
+    assert alias is target
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize("num_channels", [1, 3])
+def test_normalize_image_tensor_stats(device, num_channels):
+    stats = pytest.importorskip("scipy.stats", reason="SciPy is not available")
+
+    def assert_samples_from_standard_normal(t):
+        p_value = stats.kstest(t.flatten(), cdf="norm", args=(0, 1)).pvalue
+        return p_value > 1e-4
+
+    image = torch.rand(num_channels, DEFAULT_SQUARE_SPATIAL_SIZE, DEFAULT_SQUARE_SPATIAL_SIZE)
+    mean = image.mean(dim=(1, 2)).tolist()
+    std = image.std(dim=(1, 2)).tolist()
+
+    assert_samples_from_standard_normal(F.normalize_image(image, mean, std))
+
+
+class TestClampBoundingBoxes:
+    @pytest.mark.parametrize(
+        "metadata",
+        [
+            dict(),
+            dict(format=tv_tensors.BoundingBoxFormat.XYXY),
+            dict(canvas_size=(1, 1)),
+        ],
+    )
+    def test_pure_tensor_insufficient_metadata(self, metadata):
+        pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
+
+        with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` has to be passed")):
+            F.clamp_bounding_boxes(pure_tensor, **metadata)
+
+    @pytest.mark.parametrize(
+        "metadata",
+        [
+            dict(format=tv_tensors.BoundingBoxFormat.XYXY),
+            dict(canvas_size=(1, 1)),
+            dict(format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=(1, 1)),
+        ],
+    )
+    def test_tv_tensor_explicit_metadata(self, metadata):
+        tv_tensor = next(make_multiple_bounding_boxes())
+
+        with pytest.raises(ValueError, match=re.escape("`format` and `canvas_size` must not be passed")):
+            F.clamp_bounding_boxes(tv_tensor, **metadata)
+
+
+class TestConvertFormatBoundingBoxes:
+    @pytest.mark.parametrize(
+        ("inpt", "old_format"),
+        [
+            (next(make_multiple_bounding_boxes()), None),
+            (next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor), tv_tensors.BoundingBoxFormat.XYXY),
+        ],
+    )
+    def test_missing_new_format(self, inpt, old_format):
+        with pytest.raises(TypeError, match=re.escape("missing 1 required argument: 'new_format'")):
+            F.convert_bounding_box_format(inpt, old_format)
+
+    def test_pure_tensor_insufficient_metadata(self):
+        pure_tensor = next(make_multiple_bounding_boxes()).as_subclass(torch.Tensor)
+
+        with pytest.raises(ValueError, match=re.escape("`old_format` has to be passed")):
+            F.convert_bounding_box_format(pure_tensor, new_format=tv_tensors.BoundingBoxFormat.CXCYWH)
+
+    def test_tv_tensor_explicit_metadata(self):
+        tv_tensor = next(make_multiple_bounding_boxes())
+
+        with pytest.raises(ValueError, match=re.escape("`old_format` must not be passed")):
+            F.convert_bounding_box_format(
+                tv_tensor, old_format=tv_tensor.format, new_format=tv_tensors.BoundingBoxFormat.CXCYWH
+            )
+
+
+# TODO: All correctness checks below this line should be ported to be references on a `KernelInfo` in
+#  `transforms_v2_kernel_infos.py`
+
+
+def _compute_affine_matrix(angle_, translate_, scale_, shear_, center_):
+    rot = math.radians(angle_)
+    cx, cy = center_
+    tx, ty = translate_
+    sx, sy = [math.radians(sh_) for sh_ in shear_]
+
+    c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]])
+    t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
+    c_matrix_inv = np.linalg.inv(c_matrix)
+    rs_matrix = np.array(
+        [
+            [scale_ * math.cos(rot), -scale_ * math.sin(rot), 0],
+            [scale_ * math.sin(rot), scale_ * math.cos(rot), 0],
+            [0, 0, 1],
+        ]
+    )
+    shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]])
+    shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
+    rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
+    true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
+    return true_matrix
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_correctness_vertical_flip_segmentation_mask_on_fixed_input(device):
+    mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
+    mask[:, 0, :] = 1
+
+    out_mask = F.vertical_flip_mask(mask)
+
+    expected_mask = torch.zeros((3, 3, 3), dtype=torch.long, device=device)
+    expected_mask[:, -1, :] = 1
+    torch.testing.assert_close(out_mask, expected_mask)
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize(
+    "format",
+    [tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH, tv_tensors.BoundingBoxFormat.CXCYWH],
+)
+@pytest.mark.parametrize(
+    "top, left, height, width, size",
+    [
+        [0, 0, 30, 30, (60, 60)],
+        [-5, 5, 35, 45, (32, 34)],
+    ],
+)
+def test_correctness_resized_crop_bounding_boxes(device, format, top, left, height, width, size):
+    def _compute_expected_bbox(bbox, top_, left_, height_, width_, size_):
+        # bbox should be xyxy
+        bbox[0] = (bbox[0] - left_) * size_[1] / width_
+        bbox[1] = (bbox[1] - top_) * size_[0] / height_
+        bbox[2] = (bbox[2] - left_) * size_[1] / width_
+        bbox[3] = (bbox[3] - top_) * size_[0] / height_
+        return bbox
+
+    format = tv_tensors.BoundingBoxFormat.XYXY
+    canvas_size = (100, 100)
+    in_boxes = [
+        [10.0, 10.0, 20.0, 20.0],
+        [5.0, 10.0, 15.0, 20.0],
+    ]
+    expected_bboxes = []
+    for in_box in in_boxes:
+        expected_bboxes.append(_compute_expected_bbox(list(in_box), top, left, height, width, size))
+    expected_bboxes = torch.tensor(expected_bboxes, device=device)
+
+    in_boxes = tv_tensors.BoundingBoxes(
+        in_boxes, format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size, device=device
+    )
+    if format != tv_tensors.BoundingBoxFormat.XYXY:
+        in_boxes = convert_bounding_box_format(in_boxes, tv_tensors.BoundingBoxFormat.XYXY, format)
+
+    output_boxes, output_canvas_size = F.resized_crop_bounding_boxes(in_boxes, format, top, left, height, width, size)
+
+    if format != tv_tensors.BoundingBoxFormat.XYXY:
+        output_boxes = convert_bounding_box_format(output_boxes, format, tv_tensors.BoundingBoxFormat.XYXY)
+
+    torch.testing.assert_close(output_boxes, expected_bboxes)
+    torch.testing.assert_close(output_canvas_size, size)
+
+
+def _parse_padding(padding):
+    if isinstance(padding, int):
+        return [padding] * 4
+    if isinstance(padding, list):
+        if len(padding) == 1:
+            return padding * 4
+        if len(padding) == 2:
+            return padding * 2  # [left, up, right, down]
+
+    return padding
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize("padding", [[1], [1, 1], [1, 1, 2, 2]])
+def test_correctness_pad_bounding_boxes(device, padding):
+    def _compute_expected_bbox(bbox, format, padding_):
+        pad_left, pad_up, _, _ = _parse_padding(padding_)
+
+        dtype = bbox.dtype
+        bbox = (
+            bbox.clone()
+            if format == tv_tensors.BoundingBoxFormat.XYXY
+            else convert_bounding_box_format(bbox, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY)
+        )
+
+        bbox[0::2] += pad_left
+        bbox[1::2] += pad_up
+
+        bbox = convert_bounding_box_format(bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format)
+        if bbox.dtype != dtype:
+            # Temporary cast to original dtype
+            # e.g. float32 -> int
+            bbox = bbox.to(dtype)
+        return bbox
+
+    def _compute_expected_canvas_size(bbox, padding_):
+        pad_left, pad_up, pad_right, pad_down = _parse_padding(padding_)
+        height, width = bbox.canvas_size
+        return height + pad_up + pad_down, width + pad_left + pad_right
+
+    for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)):
+        bboxes = bboxes.to(device)
+        bboxes_format = bboxes.format
+        bboxes_canvas_size = bboxes.canvas_size
+
+        output_boxes, output_canvas_size = F.pad_bounding_boxes(
+            bboxes, format=bboxes_format, canvas_size=bboxes_canvas_size, padding=padding
+        )
+
+        torch.testing.assert_close(output_canvas_size, _compute_expected_canvas_size(bboxes, padding))
+
+        expected_bboxes = torch.stack(
+            [_compute_expected_bbox(b, bboxes_format, padding) for b in bboxes.reshape(-1, 4).unbind()]
+        ).reshape(bboxes.shape)
+
+        torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_correctness_pad_segmentation_mask_on_fixed_input(device):
+    mask = torch.ones((1, 3, 3), dtype=torch.long, device=device)
+
+    out_mask = F.pad_mask(mask, padding=[1, 1, 1, 1])
+
+    expected_mask = torch.zeros((1, 5, 5), dtype=torch.long, device=device)
+    expected_mask[:, 1:-1, 1:-1] = 1
+    torch.testing.assert_close(out_mask, expected_mask)
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize(
+    "startpoints, endpoints",
+    [
+        [[[0, 0], [33, 0], [33, 25], [0, 25]], [[3, 2], [32, 3], [30, 24], [2, 25]]],
+        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[0, 0], [33, 0], [33, 25], [0, 25]]],
+        [[[3, 2], [32, 3], [30, 24], [2, 25]], [[5, 5], [30, 3], [33, 19], [4, 25]]],
+    ],
+)
+def test_correctness_perspective_bounding_boxes(device, startpoints, endpoints):
+    def _compute_expected_bbox(bbox, format_, canvas_size_, pcoeffs_):
+        m1 = np.array(
+            [
+                [pcoeffs_[0], pcoeffs_[1], pcoeffs_[2]],
+                [pcoeffs_[3], pcoeffs_[4], pcoeffs_[5]],
+            ]
+        )
+        m2 = np.array(
+            [
+                [pcoeffs_[6], pcoeffs_[7], 1.0],
+                [pcoeffs_[6], pcoeffs_[7], 1.0],
+            ]
+        )
+
+        bbox_xyxy = convert_bounding_box_format(bbox, old_format=format_, new_format=tv_tensors.BoundingBoxFormat.XYXY)
+        points = np.array(
+            [
+                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
+                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
+                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
+                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
+            ]
+        )
+        numer = np.matmul(points, m1.T)
+        denom = np.matmul(points, m2.T)
+        transformed_points = numer / denom
+        out_bbox = np.array(
+            [
+                np.min(transformed_points[:, 0]),
+                np.min(transformed_points[:, 1]),
+                np.max(transformed_points[:, 0]),
+                np.max(transformed_points[:, 1]),
+            ]
+        )
+        out_bbox = torch.from_numpy(out_bbox)
+        out_bbox = convert_bounding_box_format(
+            out_bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format_
+        )
+        return clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_).to(bbox)
+
+    canvas_size = (32, 38)
+
+    pcoeffs = _get_perspective_coeffs(startpoints, endpoints)
+    inv_pcoeffs = _get_perspective_coeffs(endpoints, startpoints)
+
+    for bboxes in make_multiple_bounding_boxes(spatial_size=canvas_size, extra_dims=((4,),)):
+        bboxes = bboxes.to(device)
+
+        output_bboxes = F.perspective_bounding_boxes(
+            bboxes.as_subclass(torch.Tensor),
+            format=bboxes.format,
+            canvas_size=bboxes.canvas_size,
+            startpoints=None,
+            endpoints=None,
+            coefficients=pcoeffs,
+        )
+
+        expected_bboxes = torch.stack(
+            [
+                _compute_expected_bbox(b, bboxes.format, bboxes.canvas_size, inv_pcoeffs)
+                for b in bboxes.reshape(-1, 4).unbind()
+            ]
+        ).reshape(bboxes.shape)
+
+        torch.testing.assert_close(output_bboxes, expected_bboxes, rtol=0, atol=1)
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize(
+    "output_size",
+    [(18, 18), [18, 15], (16, 19), [12], [46, 48]],
+)
+def test_correctness_center_crop_bounding_boxes(device, output_size):
+    def _compute_expected_bbox(bbox, format_, canvas_size_, output_size_):
+        dtype = bbox.dtype
+        bbox = convert_bounding_box_format(bbox.float(), format_, tv_tensors.BoundingBoxFormat.XYWH)
+
+        if len(output_size_) == 1:
+            output_size_.append(output_size_[-1])
+
+        cy = int(round((canvas_size_[0] - output_size_[0]) * 0.5))
+        cx = int(round((canvas_size_[1] - output_size_[1]) * 0.5))
+        out_bbox = [
+            bbox[0].item() - cx,
+            bbox[1].item() - cy,
+            bbox[2].item(),
+            bbox[3].item(),
+        ]
+        out_bbox = torch.tensor(out_bbox)
+        out_bbox = convert_bounding_box_format(out_bbox, tv_tensors.BoundingBoxFormat.XYWH, format_)
+        out_bbox = clamp_bounding_boxes(out_bbox, format=format_, canvas_size=output_size)
+        return out_bbox.to(dtype=dtype, device=bbox.device)
+
+    for bboxes in make_multiple_bounding_boxes(extra_dims=((4,),)):
+        bboxes = bboxes.to(device)
+        bboxes_format = bboxes.format
+        bboxes_canvas_size = bboxes.canvas_size
+
+        output_boxes, output_canvas_size = F.center_crop_bounding_boxes(
+            bboxes, bboxes_format, bboxes_canvas_size, output_size
+        )
+
+        expected_bboxes = torch.stack(
+            [
+                _compute_expected_bbox(b, bboxes_format, bboxes_canvas_size, output_size)
+                for b in bboxes.reshape(-1, 4).unbind()
+            ]
+        ).reshape(bboxes.shape)
+
+        torch.testing.assert_close(output_boxes, expected_bboxes, atol=1, rtol=0)
+        torch.testing.assert_close(output_canvas_size, output_size)
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize("output_size", [[4, 2], [4], [7, 6]])
+def test_correctness_center_crop_mask(device, output_size):
+    def _compute_expected_mask(mask, output_size):
+        crop_height, crop_width = output_size if len(output_size) > 1 else [output_size[0], output_size[0]]
+
+        _, image_height, image_width = mask.shape
+        if crop_width > image_height or crop_height > image_width:
+            padding = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
+            mask = F.pad_image(mask, padding, fill=0)
+
+        left = round((image_width - crop_width) * 0.5)
+        top = round((image_height - crop_height) * 0.5)
+
+        return mask[:, top : top + crop_height, left : left + crop_width]
+
+    mask = torch.randint(0, 2, size=(1, 6, 6), dtype=torch.long, device=device)
+    actual = F.center_crop_mask(mask, output_size)
+
+    expected = _compute_expected_mask(mask, output_size)
+    torch.testing.assert_close(expected, actual)
+
+
+# Copied from test/test_functional_tensor.py
+@pytest.mark.parametrize("device", cpu_and_cuda())
+@pytest.mark.parametrize("canvas_size", ("small", "large"))
+@pytest.mark.parametrize("dt", [None, torch.float32, torch.float64, torch.float16])
+@pytest.mark.parametrize("ksize", [(3, 3), [3, 5], (23, 23)])
+@pytest.mark.parametrize("sigma", [[0.5, 0.5], (0.5, 0.5), (0.8, 0.8), (1.7, 1.7)])
+def test_correctness_gaussian_blur_image_tensor(device, canvas_size, dt, ksize, sigma):
+    fn = F.gaussian_blur_image
+
+    # true_cv2_results = {
+    #     # np_img = np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))
+    #     # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.8)
+    #     "3_3_0.8": ...
+    #     # cv2.GaussianBlur(np_img, ksize=(3, 3), sigmaX=0.5)
+    #     "3_3_0.5": ...
+    #     # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.8)
+    #     "3_5_0.8": ...
+    #     # cv2.GaussianBlur(np_img, ksize=(3, 5), sigmaX=0.5)
+    #     "3_5_0.5": ...
+    #     # np_img2 = np.arange(26 * 28, dtype="uint8").reshape((26, 28))
+    #     # cv2.GaussianBlur(np_img2, ksize=(23, 23), sigmaX=1.7)
+    #     "23_23_1.7": ...
+    # }
+    p = os.path.join(os.path.dirname(os.path.abspath(__file__)), "assets", "gaussian_blur_opencv_results.pt")
+    true_cv2_results = torch.load(p)
+
+    if canvas_size == "small":
+        tensor = (
+            torch.from_numpy(np.arange(3 * 10 * 12, dtype="uint8").reshape((10, 12, 3))).permute(2, 0, 1).to(device)
+        )
+    else:
+        tensor = torch.from_numpy(np.arange(26 * 28, dtype="uint8").reshape((1, 26, 28))).to(device)
+
+    if dt == torch.float16 and device == "cpu":
+        # skip float16 on CPU case
+        return
+
+    if dt is not None:
+        tensor = tensor.to(dtype=dt)
+
+    _ksize = (ksize, ksize) if isinstance(ksize, int) else ksize
+    _sigma = sigma[0] if sigma is not None else None
+    shape = tensor.shape
+    gt_key = f"{shape[-2]}_{shape[-1]}_{shape[-3]}__{_ksize[0]}_{_ksize[1]}_{_sigma}"
+    if gt_key not in true_cv2_results:
+        return
+
+    true_out = (
+        torch.tensor(true_cv2_results[gt_key]).reshape(shape[-2], shape[-1], shape[-3]).permute(2, 0, 1).to(tensor)
+    )
+
+    image = tv_tensors.Image(tensor)
+
+    out = fn(image, kernel_size=ksize, sigma=sigma)
+    torch.testing.assert_close(out, true_out, rtol=0.0, atol=1.0, msg=f"{ksize}, {sigma}")
+
+
+@pytest.mark.parametrize(
+    "inpt",
+    [
+        127 * np.ones((32, 32, 3), dtype="uint8"),
+        PIL.Image.new("RGB", (32, 32), 122),
+    ],
+)
+def test_to_image(inpt):
+    output = F.to_image(inpt)
+    assert isinstance(output, torch.Tensor)
+    assert output.shape == (3, 32, 32)
+
+    assert np.asarray(inpt).sum() == output.sum().item()
+
+
+@pytest.mark.parametrize(
+    "inpt",
+    [
+        torch.randint(0, 256, size=(3, 32, 32), dtype=torch.uint8),
+        127 * np.ones((32, 32, 3), dtype="uint8"),
+    ],
+)
+@pytest.mark.parametrize("mode", [None, "RGB"])
+def test_to_pil_image(inpt, mode):
+    output = F.to_pil_image(inpt, mode=mode)
+    assert isinstance(output, PIL.Image.Image)
+
+    assert np.asarray(inpt).sum() == np.asarray(output).sum()
+
+
+def test_equalize_image_tensor_edge_cases():
+    inpt = torch.zeros(3, 200, 200, dtype=torch.uint8)
+    output = F.equalize_image(inpt)
+    torch.testing.assert_close(inpt, output)
+
+    inpt = torch.zeros(5, 3, 200, 200, dtype=torch.uint8)
+    inpt[..., 100:, 100:] = 1
+    output = F.equalize_image(inpt)
+    assert output.unique().tolist() == [0, 255]
+
+
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_correctness_uniform_temporal_subsample(device):
+    video = torch.arange(10, device=device)[:, None, None, None].expand(-1, 3, 8, 8)
+    out_video = F.uniform_temporal_subsample(video, 5)
+    assert out_video.unique().tolist() == [0, 2, 4, 6, 9]
+
+    out_video = F.uniform_temporal_subsample(video, 8)
+    assert out_video.unique().tolist() == [0, 1, 2, 3, 5, 6, 7, 9]
diff --git a/test/test_transforms_v2_refactored.py b/test/test_transforms_v2_refactored.py
new file mode 100644
index 0000000000000000000000000000000000000000..e18beb35a484353af52771b1ed7e458b6a5ceeeb
--- /dev/null
+++ b/test/test_transforms_v2_refactored.py
@@ -0,0 +1,2909 @@
+import contextlib
+import decimal
+import inspect
+import math
+import pickle
+import re
+from pathlib import Path
+from unittest import mock
+
+import numpy as np
+import PIL.Image
+import pytest
+
+import torch
+import torchvision.transforms.v2 as transforms
+from common_utils import (
+    assert_equal,
+    assert_no_warnings,
+    cache,
+    cpu_and_cuda,
+    freeze_rng_state,
+    ignore_jit_no_profile_information_warning,
+    make_bounding_boxes,
+    make_detection_mask,
+    make_image,
+    make_image_pil,
+    make_image_tensor,
+    make_segmentation_mask,
+    make_video,
+    make_video_tensor,
+    needs_cuda,
+    set_rng_seed,
+)
+
+from torch import nn
+from torch.testing import assert_close
+from torch.utils._pytree import tree_map
+from torch.utils.data import DataLoader, default_collate
+from torchvision import tv_tensors
+
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.functional import pil_modes_mapping
+from torchvision.transforms.v2 import functional as F
+from torchvision.transforms.v2.functional._utils import _get_kernel, _register_kernel_internal
+
+
+@pytest.fixture(autouse=True)
+def fix_rng_seed():
+    set_rng_seed(0)
+    yield
+
+
+def _to_tolerances(maybe_tolerance_dict):
+    if not isinstance(maybe_tolerance_dict, dict):
+        return dict(rtol=None, atol=None)
+
+    tolerances = dict(rtol=0, atol=0)
+    tolerances.update(maybe_tolerance_dict)
+    return tolerances
+
+
+def _check_kernel_cuda_vs_cpu(kernel, input, *args, rtol, atol, **kwargs):
+    """Checks if the kernel produces closes results for inputs on GPU and CPU."""
+    if input.device.type != "cuda":
+        return
+
+    input_cuda = input.as_subclass(torch.Tensor)
+    input_cpu = input_cuda.to("cpu")
+
+    with freeze_rng_state():
+        actual = kernel(input_cuda, *args, **kwargs)
+    with freeze_rng_state():
+        expected = kernel(input_cpu, *args, **kwargs)
+
+    assert_close(actual, expected, check_device=False, rtol=rtol, atol=atol)
+
+
+@cache
+def _script(obj):
+    try:
+        return torch.jit.script(obj)
+    except Exception as error:
+        name = getattr(obj, "__name__", obj.__class__.__name__)
+        raise AssertionError(f"Trying to `torch.jit.script` '{name}' raised the error above.") from error
+
+
+def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs):
+    """Checks if the kernel is scriptable and if the scripted output is close to the eager one."""
+    if input.device.type != "cpu":
+        return
+
+    kernel_scripted = _script(kernel)
+
+    input = input.as_subclass(torch.Tensor)
+    with ignore_jit_no_profile_information_warning():
+        actual = kernel_scripted(input, *args, **kwargs)
+    expected = kernel(input, *args, **kwargs)
+
+    assert_close(actual, expected, rtol=rtol, atol=atol)
+
+
+def _check_kernel_batched_vs_unbatched(kernel, input, *args, rtol, atol, **kwargs):
+    """Checks if the kernel produces close results for batched and unbatched inputs."""
+    unbatched_input = input.as_subclass(torch.Tensor)
+
+    for batch_dims in [(2,), (2, 1)]:
+        repeats = [*batch_dims, *[1] * input.ndim]
+
+        actual = kernel(unbatched_input.repeat(repeats), *args, **kwargs)
+
+        expected = kernel(unbatched_input, *args, **kwargs)
+        # We can't directly call `.repeat()` on the output, since some kernel also return some additional metadata
+        if isinstance(expected, torch.Tensor):
+            expected = expected.repeat(repeats)
+        else:
+            tensor, *metadata = expected
+            expected = (tensor.repeat(repeats), *metadata)
+
+        assert_close(actual, expected, rtol=rtol, atol=atol)
+
+    for degenerate_batch_dims in [(0,), (5, 0), (0, 5)]:
+        degenerate_batched_input = torch.empty(
+            degenerate_batch_dims + input.shape, dtype=input.dtype, device=input.device
+        )
+
+        output = kernel(degenerate_batched_input, *args, **kwargs)
+        # Most kernels just return a tensor, but some also return some additional metadata
+        if not isinstance(output, torch.Tensor):
+            output, *_ = output
+
+        assert output.shape[: -input.ndim] == degenerate_batch_dims
+
+
+def check_kernel(
+    kernel,
+    input,
+    *args,
+    check_cuda_vs_cpu=True,
+    check_scripted_vs_eager=True,
+    check_batched_vs_unbatched=True,
+    expect_same_dtype=True,
+    **kwargs,
+):
+    initial_input_version = input._version
+
+    output = kernel(input.as_subclass(torch.Tensor), *args, **kwargs)
+    # Most kernels just return a tensor, but some also return some additional metadata
+    if not isinstance(output, torch.Tensor):
+        output, *_ = output
+
+    # check that no inplace operation happened
+    assert input._version == initial_input_version
+
+    if expect_same_dtype:
+        assert output.dtype == input.dtype
+    assert output.device == input.device
+
+    if check_cuda_vs_cpu:
+        _check_kernel_cuda_vs_cpu(kernel, input, *args, **kwargs, **_to_tolerances(check_cuda_vs_cpu))
+
+    if check_scripted_vs_eager:
+        _check_kernel_scripted_vs_eager(kernel, input, *args, **kwargs, **_to_tolerances(check_scripted_vs_eager))
+
+    if check_batched_vs_unbatched:
+        _check_kernel_batched_vs_unbatched(kernel, input, *args, **kwargs, **_to_tolerances(check_batched_vs_unbatched))
+
+
+def _check_functional_scripted_smoke(functional, input, *args, **kwargs):
+    """Checks if the functional can be scripted and the scripted version can be called without error."""
+    if not isinstance(input, tv_tensors.Image):
+        return
+
+    functional_scripted = _script(functional)
+    with ignore_jit_no_profile_information_warning():
+        functional_scripted(input.as_subclass(torch.Tensor), *args, **kwargs)
+
+
+def check_functional(functional, input, *args, check_scripted_smoke=True, **kwargs):
+    unknown_input = object()
+    with pytest.raises(TypeError, match=re.escape(str(type(unknown_input)))):
+        functional(unknown_input, *args, **kwargs)
+
+    with mock.patch("torch._C._log_api_usage_once", wraps=torch._C._log_api_usage_once) as spy:
+        output = functional(input, *args, **kwargs)
+
+        spy.assert_any_call(f"{functional.__module__}.{functional.__name__}")
+
+    assert isinstance(output, type(input))
+
+    if isinstance(input, tv_tensors.BoundingBoxes):
+        assert output.format == input.format
+
+    if check_scripted_smoke:
+        _check_functional_scripted_smoke(functional, input, *args, **kwargs)
+
+
+def check_functional_kernel_signature_match(functional, *, kernel, input_type):
+    """Checks if the signature of the functional matches the kernel signature."""
+    functional_params = list(inspect.signature(functional).parameters.values())[1:]
+    kernel_params = list(inspect.signature(kernel).parameters.values())[1:]
+
+    if issubclass(input_type, tv_tensors.TVTensor):
+        # We filter out metadata that is implicitly passed to the functional through the input tv_tensor, but has to be
+        # explicitly passed to the kernel.
+        explicit_metadata = {
+            tv_tensors.BoundingBoxes: {"format", "canvas_size"},
+        }
+        kernel_params = [param for param in kernel_params if param.name not in explicit_metadata.get(input_type, set())]
+
+    functional_params = iter(functional_params)
+    for functional_param, kernel_param in zip(functional_params, kernel_params):
+        try:
+            # In general, the functional parameters are a superset of the kernel parameters. Thus, we filter out
+            # functional parameters that have no kernel equivalent while keeping the order intact.
+            while functional_param.name != kernel_param.name:
+                functional_param = next(functional_params)
+        except StopIteration:
+            raise AssertionError(
+                f"Parameter `{kernel_param.name}` of kernel `{kernel.__name__}` "
+                f"has no corresponding parameter on the functional `{functional.__name__}`."
+            ) from None
+
+        if issubclass(input_type, PIL.Image.Image):
+            # PIL kernels often have more correct annotations, since they are not limited by JIT. Thus, we don't check
+            # them in the first place.
+            functional_param._annotation = kernel_param._annotation = inspect.Parameter.empty
+
+        assert functional_param == kernel_param
+
+
+def _check_transform_v1_compatibility(transform, input, *, rtol, atol):
+    """If the transform defines the ``_v1_transform_cls`` attribute, checks if the transform has a public, static
+    ``get_params`` method that is the v1 equivalent, the output is close to v1, is scriptable, and the scripted version
+    can be called without error."""
+    if type(input) is not torch.Tensor or isinstance(input, PIL.Image.Image):
+        return
+
+    v1_transform_cls = transform._v1_transform_cls
+    if v1_transform_cls is None:
+        return
+
+    if hasattr(v1_transform_cls, "get_params"):
+        assert type(transform).get_params is v1_transform_cls.get_params
+
+    v1_transform = v1_transform_cls(**transform._extract_params_for_v1_transform())
+
+    with freeze_rng_state():
+        output_v2 = transform(input)
+
+    with freeze_rng_state():
+        output_v1 = v1_transform(input)
+
+    assert_close(output_v2, output_v1, rtol=rtol, atol=atol)
+
+    if isinstance(input, PIL.Image.Image):
+        return
+
+    _script(v1_transform)(input)
+
+
+def check_transform(transform, input, check_v1_compatibility=True):
+    pickle.loads(pickle.dumps(transform))
+
+    output = transform(input)
+    assert isinstance(output, type(input))
+
+    if isinstance(input, tv_tensors.BoundingBoxes):
+        assert output.format == input.format
+
+    if check_v1_compatibility:
+        _check_transform_v1_compatibility(transform, input, **_to_tolerances(check_v1_compatibility))
+
+
+def transform_cls_to_functional(transform_cls, **transform_specific_kwargs):
+    def wrapper(input, *args, **kwargs):
+        transform = transform_cls(*args, **transform_specific_kwargs, **kwargs)
+        return transform(input)
+
+    wrapper.__name__ = transform_cls.__name__
+
+    return wrapper
+
+
+def param_value_parametrization(**kwargs):
+    """Helper function to turn
+
+    @pytest.mark.parametrize(
+        ("param", "value"),
+        ("a", 1),
+        ("a", 2),
+        ("a", 3),
+        ("b", -1.0)
+        ("b", 1.0)
+    )
+
+    into
+
+    @param_value_parametrization(a=[1, 2, 3], b=[-1.0, 1.0])
+    """
+    return pytest.mark.parametrize(
+        ("param", "value"),
+        [(param, value) for param, values in kwargs.items() for value in values],
+    )
+
+
+def adapt_fill(value, *, dtype):
+    """Adapt fill values in the range [0.0, 1.0] to the value range of the dtype"""
+    if value is None:
+        return value
+
+    max_value = get_max_value(dtype)
+    value_type = float if dtype.is_floating_point else int
+
+    if isinstance(value, (int, float)):
+        return value_type(value * max_value)
+    elif isinstance(value, (list, tuple)):
+        return type(value)(value_type(v * max_value) for v in value)
+    else:
+        raise ValueError(f"fill should be an int or float, or a list or tuple of the former, but got '{value}'.")
+
+
+EXHAUSTIVE_TYPE_FILLS = [
+    None,
+    1,
+    0.5,
+    [1],
+    [0.2],
+    (0,),
+    (0.7,),
+    [1, 0, 1],
+    [0.1, 0.2, 0.3],
+    (0, 1, 0),
+    (0.9, 0.234, 0.314),
+]
+CORRECTNESS_FILLS = [
+    v for v in EXHAUSTIVE_TYPE_FILLS if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)
+]
+
+
+# We cannot use `list(transforms.InterpolationMode)` here, since it includes some PIL-only ones as well
+INTERPOLATION_MODES = [
+    transforms.InterpolationMode.NEAREST,
+    transforms.InterpolationMode.NEAREST_EXACT,
+    transforms.InterpolationMode.BILINEAR,
+    transforms.InterpolationMode.BICUBIC,
+]
+
+
+@contextlib.contextmanager
+def assert_warns_antialias_default_value():
+    with pytest.warns(UserWarning, match="The default value of the antialias parameter of all the resizing transforms"):
+        yield
+
+
+def reference_affine_bounding_boxes_helper(bounding_boxes, *, affine_matrix, new_canvas_size=None, clamp=True):
+    format = bounding_boxes.format
+    canvas_size = new_canvas_size or bounding_boxes.canvas_size
+
+    def affine_bounding_boxes(bounding_boxes):
+        dtype = bounding_boxes.dtype
+        device = bounding_boxes.device
+
+        # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
+        input_xyxy = F.convert_bounding_box_format(
+            bounding_boxes.to(dtype=torch.float64, device="cpu", copy=True),
+            old_format=format,
+            new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            inplace=True,
+        )
+        x1, y1, x2, y2 = input_xyxy.squeeze(0).tolist()
+
+        points = np.array(
+            [
+                [x1, y1, 1.0],
+                [x2, y1, 1.0],
+                [x1, y2, 1.0],
+                [x2, y2, 1.0],
+            ]
+        )
+        transformed_points = np.matmul(points, affine_matrix.astype(points.dtype).T)
+
+        output_xyxy = torch.Tensor(
+            [
+                float(np.min(transformed_points[:, 0])),
+                float(np.min(transformed_points[:, 1])),
+                float(np.max(transformed_points[:, 0])),
+                float(np.max(transformed_points[:, 1])),
+            ]
+        )
+
+        output = F.convert_bounding_box_format(
+            output_xyxy, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format
+        )
+
+        if clamp:
+            # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+            output = F.clamp_bounding_boxes(
+                output,
+                format=format,
+                canvas_size=canvas_size,
+            )
+        else:
+            # We leave the bounding box as float64 so the caller gets the full precision to perform any additional
+            # operation
+            dtype = output.dtype
+
+        return output.to(dtype=dtype, device=device)
+
+    return tv_tensors.BoundingBoxes(
+        torch.cat([affine_bounding_boxes(b) for b in bounding_boxes.reshape(-1, 4).unbind()], dim=0).reshape(
+            bounding_boxes.shape
+        ),
+        format=format,
+        canvas_size=canvas_size,
+    )
+
+
+# turns all warnings into errors for this module
+pytestmark = pytest.mark.filterwarnings("error")
+
+
+class TestResize:
+    INPUT_SIZE = (17, 11)
+    OUTPUT_SIZES = [17, [17], (17,), [12, 13], (12, 13)]
+
+    def _make_max_size_kwarg(self, *, use_max_size, size):
+        if use_max_size:
+            if not (isinstance(size, int) or len(size) == 1):
+                # This would result in an `ValueError`
+                return None
+
+            max_size = (size if isinstance(size, int) else size[0]) + 1
+        else:
+            max_size = None
+
+        return dict(max_size=max_size)
+
+    def _compute_output_size(self, *, input_size, size, max_size):
+        if not (isinstance(size, int) or len(size) == 1):
+            return tuple(size)
+
+        if not isinstance(size, int):
+            size = size[0]
+
+        old_height, old_width = input_size
+        ratio = old_width / old_height
+        if ratio > 1:
+            new_height = size
+            new_width = int(ratio * new_height)
+        else:
+            new_width = size
+            new_height = int(new_width / ratio)
+
+        if max_size is not None and max(new_height, new_width) > max_size:
+            # Need to recompute the aspect ratio, since it might have changed due to rounding
+            ratio = new_width / new_height
+            if ratio > 1:
+                new_width = max_size
+                new_height = int(new_width / ratio)
+            else:
+                new_height = max_size
+                new_width = int(new_height * ratio)
+
+        return new_height, new_width
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("antialias", [True, False])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, size, interpolation, use_max_size, antialias, dtype, device):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        # In contrast to CPU, there is no native `InterpolationMode.BICUBIC` implementation for uint8 images on CUDA.
+        # Internally, it uses the float path. Thus, we need to test with an enormous tolerance here to account for that.
+        atol = 30 if transforms.InterpolationMode.BICUBIC and dtype is torch.uint8 else 1
+        check_cuda_vs_cpu_tolerances = dict(rtol=0, atol=atol / 255 if dtype.is_floating_point else atol)
+
+        check_kernel(
+            F.resize_image,
+            make_image(self.INPUT_SIZE, dtype=dtype, device=device),
+            size=size,
+            interpolation=interpolation,
+            **max_size_kwarg,
+            antialias=antialias,
+            check_cuda_vs_cpu=check_cuda_vs_cpu_tolerances,
+            check_scripted_vs_eager=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, format, size, use_max_size, dtype, device):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        bounding_boxes = make_bounding_boxes(
+            format=format,
+            canvas_size=self.INPUT_SIZE,
+            dtype=dtype,
+            device=device,
+        )
+        check_kernel(
+            F.resize_bounding_boxes,
+            bounding_boxes,
+            canvas_size=bounding_boxes.canvas_size,
+            size=size,
+            **max_size_kwarg,
+            check_scripted_vs_eager=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.resize_mask, make_mask(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1])
+
+    def test_kernel_video(self):
+        check_kernel(F.resize_video, make_video(self.INPUT_SIZE), size=self.OUTPUT_SIZES[-1], antialias=True)
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, size, make_input):
+        check_functional(
+            F.resize,
+            make_input(self.INPUT_SIZE),
+            size=size,
+            antialias=True,
+            check_scripted_smoke=not isinstance(size, int),
+        )
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.resize_image, torch.Tensor),
+            (F._resize_image_pil, PIL.Image.Image),
+            (F.resize_image, tv_tensors.Image),
+            (F.resize_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.resize_mask, tv_tensors.Mask),
+            (F.resize_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.resize, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
+    )
+    def test_transform(self, size, device, make_input):
+        check_transform(
+            transforms.Resize(size=size, antialias=True),
+            make_input(self.INPUT_SIZE, device=device),
+            # atol=1 due to Resize v2 is using native uint8 interpolate path for bilinear and nearest modes
+            check_v1_compatibility=dict(rtol=0, atol=1),
+        )
+
+    def _check_output_size(self, input, output, *, size, max_size):
+        assert tuple(F.get_size(output)) == self._compute_output_size(
+            input_size=F.get_size(input), size=size, max_size=max_size
+        )
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    # `InterpolationMode.NEAREST` is modeled after the buggy `INTER_NEAREST` interpolation of CV2.
+    # The PIL equivalent of `InterpolationMode.NEAREST` is `InterpolationMode.NEAREST_EXACT`
+    @pytest.mark.parametrize("interpolation", set(INTERPOLATION_MODES) - {transforms.InterpolationMode.NEAREST})
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
+    def test_image_correctness(self, size, interpolation, use_max_size, fn):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8)
+
+        actual = fn(image, size=size, interpolation=interpolation, **max_size_kwarg, antialias=True)
+        expected = F.to_image(F.resize(F.to_pil_image(image), size=size, interpolation=interpolation, **max_size_kwarg))
+
+        self._check_output_size(image, actual, size=size, **max_size_kwarg)
+        torch.testing.assert_close(actual, expected, atol=1, rtol=0)
+
+    def _reference_resize_bounding_boxes(self, bounding_boxes, *, size, max_size=None):
+        old_height, old_width = bounding_boxes.canvas_size
+        new_height, new_width = self._compute_output_size(
+            input_size=bounding_boxes.canvas_size, size=size, max_size=max_size
+        )
+
+        if (old_height, old_width) == (new_height, new_width):
+            return bounding_boxes
+
+        affine_matrix = np.array(
+            [
+                [new_width / old_width, 0, 0],
+                [0, new_height / old_height, 0],
+            ],
+        )
+
+        return reference_affine_bounding_boxes_helper(
+            bounding_boxes,
+            affine_matrix=affine_matrix,
+            new_canvas_size=(new_height, new_width),
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize("use_max_size", [True, False])
+    @pytest.mark.parametrize("fn", [F.resize, transform_cls_to_functional(transforms.Resize)])
+    def test_bounding_boxes_correctness(self, format, size, use_max_size, fn):
+        if not (max_size_kwarg := self._make_max_size_kwarg(use_max_size=use_max_size, size=size)):
+            return
+
+        bounding_boxes = make_bounding_boxes(format=format, canvas_size=self.INPUT_SIZE)
+
+        actual = fn(bounding_boxes, size=size, **max_size_kwarg)
+        expected = self._reference_resize_bounding_boxes(bounding_boxes, size=size, **max_size_kwarg)
+
+        self._check_output_size(bounding_boxes, actual, size=size, **max_size_kwarg)
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("interpolation", set(transforms.InterpolationMode) - set(INTERPOLATION_MODES))
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    def test_pil_interpolation_compat_smoke(self, interpolation, make_input):
+        input = make_input(self.INPUT_SIZE)
+
+        with (
+            contextlib.nullcontext()
+            if isinstance(input, PIL.Image.Image)
+            # This error is triggered in PyTorch core
+            else pytest.raises(NotImplementedError, match=f"got {interpolation.value.lower()}")
+        ):
+            F.resize(
+                input,
+                size=self.OUTPUT_SIZES[0],
+                interpolation=interpolation,
+            )
+
+    def test_functional_pil_antialias_warning(self):
+        with pytest.warns(UserWarning, match="Anti-alias option is always applied for PIL Image input"):
+            F.resize(make_image_pil(self.INPUT_SIZE), size=self.OUTPUT_SIZES[0], antialias=False)
+
+    @pytest.mark.parametrize("size", OUTPUT_SIZES)
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
+    )
+    def test_max_size_error(self, size, make_input):
+        if isinstance(size, int) or len(size) == 1:
+            max_size = (size if isinstance(size, int) else size[0]) - 1
+            match = "must be strictly greater than the requested size"
+        else:
+            # value can be anything other than None
+            max_size = -1
+            match = "size should be an int or a sequence of length 1"
+
+        with pytest.raises(ValueError, match=match):
+            F.resize(make_input(self.INPUT_SIZE), size=size, max_size=max_size, antialias=True)
+
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_video],
+    )
+    def test_antialias_warning(self, interpolation, make_input):
+        with (
+            assert_warns_antialias_default_value()
+            if interpolation in {transforms.InterpolationMode.BILINEAR, transforms.InterpolationMode.BICUBIC}
+            else assert_no_warnings()
+        ):
+            F.resize(
+                make_input(self.INPUT_SIZE),
+                size=self.OUTPUT_SIZES[0],
+                interpolation=interpolation,
+            )
+
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    def test_interpolation_int(self, interpolation, make_input):
+        input = make_input(self.INPUT_SIZE)
+
+        # `InterpolationMode.NEAREST_EXACT` has no proper corresponding integer equivalent. Internally, we map it to
+        # `0` to be the same as `InterpolationMode.NEAREST` for PIL. However, for the tensor backend there is a
+        # difference and thus we don't test it here.
+        if isinstance(input, torch.Tensor) and interpolation is transforms.InterpolationMode.NEAREST_EXACT:
+            return
+
+        expected = F.resize(input, size=self.OUTPUT_SIZES[0], interpolation=interpolation, antialias=True)
+        actual = F.resize(
+            input, size=self.OUTPUT_SIZES[0], interpolation=pil_modes_mapping[interpolation], antialias=True
+        )
+
+        assert_equal(actual, expected)
+
+    def test_transform_unknown_size_error(self):
+        with pytest.raises(ValueError, match="size can either be an integer or a list or tuple of one or two integers"):
+            transforms.Resize(size=object())
+
+    @pytest.mark.parametrize(
+        "size", [min(INPUT_SIZE), [min(INPUT_SIZE)], (min(INPUT_SIZE),), list(INPUT_SIZE), tuple(INPUT_SIZE)]
+    )
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
+    )
+    def test_noop(self, size, make_input):
+        input = make_input(self.INPUT_SIZE)
+
+        output = F.resize(input, size=F.get_size(input), antialias=True)
+
+        # This identity check is not a requirement. It is here to avoid breaking the behavior by accident. If there
+        # is a good reason to break this, feel free to downgrade to an equality check.
+        if isinstance(input, tv_tensors.TVTensor):
+            # We can't test identity directly, since that checks for the identity of the Python object. Since all
+            # tv_tensors unwrap before a kernel and wrap again afterwards, the Python object changes. Thus, we check
+            # that the underlying storage is the same
+            assert output.data_ptr() == input.data_ptr()
+        else:
+            assert output is input
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [
+            make_image_tensor,
+            make_image_pil,
+            make_image,
+            make_bounding_boxes,
+            make_segmentation_mask,
+            make_detection_mask,
+            make_video,
+        ],
+    )
+    def test_no_regression_5405(self, make_input):
+        # Checks that `max_size` is not ignored if `size == small_edge_size`
+        # See https://github.com/pytorch/vision/issues/5405
+
+        input = make_input(self.INPUT_SIZE)
+
+        size = min(F.get_size(input))
+        max_size = size + 1
+        output = F.resize(input, size=size, max_size=max_size, antialias=True)
+
+        assert max(F.get_size(output)) == max_size
+
+    def _make_image(self, *args, batch_dims=(), memory_format=torch.contiguous_format, **kwargs):
+        # torch.channels_last memory_format is only available for 4D tensors, i.e. (B, C, H, W). However, images coming
+        # from PIL or our own I/O functions do not have a batch dimensions and are thus 3D, i.e. (C, H, W). Still, the
+        # layout of the data in memory is channels last. To emulate this when a 3D input is requested here, we create
+        # the image as 4D and create a view with the right shape afterwards. With this the layout in memory is channels
+        # last although PyTorch doesn't recognizes it as such.
+        emulate_channels_last = memory_format is torch.channels_last and len(batch_dims) != 1
+
+        image = make_image(
+            *args,
+            batch_dims=(math.prod(batch_dims),) if emulate_channels_last else batch_dims,
+            memory_format=memory_format,
+            **kwargs,
+        )
+
+        if emulate_channels_last:
+            image = tv_tensors.wrap(image.view(*batch_dims, *image.shape[-3:]), like=image)
+
+        return image
+
+    def _check_stride(self, image, *, memory_format):
+        C, H, W = F.get_dimensions(image)
+        if memory_format is torch.contiguous_format:
+            expected_stride = (H * W, W, 1)
+        elif memory_format is torch.channels_last:
+            expected_stride = (1, W * C, C)
+        else:
+            raise ValueError(f"Unknown memory_format: {memory_format}")
+
+        assert image.stride() == expected_stride
+
+    # TODO: We can remove this test and related torchvision workaround
+    #  once we fixed related pytorch issue: https://github.com/pytorch/pytorch/issues/68430
+    @pytest.mark.parametrize("interpolation", INTERPOLATION_MODES)
+    @pytest.mark.parametrize("antialias", [True, False])
+    @pytest.mark.parametrize("memory_format", [torch.contiguous_format, torch.channels_last])
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_memory_format_consistency(self, interpolation, antialias, memory_format, dtype, device):
+        size = self.OUTPUT_SIZES[0]
+
+        input = self._make_image(self.INPUT_SIZE, dtype=dtype, device=device, memory_format=memory_format)
+
+        # Smoke test to make sure we aren't starting with wrong assumptions
+        self._check_stride(input, memory_format=memory_format)
+
+        output = F.resize_image(input, size=size, interpolation=interpolation, antialias=antialias)
+
+        self._check_stride(output, memory_format=memory_format)
+
+    def test_float16_no_rounding(self):
+        # Make sure Resize() doesn't round float16 images
+        # Non-regression test for https://github.com/pytorch/vision/issues/7667
+
+        input = make_image_tensor(self.INPUT_SIZE, dtype=torch.float16)
+        output = F.resize_image(input, size=self.OUTPUT_SIZES[0], antialias=True)
+
+        assert output.dtype is torch.float16
+        assert (output.round() - output).abs().sum() > 0
+
+
+class TestHorizontalFlip:
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.horizontal_flip_image, make_image(dtype=dtype, device=device))
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, format, dtype, device):
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+        check_kernel(
+            F.horizontal_flip_bounding_boxes,
+            bounding_boxes,
+            format=format,
+            canvas_size=bounding_boxes.canvas_size,
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.horizontal_flip_mask, make_mask())
+
+    def test_kernel_video(self):
+        check_kernel(F.horizontal_flip_video, make_video())
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.horizontal_flip, make_input())
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.horizontal_flip_image, torch.Tensor),
+            (F._horizontal_flip_image_pil, PIL.Image.Image),
+            (F.horizontal_flip_image, tv_tensors.Image),
+            (F.horizontal_flip_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.horizontal_flip_mask, tv_tensors.Mask),
+            (F.horizontal_flip_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.horizontal_flip, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomHorizontalFlip(p=1), make_input(device=device))
+
+    @pytest.mark.parametrize(
+        "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
+    )
+    def test_image_correctness(self, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = fn(image)
+        expected = F.to_image(F.horizontal_flip(F.to_pil_image(image)))
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_horizontal_flip_bounding_boxes(self, bounding_boxes):
+        affine_matrix = np.array(
+            [
+                [-1, 0, bounding_boxes.canvas_size[1]],
+                [0, 1, 0],
+            ],
+        )
+
+        return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize(
+        "fn", [F.horizontal_flip, transform_cls_to_functional(transforms.RandomHorizontalFlip, p=1)]
+    )
+    def test_bounding_boxes_correctness(self, format, fn):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        actual = fn(bounding_boxes)
+        expected = self._reference_horizontal_flip_bounding_boxes(bounding_boxes)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform_noop(self, make_input, device):
+        input = make_input(device=device)
+
+        transform = transforms.RandomHorizontalFlip(p=0)
+
+        output = transform(input)
+
+        assert_equal(output, input)
+
+
+class TestAffine:
+    _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict(
+        # float, int
+        angle=[-10.9, 18],
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        translate=[[6.3, -0.6], [1, -3], (16.6, -6.6), (-2, 4)],
+        # float
+        scale=[0.5],
+        # float, int,
+        # one-list of float, one-list of int, one-tuple of float, one-tuple of int
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        shear=[35.6, 38, [-37.7], [-23], (5.3,), (-52,), [5.4, 21.8], [-47, 51], (-11.2, 36.7), (8, -53)],
+        # None
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)],
+    )
+    # The special case for shear makes sure we pick a value that is supported while JIT scripting
+    _MINIMAL_AFFINE_KWARGS = {
+        k: vs[0] if k != "shear" else next(v for v in vs if isinstance(v, list))
+        for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
+    }
+    _CORRECTNESS_AFFINE_KWARGS = {
+        k: [v for v in vs if v is None or isinstance(v, float) or (isinstance(v, list) and len(v) > 1)]
+        for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
+    }
+
+    _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict(
+        degrees=[30, (-15, 20)],
+        translate=[None, (0.5, 0.5)],
+        scale=[None, (0.75, 1.25)],
+        shear=[None, (12, 30, -17, 5), 10, (-5, 12)],
+    )
+    _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {
+        k: next(v for v in vs if v is not None) for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()
+    }
+
+    def _check_kernel(self, kernel, input, *args, **kwargs):
+        kwargs_ = self._MINIMAL_AFFINE_KWARGS.copy()
+        kwargs_.update(kwargs)
+        check_kernel(kernel, input, *args, **kwargs_)
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"],
+        shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+        interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, param, value, dtype, device):
+        if param == "fill":
+            value = adapt_fill(value, dtype=dtype)
+        self._check_kernel(
+            F.affine_image,
+            make_image(dtype=dtype, device=device),
+            **{param: value},
+            check_scripted_vs_eager=not (param in {"shear", "fill"} and isinstance(value, (int, float))),
+            check_cuda_vs_cpu=dict(atol=1, rtol=0)
+            if dtype is torch.uint8 and param == "interpolation" and value is transforms.InterpolationMode.BILINEAR
+            else True,
+        )
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        translate=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["translate"],
+        shear=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["shear"],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+    )
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+        self._check_kernel(
+            F.affine_bounding_boxes,
+            bounding_boxes,
+            format=format,
+            canvas_size=bounding_boxes.canvas_size,
+            **{param: value},
+            check_scripted_vs_eager=not (param == "shear" and isinstance(value, (int, float))),
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        self._check_kernel(F.affine_mask, make_mask())
+
+    def test_kernel_video(self):
+        self._check_kernel(F.affine_video, make_video())
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.affine, make_input(), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.affine_image, torch.Tensor),
+            (F._affine_image_pil, PIL.Image.Image),
+            (F.affine_image, tv_tensors.Image),
+            (F.affine_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.affine_mask, tv_tensors.Mask),
+            (F.affine_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.affine, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        input = make_input(device=device)
+
+        check_transform(transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), input)
+
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
+    @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
+    @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    def test_functional_image_correctness(self, angle, translate, scale, shear, center, interpolation, fill):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        actual = F.affine(
+            image,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+            interpolation=interpolation,
+            fill=fill,
+        )
+        expected = F.to_image(
+            F.affine(
+                F.to_pil_image(image),
+                angle=angle,
+                translate=translate,
+                scale=scale,
+                shear=shear,
+                center=center,
+                interpolation=interpolation,
+                fill=fill,
+            )
+        )
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8
+
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, center, interpolation, fill, seed):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        transform = transforms.RandomAffine(
+            **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center, interpolation=interpolation, fill=fill
+        )
+
+        torch.manual_seed(seed)
+        actual = transform(image)
+
+        torch.manual_seed(seed)
+        expected = F.to_image(transform(F.to_pil_image(image)))
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 2 if interpolation is transforms.InterpolationMode.NEAREST else 8
+
+    def _compute_affine_matrix(self, *, angle, translate, scale, shear, center):
+        rot = math.radians(angle)
+        cx, cy = center
+        tx, ty = translate
+        sx, sy = [math.radians(s) for s in ([shear, 0.0] if isinstance(shear, (int, float)) else shear)]
+
+        c_matrix = np.array([[1, 0, cx], [0, 1, cy], [0, 0, 1]])
+        t_matrix = np.array([[1, 0, tx], [0, 1, ty], [0, 0, 1]])
+        c_matrix_inv = np.linalg.inv(c_matrix)
+        rs_matrix = np.array(
+            [
+                [scale * math.cos(rot), -scale * math.sin(rot), 0],
+                [scale * math.sin(rot), scale * math.cos(rot), 0],
+                [0, 0, 1],
+            ]
+        )
+        shear_x_matrix = np.array([[1, -math.tan(sx), 0], [0, 1, 0], [0, 0, 1]])
+        shear_y_matrix = np.array([[1, 0, 0], [-math.tan(sy), 1, 0], [0, 0, 1]])
+        rss_matrix = np.matmul(rs_matrix, np.matmul(shear_y_matrix, shear_x_matrix))
+        true_matrix = np.matmul(t_matrix, np.matmul(c_matrix, np.matmul(rss_matrix, c_matrix_inv)))
+        return true_matrix[:2, :]
+
+    def _reference_affine_bounding_boxes(self, bounding_boxes, *, angle, translate, scale, shear, center):
+        if center is None:
+            center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]]
+
+        return reference_affine_bounding_boxes_helper(
+            bounding_boxes,
+            affine_matrix=self._compute_affine_matrix(
+                angle=angle, translate=translate, scale=scale, shear=shear, center=center
+            ),
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("translate", _CORRECTNESS_AFFINE_KWARGS["translate"])
+    @pytest.mark.parametrize("scale", _CORRECTNESS_AFFINE_KWARGS["scale"])
+    @pytest.mark.parametrize("shear", _CORRECTNESS_AFFINE_KWARGS["shear"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    def test_functional_bounding_boxes_correctness(self, format, angle, translate, scale, shear, center):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        actual = F.affine(
+            bounding_boxes,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+        )
+        expected = self._reference_affine_bounding_boxes(
+            bounding_boxes,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            center=center,
+        )
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_bounding_boxes_correctness(self, format, center, seed):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        transform = transforms.RandomAffine(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, center=center)
+
+        torch.manual_seed(seed)
+        params = transform._get_params([bounding_boxes])
+
+        torch.manual_seed(seed)
+        actual = transform(bounding_boxes)
+
+        expected = self._reference_affine_bounding_boxes(bounding_boxes, **params, center=center)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"])
+    @pytest.mark.parametrize("translate", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["translate"])
+    @pytest.mark.parametrize("scale", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["scale"])
+    @pytest.mark.parametrize("shear", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["shear"])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_transform_get_params_bounds(self, degrees, translate, scale, shear, seed):
+        image = make_image()
+        height, width = F.get_size(image)
+
+        transform = transforms.RandomAffine(degrees=degrees, translate=translate, scale=scale, shear=shear)
+
+        torch.manual_seed(seed)
+        params = transform._get_params([image])
+
+        if isinstance(degrees, (int, float)):
+            assert -degrees <= params["angle"] <= degrees
+        else:
+            assert degrees[0] <= params["angle"] <= degrees[1]
+
+        if translate is not None:
+            width_max = int(round(translate[0] * width))
+            height_max = int(round(translate[1] * height))
+            assert -width_max <= params["translate"][0] <= width_max
+            assert -height_max <= params["translate"][1] <= height_max
+        else:
+            assert params["translate"] == (0, 0)
+
+        if scale is not None:
+            assert scale[0] <= params["scale"] <= scale[1]
+        else:
+            assert params["scale"] == 1.0
+
+        if shear is not None:
+            if isinstance(shear, (int, float)):
+                assert -shear <= params["shear"][0] <= shear
+                assert params["shear"][1] == 0.0
+            elif len(shear) == 2:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert params["shear"][1] == 0.0
+            elif len(shear) == 4:
+                assert shear[0] <= params["shear"][0] <= shear[1]
+                assert shear[2] <= params["shear"][1] <= shear[3]
+        else:
+            assert params["shear"] == (0, 0)
+
+    @pytest.mark.parametrize("param", ["degrees", "translate", "scale", "shear", "center"])
+    @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]])
+    def test_transform_sequence_len_errors(self, param, value):
+        if param in {"degrees", "shear"} and not isinstance(value, list):
+            return
+
+        kwargs = {param: value}
+        if param != "degrees":
+            kwargs["degrees"] = 0
+
+        with pytest.raises(
+            ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2"
+        ):
+            transforms.RandomAffine(**kwargs)
+
+    def test_transform_negative_degrees_error(self):
+        with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"):
+            transforms.RandomAffine(degrees=-1)
+
+    @pytest.mark.parametrize("translate", [[-1, 0], [2, 0], [-1, 2]])
+    def test_transform_translate_range_error(self, translate):
+        with pytest.raises(ValueError, match="translation values should be between 0 and 1"):
+            transforms.RandomAffine(degrees=0, translate=translate)
+
+    @pytest.mark.parametrize("scale", [[-1, 0], [0, -1], [-1, -1]])
+    def test_transform_scale_range_error(self, scale):
+        with pytest.raises(ValueError, match="scale values should be positive"):
+            transforms.RandomAffine(degrees=0, scale=scale)
+
+    def test_transform_negative_shear_error(self):
+        with pytest.raises(ValueError, match="If shear is a single number, it must be positive"):
+            transforms.RandomAffine(degrees=0, shear=-1)
+
+    def test_transform_unknown_fill_error(self):
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(degrees=0, fill="fill")
+
+
+class TestVerticalFlip:
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.vertical_flip_image, make_image(dtype=dtype, device=device))
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, format, dtype, device):
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+        check_kernel(
+            F.vertical_flip_bounding_boxes,
+            bounding_boxes,
+            format=format,
+            canvas_size=bounding_boxes.canvas_size,
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.vertical_flip_mask, make_mask())
+
+    def test_kernel_video(self):
+        check_kernel(F.vertical_flip_video, make_video())
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.vertical_flip, make_input())
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.vertical_flip_image, torch.Tensor),
+            (F._vertical_flip_image_pil, PIL.Image.Image),
+            (F.vertical_flip_image, tv_tensors.Image),
+            (F.vertical_flip_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.vertical_flip_mask, tv_tensors.Mask),
+            (F.vertical_flip_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.vertical_flip, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomVerticalFlip(p=1), make_input(device=device))
+
+    @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
+    def test_image_correctness(self, fn):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = fn(image)
+        expected = F.to_image(F.vertical_flip(F.to_pil_image(image)))
+
+        torch.testing.assert_close(actual, expected)
+
+    def _reference_vertical_flip_bounding_boxes(self, bounding_boxes):
+        affine_matrix = np.array(
+            [
+                [1, 0, 0],
+                [0, -1, bounding_boxes.canvas_size[0]],
+            ],
+        )
+
+        return reference_affine_bounding_boxes_helper(bounding_boxes, affine_matrix=affine_matrix)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("fn", [F.vertical_flip, transform_cls_to_functional(transforms.RandomVerticalFlip, p=1)])
+    def test_bounding_boxes_correctness(self, format, fn):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        actual = fn(bounding_boxes)
+        expected = self._reference_vertical_flip_bounding_boxes(bounding_boxes)
+
+        torch.testing.assert_close(actual, expected)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform_noop(self, make_input, device):
+        input = make_input(device=device)
+
+        transform = transforms.RandomVerticalFlip(p=0)
+
+        output = transform(input)
+
+        assert_equal(output, input)
+
+
+class TestRotate:
+    _EXHAUSTIVE_TYPE_AFFINE_KWARGS = dict(
+        # float, int
+        angle=[-10.9, 18],
+        # None
+        # two-list of float, two-list of int, two-tuple of float, two-tuple of int
+        center=[None, [1.2, 4.9], [-3, 1], (2.5, -4.7), (3, 2)],
+    )
+    _MINIMAL_AFFINE_KWARGS = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()}
+    _CORRECTNESS_AFFINE_KWARGS = {
+        k: [v for v in vs if v is None or isinstance(v, float) or isinstance(v, list)]
+        for k, vs in _EXHAUSTIVE_TYPE_AFFINE_KWARGS.items()
+    }
+
+    _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES = dict(
+        degrees=[30, (-15, 20)],
+    )
+    _CORRECTNESS_TRANSFORM_AFFINE_RANGES = {k: vs[0] for k, vs in _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES.items()}
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR],
+        expand=[False, True],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, param, value, dtype, device):
+        kwargs = {param: value}
+        if param != "angle":
+            kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
+        check_kernel(
+            F.rotate_image,
+            make_image(dtype=dtype, device=device),
+            **kwargs,
+            check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
+        )
+
+    @param_value_parametrization(
+        angle=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["angle"],
+        expand=[False, True],
+        center=_EXHAUSTIVE_TYPE_AFFINE_KWARGS["center"],
+    )
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, param, value, format, dtype, device):
+        kwargs = {param: value}
+        if param != "angle":
+            kwargs["angle"] = self._MINIMAL_AFFINE_KWARGS["angle"]
+
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+
+        check_kernel(
+            F.rotate_bounding_boxes,
+            bounding_boxes,
+            format=format,
+            canvas_size=bounding_boxes.canvas_size,
+            **kwargs,
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.rotate_mask, make_mask(), **self._MINIMAL_AFFINE_KWARGS)
+
+    def test_kernel_video(self):
+        check_kernel(F.rotate_video, make_video(), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.rotate, make_input(), **self._MINIMAL_AFFINE_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.rotate_image, torch.Tensor),
+            (F._rotate_image_pil, PIL.Image.Image),
+            (F.rotate_image, tv_tensors.Image),
+            (F.rotate_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.rotate_mask, tv_tensors.Mask),
+            (F.rotate_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.rotate, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        check_transform(
+            transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES), make_input(device=device)
+        )
+
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    def test_functional_image_correctness(self, angle, center, interpolation, expand, fill):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        actual = F.rotate(image, angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill)
+        expected = F.to_image(
+            F.rotate(
+                F.to_pil_image(image), angle=angle, center=center, interpolation=interpolation, expand=expand, fill=fill
+            )
+        )
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6
+
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize(
+        "interpolation", [transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR]
+    )
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("fill", CORRECTNESS_FILLS)
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, center, interpolation, expand, fill, seed):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        fill = adapt_fill(fill, dtype=torch.uint8)
+
+        transform = transforms.RandomRotation(
+            **self._CORRECTNESS_TRANSFORM_AFFINE_RANGES,
+            center=center,
+            interpolation=interpolation,
+            expand=expand,
+            fill=fill,
+        )
+
+        torch.manual_seed(seed)
+        actual = transform(image)
+
+        torch.manual_seed(seed)
+        expected = F.to_image(transform(F.to_pil_image(image)))
+
+        mae = (actual.float() - expected.float()).abs().mean()
+        assert mae < 1 if interpolation is transforms.InterpolationMode.NEAREST else 6
+
+    def _compute_output_canvas_size(self, *, expand, canvas_size, affine_matrix):
+        if not expand:
+            return canvas_size, (0.0, 0.0)
+
+        input_height, input_width = canvas_size
+
+        input_image_frame = np.array(
+            [
+                [0.0, 0.0, 1.0],
+                [0.0, input_height, 1.0],
+                [input_width, input_height, 1.0],
+                [input_width, 0.0, 1.0],
+            ],
+            dtype=np.float64,
+        )
+        output_image_frame = np.matmul(input_image_frame, affine_matrix.astype(input_image_frame.dtype).T)
+
+        recenter_x = float(np.min(output_image_frame[:, 0]))
+        recenter_y = float(np.min(output_image_frame[:, 1]))
+
+        output_width = int(np.max(output_image_frame[:, 0]) - recenter_x)
+        output_height = int(np.max(output_image_frame[:, 1]) - recenter_y)
+
+        return (output_height, output_width), (recenter_x, recenter_y)
+
+    def _recenter_bounding_boxes_after_expand(self, bounding_boxes, *, recenter_xy):
+        x, y = recenter_xy
+        if bounding_boxes.format is tv_tensors.BoundingBoxFormat.XYXY:
+            translate = [x, y, x, y]
+        else:
+            translate = [x, y, 0.0, 0.0]
+        return tv_tensors.wrap(
+            (bounding_boxes.to(torch.float64) - torch.tensor(translate)).to(bounding_boxes.dtype), like=bounding_boxes
+        )
+
+    def _reference_rotate_bounding_boxes(self, bounding_boxes, *, angle, expand, center):
+        if center is None:
+            center = [s * 0.5 for s in bounding_boxes.canvas_size[::-1]]
+        cx, cy = center
+
+        a = np.cos(angle * np.pi / 180.0)
+        b = np.sin(angle * np.pi / 180.0)
+        affine_matrix = np.array(
+            [
+                [a, b, cx - cx * a - b * cy],
+                [-b, a, cy + cx * b - a * cy],
+            ],
+        )
+
+        new_canvas_size, recenter_xy = self._compute_output_canvas_size(
+            expand=expand, canvas_size=bounding_boxes.canvas_size, affine_matrix=affine_matrix
+        )
+
+        output = reference_affine_bounding_boxes_helper(
+            bounding_boxes,
+            affine_matrix=affine_matrix,
+            new_canvas_size=new_canvas_size,
+            clamp=False,
+        )
+
+        return F.clamp_bounding_boxes(self._recenter_bounding_boxes_after_expand(output, recenter_xy=recenter_xy)).to(
+            bounding_boxes
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("angle", _CORRECTNESS_AFFINE_KWARGS["angle"])
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    def test_functional_bounding_boxes_correctness(self, format, angle, expand, center):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        actual = F.rotate(bounding_boxes, angle=angle, expand=expand, center=center)
+        expected = self._reference_rotate_bounding_boxes(bounding_boxes, angle=angle, expand=expand, center=center)
+
+        torch.testing.assert_close(actual, expected)
+        torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0)
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("expand", [False, True])
+    @pytest.mark.parametrize("center", _CORRECTNESS_AFFINE_KWARGS["center"])
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_bounding_boxes_correctness(self, format, expand, center, seed):
+        bounding_boxes = make_bounding_boxes(format=format)
+
+        transform = transforms.RandomRotation(**self._CORRECTNESS_TRANSFORM_AFFINE_RANGES, expand=expand, center=center)
+
+        torch.manual_seed(seed)
+        params = transform._get_params([bounding_boxes])
+
+        torch.manual_seed(seed)
+        actual = transform(bounding_boxes)
+
+        expected = self._reference_rotate_bounding_boxes(bounding_boxes, **params, expand=expand, center=center)
+
+        torch.testing.assert_close(actual, expected)
+        torch.testing.assert_close(F.get_size(actual), F.get_size(expected), atol=2 if expand else 0, rtol=0)
+
+    @pytest.mark.parametrize("degrees", _EXHAUSTIVE_TYPE_TRANSFORM_AFFINE_RANGES["degrees"])
+    @pytest.mark.parametrize("seed", list(range(10)))
+    def test_transform_get_params_bounds(self, degrees, seed):
+        transform = transforms.RandomRotation(degrees=degrees)
+
+        torch.manual_seed(seed)
+        params = transform._get_params([])
+
+        if isinstance(degrees, (int, float)):
+            assert -degrees <= params["angle"] <= degrees
+        else:
+            assert degrees[0] <= params["angle"] <= degrees[1]
+
+    @pytest.mark.parametrize("param", ["degrees", "center"])
+    @pytest.mark.parametrize("value", [0, [0], [0, 0, 0]])
+    def test_transform_sequence_len_errors(self, param, value):
+        if param == "degrees" and not isinstance(value, list):
+            return
+
+        kwargs = {param: value}
+        if param != "degrees":
+            kwargs["degrees"] = 0
+
+        with pytest.raises(
+            ValueError if isinstance(value, list) else TypeError, match=f"{param} should be a sequence of length 2"
+        ):
+            transforms.RandomRotation(**kwargs)
+
+    def test_transform_negative_degrees_error(self):
+        with pytest.raises(ValueError, match="If degrees is a single number, it must be positive"):
+            transforms.RandomAffine(degrees=-1)
+
+    def test_transform_unknown_fill_error(self):
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomAffine(degrees=0, fill="fill")
+
+
+class TestCompose:
+    class BuiltinTransform(transforms.Transform):
+        def _transform(self, inpt, params):
+            return inpt
+
+    class PackedInputTransform(nn.Module):
+        def forward(self, sample):
+            assert len(sample) == 2
+            return sample
+
+    class UnpackedInputTransform(nn.Module):
+        def forward(self, image, label):
+            return image, label
+
+    @pytest.mark.parametrize(
+        "transform_clss",
+        [
+            [BuiltinTransform],
+            [PackedInputTransform],
+            [UnpackedInputTransform],
+            [BuiltinTransform, BuiltinTransform],
+            [PackedInputTransform, PackedInputTransform],
+            [UnpackedInputTransform, UnpackedInputTransform],
+            [BuiltinTransform, PackedInputTransform, BuiltinTransform],
+            [BuiltinTransform, UnpackedInputTransform, BuiltinTransform],
+            [PackedInputTransform, BuiltinTransform, PackedInputTransform],
+            [UnpackedInputTransform, BuiltinTransform, UnpackedInputTransform],
+        ],
+    )
+    @pytest.mark.parametrize("unpack", [True, False])
+    def test_packed_unpacked(self, transform_clss, unpack):
+        needs_packed_inputs = any(issubclass(cls, self.PackedInputTransform) for cls in transform_clss)
+        needs_unpacked_inputs = any(issubclass(cls, self.UnpackedInputTransform) for cls in transform_clss)
+        assert not (needs_packed_inputs and needs_unpacked_inputs)
+
+        transform = transforms.Compose([cls() for cls in transform_clss])
+
+        image = make_image()
+        label = 3
+        packed_input = (image, label)
+
+        def call_transform():
+            if unpack:
+                return transform(*packed_input)
+            else:
+                return transform(packed_input)
+
+        if needs_unpacked_inputs and not unpack:
+            with pytest.raises(TypeError, match="missing 1 required positional argument"):
+                call_transform()
+        elif needs_packed_inputs and unpack:
+            with pytest.raises(TypeError, match="takes 2 positional arguments but 3 were given"):
+                call_transform()
+        else:
+            output = call_transform()
+
+            assert isinstance(output, tuple) and len(output) == 2
+            assert output[0] is image
+            assert output[1] is label
+
+
+class TestToDtype:
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.to_dtype_image, make_image_tensor),
+            (F.to_dtype_image, make_image),
+            (F.to_dtype_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_kernel(self, kernel, make_input, input_dtype, output_dtype, device, scale):
+        check_kernel(
+            kernel,
+            make_input(dtype=input_dtype, device=device),
+            expect_same_dtype=input_dtype is output_dtype,
+            dtype=output_dtype,
+            scale=scale,
+        )
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image, make_video])
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_functional(self, make_input, input_dtype, output_dtype, device, scale):
+        check_functional(
+            F.to_dtype,
+            make_input(dtype=input_dtype, device=device),
+            dtype=output_dtype,
+            scale=scale,
+        )
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    @pytest.mark.parametrize("as_dict", (True, False))
+    def test_transform(self, make_input, input_dtype, output_dtype, device, scale, as_dict):
+        input = make_input(dtype=input_dtype, device=device)
+        if as_dict:
+            output_dtype = {type(input): output_dtype}
+        check_transform(transforms.ToDtype(dtype=output_dtype, scale=scale), input)
+
+    def reference_convert_dtype_image_tensor(self, image, dtype=torch.float, scale=False):
+        input_dtype = image.dtype
+        output_dtype = dtype
+
+        if not scale:
+            return image.to(dtype)
+
+        if output_dtype == input_dtype:
+            return image
+
+        def fn(value):
+            if input_dtype.is_floating_point:
+                if output_dtype.is_floating_point:
+                    return value
+                else:
+                    return round(decimal.Decimal(value) * torch.iinfo(output_dtype).max)
+            else:
+                input_max_value = torch.iinfo(input_dtype).max
+
+                if output_dtype.is_floating_point:
+                    return float(decimal.Decimal(value) / input_max_value)
+                else:
+                    output_max_value = torch.iinfo(output_dtype).max
+
+                    if input_max_value > output_max_value:
+                        factor = (input_max_value + 1) // (output_max_value + 1)
+                        return value / factor
+                    else:
+                        factor = (output_max_value + 1) // (input_max_value + 1)
+                        return value * factor
+
+        return torch.tensor(tree_map(fn, image.tolist()), dtype=dtype, device=image.device)
+
+    @pytest.mark.parametrize("input_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("output_dtype", [torch.float32, torch.float64, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_image_correctness(self, input_dtype, output_dtype, device, scale):
+        if input_dtype.is_floating_point and output_dtype == torch.int64:
+            pytest.xfail("float to int64 conversion is not supported")
+
+        input = make_image(dtype=input_dtype, device=device)
+
+        out = F.to_dtype(input, dtype=output_dtype, scale=scale)
+        expected = self.reference_convert_dtype_image_tensor(input, dtype=output_dtype, scale=scale)
+
+        if input_dtype.is_floating_point and not output_dtype.is_floating_point and scale:
+            torch.testing.assert_close(out, expected, atol=1, rtol=0)
+        else:
+            torch.testing.assert_close(out, expected)
+
+    def was_scaled(self, inpt):
+        # this assumes the target dtype is float
+        return inpt.max() <= 1
+
+    def make_inpt_with_bbox_and_mask(self, make_input):
+        H, W = 10, 10
+        inpt_dtype = torch.uint8
+        bbox_dtype = torch.float32
+        mask_dtype = torch.bool
+        sample = {
+            "inpt": make_input(size=(H, W), dtype=inpt_dtype),
+            "bbox": make_bounding_boxes(canvas_size=(H, W), dtype=bbox_dtype),
+            "mask": make_detection_mask(size=(H, W), dtype=mask_dtype),
+        }
+
+        return sample, inpt_dtype, bbox_dtype, mask_dtype
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    @pytest.mark.parametrize("scale", (True, False))
+    def test_dtype_not_a_dict(self, make_input, scale):
+        # assert only inpt gets transformed when dtype isn't a dict
+
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+        out = transforms.ToDtype(dtype=torch.float32, scale=scale)(sample)
+
+        assert out["inpt"].dtype != inpt_dtype
+        assert out["inpt"].dtype == torch.float32
+        if scale:
+            assert self.was_scaled(out["inpt"])
+        else:
+            assert not self.was_scaled(out["inpt"])
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype == mask_dtype
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    def test_others_catch_all_and_none(self, make_input):
+        # make sure "others" works as a catch-all and that None means no conversion
+
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+        out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.int64, "others": None})(sample)
+        assert out["inpt"].dtype == inpt_dtype
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype != mask_dtype
+        assert out["mask"].dtype == torch.int64
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    def test_typical_use_case(self, make_input):
+        # Typical use-case: want to convert dtype and scale for inpt and just dtype for masks.
+        # This just makes sure we now have a decent API for this
+
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+        out = transforms.ToDtype(
+            dtype={type(sample["inpt"]): torch.float32, tv_tensors.Mask: torch.int64, "others": None}, scale=True
+        )(sample)
+        assert out["inpt"].dtype != inpt_dtype
+        assert out["inpt"].dtype == torch.float32
+        assert self.was_scaled(out["inpt"])
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype != mask_dtype
+        assert out["mask"].dtype == torch.int64
+
+    @pytest.mark.parametrize("make_input", (make_image_tensor, make_image, make_video))
+    def test_errors_warnings(self, make_input):
+        sample, inpt_dtype, bbox_dtype, mask_dtype = self.make_inpt_with_bbox_and_mask(make_input)
+
+        with pytest.raises(ValueError, match="No dtype was specified for"):
+            out = transforms.ToDtype(dtype={tv_tensors.Mask: torch.float32})(sample)
+        with pytest.warns(UserWarning, match=re.escape("plain `torch.Tensor` will *not* be transformed")):
+            transforms.ToDtype(dtype={torch.Tensor: torch.float32, tv_tensors.Image: torch.float32})
+        with pytest.warns(UserWarning, match="no scaling will be done"):
+            out = transforms.ToDtype(dtype={"others": None}, scale=True)(sample)
+        assert out["inpt"].dtype == inpt_dtype
+        assert out["bbox"].dtype == bbox_dtype
+        assert out["mask"].dtype == mask_dtype
+
+
+class TestAdjustBrightness:
+    _CORRECTNESS_BRIGHTNESS_FACTORS = [0.5, 0.0, 1.0, 5.0]
+    _DEFAULT_BRIGHTNESS_FACTOR = _CORRECTNESS_BRIGHTNESS_FACTORS[0]
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.adjust_brightness_image, make_image),
+            (F.adjust_brightness_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel(self, kernel, make_input, dtype, device):
+        check_kernel(kernel, make_input(dtype=dtype, device=device), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.adjust_brightness, make_input(), brightness_factor=self._DEFAULT_BRIGHTNESS_FACTOR)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.adjust_brightness_image, torch.Tensor),
+            (F._adjust_brightness_image_pil, PIL.Image.Image),
+            (F.adjust_brightness_image, tv_tensors.Image),
+            (F.adjust_brightness_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.adjust_brightness, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("brightness_factor", _CORRECTNESS_BRIGHTNESS_FACTORS)
+    def test_image_correctness(self, brightness_factor):
+        image = make_image(dtype=torch.uint8, device="cpu")
+
+        actual = F.adjust_brightness(image, brightness_factor=brightness_factor)
+        expected = F.to_image(F.adjust_brightness(F.to_pil_image(image), brightness_factor=brightness_factor))
+
+        torch.testing.assert_close(actual, expected)
+
+
+class TestCutMixMixUp:
+    class DummyDataset:
+        def __init__(self, size, num_classes):
+            self.size = size
+            self.num_classes = num_classes
+            assert size < num_classes
+
+        def __getitem__(self, idx):
+            img = torch.rand(3, 100, 100)
+            label = idx  # This ensures all labels in a batch are unique and makes testing easier
+            return img, label
+
+        def __len__(self):
+            return self.size
+
+    @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp])
+    def test_supported_input_structure(self, T):
+
+        batch_size = 32
+        num_classes = 100
+
+        dataset = self.DummyDataset(size=batch_size, num_classes=num_classes)
+
+        cutmix_mixup = T(num_classes=num_classes)
+
+        dl = DataLoader(dataset, batch_size=batch_size)
+
+        # Input sanity checks
+        img, target = next(iter(dl))
+        input_img_size = img.shape[-3:]
+        assert isinstance(img, torch.Tensor) and isinstance(target, torch.Tensor)
+        assert target.shape == (batch_size,)
+
+        def check_output(img, target):
+            assert img.shape == (batch_size, *input_img_size)
+            assert target.shape == (batch_size, num_classes)
+            torch.testing.assert_close(target.sum(axis=-1), torch.ones(batch_size))
+            num_non_zero_labels = (target != 0).sum(axis=-1)
+            assert (num_non_zero_labels == 2).all()
+
+        # After Dataloader, as unpacked input
+        img, target = next(iter(dl))
+        assert target.shape == (batch_size,)
+        img, target = cutmix_mixup(img, target)
+        check_output(img, target)
+
+        # After Dataloader, as packed input
+        packed_from_dl = next(iter(dl))
+        assert isinstance(packed_from_dl, list)
+        img, target = cutmix_mixup(packed_from_dl)
+        check_output(img, target)
+
+        # As collation function. We expect default_collate to be used by users.
+        def collate_fn_1(batch):
+            return cutmix_mixup(default_collate(batch))
+
+        def collate_fn_2(batch):
+            return cutmix_mixup(*default_collate(batch))
+
+        for collate_fn in (collate_fn_1, collate_fn_2):
+            dl = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn)
+            img, target = next(iter(dl))
+            check_output(img, target)
+
+    @needs_cuda
+    @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp])
+    def test_cpu_vs_gpu(self, T):
+        num_classes = 10
+        batch_size = 3
+        H, W = 12, 12
+
+        imgs = torch.rand(batch_size, 3, H, W)
+        labels = torch.randint(0, num_classes, (batch_size,))
+        cutmix_mixup = T(alpha=0.5, num_classes=num_classes)
+
+        _check_kernel_cuda_vs_cpu(cutmix_mixup, imgs, labels, rtol=None, atol=None)
+
+    @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp])
+    def test_error(self, T):
+
+        num_classes = 10
+        batch_size = 9
+
+        imgs = torch.rand(batch_size, 3, 12, 12)
+        cutmix_mixup = T(alpha=0.5, num_classes=num_classes)
+
+        for input_with_bad_type in (
+            F.to_pil_image(imgs[0]),
+            tv_tensors.Mask(torch.rand(12, 12)),
+            tv_tensors.BoundingBoxes(torch.rand(2, 4), format="XYXY", canvas_size=12),
+        ):
+            with pytest.raises(ValueError, match="does not support PIL images, "):
+                cutmix_mixup(input_with_bad_type)
+
+        with pytest.raises(ValueError, match="Could not infer where the labels are"):
+            cutmix_mixup({"img": imgs, "Nothing_else": 3})
+
+        with pytest.raises(ValueError, match="labels tensor should be of shape"):
+            # Note: the error message isn't ideal, but that's because the label heuristic found the img as the label
+            # It's OK, it's an edge-case. The important thing is that this fails loudly instead of passing silently
+            cutmix_mixup(imgs)
+
+        with pytest.raises(ValueError, match="When using the default labels_getter"):
+            cutmix_mixup(imgs, "not_a_tensor")
+
+        with pytest.raises(ValueError, match="labels tensor should be of shape"):
+            cutmix_mixup(imgs, torch.randint(0, 2, size=(2, 3)))
+
+        with pytest.raises(ValueError, match="Expected a batched input with 4 dims"):
+            cutmix_mixup(imgs[None, None], torch.randint(0, num_classes, size=(batch_size,)))
+
+        with pytest.raises(ValueError, match="does not match the batch size of the labels"):
+            cutmix_mixup(imgs, torch.randint(0, num_classes, size=(batch_size + 1,)))
+
+        with pytest.raises(ValueError, match="labels tensor should be of shape"):
+            # The purpose of this check is more about documenting the current
+            # behaviour of what happens on a Compose(), rather than actually
+            # asserting the expected behaviour. We may support Compose() in the
+            # future, e.g. for 2 consecutive CutMix?
+            labels = torch.randint(0, num_classes, size=(batch_size,))
+            transforms.Compose([cutmix_mixup, cutmix_mixup])(imgs, labels)
+
+
+@pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT"))
+@pytest.mark.parametrize("sample_type", (tuple, list, dict))
+def test_labels_getter_default_heuristic(key, sample_type):
+    labels = torch.arange(10)
+    sample = {key: labels, "another_key": "whatever"}
+    if sample_type is not dict:
+        sample = sample_type((None, sample, "whatever_again"))
+    assert transforms._utils._find_labels_default_heuristic(sample) is labels
+
+    if key.lower() != "labels":
+        # If "labels" is in the dict (case-insensitive),
+        # it takes precedence over other keys which would otherwise be a match
+        d = {key: "something_else", "labels": labels}
+        assert transforms._utils._find_labels_default_heuristic(d) is labels
+
+
+class TestShapeGetters:
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_dimensions_image, make_image_tensor),
+            (F._get_dimensions_image_pil, make_image_pil),
+            (F.get_dimensions_image, make_image),
+            (F.get_dimensions_video, make_video),
+        ],
+    )
+    def test_get_dimensions(self, kernel, make_input):
+        size = (10, 10)
+        color_space, num_channels = "RGB", 3
+
+        input = make_input(size, color_space=color_space)
+
+        assert kernel(input) == F.get_dimensions(input) == [num_channels, *size]
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_num_channels_image, make_image_tensor),
+            (F._get_num_channels_image_pil, make_image_pil),
+            (F.get_num_channels_image, make_image),
+            (F.get_num_channels_video, make_video),
+        ],
+    )
+    def test_get_num_channels(self, kernel, make_input):
+        color_space, num_channels = "RGB", 3
+
+        input = make_input(color_space=color_space)
+
+        assert kernel(input) == F.get_num_channels(input) == num_channels
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_size_image, make_image_tensor),
+            (F._get_size_image_pil, make_image_pil),
+            (F.get_size_image, make_image),
+            (F.get_size_bounding_boxes, make_bounding_boxes),
+            (F.get_size_mask, make_detection_mask),
+            (F.get_size_mask, make_segmentation_mask),
+            (F.get_size_video, make_video),
+        ],
+    )
+    def test_get_size(self, kernel, make_input):
+        size = (10, 10)
+
+        input = make_input(size)
+
+        assert kernel(input) == F.get_size(input) == list(size)
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.get_num_frames_video, make_video_tensor),
+            (F.get_num_frames_video, make_video),
+        ],
+    )
+    def test_get_num_frames(self, kernel, make_input):
+        num_frames = 4
+
+        input = make_input(num_frames=num_frames)
+
+        assert kernel(input) == F.get_num_frames(input) == num_frames
+
+    @pytest.mark.parametrize(
+        ("functional", "make_input"),
+        [
+            (F.get_dimensions, make_bounding_boxes),
+            (F.get_dimensions, make_detection_mask),
+            (F.get_dimensions, make_segmentation_mask),
+            (F.get_num_channels, make_bounding_boxes),
+            (F.get_num_channels, make_detection_mask),
+            (F.get_num_channels, make_segmentation_mask),
+            (F.get_num_frames, make_image_pil),
+            (F.get_num_frames, make_image),
+            (F.get_num_frames, make_bounding_boxes),
+            (F.get_num_frames, make_detection_mask),
+            (F.get_num_frames, make_segmentation_mask),
+        ],
+    )
+    def test_unsupported_types(self, functional, make_input):
+        input = make_input()
+
+        with pytest.raises(TypeError, match=re.escape(str(type(input)))):
+            functional(input)
+
+
+class TestRegisterKernel:
+    @pytest.mark.parametrize("functional", (F.resize, "resize"))
+    def test_register_kernel(self, functional):
+        class CustomTVTensor(tv_tensors.TVTensor):
+            pass
+
+        kernel_was_called = False
+
+        @F.register_kernel(functional, CustomTVTensor)
+        def new_resize(dp, *args, **kwargs):
+            nonlocal kernel_was_called
+            kernel_was_called = True
+            return dp
+
+        t = transforms.Resize(size=(224, 224), antialias=True)
+
+        my_dp = CustomTVTensor(torch.rand(3, 10, 10))
+        out = t(my_dp)
+        assert out is my_dp
+        assert kernel_was_called
+
+        # Sanity check to make sure we didn't override the kernel of other types
+        t(torch.rand(3, 10, 10)).shape == (3, 224, 224)
+        t(tv_tensors.Image(torch.rand(3, 10, 10))).shape == (3, 224, 224)
+
+    def test_errors(self):
+        with pytest.raises(ValueError, match="Could not find functional with name"):
+            F.register_kernel("bad_name", tv_tensors.Image)
+
+        with pytest.raises(ValueError, match="Kernels can only be registered on functionals"):
+            F.register_kernel(tv_tensors.Image, F.resize)
+
+        with pytest.raises(ValueError, match="Kernels can only be registered for subclasses"):
+            F.register_kernel(F.resize, object)
+
+        with pytest.raises(ValueError, match="cannot be registered for the builtin tv_tensor classes"):
+            F.register_kernel(F.resize, tv_tensors.Image)(F.resize_image)
+
+        class CustomTVTensor(tv_tensors.TVTensor):
+            pass
+
+        def resize_custom_tv_tensor():
+            pass
+
+        F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor)
+
+        with pytest.raises(ValueError, match="already has a kernel registered for type"):
+            F.register_kernel(F.resize, CustomTVTensor)(resize_custom_tv_tensor)
+
+
+class TestGetKernel:
+    # We are using F.resize as functional and the kernels below as proxy. Any other functional / kernels combination
+    # would also be fine
+    KERNELS = {
+        torch.Tensor: F.resize_image,
+        PIL.Image.Image: F._resize_image_pil,
+        tv_tensors.Image: F.resize_image,
+        tv_tensors.BoundingBoxes: F.resize_bounding_boxes,
+        tv_tensors.Mask: F.resize_mask,
+        tv_tensors.Video: F.resize_video,
+    }
+
+    @pytest.mark.parametrize("input_type", [str, int, object])
+    def test_unsupported_types(self, input_type):
+        with pytest.raises(TypeError, match="supports inputs of type"):
+            _get_kernel(F.resize, input_type)
+
+    def test_exact_match(self):
+        # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
+        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional
+        # here, register the kernels without wrapper, and check the exact matching afterwards.
+        def resize_with_pure_kernels():
+            pass
+
+        for input_type, kernel in self.KERNELS.items():
+            _register_kernel_internal(resize_with_pure_kernels, input_type, tv_tensor_wrapper=False)(kernel)
+
+            assert _get_kernel(resize_with_pure_kernels, input_type) is kernel
+
+    def test_builtin_tv_tensor_subclass(self):
+        # We cannot use F.resize together with self.KERNELS mapping here directly here, since this is only the
+        # ideal wrapping. Practically, we have an intermediate wrapper layer. Thus, we create a new resize functional
+        # here, register the kernels without wrapper, and check if subclasses of our builtin tv_tensors get dispatched
+        # to the kernel of the corresponding superclass
+        def resize_with_pure_kernels():
+            pass
+
+        class MyImage(tv_tensors.Image):
+            pass
+
+        class MyBoundingBoxes(tv_tensors.BoundingBoxes):
+            pass
+
+        class MyMask(tv_tensors.Mask):
+            pass
+
+        class MyVideo(tv_tensors.Video):
+            pass
+
+        for custom_tv_tensor_subclass in [
+            MyImage,
+            MyBoundingBoxes,
+            MyMask,
+            MyVideo,
+        ]:
+            builtin_tv_tensor_class = custom_tv_tensor_subclass.__mro__[1]
+            builtin_tv_tensor_kernel = self.KERNELS[builtin_tv_tensor_class]
+            _register_kernel_internal(resize_with_pure_kernels, builtin_tv_tensor_class, tv_tensor_wrapper=False)(
+                builtin_tv_tensor_kernel
+            )
+
+            assert _get_kernel(resize_with_pure_kernels, custom_tv_tensor_subclass) is builtin_tv_tensor_kernel
+
+    def test_tv_tensor_subclass(self):
+        class MyTVTensor(tv_tensors.TVTensor):
+            pass
+
+        with pytest.raises(TypeError, match="supports inputs of type"):
+            _get_kernel(F.resize, MyTVTensor)
+
+        def resize_my_tv_tensor():
+            pass
+
+        _register_kernel_internal(F.resize, MyTVTensor, tv_tensor_wrapper=False)(resize_my_tv_tensor)
+
+        assert _get_kernel(F.resize, MyTVTensor) is resize_my_tv_tensor
+
+    def test_pil_image_subclass(self):
+        opened_image = PIL.Image.open(Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+        loaded_image = opened_image.convert("RGB")
+
+        # check the assumptions
+        assert isinstance(opened_image, PIL.Image.Image)
+        assert type(opened_image) is not PIL.Image.Image
+
+        assert type(loaded_image) is PIL.Image.Image
+
+        size = [17, 11]
+        for image in [opened_image, loaded_image]:
+            kernel = _get_kernel(F.resize, type(image))
+
+            output = kernel(image, size=size)
+
+            assert F.get_size(output) == size
+
+
+class TestPermuteChannels:
+    _DEFAULT_PERMUTATION = [2, 0, 1]
+
+    @pytest.mark.parametrize(
+        ("kernel", "make_input"),
+        [
+            (F.permute_channels_image, make_image_tensor),
+            # FIXME
+            # check_kernel does not support PIL kernel, but it should
+            (F.permute_channels_image, make_image),
+            (F.permute_channels_video, make_video),
+        ],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel(self, kernel, make_input, dtype, device):
+        check_kernel(kernel, make_input(dtype=dtype, device=device), permutation=self._DEFAULT_PERMUTATION)
+
+    @pytest.mark.parametrize("make_input", [make_image_tensor, make_image_pil, make_image, make_video])
+    def test_functional(self, make_input):
+        check_functional(F.permute_channels, make_input(), permutation=self._DEFAULT_PERMUTATION)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.permute_channels_image, torch.Tensor),
+            (F._permute_channels_image_pil, PIL.Image.Image),
+            (F.permute_channels_image, tv_tensors.Image),
+            (F.permute_channels_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.permute_channels, kernel=kernel, input_type=input_type)
+
+    def reference_image_correctness(self, image, permutation):
+        channel_images = image.split(1, dim=-3)
+        permuted_channel_images = [channel_images[channel_idx] for channel_idx in permutation]
+        return tv_tensors.Image(torch.concat(permuted_channel_images, dim=-3))
+
+    @pytest.mark.parametrize("permutation", [[2, 0, 1], [1, 2, 0], [2, 0, 1], [0, 1, 2]])
+    @pytest.mark.parametrize("batch_dims", [(), (2,), (2, 1)])
+    def test_image_correctness(self, permutation, batch_dims):
+        image = make_image(batch_dims=batch_dims)
+
+        actual = F.permute_channels(image, permutation=permutation)
+        expected = self.reference_image_correctness(image, permutation=permutation)
+
+        torch.testing.assert_close(actual, expected)
+
+
+class TestElastic:
+    def _make_displacement(self, inpt):
+        return torch.rand(
+            1,
+            *F.get_size(inpt),
+            2,
+            dtype=torch.float32,
+            device=inpt.device if isinstance(inpt, torch.Tensor) else "cpu",
+        )
+
+    @param_value_parametrization(
+        interpolation=[transforms.InterpolationMode.NEAREST, transforms.InterpolationMode.BILINEAR],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, param, value, dtype, device):
+        image = make_image_tensor(dtype=dtype, device=device)
+
+        check_kernel(
+            F.elastic_image,
+            image,
+            displacement=self._make_displacement(image),
+            **{param: value},
+            check_scripted_vs_eager=not (param == "fill" and isinstance(value, (int, float))),
+        )
+
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_boxes(self, format, dtype, device):
+        bounding_boxes = make_bounding_boxes(format=format, dtype=dtype, device=device)
+
+        check_kernel(
+            F.elastic_bounding_boxes,
+            bounding_boxes,
+            format=bounding_boxes.format,
+            canvas_size=bounding_boxes.canvas_size,
+            displacement=self._make_displacement(bounding_boxes),
+        )
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        mask = make_mask()
+        check_kernel(F.elastic_mask, mask, displacement=self._make_displacement(mask))
+
+    def test_kernel_video(self):
+        video = make_video()
+        check_kernel(F.elastic_video, video, displacement=self._make_displacement(video))
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, make_input):
+        input = make_input()
+        check_functional(F.elastic, input, displacement=self._make_displacement(input))
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.elastic_image, torch.Tensor),
+            (F._elastic_image_pil, PIL.Image.Image),
+            (F.elastic_image, tv_tensors.Image),
+            (F.elastic_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.elastic_mask, tv_tensors.Mask),
+            (F.elastic_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.elastic, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_displacement_error(self, make_input):
+        input = make_input()
+
+        with pytest.raises(TypeError, match="displacement should be a Tensor"):
+            F.elastic(input, displacement=None)
+
+        with pytest.raises(ValueError, match="displacement shape should be"):
+            F.elastic(input, displacement=torch.rand(F.get_size(input)))
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    # ElasticTransform needs larger images to avoid the needed internal padding being larger than the actual image
+    @pytest.mark.parametrize("size", [(163, 163), (72, 333), (313, 95)])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, size, device):
+        check_transform(
+            transforms.ElasticTransform(),
+            make_input(size, device=device),
+            # We updated gaussian blur kernel generation with a faster and numerically more stable version
+            check_v1_compatibility=dict(rtol=0, atol=1),
+        )
+
+
+class TestToPureTensor:
+    def test_correctness(self):
+        input = {
+            "img": make_image(),
+            "img_tensor": make_image_tensor(),
+            "img_pil": make_image_pil(),
+            "mask": make_detection_mask(),
+            "video": make_video(),
+            "bbox": make_bounding_boxes(),
+            "str": "str",
+        }
+
+        out = transforms.ToPureTensor()(input)
+
+        for input_value, out_value in zip(input.values(), out.values()):
+            if isinstance(input_value, tv_tensors.TVTensor):
+                assert isinstance(out_value, torch.Tensor) and not isinstance(out_value, tv_tensors.TVTensor)
+            else:
+                assert isinstance(out_value, type(input_value))
+
+
+class TestCrop:
+    INPUT_SIZE = (21, 11)
+
+    CORRECTNESS_CROP_KWARGS = [
+        # center
+        dict(top=5, left=5, height=10, width=5),
+        # larger than input, i.e. pad
+        dict(top=-5, left=-5, height=30, width=20),
+        # sides: left, right, top, bottom
+        dict(top=-5, left=-5, height=30, width=10),
+        dict(top=-5, left=5, height=30, width=10),
+        dict(top=-5, left=-5, height=20, width=20),
+        dict(top=5, left=-5, height=20, width=20),
+        # corners: top-left, top-right, bottom-left, bottom-right
+        dict(top=-5, left=-5, height=20, width=10),
+        dict(top=-5, left=5, height=20, width=10),
+        dict(top=5, left=-5, height=20, width=10),
+        dict(top=5, left=5, height=20, width=10),
+    ]
+    MINIMAL_CROP_KWARGS = CORRECTNESS_CROP_KWARGS[0]
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("dtype", [torch.uint8, torch.float32])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, kwargs, dtype, device):
+        check_kernel(F.crop_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **kwargs)
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_bounding_box(self, kwargs, format, dtype, device):
+        bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device)
+        check_kernel(F.crop_bounding_boxes, bounding_boxes, format=format, **kwargs)
+
+    @pytest.mark.parametrize("make_mask", [make_segmentation_mask, make_detection_mask])
+    def test_kernel_mask(self, make_mask):
+        check_kernel(F.crop_mask, make_mask(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS)
+
+    def test_kernel_video(self):
+        check_kernel(F.crop_video, make_video(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.crop, make_input(self.INPUT_SIZE), **self.MINIMAL_CROP_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.crop_image, torch.Tensor),
+            (F._crop_image_pil, PIL.Image.Image),
+            (F.crop_image, tv_tensors.Image),
+            (F.crop_bounding_boxes, tv_tensors.BoundingBoxes),
+            (F.crop_mask, tv_tensors.Mask),
+            (F.crop_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.crop, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    def test_functional_image_correctness(self, kwargs):
+        image = make_image(self.INPUT_SIZE, dtype=torch.uint8, device="cpu")
+
+        actual = F.crop(image, **kwargs)
+        expected = F.to_image(F.crop(F.to_pil_image(image), **kwargs))
+
+        assert_equal(actual, expected)
+
+    @param_value_parametrization(
+        size=[(10, 5), (25, 15), (25, 5), (10, 15)],
+        fill=EXHAUSTIVE_TYPE_FILLS,
+    )
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    def test_transform(self, param, value, make_input):
+        input = make_input(self.INPUT_SIZE)
+
+        if param == "fill":
+            if isinstance(input, tv_tensors.Mask) and isinstance(value, (tuple, list)):
+                pytest.skip("F.pad_mask doesn't support non-scalar fill.")
+
+            kwargs = dict(
+                # 1. size is required
+                # 2. the fill parameter only has an affect if we need padding
+                size=[s + 4 for s in self.INPUT_SIZE],
+                fill=adapt_fill(value, dtype=input.dtype if isinstance(input, torch.Tensor) else torch.uint8),
+            )
+        else:
+            kwargs = {param: value}
+
+        check_transform(
+            transforms.RandomCrop(**kwargs, pad_if_needed=True),
+            input,
+            check_v1_compatibility=param != "fill" or isinstance(value, (int, float)),
+        )
+
+    @pytest.mark.parametrize("padding", [1, (1, 1), (1, 1, 1, 1)])
+    def test_transform_padding(self, padding):
+        inpt = make_image(self.INPUT_SIZE)
+
+        output_size = [s + 2 for s in F.get_size(inpt)]
+        transform = transforms.RandomCrop(output_size, padding=padding)
+
+        output = transform(inpt)
+
+        assert F.get_size(output) == output_size
+
+    @pytest.mark.parametrize("padding", [None, 1, (1, 1), (1, 1, 1, 1)])
+    def test_transform_insufficient_padding(self, padding):
+        inpt = make_image(self.INPUT_SIZE)
+
+        output_size = [s + 3 for s in F.get_size(inpt)]
+        transform = transforms.RandomCrop(output_size, padding=padding)
+
+        with pytest.raises(ValueError, match="larger than (padded )?input image size"):
+            transform(inpt)
+
+    def test_transform_pad_if_needed(self):
+        inpt = make_image(self.INPUT_SIZE)
+
+        output_size = [s * 2 for s in F.get_size(inpt)]
+        transform = transforms.RandomCrop(output_size, pad_if_needed=True)
+
+        output = transform(inpt)
+
+        assert F.get_size(output) == output_size
+
+    @param_value_parametrization(
+        size=[(10, 5), (25, 15), (25, 5), (10, 15)],
+        fill=CORRECTNESS_FILLS,
+        padding_mode=["constant", "edge", "reflect", "symmetric"],
+    )
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, param, value, seed):
+        kwargs = {param: value}
+        if param != "size":
+            # 1. size is required
+            # 2. the fill / padding_mode parameters only have an affect if we need padding
+            kwargs["size"] = [s + 4 for s in self.INPUT_SIZE]
+        if param == "fill":
+            kwargs["fill"] = adapt_fill(kwargs["fill"], dtype=torch.uint8)
+
+        transform = transforms.RandomCrop(pad_if_needed=True, **kwargs)
+
+        image = make_image(self.INPUT_SIZE)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            actual = transform(image)
+
+            torch.manual_seed(seed)
+            expected = F.to_image(transform(F.to_pil_image(image)))
+
+        assert_equal(actual, expected)
+
+    def _reference_crop_bounding_boxes(self, bounding_boxes, *, top, left, height, width):
+        affine_matrix = np.array(
+            [
+                [1, 0, -left],
+                [0, 1, -top],
+            ],
+        )
+        return reference_affine_bounding_boxes_helper(
+            bounding_boxes, affine_matrix=affine_matrix, new_canvas_size=(height, width)
+        )
+
+    @pytest.mark.parametrize("kwargs", CORRECTNESS_CROP_KWARGS)
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_functional_bounding_box_correctness(self, kwargs, format, dtype, device):
+        bounding_boxes = make_bounding_boxes(self.INPUT_SIZE, format=format, dtype=dtype, device=device)
+
+        actual = F.crop(bounding_boxes, **kwargs)
+        expected = self._reference_crop_bounding_boxes(bounding_boxes, **kwargs)
+
+        assert_equal(actual, expected, atol=1, rtol=0)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    @pytest.mark.parametrize("output_size", [(17, 11), (11, 17), (11, 11)])
+    @pytest.mark.parametrize("format", list(tv_tensors.BoundingBoxFormat))
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.int64])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_bounding_boxes_correctness(self, output_size, format, dtype, device, seed):
+        input_size = [s * 2 for s in output_size]
+        bounding_boxes = make_bounding_boxes(input_size, format=format, dtype=dtype, device=device)
+
+        transform = transforms.RandomCrop(output_size)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            params = transform._get_params([bounding_boxes])
+            assert not params.pop("needs_pad")
+            del params["padding"]
+            assert params.pop("needs_crop")
+
+            torch.manual_seed(seed)
+            actual = transform(bounding_boxes)
+
+        expected = self._reference_crop_bounding_boxes(bounding_boxes, **params)
+
+        assert_equal(actual, expected)
+        assert_equal(F.get_size(actual), F.get_size(expected))
+
+    def test_errors(self):
+        with pytest.raises(ValueError, match="Please provide only two dimensions"):
+            transforms.RandomCrop([10, 12, 14])
+
+        with pytest.raises(TypeError, match="Got inappropriate padding arg"):
+            transforms.RandomCrop([10, 12], padding="abc")
+
+        with pytest.raises(ValueError, match="Padding must be an int or a 1, 2, or 4"):
+            transforms.RandomCrop([10, 12], padding=[-0.7, 0, 0.7])
+
+        with pytest.raises(TypeError, match="Got inappropriate fill arg"):
+            transforms.RandomCrop([10, 12], padding=1, fill="abc")
+
+        with pytest.raises(ValueError, match="Padding mode should be either"):
+            transforms.RandomCrop([10, 12], padding=1, padding_mode="abc")
+
+
+class TestErase:
+    INPUT_SIZE = (17, 11)
+    FUNCTIONAL_KWARGS = dict(
+        zip("ijhwv", [2, 2, 10, 8, torch.tensor(0.0, dtype=torch.float32, device="cpu").reshape(-1, 1, 1)])
+    )
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image(self, dtype, device):
+        check_kernel(F.erase_image, make_image(self.INPUT_SIZE, dtype=dtype, device=device), **self.FUNCTIONAL_KWARGS)
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_kernel_image_inplace(self, dtype, device):
+        input = make_image(self.INPUT_SIZE, dtype=dtype, device=device)
+        input_version = input._version
+
+        output_out_of_place = F.erase_image(input, **self.FUNCTIONAL_KWARGS)
+        assert output_out_of_place.data_ptr() != input.data_ptr()
+        assert output_out_of_place is not input
+
+        output_inplace = F.erase_image(input, **self.FUNCTIONAL_KWARGS, inplace=True)
+        assert output_inplace.data_ptr() == input.data_ptr()
+        assert output_inplace._version > input_version
+        assert output_inplace is input
+
+        assert_equal(output_inplace, output_out_of_place)
+
+    def test_kernel_video(self):
+        check_kernel(F.erase_video, make_video(self.INPUT_SIZE), **self.FUNCTIONAL_KWARGS)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    def test_functional(self, make_input):
+        check_functional(F.erase, make_input(), **self.FUNCTIONAL_KWARGS)
+
+    @pytest.mark.parametrize(
+        ("kernel", "input_type"),
+        [
+            (F.erase_image, torch.Tensor),
+            (F._erase_image_pil, PIL.Image.Image),
+            (F.erase_image, tv_tensors.Image),
+            (F.erase_video, tv_tensors.Video),
+        ],
+    )
+    def test_functional_signature(self, kernel, input_type):
+        check_functional_kernel_signature_match(F.erase, kernel=kernel, input_type=input_type)
+
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_transform(self, make_input, device):
+        check_transform(transforms.RandomErasing(p=1), make_input(device=device))
+
+    def _reference_erase_image(self, image, *, i, j, h, w, v):
+        mask = torch.zeros_like(image, dtype=torch.bool)
+        mask[..., i : i + h, j : j + w] = True
+
+        # The broadcasting and type casting logic is handled automagically in the kernel through indexing
+        value = torch.broadcast_to(v, (*image.shape[:-2], h, w)).to(image)
+
+        erased_image = torch.empty_like(image)
+        erased_image[mask] = value.flatten()
+        erased_image[~mask] = image[~mask]
+
+        return erased_image
+
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    def test_functional_image_correctness(self, dtype, device):
+        image = make_image(dtype=dtype, device=device)
+
+        actual = F.erase(image, **self.FUNCTIONAL_KWARGS)
+        expected = self._reference_erase_image(image, **self.FUNCTIONAL_KWARGS)
+
+        assert_equal(actual, expected)
+
+    @param_value_parametrization(
+        scale=[(0.1, 0.2), [0.0, 1.0]],
+        ratio=[(0.3, 0.7), [0.1, 5.0]],
+        value=[0, 0.5, (0, 1, 0), [-0.2, 0.0, 1.3], "random"],
+    )
+    @pytest.mark.parametrize("dtype", [torch.float32, torch.uint8])
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("seed", list(range(5)))
+    def test_transform_image_correctness(self, param, value, dtype, device, seed):
+        transform = transforms.RandomErasing(**{param: value}, p=1)
+
+        image = make_image(dtype=dtype, device=device)
+
+        with freeze_rng_state():
+            torch.manual_seed(seed)
+            # This emulates the random apply check that happens before _get_params is called
+            torch.rand(1)
+            params = transform._get_params([image])
+
+            torch.manual_seed(seed)
+            actual = transform(image)
+
+        expected = self._reference_erase_image(image, **params)
+
+        assert_equal(actual, expected)
+
+    def test_transform_errors(self):
+        with pytest.raises(TypeError, match="Argument value should be either a number or str or a sequence"):
+            transforms.RandomErasing(value={})
+
+        with pytest.raises(ValueError, match="If value is str, it should be 'random'"):
+            transforms.RandomErasing(value="abc")
+
+        with pytest.raises(TypeError, match="Scale should be a sequence"):
+            transforms.RandomErasing(scale=123)
+
+        with pytest.raises(TypeError, match="Ratio should be a sequence"):
+            transforms.RandomErasing(ratio=123)
+
+        with pytest.raises(ValueError, match="Scale should be between 0 and 1"):
+            transforms.RandomErasing(scale=[-1, 2])
+
+        transform = transforms.RandomErasing(value=[1, 2, 3, 4])
+
+        with pytest.raises(ValueError, match="If value is a sequence, it should have either a single value"):
+            transform._get_params([make_image()])
+
+    @pytest.mark.parametrize("make_input", [make_bounding_boxes, make_detection_mask])
+    def test_transform_passthrough(self, make_input):
+        transform = transforms.RandomErasing(p=1)
+
+        input = make_input(self.INPUT_SIZE)
+
+        with pytest.warns(UserWarning, match="currently passing through inputs of type"):
+            # RandomErasing requires an image or video to be present
+            _, output = transform(make_image(self.INPUT_SIZE), input)
+
+        assert output is input
+
+
+class TestGaussianBlur:
+    @pytest.mark.parametrize(
+        "make_input",
+        [make_image_tensor, make_image_pil, make_image, make_bounding_boxes, make_segmentation_mask, make_video],
+    )
+    @pytest.mark.parametrize("device", cpu_and_cuda())
+    @pytest.mark.parametrize("sigma", [5, (0.5, 2)])
+    def test_transform(self, make_input, device, sigma):
+        check_transform(transforms.GaussianBlur(kernel_size=3, sigma=sigma), make_input(device=device))
+
+    def test_assertions(self):
+        with pytest.raises(ValueError, match="Kernel size should be a tuple/list of two integers"):
+            transforms.GaussianBlur([10, 12, 14])
+
+        with pytest.raises(ValueError, match="Kernel size value should be an odd and positive number"):
+            transforms.GaussianBlur(4)
+
+        with pytest.raises(ValueError, match="If sigma is a sequence its length should be 1 or 2. Got 3"):
+            transforms.GaussianBlur(3, sigma=[1, 2, 3])
+
+        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
+            transforms.GaussianBlur(3, sigma=-1.0)
+
+        with pytest.raises(ValueError, match="sigma values should be positive and of the form"):
+            transforms.GaussianBlur(3, sigma=[2.0, 1.0])
+
+        with pytest.raises(TypeError, match="sigma should be a number or a sequence of numbers"):
+            transforms.GaussianBlur(3, sigma={})
+
+    @pytest.mark.parametrize("sigma", [10.0, [10.0, 12.0], (10, 12.0), [10]])
+    def test__get_params(self, sigma):
+        transform = transforms.GaussianBlur(3, sigma=sigma)
+        params = transform._get_params([])
+
+        if isinstance(sigma, float):
+            assert params["sigma"][0] == params["sigma"][1] == sigma
+        elif isinstance(sigma, list) and len(sigma) == 1:
+            assert params["sigma"][0] == params["sigma"][1] == sigma[0]
+        else:
+            assert sigma[0] <= params["sigma"][0] <= sigma[1]
+            assert sigma[0] <= params["sigma"][1] <= sigma[1]
diff --git a/test/test_transforms_v2_utils.py b/test/test_transforms_v2_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..246e31485f3c1ba29b193a565af9c7327dc08754
--- /dev/null
+++ b/test/test_transforms_v2_utils.py
@@ -0,0 +1,92 @@
+import PIL.Image
+import pytest
+
+import torch
+
+import torchvision.transforms.v2._utils
+from common_utils import DEFAULT_SIZE, make_bounding_boxes, make_detection_mask, make_image
+
+from torchvision import tv_tensors
+from torchvision.transforms.v2._utils import has_all, has_any
+from torchvision.transforms.v2.functional import to_pil_image
+
+
+IMAGE = make_image(DEFAULT_SIZE, color_space="RGB")
+BOUNDING_BOX = make_bounding_boxes(DEFAULT_SIZE, format=tv_tensors.BoundingBoxFormat.XYXY)
+MASK = make_detection_mask(DEFAULT_SIZE)
+
+
+@pytest.mark.parametrize(
+    ("sample", "types", "expected"),
+    [
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Mask,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True),
+        ((MASK,), (tv_tensors.Image, tv_tensors.BoundingBoxes), False),
+        ((BOUNDING_BOX,), (tv_tensors.Image, tv_tensors.Mask), False),
+        ((IMAGE,), (tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        (
+            (IMAGE, BOUNDING_BOX, MASK),
+            (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask),
+            True,
+        ),
+        ((), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        ((IMAGE, BOUNDING_BOX, MASK), (lambda obj: isinstance(obj, tv_tensors.Image),), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
+        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
+        ((IMAGE,), (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor), True),
+        (
+            (torch.Tensor(IMAGE),),
+            (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
+            True,
+        ),
+        (
+            (to_pil_image(IMAGE),),
+            (tv_tensors.Image, PIL.Image.Image, torchvision.transforms.v2._utils.is_pure_tensor),
+            True,
+        ),
+    ],
+)
+def test_has_any(sample, types, expected):
+    assert has_any(sample, *types) is expected
+
+
+@pytest.mark.parametrize(
+    ("sample", "types", "expected"),
+    [
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Mask,), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), True),
+        ((IMAGE, BOUNDING_BOX, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), True),
+        (
+            (IMAGE, BOUNDING_BOX, MASK),
+            (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask),
+            True,
+        ),
+        ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes), False),
+        ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.Mask), False),
+        ((IMAGE, MASK), (tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        (
+            (IMAGE, BOUNDING_BOX, MASK),
+            (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask),
+            True,
+        ),
+        ((BOUNDING_BOX, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        ((IMAGE, MASK), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        ((IMAGE, BOUNDING_BOX), (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask), False),
+        (
+            (IMAGE, BOUNDING_BOX, MASK),
+            (lambda obj: isinstance(obj, (tv_tensors.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask)),),
+            True,
+        ),
+        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: False,), False),
+        ((IMAGE, BOUNDING_BOX, MASK), (lambda _: True,), True),
+    ],
+)
+def test_has_all(sample, types, expected):
+    assert has_all(sample, *types) is expected
diff --git a/test/test_tv_tensors.py b/test/test_tv_tensors.py
new file mode 100644
index 0000000000000000000000000000000000000000..ed75ae35ecd1cbf63ecf3a1b3b67725108d219c6
--- /dev/null
+++ b/test/test_tv_tensors.py
@@ -0,0 +1,320 @@
+from copy import deepcopy
+
+import pytest
+import torch
+from common_utils import assert_equal, make_bounding_boxes, make_image, make_segmentation_mask, make_video
+from PIL import Image
+
+from torchvision import tv_tensors
+
+
+@pytest.fixture(autouse=True)
+def restore_tensor_return_type():
+    # This is for security, as we should already be restoring the default manually in each test anyway
+    # (at least at the time of writing...)
+    yield
+    tv_tensors.set_return_type("Tensor")
+
+
+@pytest.mark.parametrize("data", [torch.rand(3, 32, 32), Image.new("RGB", (32, 32), color=123)])
+def test_image_instance(data):
+    image = tv_tensors.Image(data)
+    assert isinstance(image, torch.Tensor)
+    assert image.ndim == 3 and image.shape[0] == 3
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 10, size=(1, 32, 32)), Image.new("L", (32, 32), color=2)])
+def test_mask_instance(data):
+    mask = tv_tensors.Mask(data)
+    assert isinstance(mask, torch.Tensor)
+    assert mask.ndim == 3 and mask.shape[0] == 1
+
+
+@pytest.mark.parametrize("data", [torch.randint(0, 32, size=(5, 4)), [[0, 0, 5, 5], [2, 2, 7, 7]], [1, 2, 3, 4]])
+@pytest.mark.parametrize(
+    "format", ["XYXY", "CXCYWH", tv_tensors.BoundingBoxFormat.XYXY, tv_tensors.BoundingBoxFormat.XYWH]
+)
+def test_bbox_instance(data, format):
+    bboxes = tv_tensors.BoundingBoxes(data, format=format, canvas_size=(32, 32))
+    assert isinstance(bboxes, torch.Tensor)
+    assert bboxes.ndim == 2 and bboxes.shape[1] == 4
+    if isinstance(format, str):
+        format = tv_tensors.BoundingBoxFormat[(format.upper())]
+    assert bboxes.format == format
+
+
+def test_bbox_dim_error():
+    data_3d = [[[1, 2, 3, 4]]]
+    with pytest.raises(ValueError, match="Expected a 1D or 2D tensor, got 3D"):
+        tv_tensors.BoundingBoxes(data_3d, format="XYXY", canvas_size=(32, 32))
+
+
+@pytest.mark.parametrize(
+    ("data", "input_requires_grad", "expected_requires_grad"),
+    [
+        ([[[0.0, 1.0], [0.0, 1.0]]], None, False),
+        ([[[0.0, 1.0], [0.0, 1.0]]], False, False),
+        ([[[0.0, 1.0], [0.0, 1.0]]], True, True),
+        (torch.rand(3, 16, 16, requires_grad=False), None, False),
+        (torch.rand(3, 16, 16, requires_grad=False), False, False),
+        (torch.rand(3, 16, 16, requires_grad=False), True, True),
+        (torch.rand(3, 16, 16, requires_grad=True), None, True),
+        (torch.rand(3, 16, 16, requires_grad=True), False, False),
+        (torch.rand(3, 16, 16, requires_grad=True), True, True),
+    ],
+)
+def test_new_requires_grad(data, input_requires_grad, expected_requires_grad):
+    tv_tensor = tv_tensors.Image(data, requires_grad=input_requires_grad)
+    assert tv_tensor.requires_grad is expected_requires_grad
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+def test_isinstance(make_input):
+    assert isinstance(make_input(), torch.Tensor)
+
+
+def test_wrapping_no_copy():
+    tensor = torch.rand(3, 16, 16)
+    image = tv_tensors.Image(tensor)
+
+    assert image.data_ptr() == tensor.data_ptr()
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+def test_to_wrapping(make_input):
+    dp = make_input()
+
+    dp_to = dp.to(torch.float64)
+
+    assert type(dp_to) is type(dp)
+    assert dp_to.dtype is torch.float64
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_to_tv_tensor_reference(make_input, return_type):
+    tensor = torch.rand((3, 16, 16), dtype=torch.float64)
+    dp = make_input()
+
+    with tv_tensors.set_return_type(return_type):
+        tensor_to = tensor.to(dp)
+
+    assert type(tensor_to) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
+    assert tensor_to.dtype is dp.dtype
+    assert type(tensor) is torch.Tensor
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_clone_wrapping(make_input, return_type):
+    dp = make_input()
+
+    with tv_tensors.set_return_type(return_type):
+        dp_clone = dp.clone()
+
+    assert type(dp_clone) is type(dp)
+    assert dp_clone.data_ptr() != dp.data_ptr()
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_requires_grad__wrapping(make_input, return_type):
+    dp = make_input(dtype=torch.float)
+
+    assert not dp.requires_grad
+
+    with tv_tensors.set_return_type(return_type):
+        dp_requires_grad = dp.requires_grad_(True)
+
+    assert type(dp_requires_grad) is type(dp)
+    assert dp.requires_grad
+    assert dp_requires_grad.requires_grad
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_detach_wrapping(make_input, return_type):
+    dp = make_input(dtype=torch.float).requires_grad_(True)
+
+    with tv_tensors.set_return_type(return_type):
+        dp_detached = dp.detach()
+
+    assert type(dp_detached) is type(dp)
+
+
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_force_subclass_with_metadata(return_type):
+    # Sanity checks for the ops in _FORCE_TORCHFUNCTION_SUBCLASS and tv_tensors with metadata
+    # Largely the same as above, we additionally check that the metadata is preserved
+    format, canvas_size = "XYXY", (32, 32)
+    bbox = tv_tensors.BoundingBoxes([[0, 0, 5, 5], [2, 2, 7, 7]], format=format, canvas_size=canvas_size)
+
+    tv_tensors.set_return_type(return_type)
+    bbox = bbox.clone()
+    if return_type == "TVTensor":
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    bbox = bbox.to(torch.float64)
+    if return_type == "TVTensor":
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    bbox = bbox.detach()
+    if return_type == "TVTensor":
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
+
+    assert not bbox.requires_grad
+    bbox.requires_grad_(True)
+    if return_type == "TVTensor":
+        assert bbox.format, bbox.canvas_size == (format, canvas_size)
+        assert bbox.requires_grad
+    tv_tensors.set_return_type("tensor")
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_other_op_no_wrapping(make_input, return_type):
+    dp = make_input()
+
+    with tv_tensors.set_return_type(return_type):
+        # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
+        output = dp * 2
+
+    assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+@pytest.mark.parametrize(
+    "op",
+    [
+        lambda t: t.numpy(),
+        lambda t: t.tolist(),
+        lambda t: t.max(dim=-1),
+    ],
+)
+def test_no_tensor_output_op_no_wrapping(make_input, op):
+    dp = make_input()
+
+    output = op(dp)
+
+    assert type(output) is not type(dp)
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+def test_inplace_op_no_wrapping(make_input, return_type):
+    dp = make_input()
+    original_type = type(dp)
+
+    with tv_tensors.set_return_type(return_type):
+        output = dp.add_(0)
+
+    assert type(output) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
+    assert type(dp) is original_type
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+def test_wrap(make_input):
+    dp = make_input()
+
+    # any operation besides the ones listed in _FORCE_TORCHFUNCTION_SUBCLASS will do here
+    output = dp * 2
+
+    dp_new = tv_tensors.wrap(output, like=dp)
+
+    assert type(dp_new) is type(dp)
+    assert dp_new.data_ptr() == output.data_ptr()
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("requires_grad", [False, True])
+def test_deepcopy(make_input, requires_grad):
+    dp = make_input(dtype=torch.float)
+
+    dp.requires_grad_(requires_grad)
+
+    dp_deepcopied = deepcopy(dp)
+
+    assert dp_deepcopied is not dp
+    assert dp_deepcopied.data_ptr() != dp.data_ptr()
+    assert_equal(dp_deepcopied, dp)
+
+    assert type(dp_deepcopied) is type(dp)
+    assert dp_deepcopied.requires_grad is requires_grad
+
+
+@pytest.mark.parametrize("make_input", [make_image, make_bounding_boxes, make_segmentation_mask, make_video])
+@pytest.mark.parametrize("return_type", ["Tensor", "TVTensor"])
+@pytest.mark.parametrize(
+    "op",
+    (
+        lambda dp: dp + torch.rand(*dp.shape),
+        lambda dp: torch.rand(*dp.shape) + dp,
+        lambda dp: dp * torch.rand(*dp.shape),
+        lambda dp: torch.rand(*dp.shape) * dp,
+        lambda dp: dp + 3,
+        lambda dp: 3 + dp,
+        lambda dp: dp + dp,
+        lambda dp: dp.sum(),
+        lambda dp: dp.reshape(-1),
+        lambda dp: dp.int(),
+        lambda dp: torch.stack([dp, dp]),
+        lambda dp: torch.chunk(dp, 2)[0],
+        lambda dp: torch.unbind(dp)[0],
+    ),
+)
+def test_usual_operations(make_input, return_type, op):
+
+    dp = make_input()
+    with tv_tensors.set_return_type(return_type):
+        out = op(dp)
+    assert type(out) is (type(dp) if return_type == "TVTensor" else torch.Tensor)
+    if isinstance(dp, tv_tensors.BoundingBoxes) and return_type == "TVTensor":
+        assert hasattr(out, "format")
+        assert hasattr(out, "canvas_size")
+
+
+def test_subclasses():
+    img = make_image()
+    masks = make_segmentation_mask()
+
+    with pytest.raises(TypeError, match="unsupported operand"):
+        img + masks
+
+
+def test_set_return_type():
+    img = make_image()
+
+    assert type(img + 3) is torch.Tensor
+
+    with tv_tensors.set_return_type("TVTensor"):
+        assert type(img + 3) is tv_tensors.Image
+    assert type(img + 3) is torch.Tensor
+
+    tv_tensors.set_return_type("TVTensor")
+    assert type(img + 3) is tv_tensors.Image
+
+    with tv_tensors.set_return_type("tensor"):
+        assert type(img + 3) is torch.Tensor
+        with tv_tensors.set_return_type("TVTensor"):
+            assert type(img + 3) is tv_tensors.Image
+            tv_tensors.set_return_type("tensor")
+            assert type(img + 3) is torch.Tensor
+        assert type(img + 3) is torch.Tensor
+    # Exiting a context manager will restore the return type as it was prior to entering it,
+    # regardless of whether the "global" tv_tensors.set_return_type() was called within the context manager.
+    assert type(img + 3) is tv_tensors.Image
+
+    tv_tensors.set_return_type("tensor")
+
+
+def test_return_type_input():
+    img = make_image()
+
+    # Case-insensitive
+    with tv_tensors.set_return_type("tvtensor"):
+        assert type(img + 3) is tv_tensors.Image
+
+    with pytest.raises(ValueError, match="return_type must be"):
+        tv_tensors.set_return_type("typo")
+
+    tv_tensors.set_return_type("tensor")
diff --git a/test/test_utils.py b/test/test_utils.py
index dde3ee90dc306f8e2a7c1be74c2a85b8855dd1a6..b13bd0f0f5bbfffbd296ccf25f4ad22f990a01de 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -9,7 +9,7 @@ import pytest
 import torch
 import torchvision.transforms.functional as F
 import torchvision.utils as utils
-from common_utils import assert_equal
+from common_utils import assert_equal, cpu_and_cuda
 from PIL import __version__ as PILLOW_VERSION, Image, ImageColor
 
 
@@ -120,6 +120,9 @@ def test_draw_boxes_colors(colors):
     img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
     utils.draw_bounding_boxes(img, boxes, fill=False, width=7, colors=colors)
 
+    with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"):
+        utils.draw_bounding_boxes(image=img, boxes=boxes, colors=[])
+
 
 def test_draw_boxes_vanilla():
     img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
@@ -184,7 +187,7 @@ def test_draw_no_boxes():
     boxes = torch.full((0, 4), 0, dtype=torch.float)
     with pytest.warns(UserWarning, match=re.escape("boxes doesn't contain any box. No box was drawn")):
         res = utils.draw_bounding_boxes(img, boxes)
-        # Check that the function didnt change the image
+        # Check that the function didn't change the image
         assert res.eq(img).all()
 
 
@@ -200,16 +203,17 @@ def test_draw_no_boxes():
     ],
 )
 @pytest.mark.parametrize("alpha", (0, 0.5, 0.7, 1))
-def test_draw_segmentation_masks(colors, alpha):
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_draw_segmentation_masks(colors, alpha, device):
     """This test makes sure that masks draw their corresponding color where they should"""
     num_masks, h, w = 2, 100, 100
     dtype = torch.uint8
-    img = torch.randint(0, 256, size=(3, h, w), dtype=dtype)
-    masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool)
+    img = torch.randint(0, 256, size=(3, h, w), dtype=dtype, device=device)
+    masks = torch.randint(0, 2, (num_masks, h, w), dtype=torch.bool, device=device)
 
     # For testing we enforce that there's no overlap between the masks. The
     # current behaviour is that the last mask's color will take priority when
-    # masks overlap, but this makes testing slightly harder so we don't really
+    # masks overlap, but this makes testing slightly harder, so we don't really
     # care
     overlap = masks[0] & masks[1]
     masks[:, overlap] = False
@@ -231,7 +235,7 @@ def test_draw_segmentation_masks(colors, alpha):
     for mask, color in zip(masks, colors):
         if isinstance(color, str):
             color = ImageColor.getrgb(color)
-        color = torch.tensor(color, dtype=dtype)
+        color = torch.tensor(color, dtype=dtype, device=device)
 
         if alpha == 1:
             assert (out[:, mask] == color[:, None]).all()
@@ -242,11 +246,12 @@ def test_draw_segmentation_masks(colors, alpha):
         torch.testing.assert_close(out[:, mask], interpolated_color, rtol=0.0, atol=1.0)
 
 
-def test_draw_segmentation_masks_errors():
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_draw_segmentation_masks_errors(device):
     h, w = 10, 10
 
-    masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool)
-    img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8)
+    masks = torch.randint(0, 2, size=(h, w), dtype=torch.bool, device=device)
+    img = torch.randint(0, 256, size=(3, h, w), dtype=torch.uint8, device=device)
 
     with pytest.raises(TypeError, match="The image must be a tensor"):
         utils.draw_segmentation_masks(image="Not A Tensor Image", masks=masks)
@@ -268,22 +273,23 @@ def test_draw_segmentation_masks_errors():
     with pytest.raises(ValueError, match="must have the same height and width"):
         masks_bad_shape = torch.randint(0, 2, size=(h + 4, w), dtype=torch.bool)
         utils.draw_segmentation_masks(image=img, masks=masks_bad_shape)
-    with pytest.raises(ValueError, match="There are more masks"):
+    with pytest.raises(ValueError, match="Number of colors must be equal or larger than the number of objects"):
         utils.draw_segmentation_masks(image=img, masks=masks, colors=[])
-    with pytest.raises(ValueError, match="colors must be a tuple or a string, or a list thereof"):
+    with pytest.raises(ValueError, match="`colors` must be a tuple or a string, or a list thereof"):
         bad_colors = np.array(["red", "blue"])  # should be a list
         utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors)
-    with pytest.raises(ValueError, match="It seems that you passed a tuple of colors instead of"):
+    with pytest.raises(ValueError, match="If passed as tuple, colors should be an RGB triplet"):
         bad_colors = ("red", "blue")  # should be a list
         utils.draw_segmentation_masks(image=img, masks=masks, colors=bad_colors)
 
 
-def test_draw_no_segmention_mask():
-    img = torch.full((3, 100, 100), 0, dtype=torch.uint8)
-    masks = torch.full((0, 100, 100), 0, dtype=torch.bool)
+@pytest.mark.parametrize("device", cpu_and_cuda())
+def test_draw_no_segmention_mask(device):
+    img = torch.full((3, 100, 100), 0, dtype=torch.uint8, device=device)
+    masks = torch.full((0, 100, 100), 0, dtype=torch.bool, device=device)
     with pytest.warns(UserWarning, match=re.escape("masks doesn't contain any mask. No mask was drawn")):
         res = utils.draw_segmentation_masks(img, masks)
-        # Check that the function didnt change the image
+        # Check that the function didn't change the image
         assert res.eq(img).all()
 
 
diff --git a/test/test_video_gpu_decoder.py b/test/test_video_gpu_decoder.py
index d987db6ddebdb756c98c7d5ed788a28eaa7fb984..aa6d0aee9e04afe5d36227922e2bc589d16d3ee6 100644
--- a/test/test_video_gpu_decoder.py
+++ b/test/test_video_gpu_decoder.py
@@ -3,6 +3,7 @@ import os
 
 import pytest
 import torch
+import torchvision
 from torchvision.io import _HAS_GPU_VIDEO_DECODER, VideoReader
 
 try:
@@ -29,8 +30,9 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_frame_reading(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         with av.open(full_path) as container:
             for av_frame in container.decode(container.streams.video[0]):
                 av_frames = torch.tensor(av_frame.to_rgb(src_colorspace="ITU709").to_ndarray())
@@ -54,7 +56,8 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_seek_reading(self, keyframes, full_path, duration):
-        decoder = VideoReader(full_path, device="cuda")
+        torchvision.set_video_backend("cuda")
+        decoder = VideoReader(full_path)
         time = duration / 2
         decoder.seek(time, keyframes_only=keyframes)
         with av.open(full_path) as container:
@@ -79,8 +82,9 @@ class TestVideoGPUDecoder:
         ],
     )
     def test_metadata(self, video_file):
+        torchvision.set_video_backend("cuda")
         full_path = os.path.join(VIDEO_DIR, video_file)
-        decoder = VideoReader(full_path, device="cuda")
+        decoder = VideoReader(full_path)
         video_metadata = decoder.get_metadata()["video"]
         with av.open(full_path) as container:
             video = container.streams.video[0]
diff --git a/test/test_video_reader.py b/test/test_video_reader.py
index 867923d10d0732a3f144839732250575cfaac722..243aa12fc120ed0102f66d064cf834ec3a26cafb 100644
--- a/test/test_video_reader.py
+++ b/test/test_video_reader.py
@@ -127,7 +127,7 @@ def _read_from_stream(container, start_pts, end_pts, stream, stream_name, buffer
             ascending order. We need to decode more frames even when we meet end
             pts
     """
-    # seeking in the stream is imprecise. Thus, seek to an ealier PTS by a margin
+    # seeking in the stream is imprecise. Thus, seek to an earlier PTS by a margin
     margin = 1
     seek_offset = max(start_pts - margin, 0)
 
diff --git a/test/test_videoapi.py b/test/test_videoapi.py
index 895b9b83555dbc28801d4903bf0f3b996d1e99e8..05fbcbdbff29b1e961226eeea48e3380aac9d761 100644
--- a/test/test_videoapi.py
+++ b/test/test_videoapi.py
@@ -10,6 +10,12 @@ from torchvision.datasets.utils import download_url
 from torchvision.io import _HAS_VIDEO_OPT, VideoReader
 
 
+# WARNING: these tests have been skipped forever on the CI because the video ops
+# are never properly available. This is bad, but things have been in a terrible
+# state for a long time already as we write this comment, and we'll hopefully be
+# able to get rid of this all soon.
+
+
 try:
     import av
 
@@ -25,6 +31,13 @@ CheckerConfig = ["duration", "video_fps", "audio_sample_rate"]
 GroundTruth = collections.namedtuple("GroundTruth", " ".join(CheckerConfig))
 
 
+def backends():
+    backends_ = ["video_reader"]
+    if av is not None:
+        backends_.append("pyav")
+    return backends_
+
+
 def fate(name, path="."):
     """Download and return a path to a sample from the FFmpeg test suite.
     See the `FFmpeg Automated Test Environment <https://www.ffmpeg.org/fate.html>`_
@@ -53,7 +66,9 @@ test_videos = {
 class TestVideoApi:
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_frame_reading(self, test_video):
+    @pytest.mark.parametrize("backend", backends())
+    def test_frame_reading(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         with av.open(full_path) as av_reader:
             if av_reader.streams.video:
@@ -77,6 +92,7 @@ class TestVideoApi:
                 # compare the frames and ptss
                 for i in range(len(vr_frames)):
                     assert float(av_pts[i]) == approx(vr_pts[i], abs=0.1)
+
                     mean_delta = torch.mean(torch.abs(av_frames[i].float() - vr_frames[i].float()))
                     # on average the difference is very small and caused
                     # by decoding (around 1%)
@@ -114,12 +130,62 @@ class TestVideoApi:
                     # we assure that there is never more than 1% difference in signal
                     assert max_delta.item() < 0.001
 
+    @pytest.mark.parametrize("stream", ["video", "audio"])
+    @pytest.mark.parametrize("test_video", test_videos.keys())
+    @pytest.mark.parametrize("backend", backends())
+    def test_frame_reading_mem_vs_file(self, test_video, stream, backend):
+        torchvision.set_video_backend(backend)
+        full_path = os.path.join(VIDEO_DIR, test_video)
+
+        reader = VideoReader(full_path)
+        reader_md = reader.get_metadata()
+
+        if stream in reader_md:
+            # Test video reading from file vs from memory
+            vr_frames, vr_frames_mem = [], []
+            vr_pts, vr_pts_mem = [], []
+            # get vr frames
+            video_reader = VideoReader(full_path, stream)
+            for vr_frame in video_reader:
+                vr_frames.append(vr_frame["data"])
+                vr_pts.append(vr_frame["pts"])
+
+            # get vr frames = read from memory
+            f = open(full_path, "rb")
+            fbytes = f.read()
+            f.close()
+            video_reader_from_mem = VideoReader(fbytes, stream)
+
+            for vr_frame_from_mem in video_reader_from_mem:
+                vr_frames_mem.append(vr_frame_from_mem["data"])
+                vr_pts_mem.append(vr_frame_from_mem["pts"])
+
+            # same number of frames
+            assert len(vr_frames) == len(vr_frames_mem)
+            assert len(vr_pts) == len(vr_pts_mem)
+
+            # compare the frames and ptss
+            for i in range(len(vr_frames)):
+                assert vr_pts[i] == vr_pts_mem[i]
+                mean_delta = torch.mean(torch.abs(vr_frames[i].float() - vr_frames_mem[i].float()))
+                # on average the difference is very small and caused
+                # by decoding (around 1%)
+                # TODO: asses empirically how to set this? atm it's 1%
+                # averaged over all frames
+                assert mean_delta.item() < 2.55
+
+            del vr_frames, vr_pts, vr_frames_mem, vr_pts_mem
+        else:
+            del reader, reader_md
+
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_metadata(self, test_video, config):
+    @pytest.mark.parametrize("backend", backends())
+    def test_metadata(self, test_video, config, backend):
         """
         Test that the metadata returned via pyav corresponds to the one returned
         by the new video decoder API
         """
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         reader = VideoReader(full_path, "video")
         reader_md = reader.get_metadata()
@@ -127,7 +193,9 @@ class TestVideoApi:
         assert config.duration == approx(reader_md["video"]["duration"][0], abs=0.5)
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_seek_start(self, test_video):
+    @pytest.mark.parametrize("backend", backends())
+    def test_seek_start(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         video_reader = VideoReader(full_path, "video")
         num_frames = 0
@@ -153,7 +221,9 @@ class TestVideoApi:
         assert start_num_frames == num_frames
 
     @pytest.mark.parametrize("test_video", test_videos.keys())
-    def test_accurateseek_middle(self, test_video):
+    @pytest.mark.parametrize("backend", ["video_reader"])
+    def test_accurateseek_middle(self, test_video, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
         stream = "video"
         video_reader = VideoReader(full_path, stream)
@@ -192,7 +262,9 @@ class TestVideoApi:
 
     @pytest.mark.skipif(av is None, reason="PyAV unavailable")
     @pytest.mark.parametrize("test_video,config", test_videos.items())
-    def test_keyframe_reading(self, test_video, config):
+    @pytest.mark.parametrize("backend", backends())
+    def test_keyframe_reading(self, test_video, config, backend):
+        torchvision.set_video_backend(backend)
         full_path = os.path.join(VIDEO_DIR, test_video)
 
         av_reader = av.open(full_path)
diff --git a/test/tracing/frcnn/CMakeLists.txt b/test/tracing/frcnn/CMakeLists.txt
index c79382470bd528e17e38fb01ad3078d77eccf24b..8ede462e34b7b87884a2e6f929d8480930ecd9f8 100644
--- a/test/tracing/frcnn/CMakeLists.txt
+++ b/test/tracing/frcnn/CMakeLists.txt
@@ -10,4 +10,4 @@ find_package(Python3 COMPONENTS Development)
 add_executable(test_frcnn_tracing test_frcnn_tracing.cpp)
 target_compile_features(test_frcnn_tracing PUBLIC cxx_range_for)
 target_link_libraries(test_frcnn_tracing ${TORCH_LIBRARIES} TorchVision::TorchVision Python3::Python)
-set_property(TARGET test_frcnn_tracing PROPERTY CXX_STANDARD 14)
+set_property(TARGET test_frcnn_tracing PROPERTY CXX_STANDARD 17)
diff --git a/test/transforms_v2_dispatcher_infos.py b/test/transforms_v2_dispatcher_infos.py
new file mode 100644
index 0000000000000000000000000000000000000000..b84d87eb7aec6aac904e46357674687db531bbc0
--- /dev/null
+++ b/test/transforms_v2_dispatcher_infos.py
@@ -0,0 +1,325 @@
+import pytest
+import torchvision.transforms.v2.functional as F
+from torchvision import tv_tensors
+from transforms_v2_kernel_infos import KERNEL_INFOS, pad_xfail_jit_fill_condition
+from transforms_v2_legacy_utils import InfoBase, TestMark
+
+__all__ = ["DispatcherInfo", "DISPATCHER_INFOS"]
+
+
+class PILKernelInfo(InfoBase):
+    def __init__(
+        self,
+        kernel,
+        *,
+        # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name
+        # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then
+        kernel_name=None,
+    ):
+        super().__init__(id=kernel_name or kernel.__name__)
+        self.kernel = kernel
+
+
+class DispatcherInfo(InfoBase):
+    _KERNEL_INFO_MAP = {info.kernel: info for info in KERNEL_INFOS}
+
+    def __init__(
+        self,
+        dispatcher,
+        *,
+        # Dictionary of types that map to the kernel the dispatcher dispatches to.
+        kernels,
+        # If omitted, no PIL dispatch test will be performed.
+        pil_kernel_info=None,
+        # See InfoBase
+        test_marks=None,
+        # See InfoBase
+        closeness_kwargs=None,
+    ):
+        super().__init__(id=dispatcher.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs)
+        self.dispatcher = dispatcher
+        self.kernels = kernels
+        self.pil_kernel_info = pil_kernel_info
+
+        kernel_infos = {}
+        for tv_tensor_type, kernel in self.kernels.items():
+            kernel_info = self._KERNEL_INFO_MAP.get(kernel)
+            if not kernel_info:
+                raise pytest.UsageError(
+                    f"Can't register {kernel.__name__} for type {tv_tensor_type} since there is no `KernelInfo` for it. "
+                    f"Please add a `KernelInfo` for it in `transforms_v2_kernel_infos.py`."
+                )
+            kernel_infos[tv_tensor_type] = kernel_info
+        self.kernel_infos = kernel_infos
+
+    def sample_inputs(self, *tv_tensor_types, filter_metadata=True):
+        for tv_tensor_type in tv_tensor_types or self.kernel_infos.keys():
+            kernel_info = self.kernel_infos.get(tv_tensor_type)
+            if not kernel_info:
+                raise pytest.UsageError(f"There is no kernel registered for type {type.__name__}")
+
+            sample_inputs = kernel_info.sample_inputs_fn()
+
+            if not filter_metadata:
+                yield from sample_inputs
+                return
+
+            import itertools
+
+            for args_kwargs in sample_inputs:
+                if hasattr(tv_tensor_type, "__annotations__"):
+                    for name in itertools.chain(
+                        tv_tensor_type.__annotations__.keys(),
+                        # FIXME: this seems ok for conversion dispatchers, but we should probably handle this on a
+                        #  per-dispatcher level. However, so far there is no option for that.
+                        (f"old_{name}" for name in tv_tensor_type.__annotations__.keys()),
+                    ):
+                        if name in args_kwargs.kwargs:
+                            del args_kwargs.kwargs[name]
+
+                yield args_kwargs
+
+
+def xfail_jit(reason, *, condition=None):
+    return TestMark(
+        ("TestDispatchers", "test_scripted_smoke"),
+        pytest.mark.xfail(reason=reason),
+        condition=condition,
+    )
+
+
+def xfail_jit_python_scalar_arg(name, *, reason=None):
+    return xfail_jit(
+        reason or f"Python scalar int or float for `{name}` is not supported when scripting",
+        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)),
+    )
+
+
+skip_dispatch_tv_tensor = TestMark(
+    ("TestDispatchers", "test_dispatch_tv_tensor"),
+    pytest.mark.skip(reason="Dispatcher doesn't support arbitrary tv_tensor dispatch."),
+)
+
+multi_crop_skips = [
+    TestMark(
+        ("TestDispatchers", test_name),
+        pytest.mark.skip(reason="Multi-crop dispatchers return a sequence of items rather than a single one."),
+    )
+    for test_name in ["test_pure_tensor_output_type", "test_pil_output_type", "test_tv_tensor_output_type"]
+]
+multi_crop_skips.append(skip_dispatch_tv_tensor)
+
+
+DISPATCHER_INFOS = [
+    DispatcherInfo(
+        F.resized_crop,
+        kernels={
+            tv_tensors.Image: F.resized_crop_image,
+            tv_tensors.Video: F.resized_crop_video,
+            tv_tensors.BoundingBoxes: F.resized_crop_bounding_boxes,
+            tv_tensors.Mask: F.resized_crop_mask,
+        },
+        pil_kernel_info=PILKernelInfo(F._resized_crop_image_pil),
+    ),
+    DispatcherInfo(
+        F.pad,
+        kernels={
+            tv_tensors.Image: F.pad_image,
+            tv_tensors.Video: F.pad_video,
+            tv_tensors.BoundingBoxes: F.pad_bounding_boxes,
+            tv_tensors.Mask: F.pad_mask,
+        },
+        pil_kernel_info=PILKernelInfo(F._pad_image_pil, kernel_name="pad_image_pil"),
+        test_marks=[
+            xfail_jit("F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition),
+            xfail_jit_python_scalar_arg("padding"),
+        ],
+    ),
+    DispatcherInfo(
+        F.perspective,
+        kernels={
+            tv_tensors.Image: F.perspective_image,
+            tv_tensors.Video: F.perspective_video,
+            tv_tensors.BoundingBoxes: F.perspective_bounding_boxes,
+            tv_tensors.Mask: F.perspective_mask,
+        },
+        pil_kernel_info=PILKernelInfo(F._perspective_image_pil),
+        test_marks=[
+            xfail_jit_python_scalar_arg("fill"),
+        ],
+    ),
+    DispatcherInfo(
+        F.elastic,
+        kernels={
+            tv_tensors.Image: F.elastic_image,
+            tv_tensors.Video: F.elastic_video,
+            tv_tensors.BoundingBoxes: F.elastic_bounding_boxes,
+            tv_tensors.Mask: F.elastic_mask,
+        },
+        pil_kernel_info=PILKernelInfo(F._elastic_image_pil),
+        test_marks=[xfail_jit_python_scalar_arg("fill")],
+    ),
+    DispatcherInfo(
+        F.center_crop,
+        kernels={
+            tv_tensors.Image: F.center_crop_image,
+            tv_tensors.Video: F.center_crop_video,
+            tv_tensors.BoundingBoxes: F.center_crop_bounding_boxes,
+            tv_tensors.Mask: F.center_crop_mask,
+        },
+        pil_kernel_info=PILKernelInfo(F._center_crop_image_pil),
+        test_marks=[
+            xfail_jit_python_scalar_arg("output_size"),
+        ],
+    ),
+    DispatcherInfo(
+        F.gaussian_blur,
+        kernels={
+            tv_tensors.Image: F.gaussian_blur_image,
+            tv_tensors.Video: F.gaussian_blur_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._gaussian_blur_image_pil),
+        test_marks=[
+            xfail_jit_python_scalar_arg("kernel_size"),
+            xfail_jit_python_scalar_arg("sigma"),
+        ],
+    ),
+    DispatcherInfo(
+        F.equalize,
+        kernels={
+            tv_tensors.Image: F.equalize_image,
+            tv_tensors.Video: F.equalize_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._equalize_image_pil, kernel_name="equalize_image_pil"),
+    ),
+    DispatcherInfo(
+        F.invert,
+        kernels={
+            tv_tensors.Image: F.invert_image,
+            tv_tensors.Video: F.invert_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._invert_image_pil, kernel_name="invert_image_pil"),
+    ),
+    DispatcherInfo(
+        F.posterize,
+        kernels={
+            tv_tensors.Image: F.posterize_image,
+            tv_tensors.Video: F.posterize_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._posterize_image_pil, kernel_name="posterize_image_pil"),
+    ),
+    DispatcherInfo(
+        F.solarize,
+        kernels={
+            tv_tensors.Image: F.solarize_image,
+            tv_tensors.Video: F.solarize_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._solarize_image_pil, kernel_name="solarize_image_pil"),
+    ),
+    DispatcherInfo(
+        F.autocontrast,
+        kernels={
+            tv_tensors.Image: F.autocontrast_image,
+            tv_tensors.Video: F.autocontrast_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._autocontrast_image_pil, kernel_name="autocontrast_image_pil"),
+    ),
+    DispatcherInfo(
+        F.adjust_sharpness,
+        kernels={
+            tv_tensors.Image: F.adjust_sharpness_image,
+            tv_tensors.Video: F.adjust_sharpness_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._adjust_sharpness_image_pil, kernel_name="adjust_sharpness_image_pil"),
+    ),
+    DispatcherInfo(
+        F.adjust_contrast,
+        kernels={
+            tv_tensors.Image: F.adjust_contrast_image,
+            tv_tensors.Video: F.adjust_contrast_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._adjust_contrast_image_pil, kernel_name="adjust_contrast_image_pil"),
+    ),
+    DispatcherInfo(
+        F.adjust_gamma,
+        kernels={
+            tv_tensors.Image: F.adjust_gamma_image,
+            tv_tensors.Video: F.adjust_gamma_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._adjust_gamma_image_pil, kernel_name="adjust_gamma_image_pil"),
+    ),
+    DispatcherInfo(
+        F.adjust_hue,
+        kernels={
+            tv_tensors.Image: F.adjust_hue_image,
+            tv_tensors.Video: F.adjust_hue_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._adjust_hue_image_pil, kernel_name="adjust_hue_image_pil"),
+    ),
+    DispatcherInfo(
+        F.adjust_saturation,
+        kernels={
+            tv_tensors.Image: F.adjust_saturation_image,
+            tv_tensors.Video: F.adjust_saturation_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._adjust_saturation_image_pil, kernel_name="adjust_saturation_image_pil"),
+    ),
+    DispatcherInfo(
+        F.five_crop,
+        kernels={
+            tv_tensors.Image: F.five_crop_image,
+            tv_tensors.Video: F.five_crop_video,
+        },
+        pil_kernel_info=PILKernelInfo(F._five_crop_image_pil),
+        test_marks=[
+            xfail_jit_python_scalar_arg("size"),
+            *multi_crop_skips,
+        ],
+    ),
+    DispatcherInfo(
+        F.ten_crop,
+        kernels={
+            tv_tensors.Image: F.ten_crop_image,
+            tv_tensors.Video: F.ten_crop_video,
+        },
+        test_marks=[
+            xfail_jit_python_scalar_arg("size"),
+            *multi_crop_skips,
+        ],
+        pil_kernel_info=PILKernelInfo(F._ten_crop_image_pil),
+    ),
+    DispatcherInfo(
+        F.normalize,
+        kernels={
+            tv_tensors.Image: F.normalize_image,
+            tv_tensors.Video: F.normalize_video,
+        },
+        test_marks=[
+            xfail_jit_python_scalar_arg("mean"),
+            xfail_jit_python_scalar_arg("std"),
+        ],
+    ),
+    DispatcherInfo(
+        F.uniform_temporal_subsample,
+        kernels={
+            tv_tensors.Video: F.uniform_temporal_subsample_video,
+        },
+        test_marks=[
+            skip_dispatch_tv_tensor,
+        ],
+    ),
+    DispatcherInfo(
+        F.clamp_bounding_boxes,
+        kernels={tv_tensors.BoundingBoxes: F.clamp_bounding_boxes},
+        test_marks=[
+            skip_dispatch_tv_tensor,
+        ],
+    ),
+    DispatcherInfo(
+        F.convert_bounding_box_format,
+        kernels={tv_tensors.BoundingBoxes: F.convert_bounding_box_format},
+        test_marks=[
+            skip_dispatch_tv_tensor,
+        ],
+    ),
+]
diff --git a/test/transforms_v2_kernel_infos.py b/test/transforms_v2_kernel_infos.py
new file mode 100644
index 0000000000000000000000000000000000000000..a549bfe72ddfc950d4e3faf4827dbdeb78e128d0
--- /dev/null
+++ b/test/transforms_v2_kernel_infos.py
@@ -0,0 +1,1522 @@
+import functools
+import itertools
+
+import numpy as np
+import PIL.Image
+import pytest
+import torch.testing
+import torchvision.ops
+import torchvision.transforms.v2.functional as F
+from torchvision import tv_tensors
+from torchvision.transforms._functional_tensor import _max_value as get_max_value, _parse_pad_padding
+from transforms_v2_legacy_utils import (
+    ArgsKwargs,
+    combinations_grid,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    get_num_channels,
+    ImageLoader,
+    InfoBase,
+    make_bounding_box_loader,
+    make_bounding_box_loaders,
+    make_detection_mask_loader,
+    make_image_loader,
+    make_image_loaders,
+    make_image_loaders_for_interpolation,
+    make_mask_loaders,
+    make_video_loader,
+    make_video_loaders,
+    mark_framework_limitation,
+    TestMark,
+)
+
+__all__ = ["KernelInfo", "KERNEL_INFOS"]
+
+
+class KernelInfo(InfoBase):
+    def __init__(
+        self,
+        kernel,
+        *,
+        # Defaults to `kernel.__name__`. Should be set if the function is exposed under a different name
+        # TODO: This can probably be removed after roll-out since we shouldn't have any aliasing then
+        kernel_name=None,
+        # Most common tests use these inputs to check the kernel. As such it should cover all valid code paths, but
+        # should not include extensive parameter combinations to keep to overall test count moderate.
+        sample_inputs_fn,
+        # This function should mirror the kernel. It should have the same signature as the `kernel` and as such also
+        # take tensors as inputs. Any conversion into another object type, e.g. PIL images or numpy arrays, should
+        # happen inside the function. It should return a tensor or to be more precise an object that can be compared to
+        # a tensor by `assert_close`. If omitted, no reference test will be performed.
+        reference_fn=None,
+        # These inputs are only used for the reference tests and thus can be comprehensive with regard to the parameter
+        # values to be tested. If not specified, `sample_inputs_fn` will be used.
+        reference_inputs_fn=None,
+        # If true-ish, triggers a test that checks the kernel for consistency between uint8 and float32 inputs with the
+        # reference inputs. This is usually used whenever we use a PIL kernel as reference.
+        # Can be a callable in which case it will be called with `other_args, kwargs`. It should return the same
+        # structure, but with adapted parameters. This is useful in case a parameter value is closely tied to the input
+        # dtype.
+        float32_vs_uint8=False,
+        # Some kernels don't have dispatchers that would handle logging the usage. Thus, the kernel has to do it
+        # manually. If set, triggers a test that makes sure this happens.
+        logs_usage=False,
+        # See InfoBase
+        test_marks=None,
+        # See InfoBase
+        closeness_kwargs=None,
+    ):
+        super().__init__(id=kernel_name or kernel.__name__, test_marks=test_marks, closeness_kwargs=closeness_kwargs)
+        self.kernel = kernel
+        self.sample_inputs_fn = sample_inputs_fn
+        self.reference_fn = reference_fn
+        self.reference_inputs_fn = reference_inputs_fn
+
+        if float32_vs_uint8 and not callable(float32_vs_uint8):
+            float32_vs_uint8 = lambda other_args, kwargs: (other_args, kwargs)  # noqa: E731
+        self.float32_vs_uint8 = float32_vs_uint8
+        self.logs_usage = logs_usage
+
+
+def pixel_difference_closeness_kwargs(uint8_atol, *, dtype=torch.uint8, mae=False):
+    return dict(atol=uint8_atol / 255 * get_max_value(dtype), rtol=0, mae=mae)
+
+
+def cuda_vs_cpu_pixel_difference(atol=1):
+    return {
+        (("TestKernels", "test_cuda_vs_cpu"), dtype, "cuda"): pixel_difference_closeness_kwargs(atol, dtype=dtype)
+        for dtype in [torch.uint8, torch.float32]
+    }
+
+
+def pil_reference_pixel_difference(atol=1, mae=False):
+    return {
+        (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(
+            atol, mae=mae
+        )
+    }
+
+
+def float32_vs_uint8_pixel_difference(atol=1, mae=False):
+    return {
+        (
+            ("TestKernels", "test_float32_vs_uint8"),
+            torch.float32,
+            "cpu",
+        ): pixel_difference_closeness_kwargs(atol, dtype=torch.float32, mae=mae)
+    }
+
+
+def scripted_vs_eager_float64_tolerances(device, atol=1e-6, rtol=1e-6):
+    return {
+        (("TestKernels", "test_scripted_vs_eager"), torch.float64, device): {"atol": atol, "rtol": rtol, "mae": False},
+    }
+
+
+def pil_reference_wrapper(pil_kernel):
+    @functools.wraps(pil_kernel)
+    def wrapper(input_tensor, *other_args, **kwargs):
+        if input_tensor.dtype != torch.uint8:
+            raise pytest.UsageError(f"Can only test uint8 tensor images against PIL, but input is {input_tensor.dtype}")
+        if input_tensor.ndim > 3:
+            raise pytest.UsageError(
+                f"Can only test single tensor images against PIL, but input has shape {input_tensor.shape}"
+            )
+
+        input_pil = F.to_pil_image(input_tensor)
+        output_pil = pil_kernel(input_pil, *other_args, **kwargs)
+        if not isinstance(output_pil, PIL.Image.Image):
+            return output_pil
+
+        output_tensor = F.to_image(output_pil)
+
+        # 2D mask shenanigans
+        if output_tensor.ndim == 2 and input_tensor.ndim == 3:
+            output_tensor = output_tensor.unsqueeze(0)
+        elif output_tensor.ndim == 3 and input_tensor.ndim == 2:
+            output_tensor = output_tensor.squeeze(0)
+
+        return output_tensor
+
+    return wrapper
+
+
+def xfail_jit(reason, *, condition=None):
+    return TestMark(("TestKernels", "test_scripted_vs_eager"), pytest.mark.xfail(reason=reason), condition=condition)
+
+
+def xfail_jit_python_scalar_arg(name, *, reason=None):
+    return xfail_jit(
+        reason or f"Python scalar int or float for `{name}` is not supported when scripting",
+        condition=lambda args_kwargs: isinstance(args_kwargs.kwargs.get(name), (int, float)),
+    )
+
+
+KERNEL_INFOS = []
+
+
+def get_fills(*, num_channels, dtype):
+    yield None
+
+    int_value = get_max_value(dtype)
+    float_value = int_value / 2
+    yield int_value
+    yield float_value
+
+    for vector_type in [list, tuple]:
+        yield vector_type([int_value])
+        yield vector_type([float_value])
+
+        if num_channels > 1:
+            yield vector_type(float_value * c / 10 for c in range(num_channels))
+            yield vector_type(int_value if c % 2 == 0 else 0 for c in range(num_channels))
+
+
+def float32_vs_uint8_fill_adapter(other_args, kwargs):
+    fill = kwargs.get("fill")
+    if fill is None:
+        return other_args, kwargs
+
+    if isinstance(fill, (int, float)):
+        fill /= 255
+    else:
+        fill = type(fill)(fill_ / 255 for fill_ in fill)
+
+    return other_args, dict(kwargs, fill=fill)
+
+
+def reference_affine_bounding_boxes_helper(bounding_boxes, *, format, canvas_size, affine_matrix):
+    def transform(bbox, affine_matrix_, format_, canvas_size_):
+        # Go to float before converting to prevent precision loss in case of CXCYWH -> XYXY and W or H is 1
+        in_dtype = bbox.dtype
+        if not torch.is_floating_point(bbox):
+            bbox = bbox.float()
+        bbox_xyxy = F.convert_bounding_box_format(
+            bbox.as_subclass(torch.Tensor),
+            old_format=format_,
+            new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            inplace=True,
+        )
+        points = np.array(
+            [
+                [bbox_xyxy[0].item(), bbox_xyxy[1].item(), 1.0],
+                [bbox_xyxy[2].item(), bbox_xyxy[1].item(), 1.0],
+                [bbox_xyxy[0].item(), bbox_xyxy[3].item(), 1.0],
+                [bbox_xyxy[2].item(), bbox_xyxy[3].item(), 1.0],
+            ]
+        )
+        transformed_points = np.matmul(points, affine_matrix_.T)
+        out_bbox = torch.tensor(
+            [
+                np.min(transformed_points[:, 0]).item(),
+                np.min(transformed_points[:, 1]).item(),
+                np.max(transformed_points[:, 0]).item(),
+                np.max(transformed_points[:, 1]).item(),
+            ],
+            dtype=bbox_xyxy.dtype,
+        )
+        out_bbox = F.convert_bounding_box_format(
+            out_bbox, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format_, inplace=True
+        )
+        # It is important to clamp before casting, especially for CXCYWH format, dtype=int64
+        out_bbox = F.clamp_bounding_boxes(out_bbox, format=format_, canvas_size=canvas_size_)
+        out_bbox = out_bbox.to(dtype=in_dtype)
+        return out_bbox
+
+    return torch.stack(
+        [transform(b, affine_matrix, format, canvas_size) for b in bounding_boxes.reshape(-1, 4).unbind()]
+    ).reshape(bounding_boxes.shape)
+
+
+def sample_inputs_convert_bounding_box_format():
+    formats = list(tv_tensors.BoundingBoxFormat)
+    for bounding_boxes_loader, new_format in itertools.product(make_bounding_box_loaders(formats=formats), formats):
+        yield ArgsKwargs(bounding_boxes_loader, old_format=bounding_boxes_loader.format, new_format=new_format)
+
+
+def reference_convert_bounding_box_format(bounding_boxes, old_format, new_format):
+    return torchvision.ops.box_convert(
+        bounding_boxes, in_fmt=old_format.name.lower(), out_fmt=new_format.name.lower()
+    ).to(bounding_boxes.dtype)
+
+
+def reference_inputs_convert_bounding_box_format():
+    for args_kwargs in sample_inputs_convert_bounding_box_format():
+        if len(args_kwargs.args[0].shape) == 2:
+            yield args_kwargs
+
+
+KERNEL_INFOS.append(
+    KernelInfo(
+        F.convert_bounding_box_format,
+        sample_inputs_fn=sample_inputs_convert_bounding_box_format,
+        reference_fn=reference_convert_bounding_box_format,
+        reference_inputs_fn=reference_inputs_convert_bounding_box_format,
+        logs_usage=True,
+        closeness_kwargs={
+            (("TestKernels", "test_against_reference"), torch.int64, "cpu"): dict(atol=1, rtol=0),
+        },
+    ),
+)
+
+
+_RESIZED_CROP_PARAMS = combinations_grid(top=[-8, 9], left=[-8, 9], height=[12], width=[12], size=[(16, 18)])
+
+
+def sample_inputs_resized_crop_image_tensor():
+    for image_loader in make_image_loaders():
+        yield ArgsKwargs(image_loader, **_RESIZED_CROP_PARAMS[0])
+
+
+@pil_reference_wrapper
+def reference_resized_crop_image_tensor(*args, **kwargs):
+    if not kwargs.pop("antialias", False) and kwargs.get("interpolation", F.InterpolationMode.BILINEAR) in {
+        F.InterpolationMode.BILINEAR,
+        F.InterpolationMode.BICUBIC,
+    }:
+        raise pytest.UsageError("Anti-aliasing is always active in PIL")
+    return F._resized_crop_image_pil(*args, **kwargs)
+
+
+def reference_inputs_resized_crop_image_tensor():
+    for image_loader, interpolation, params in itertools.product(
+        make_image_loaders_for_interpolation(),
+        [
+            F.InterpolationMode.NEAREST,
+            F.InterpolationMode.NEAREST_EXACT,
+            F.InterpolationMode.BILINEAR,
+            F.InterpolationMode.BICUBIC,
+        ],
+        _RESIZED_CROP_PARAMS,
+    ):
+        yield ArgsKwargs(
+            image_loader,
+            interpolation=interpolation,
+            antialias=interpolation
+            in {
+                F.InterpolationMode.BILINEAR,
+                F.InterpolationMode.BICUBIC,
+            },
+            **params,
+        )
+
+
+def sample_inputs_resized_crop_bounding_boxes():
+    for bounding_boxes_loader in make_bounding_box_loaders():
+        yield ArgsKwargs(bounding_boxes_loader, format=bounding_boxes_loader.format, **_RESIZED_CROP_PARAMS[0])
+
+
+def sample_inputs_resized_crop_mask():
+    for mask_loader in make_mask_loaders():
+        yield ArgsKwargs(mask_loader, **_RESIZED_CROP_PARAMS[0])
+
+
+def sample_inputs_resized_crop_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, **_RESIZED_CROP_PARAMS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.resized_crop_image,
+            sample_inputs_fn=sample_inputs_resized_crop_image_tensor,
+            reference_fn=reference_resized_crop_image_tensor,
+            reference_inputs_fn=reference_inputs_resized_crop_image_tensor,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **cuda_vs_cpu_pixel_difference(),
+                **pil_reference_pixel_difference(3, mae=True),
+                **float32_vs_uint8_pixel_difference(3, mae=True),
+            },
+        ),
+        KernelInfo(
+            F.resized_crop_bounding_boxes,
+            sample_inputs_fn=sample_inputs_resized_crop_bounding_boxes,
+        ),
+        KernelInfo(
+            F.resized_crop_mask,
+            sample_inputs_fn=sample_inputs_resized_crop_mask,
+        ),
+        KernelInfo(
+            F.resized_crop_video,
+            sample_inputs_fn=sample_inputs_resized_crop_video,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
+        ),
+    ]
+)
+
+_PAD_PARAMS = combinations_grid(
+    padding=[[1], [1, 1], [1, 1, 2, 2]],
+    padding_mode=["constant", "symmetric", "edge", "reflect"],
+)
+
+
+def sample_inputs_pad_image_tensor():
+    make_pad_image_loaders = functools.partial(
+        make_image_loaders, sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32]
+    )
+
+    for image_loader, padding in itertools.product(
+        make_pad_image_loaders(),
+        [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]],
+    ):
+        yield ArgsKwargs(image_loader, padding=padding)
+
+    for image_loader in make_pad_image_loaders():
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
+            yield ArgsKwargs(image_loader, padding=[1], fill=fill)
+
+    for image_loader, padding_mode in itertools.product(
+        # We branch for non-constant padding and integer inputs
+        make_pad_image_loaders(dtypes=[torch.uint8]),
+        ["constant", "symmetric", "edge", "reflect"],
+    ):
+        yield ArgsKwargs(image_loader, padding=[1], padding_mode=padding_mode)
+
+    # `torch.nn.functional.pad` does not support symmetric padding, and thus we have a custom implementation. Besides
+    # negative padding, this is already handled by the inputs above.
+    for image_loader in make_pad_image_loaders():
+        yield ArgsKwargs(image_loader, padding=[-1], padding_mode="symmetric")
+
+
+def reference_inputs_pad_image_tensor():
+    for image_loader, params in itertools.product(
+        make_image_loaders(extra_dims=[()], dtypes=[torch.uint8]), _PAD_PARAMS
+    ):
+        for fill in get_fills(
+            num_channels=image_loader.num_channels,
+            dtype=image_loader.dtype,
+        ):
+            # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
+            if isinstance(fill, (list, tuple)):
+                continue
+
+            yield ArgsKwargs(image_loader, fill=fill, **params)
+
+
+def sample_inputs_pad_bounding_boxes():
+    for bounding_boxes_loader, padding in itertools.product(
+        make_bounding_box_loaders(), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
+    ):
+        yield ArgsKwargs(
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            canvas_size=bounding_boxes_loader.canvas_size,
+            padding=padding,
+            padding_mode="constant",
+        )
+
+
+def sample_inputs_pad_mask():
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_categories=[10], num_objects=[5]):
+        yield ArgsKwargs(mask_loader, padding=[1])
+
+
+def reference_inputs_pad_mask():
+    for mask_loader, fill, params in itertools.product(
+        make_mask_loaders(num_objects=[1], extra_dims=[()]), [None, 127], _PAD_PARAMS
+    ):
+        yield ArgsKwargs(mask_loader, fill=fill, **params)
+
+
+def sample_inputs_pad_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, padding=[1])
+
+
+def reference_pad_bounding_boxes(bounding_boxes, *, format, canvas_size, padding, padding_mode):
+
+    left, right, top, bottom = _parse_pad_padding(padding)
+
+    affine_matrix = np.array(
+        [
+            [1, 0, left],
+            [0, 1, top],
+        ],
+        dtype="float64" if bounding_boxes.dtype == torch.float64 else "float32",
+    )
+
+    height = canvas_size[0] + top + bottom
+    width = canvas_size[1] + left + right
+
+    expected_bboxes = reference_affine_bounding_boxes_helper(
+        bounding_boxes, format=format, canvas_size=(height, width), affine_matrix=affine_matrix
+    )
+    return expected_bboxes, (height, width)
+
+
+def reference_inputs_pad_bounding_boxes():
+    for bounding_boxes_loader, padding in itertools.product(
+        make_bounding_box_loaders(extra_dims=((), (4,))), [1, (1,), (1, 2), (1, 2, 3, 4), [1], [1, 2], [1, 2, 3, 4]]
+    ):
+        yield ArgsKwargs(
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            canvas_size=bounding_boxes_loader.canvas_size,
+            padding=padding,
+            padding_mode="constant",
+        )
+
+
+def pad_xfail_jit_fill_condition(args_kwargs):
+    fill = args_kwargs.kwargs.get("fill")
+    if not isinstance(fill, (list, tuple)):
+        return False
+    elif isinstance(fill, tuple):
+        return True
+    else:  # isinstance(fill, list):
+        return all(isinstance(f, int) for f in fill)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.pad_image,
+            sample_inputs_fn=sample_inputs_pad_image_tensor,
+            reference_fn=pil_reference_wrapper(F._pad_image_pil),
+            reference_inputs_fn=reference_inputs_pad_image_tensor,
+            float32_vs_uint8=float32_vs_uint8_fill_adapter,
+            closeness_kwargs=float32_vs_uint8_pixel_difference(),
+            test_marks=[
+                xfail_jit_python_scalar_arg("padding"),
+                xfail_jit(
+                    "F.pad only supports vector fills for list of floats", condition=pad_xfail_jit_fill_condition
+                ),
+            ],
+        ),
+        KernelInfo(
+            F.pad_bounding_boxes,
+            sample_inputs_fn=sample_inputs_pad_bounding_boxes,
+            reference_fn=reference_pad_bounding_boxes,
+            reference_inputs_fn=reference_inputs_pad_bounding_boxes,
+            test_marks=[
+                xfail_jit_python_scalar_arg("padding"),
+            ],
+        ),
+        KernelInfo(
+            F.pad_mask,
+            sample_inputs_fn=sample_inputs_pad_mask,
+            reference_fn=pil_reference_wrapper(F._pad_image_pil),
+            reference_inputs_fn=reference_inputs_pad_mask,
+            float32_vs_uint8=float32_vs_uint8_fill_adapter,
+        ),
+        KernelInfo(
+            F.pad_video,
+            sample_inputs_fn=sample_inputs_pad_video,
+        ),
+    ]
+)
+
+_PERSPECTIVE_COEFFS = [
+    [1.2405, 0.1772, -6.9113, 0.0463, 1.251, -5.235, 0.00013, 0.0018],
+    [0.7366, -0.11724, 1.45775, -0.15012, 0.73406, 2.6019, -0.0072, -0.0063],
+]
+_STARTPOINTS = [[0, 1], [2, 3], [4, 5], [6, 7]]
+_ENDPOINTS = [[9, 8], [7, 6], [5, 4], [3, 2]]
+
+
+def sample_inputs_perspective_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
+            yield ArgsKwargs(
+                image_loader, startpoints=None, endpoints=None, fill=fill, coefficients=_PERSPECTIVE_COEFFS[0]
+            )
+
+    yield ArgsKwargs(make_image_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
+
+
+def reference_inputs_perspective_image_tensor():
+    for image_loader, coefficients, interpolation in itertools.product(
+        make_image_loaders_for_interpolation(),
+        _PERSPECTIVE_COEFFS,
+        [
+            F.InterpolationMode.NEAREST,
+            F.InterpolationMode.BILINEAR,
+        ],
+    ):
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
+            # FIXME: PIL kernel doesn't support sequences of length 1 if the number of channels is larger. Shouldn't it?
+            if isinstance(fill, (list, tuple)):
+                continue
+
+            yield ArgsKwargs(
+                image_loader,
+                startpoints=None,
+                endpoints=None,
+                interpolation=interpolation,
+                fill=fill,
+                coefficients=coefficients,
+            )
+
+
+def sample_inputs_perspective_bounding_boxes():
+    for bounding_boxes_loader in make_bounding_box_loaders():
+        yield ArgsKwargs(
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            canvas_size=bounding_boxes_loader.canvas_size,
+            startpoints=None,
+            endpoints=None,
+            coefficients=_PERSPECTIVE_COEFFS[0],
+        )
+
+    format = tv_tensors.BoundingBoxFormat.XYXY
+    loader = make_bounding_box_loader(format=format)
+    yield ArgsKwargs(
+        loader, format=format, canvas_size=loader.canvas_size, startpoints=_STARTPOINTS, endpoints=_ENDPOINTS
+    )
+
+
+def sample_inputs_perspective_mask():
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
+        yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
+
+    yield ArgsKwargs(make_detection_mask_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
+
+
+def reference_inputs_perspective_mask():
+    for mask_loader, perspective_coeffs in itertools.product(
+        make_mask_loaders(extra_dims=[()], num_objects=[1]), _PERSPECTIVE_COEFFS
+    ):
+        yield ArgsKwargs(mask_loader, startpoints=None, endpoints=None, coefficients=perspective_coeffs)
+
+
+def sample_inputs_perspective_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, startpoints=None, endpoints=None, coefficients=_PERSPECTIVE_COEFFS[0])
+
+    yield ArgsKwargs(make_video_loader(), startpoints=_STARTPOINTS, endpoints=_ENDPOINTS)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.perspective_image,
+            sample_inputs_fn=sample_inputs_perspective_image_tensor,
+            reference_fn=pil_reference_wrapper(F._perspective_image_pil),
+            reference_inputs_fn=reference_inputs_perspective_image_tensor,
+            float32_vs_uint8=float32_vs_uint8_fill_adapter,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(2, mae=True),
+                **cuda_vs_cpu_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(),
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
+            },
+            test_marks=[xfail_jit_python_scalar_arg("fill")],
+        ),
+        KernelInfo(
+            F.perspective_bounding_boxes,
+            sample_inputs_fn=sample_inputs_perspective_bounding_boxes,
+            closeness_kwargs={
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-6, rtol=1e-6),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-6, rtol=1e-6),
+            },
+        ),
+        KernelInfo(
+            F.perspective_mask,
+            sample_inputs_fn=sample_inputs_perspective_mask,
+            reference_fn=pil_reference_wrapper(F._perspective_image_pil),
+            reference_inputs_fn=reference_inputs_perspective_mask,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): dict(atol=10, rtol=0),
+            },
+        ),
+        KernelInfo(
+            F.perspective_video,
+            sample_inputs_fn=sample_inputs_perspective_video,
+            closeness_kwargs={
+                **cuda_vs_cpu_pixel_difference(),
+                **scripted_vs_eager_float64_tolerances("cpu", atol=1e-5, rtol=1e-5),
+                **scripted_vs_eager_float64_tolerances("cuda", atol=1e-5, rtol=1e-5),
+            },
+        ),
+    ]
+)
+
+
+def _get_elastic_displacement(canvas_size):
+    return torch.rand(1, *canvas_size, 2)
+
+
+def sample_inputs_elastic_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
+        displacement = _get_elastic_displacement(image_loader.canvas_size)
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
+            yield ArgsKwargs(image_loader, displacement=displacement, fill=fill)
+
+
+def reference_inputs_elastic_image_tensor():
+    for image_loader, interpolation in itertools.product(
+        make_image_loaders_for_interpolation(),
+        [
+            F.InterpolationMode.NEAREST,
+            F.InterpolationMode.BILINEAR,
+            F.InterpolationMode.BICUBIC,
+        ],
+    ):
+        displacement = _get_elastic_displacement(image_loader.canvas_size)
+        for fill in get_fills(num_channels=image_loader.num_channels, dtype=image_loader.dtype):
+            yield ArgsKwargs(image_loader, interpolation=interpolation, displacement=displacement, fill=fill)
+
+
+def sample_inputs_elastic_bounding_boxes():
+    for bounding_boxes_loader in make_bounding_box_loaders():
+        displacement = _get_elastic_displacement(bounding_boxes_loader.canvas_size)
+        yield ArgsKwargs(
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            canvas_size=bounding_boxes_loader.canvas_size,
+            displacement=displacement,
+        )
+
+
+def sample_inputs_elastic_mask():
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE]):
+        displacement = _get_elastic_displacement(mask_loader.shape[-2:])
+        yield ArgsKwargs(mask_loader, displacement=displacement)
+
+
+def sample_inputs_elastic_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        displacement = _get_elastic_displacement(video_loader.shape[-2:])
+        yield ArgsKwargs(video_loader, displacement=displacement)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.elastic_image,
+            sample_inputs_fn=sample_inputs_elastic_image_tensor,
+            reference_inputs_fn=reference_inputs_elastic_image_tensor,
+            float32_vs_uint8=float32_vs_uint8_fill_adapter,
+            closeness_kwargs={
+                **float32_vs_uint8_pixel_difference(6, mae=True),
+                **cuda_vs_cpu_pixel_difference(),
+            },
+            test_marks=[xfail_jit_python_scalar_arg("fill")],
+        ),
+        KernelInfo(
+            F.elastic_bounding_boxes,
+            sample_inputs_fn=sample_inputs_elastic_bounding_boxes,
+        ),
+        KernelInfo(
+            F.elastic_mask,
+            sample_inputs_fn=sample_inputs_elastic_mask,
+        ),
+        KernelInfo(
+            F.elastic_video,
+            sample_inputs_fn=sample_inputs_elastic_video,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
+        ),
+    ]
+)
+
+
+_CENTER_CROP_SPATIAL_SIZES = [(16, 16), (7, 33), (31, 9)]
+_CENTER_CROP_OUTPUT_SIZES = [[4, 3], [42, 70], [4], 3, (5, 2), (6,)]
+
+
+def sample_inputs_center_crop_image_tensor():
+    for image_loader, output_size in itertools.product(
+        make_image_loaders(sizes=[(16, 17)], color_spaces=["RGB"], dtypes=[torch.float32]),
+        [
+            # valid `output_size` types for which cropping is applied to both dimensions
+            *[5, (4,), (2, 3), [6], [3, 2]],
+            # `output_size`'s for which at least one dimension needs to be padded
+            *[[4, 18], [17, 5], [17, 18]],
+        ],
+    ):
+        yield ArgsKwargs(image_loader, output_size=output_size)
+
+
+def reference_inputs_center_crop_image_tensor():
+    for image_loader, output_size in itertools.product(
+        make_image_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], dtypes=[torch.uint8]),
+        _CENTER_CROP_OUTPUT_SIZES,
+    ):
+        yield ArgsKwargs(image_loader, output_size=output_size)
+
+
+def sample_inputs_center_crop_bounding_boxes():
+    for bounding_boxes_loader, output_size in itertools.product(make_bounding_box_loaders(), _CENTER_CROP_OUTPUT_SIZES):
+        yield ArgsKwargs(
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            canvas_size=bounding_boxes_loader.canvas_size,
+            output_size=output_size,
+        )
+
+
+def sample_inputs_center_crop_mask():
+    for mask_loader in make_mask_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_categories=[10], num_objects=[5]):
+        height, width = mask_loader.shape[-2:]
+        yield ArgsKwargs(mask_loader, output_size=(height // 2, width // 2))
+
+
+def reference_inputs_center_crop_mask():
+    for mask_loader, output_size in itertools.product(
+        make_mask_loaders(sizes=_CENTER_CROP_SPATIAL_SIZES, extra_dims=[()], num_objects=[1]), _CENTER_CROP_OUTPUT_SIZES
+    ):
+        yield ArgsKwargs(mask_loader, output_size=output_size)
+
+
+def sample_inputs_center_crop_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        height, width = video_loader.shape[-2:]
+        yield ArgsKwargs(video_loader, output_size=(height // 2, width // 2))
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.center_crop_image,
+            sample_inputs_fn=sample_inputs_center_crop_image_tensor,
+            reference_fn=pil_reference_wrapper(F._center_crop_image_pil),
+            reference_inputs_fn=reference_inputs_center_crop_image_tensor,
+            float32_vs_uint8=True,
+            test_marks=[
+                xfail_jit_python_scalar_arg("output_size"),
+            ],
+        ),
+        KernelInfo(
+            F.center_crop_bounding_boxes,
+            sample_inputs_fn=sample_inputs_center_crop_bounding_boxes,
+            test_marks=[
+                xfail_jit_python_scalar_arg("output_size"),
+            ],
+        ),
+        KernelInfo(
+            F.center_crop_mask,
+            sample_inputs_fn=sample_inputs_center_crop_mask,
+            reference_fn=pil_reference_wrapper(F._center_crop_image_pil),
+            reference_inputs_fn=reference_inputs_center_crop_mask,
+            float32_vs_uint8=True,
+            test_marks=[
+                xfail_jit_python_scalar_arg("output_size"),
+            ],
+        ),
+        KernelInfo(
+            F.center_crop_video,
+            sample_inputs_fn=sample_inputs_center_crop_video,
+        ),
+    ]
+)
+
+
+def sample_inputs_gaussian_blur_image_tensor():
+    make_gaussian_blur_image_loaders = functools.partial(make_image_loaders, sizes=[(7, 33)], color_spaces=["RGB"])
+
+    for image_loader, kernel_size in itertools.product(make_gaussian_blur_image_loaders(), [5, (3, 3), [3, 3]]):
+        yield ArgsKwargs(image_loader, kernel_size=kernel_size)
+
+    for image_loader, sigma in itertools.product(
+        make_gaussian_blur_image_loaders(), [None, (3.0, 3.0), [2.0, 2.0], 4.0, [1.5], (3.14,)]
+    ):
+        yield ArgsKwargs(image_loader, kernel_size=5, sigma=sigma)
+
+
+def sample_inputs_gaussian_blur_video():
+    for video_loader in make_video_loaders(sizes=[(7, 33)], num_frames=[5]):
+        yield ArgsKwargs(video_loader, kernel_size=[3, 3])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.gaussian_blur_image,
+            sample_inputs_fn=sample_inputs_gaussian_blur_image_tensor,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
+            test_marks=[
+                xfail_jit_python_scalar_arg("kernel_size"),
+                xfail_jit_python_scalar_arg("sigma"),
+            ],
+        ),
+        KernelInfo(
+            F.gaussian_blur_video,
+            sample_inputs_fn=sample_inputs_gaussian_blur_video,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
+        ),
+    ]
+)
+
+
+def sample_inputs_equalize_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
+        yield ArgsKwargs(image_loader)
+
+
+def reference_inputs_equalize_image_tensor():
+    # We are not using `make_image_loaders` here since that uniformly samples the values over the whole value range.
+    # Since the whole point of this kernel is to transform an arbitrary distribution of values into a uniform one,
+    # the information gain is low if we already provide something really close to the expected value.
+    def make_uniform_band_image(shape, dtype, device, *, low_factor, high_factor, memory_format):
+        if dtype.is_floating_point:
+            low = low_factor
+            high = high_factor
+        else:
+            max_value = torch.iinfo(dtype).max
+            low = int(low_factor * max_value)
+            high = int(high_factor * max_value)
+        return torch.testing.make_tensor(shape, dtype=dtype, device=device, low=low, high=high).to(
+            memory_format=memory_format, copy=True
+        )
+
+    def make_beta_distributed_image(shape, dtype, device, *, alpha, beta, memory_format):
+        image = torch.distributions.Beta(alpha, beta).sample(shape)
+        if not dtype.is_floating_point:
+            image.mul_(torch.iinfo(dtype).max).round_()
+        return image.to(dtype=dtype, device=device, memory_format=memory_format, copy=True)
+
+    canvas_size = (256, 256)
+    for dtype, color_space, fn in itertools.product(
+        [torch.uint8],
+        ["GRAY", "RGB"],
+        [
+            lambda shape, dtype, device, memory_format: torch.zeros(shape, dtype=dtype, device=device).to(
+                memory_format=memory_format, copy=True
+            ),
+            lambda shape, dtype, device, memory_format: torch.full(
+                shape, 1.0 if dtype.is_floating_point else torch.iinfo(dtype).max, dtype=dtype, device=device
+            ).to(memory_format=memory_format, copy=True),
+            *[
+                functools.partial(make_uniform_band_image, low_factor=low_factor, high_factor=high_factor)
+                for low_factor, high_factor in [
+                    (0.0, 0.25),
+                    (0.25, 0.75),
+                    (0.75, 1.0),
+                ]
+            ],
+            *[
+                functools.partial(make_beta_distributed_image, alpha=alpha, beta=beta)
+                for alpha, beta in [
+                    (0.5, 0.5),
+                    (2, 2),
+                    (2, 5),
+                    (5, 2),
+                ]
+            ],
+        ],
+    ):
+        image_loader = ImageLoader(fn, shape=(get_num_channels(color_space), *canvas_size), dtype=dtype)
+        yield ArgsKwargs(image_loader)
+
+
+def sample_inputs_equalize_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.equalize_image,
+            kernel_name="equalize_image_tensor",
+            sample_inputs_fn=sample_inputs_equalize_image_tensor,
+            reference_fn=pil_reference_wrapper(F._equalize_image_pil),
+            float32_vs_uint8=True,
+            reference_inputs_fn=reference_inputs_equalize_image_tensor,
+        ),
+        KernelInfo(
+            F.equalize_video,
+            sample_inputs_fn=sample_inputs_equalize_video,
+        ),
+    ]
+)
+
+
+def sample_inputs_invert_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
+        yield ArgsKwargs(image_loader)
+
+
+def reference_inputs_invert_image_tensor():
+    for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]):
+        yield ArgsKwargs(image_loader)
+
+
+def sample_inputs_invert_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.invert_image,
+            kernel_name="invert_image_tensor",
+            sample_inputs_fn=sample_inputs_invert_image_tensor,
+            reference_fn=pil_reference_wrapper(F._invert_image_pil),
+            reference_inputs_fn=reference_inputs_invert_image_tensor,
+            float32_vs_uint8=True,
+        ),
+        KernelInfo(
+            F.invert_video,
+            sample_inputs_fn=sample_inputs_invert_video,
+        ),
+    ]
+)
+
+
+_POSTERIZE_BITS = [1, 4, 8]
+
+
+def sample_inputs_posterize_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
+        yield ArgsKwargs(image_loader, bits=_POSTERIZE_BITS[0])
+
+
+def reference_inputs_posterize_image_tensor():
+    for image_loader, bits in itertools.product(
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
+        _POSTERIZE_BITS,
+    ):
+        yield ArgsKwargs(image_loader, bits=bits)
+
+
+def sample_inputs_posterize_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, bits=_POSTERIZE_BITS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.posterize_image,
+            kernel_name="posterize_image_tensor",
+            sample_inputs_fn=sample_inputs_posterize_image_tensor,
+            reference_fn=pil_reference_wrapper(F._posterize_image_pil),
+            reference_inputs_fn=reference_inputs_posterize_image_tensor,
+            float32_vs_uint8=True,
+            closeness_kwargs=float32_vs_uint8_pixel_difference(),
+        ),
+        KernelInfo(
+            F.posterize_video,
+            sample_inputs_fn=sample_inputs_posterize_video,
+        ),
+    ]
+)
+
+
+def _get_solarize_thresholds(dtype):
+    for factor in [0.1, 0.5]:
+        max_value = get_max_value(dtype)
+        yield (float if dtype.is_floating_point else int)(max_value * factor)
+
+
+def sample_inputs_solarize_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
+        yield ArgsKwargs(image_loader, threshold=next(_get_solarize_thresholds(image_loader.dtype)))
+
+
+def reference_inputs_solarize_image_tensor():
+    for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]):
+        for threshold in _get_solarize_thresholds(image_loader.dtype):
+            yield ArgsKwargs(image_loader, threshold=threshold)
+
+
+def uint8_to_float32_threshold_adapter(other_args, kwargs):
+    return other_args, dict(threshold=kwargs["threshold"] / 255)
+
+
+def sample_inputs_solarize_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, threshold=next(_get_solarize_thresholds(video_loader.dtype)))
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.solarize_image,
+            kernel_name="solarize_image_tensor",
+            sample_inputs_fn=sample_inputs_solarize_image_tensor,
+            reference_fn=pil_reference_wrapper(F._solarize_image_pil),
+            reference_inputs_fn=reference_inputs_solarize_image_tensor,
+            float32_vs_uint8=uint8_to_float32_threshold_adapter,
+            closeness_kwargs=float32_vs_uint8_pixel_difference(),
+        ),
+        KernelInfo(
+            F.solarize_video,
+            sample_inputs_fn=sample_inputs_solarize_video,
+        ),
+    ]
+)
+
+
+def sample_inputs_autocontrast_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
+        yield ArgsKwargs(image_loader)
+
+
+def reference_inputs_autocontrast_image_tensor():
+    for image_loader in make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]):
+        yield ArgsKwargs(image_loader)
+
+
+def sample_inputs_autocontrast_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.autocontrast_image,
+            kernel_name="autocontrast_image_tensor",
+            sample_inputs_fn=sample_inputs_autocontrast_image_tensor,
+            reference_fn=pil_reference_wrapper(F._autocontrast_image_pil),
+            reference_inputs_fn=reference_inputs_autocontrast_image_tensor,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(),
+            },
+        ),
+        KernelInfo(
+            F.autocontrast_video,
+            sample_inputs_fn=sample_inputs_autocontrast_video,
+        ),
+    ]
+)
+
+_ADJUST_SHARPNESS_FACTORS = [0.1, 0.5]
+
+
+def sample_inputs_adjust_sharpness_image_tensor():
+    for image_loader in make_image_loaders(
+        sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE, (2, 2)],
+        color_spaces=("GRAY", "RGB"),
+    ):
+        yield ArgsKwargs(image_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
+
+
+def reference_inputs_adjust_sharpness_image_tensor():
+    for image_loader, sharpness_factor in itertools.product(
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
+        _ADJUST_SHARPNESS_FACTORS,
+    ):
+        yield ArgsKwargs(image_loader, sharpness_factor=sharpness_factor)
+
+
+def sample_inputs_adjust_sharpness_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, sharpness_factor=_ADJUST_SHARPNESS_FACTORS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_sharpness_image,
+            kernel_name="adjust_sharpness_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_sharpness_image_tensor,
+            reference_fn=pil_reference_wrapper(F._adjust_sharpness_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_sharpness_image_tensor,
+            float32_vs_uint8=True,
+            closeness_kwargs=float32_vs_uint8_pixel_difference(2),
+        ),
+        KernelInfo(
+            F.adjust_sharpness_video,
+            sample_inputs_fn=sample_inputs_adjust_sharpness_video,
+        ),
+    ]
+)
+
+
+_ADJUST_CONTRAST_FACTORS = [0.1, 0.5]
+
+
+def sample_inputs_adjust_contrast_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
+        yield ArgsKwargs(image_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
+
+
+def reference_inputs_adjust_contrast_image_tensor():
+    for image_loader, contrast_factor in itertools.product(
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
+        _ADJUST_CONTRAST_FACTORS,
+    ):
+        yield ArgsKwargs(image_loader, contrast_factor=contrast_factor)
+
+
+def sample_inputs_adjust_contrast_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, contrast_factor=_ADJUST_CONTRAST_FACTORS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_contrast_image,
+            kernel_name="adjust_contrast_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_contrast_image_tensor,
+            reference_fn=pil_reference_wrapper(F._adjust_contrast_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_contrast_image_tensor,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(2),
+                **cuda_vs_cpu_pixel_difference(),
+                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1),
+            },
+        ),
+        KernelInfo(
+            F.adjust_contrast_video,
+            sample_inputs_fn=sample_inputs_adjust_contrast_video,
+            closeness_kwargs={
+                **cuda_vs_cpu_pixel_difference(),
+                (("TestKernels", "test_against_reference"), torch.uint8, "cpu"): pixel_difference_closeness_kwargs(1),
+            },
+        ),
+    ]
+)
+
+_ADJUST_GAMMA_GAMMAS_GAINS = [
+    (0.5, 2.0),
+    (0.0, 1.0),
+]
+
+
+def sample_inputs_adjust_gamma_image_tensor():
+    gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
+        yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
+
+
+def reference_inputs_adjust_gamma_image_tensor():
+    for image_loader, (gamma, gain) in itertools.product(
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
+        _ADJUST_GAMMA_GAMMAS_GAINS,
+    ):
+        yield ArgsKwargs(image_loader, gamma=gamma, gain=gain)
+
+
+def sample_inputs_adjust_gamma_video():
+    gamma, gain = _ADJUST_GAMMA_GAMMAS_GAINS[0]
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, gamma=gamma, gain=gain)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_gamma_image,
+            kernel_name="adjust_gamma_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_gamma_image_tensor,
+            reference_fn=pil_reference_wrapper(F._adjust_gamma_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_gamma_image_tensor,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(),
+            },
+        ),
+        KernelInfo(
+            F.adjust_gamma_video,
+            sample_inputs_fn=sample_inputs_adjust_gamma_video,
+        ),
+    ]
+)
+
+
+_ADJUST_HUE_FACTORS = [-0.1, 0.5]
+
+
+def sample_inputs_adjust_hue_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
+        yield ArgsKwargs(image_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
+
+
+def reference_inputs_adjust_hue_image_tensor():
+    for image_loader, hue_factor in itertools.product(
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
+        _ADJUST_HUE_FACTORS,
+    ):
+        yield ArgsKwargs(image_loader, hue_factor=hue_factor)
+
+
+def sample_inputs_adjust_hue_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, hue_factor=_ADJUST_HUE_FACTORS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_hue_image,
+            kernel_name="adjust_hue_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_hue_image_tensor,
+            reference_fn=pil_reference_wrapper(F._adjust_hue_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_hue_image_tensor,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(2, mae=True),
+                **float32_vs_uint8_pixel_difference(),
+            },
+        ),
+        KernelInfo(
+            F.adjust_hue_video,
+            sample_inputs_fn=sample_inputs_adjust_hue_video,
+        ),
+    ]
+)
+
+_ADJUST_SATURATION_FACTORS = [0.1, 0.5]
+
+
+def sample_inputs_adjust_saturation_image_tensor():
+    for image_loader in make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=("GRAY", "RGB")):
+        yield ArgsKwargs(image_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
+
+
+def reference_inputs_adjust_saturation_image_tensor():
+    for image_loader, saturation_factor in itertools.product(
+        make_image_loaders(color_spaces=("GRAY", "RGB"), extra_dims=[()], dtypes=[torch.uint8]),
+        _ADJUST_SATURATION_FACTORS,
+    ):
+        yield ArgsKwargs(image_loader, saturation_factor=saturation_factor)
+
+
+def sample_inputs_adjust_saturation_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[3]):
+        yield ArgsKwargs(video_loader, saturation_factor=_ADJUST_SATURATION_FACTORS[0])
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.adjust_saturation_image,
+            kernel_name="adjust_saturation_image_tensor",
+            sample_inputs_fn=sample_inputs_adjust_saturation_image_tensor,
+            reference_fn=pil_reference_wrapper(F._adjust_saturation_image_pil),
+            reference_inputs_fn=reference_inputs_adjust_saturation_image_tensor,
+            float32_vs_uint8=True,
+            closeness_kwargs={
+                **pil_reference_pixel_difference(),
+                **float32_vs_uint8_pixel_difference(2),
+                **cuda_vs_cpu_pixel_difference(),
+            },
+        ),
+        KernelInfo(
+            F.adjust_saturation_video,
+            sample_inputs_fn=sample_inputs_adjust_saturation_video,
+            closeness_kwargs=cuda_vs_cpu_pixel_difference(),
+        ),
+    ]
+)
+
+
+def sample_inputs_clamp_bounding_boxes():
+    for bounding_boxes_loader in make_bounding_box_loaders():
+        yield ArgsKwargs(
+            bounding_boxes_loader,
+            format=bounding_boxes_loader.format,
+            canvas_size=bounding_boxes_loader.canvas_size,
+        )
+
+
+KERNEL_INFOS.append(
+    KernelInfo(
+        F.clamp_bounding_boxes,
+        sample_inputs_fn=sample_inputs_clamp_bounding_boxes,
+        logs_usage=True,
+    )
+)
+
+_FIVE_TEN_CROP_SIZES = [7, (6,), [5], (6, 5), [7, 6]]
+
+
+def _get_five_ten_crop_canvas_size(size):
+    if isinstance(size, int):
+        crop_height = crop_width = size
+    elif len(size) == 1:
+        crop_height = crop_width = size[0]
+    else:
+        crop_height, crop_width = size
+    return 2 * crop_height, 2 * crop_width
+
+
+def sample_inputs_five_crop_image_tensor():
+    for size in _FIVE_TEN_CROP_SIZES:
+        for image_loader in make_image_loaders(
+            sizes=[_get_five_ten_crop_canvas_size(size)],
+            color_spaces=["RGB"],
+            dtypes=[torch.float32],
+        ):
+            yield ArgsKwargs(image_loader, size=size)
+
+
+def reference_inputs_five_crop_image_tensor():
+    for size in _FIVE_TEN_CROP_SIZES:
+        for image_loader in make_image_loaders(
+            sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8]
+        ):
+            yield ArgsKwargs(image_loader, size=size)
+
+
+def sample_inputs_five_crop_video():
+    size = _FIVE_TEN_CROP_SIZES[0]
+    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]):
+        yield ArgsKwargs(video_loader, size=size)
+
+
+def sample_inputs_ten_crop_image_tensor():
+    for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
+        for image_loader in make_image_loaders(
+            sizes=[_get_five_ten_crop_canvas_size(size)],
+            color_spaces=["RGB"],
+            dtypes=[torch.float32],
+        ):
+            yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
+
+
+def reference_inputs_ten_crop_image_tensor():
+    for size, vertical_flip in itertools.product(_FIVE_TEN_CROP_SIZES, [False, True]):
+        for image_loader in make_image_loaders(
+            sizes=[_get_five_ten_crop_canvas_size(size)], extra_dims=[()], dtypes=[torch.uint8]
+        ):
+            yield ArgsKwargs(image_loader, size=size, vertical_flip=vertical_flip)
+
+
+def sample_inputs_ten_crop_video():
+    size = _FIVE_TEN_CROP_SIZES[0]
+    for video_loader in make_video_loaders(sizes=[_get_five_ten_crop_canvas_size(size)]):
+        yield ArgsKwargs(video_loader, size=size)
+
+
+def multi_crop_pil_reference_wrapper(pil_kernel):
+    def wrapper(input_tensor, *other_args, **kwargs):
+        output = pil_reference_wrapper(pil_kernel)(input_tensor, *other_args, **kwargs)
+        return type(output)(
+            F.to_dtype_image(F.to_image(output_pil), dtype=input_tensor.dtype, scale=True) for output_pil in output
+        )
+
+    return wrapper
+
+
+_common_five_ten_crop_marks = [
+    xfail_jit_python_scalar_arg("size"),
+    mark_framework_limitation(("TestKernels", "test_batched_vs_single"), "Custom batching needed."),
+]
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.five_crop_image,
+            sample_inputs_fn=sample_inputs_five_crop_image_tensor,
+            reference_fn=multi_crop_pil_reference_wrapper(F._five_crop_image_pil),
+            reference_inputs_fn=reference_inputs_five_crop_image_tensor,
+            test_marks=_common_five_ten_crop_marks,
+        ),
+        KernelInfo(
+            F.five_crop_video,
+            sample_inputs_fn=sample_inputs_five_crop_video,
+            test_marks=_common_five_ten_crop_marks,
+        ),
+        KernelInfo(
+            F.ten_crop_image,
+            sample_inputs_fn=sample_inputs_ten_crop_image_tensor,
+            reference_fn=multi_crop_pil_reference_wrapper(F._ten_crop_image_pil),
+            reference_inputs_fn=reference_inputs_ten_crop_image_tensor,
+            test_marks=_common_five_ten_crop_marks,
+        ),
+        KernelInfo(
+            F.ten_crop_video,
+            sample_inputs_fn=sample_inputs_ten_crop_video,
+            test_marks=_common_five_ten_crop_marks,
+        ),
+    ]
+)
+
+_NORMALIZE_MEANS_STDS = [
+    ((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+    ([0.0, 0.0, 0.0], [1.0, 1.0, 1.0]),
+    (0.5, 2.0),
+]
+
+
+def sample_inputs_normalize_image_tensor():
+    for image_loader, (mean, std) in itertools.product(
+        make_image_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], dtypes=[torch.float32]),
+        _NORMALIZE_MEANS_STDS,
+    ):
+        yield ArgsKwargs(image_loader, mean=mean, std=std)
+
+
+def reference_normalize_image_tensor(image, mean, std, inplace=False):
+    mean = torch.tensor(mean).view(-1, 1, 1)
+    std = torch.tensor(std).view(-1, 1, 1)
+
+    sub = torch.Tensor.sub_ if inplace else torch.Tensor.sub
+    return sub(image, mean).div_(std)
+
+
+def reference_inputs_normalize_image_tensor():
+    yield ArgsKwargs(
+        make_image_loader(size=(32, 32), color_space="RGB", extra_dims=[1]),
+        mean=[0.5, 0.5, 0.5],
+        std=[1.0, 1.0, 1.0],
+    )
+
+
+def sample_inputs_normalize_video():
+    mean, std = _NORMALIZE_MEANS_STDS[0]
+    for video_loader in make_video_loaders(
+        sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[3], dtypes=[torch.float32]
+    ):
+        yield ArgsKwargs(video_loader, mean=mean, std=std)
+
+
+KERNEL_INFOS.extend(
+    [
+        KernelInfo(
+            F.normalize_image,
+            kernel_name="normalize_image_tensor",
+            sample_inputs_fn=sample_inputs_normalize_image_tensor,
+            reference_fn=reference_normalize_image_tensor,
+            reference_inputs_fn=reference_inputs_normalize_image_tensor,
+            test_marks=[
+                xfail_jit_python_scalar_arg("mean"),
+                xfail_jit_python_scalar_arg("std"),
+            ],
+        ),
+        KernelInfo(
+            F.normalize_video,
+            sample_inputs_fn=sample_inputs_normalize_video,
+        ),
+    ]
+)
+
+
+def sample_inputs_uniform_temporal_subsample_video():
+    for video_loader in make_video_loaders(sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], num_frames=[4]):
+        yield ArgsKwargs(video_loader, num_samples=2)
+
+
+def reference_uniform_temporal_subsample_video(x, num_samples):
+    # Copy-pasted from
+    # https://github.com/facebookresearch/pytorchvideo/blob/c8d23d8b7e597586a9e2d18f6ed31ad8aa379a7a/pytorchvideo/transforms/functional.py#L19
+    t = x.shape[-4]
+    assert num_samples > 0 and t > 0
+    # Sample by nearest neighbor interpolation if num_samples > t.
+    indices = torch.linspace(0, t - 1, num_samples)
+    indices = torch.clamp(indices, 0, t - 1).long()
+    return torch.index_select(x, -4, indices)
+
+
+def reference_inputs_uniform_temporal_subsample_video():
+    for video_loader in make_video_loaders(
+        sizes=[DEFAULT_PORTRAIT_SPATIAL_SIZE], color_spaces=["RGB"], num_frames=[10]
+    ):
+        for num_samples in range(1, video_loader.shape[-4] + 1):
+            yield ArgsKwargs(video_loader, num_samples)
+
+
+KERNEL_INFOS.append(
+    KernelInfo(
+        F.uniform_temporal_subsample_video,
+        sample_inputs_fn=sample_inputs_uniform_temporal_subsample_video,
+        reference_fn=reference_uniform_temporal_subsample_video,
+        reference_inputs_fn=reference_inputs_uniform_temporal_subsample_video,
+    )
+)
diff --git a/test/transforms_v2_legacy_utils.py b/test/transforms_v2_legacy_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9dead79342246c354211b59da5aa6233c9fd4191
--- /dev/null
+++ b/test/transforms_v2_legacy_utils.py
@@ -0,0 +1,633 @@
+"""
+As the name implies, these are legacy utilities that are hopefully removed soon. The future of
+transforms v2 testing is in test/test_transforms_v2_refactored.py. All new test should be
+implemented there and must not use any of the utilities here.
+
+The following legacy modules depend on this module
+
+- transforms_v2_kernel_infos.py
+- transforms_v2_dispatcher_infos.py
+- test_transforms_v2_functional.py
+- test_transforms_v2_consistency.py
+- test_transforms.py
+
+When all the logic is ported from the files above to test_transforms_v2_refactored.py, delete
+all the legacy modules including this one and drop the _refactored prefix from the name.
+"""
+
+import collections.abc
+import dataclasses
+import enum
+import itertools
+import pathlib
+from collections import defaultdict
+from typing import Callable, Sequence, Tuple, Union
+
+import PIL.Image
+import pytest
+import torch
+
+from torchvision import tv_tensors
+from torchvision.transforms._functional_tensor import _max_value as get_max_value
+from torchvision.transforms.v2.functional import to_dtype_image, to_image, to_pil_image
+
+
+def combinations_grid(**kwargs):
+    """Creates a grid of input combinations.
+
+    Each element in the returned sequence is a dictionary containing one possible combination as values.
+
+    Example:
+        >>> combinations_grid(foo=("bar", "baz"), spam=("eggs", "ham"))
+        [
+            {'foo': 'bar', 'spam': 'eggs'},
+            {'foo': 'bar', 'spam': 'ham'},
+            {'foo': 'baz', 'spam': 'eggs'},
+            {'foo': 'baz', 'spam': 'ham'}
+        ]
+    """
+    return [dict(zip(kwargs.keys(), values)) for values in itertools.product(*kwargs.values())]
+
+
+DEFAULT_SIZE = (17, 11)
+
+NUM_CHANNELS_MAP = {
+    "GRAY": 1,
+    "GRAY_ALPHA": 2,
+    "RGB": 3,
+    "RGBA": 4,
+}
+
+
+def make_image(
+    size=DEFAULT_SIZE,
+    *,
+    color_space="RGB",
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+    memory_format=torch.contiguous_format,
+):
+    num_channels = NUM_CHANNELS_MAP[color_space]
+    dtype = dtype or torch.uint8
+    max_value = get_max_value(dtype)
+    data = torch.testing.make_tensor(
+        (*batch_dims, num_channels, *size),
+        low=0,
+        high=max_value,
+        dtype=dtype,
+        device=device,
+        memory_format=memory_format,
+    )
+    if color_space in {"GRAY_ALPHA", "RGBA"}:
+        data[..., -1, :, :] = max_value
+
+    return tv_tensors.Image(data)
+
+
+def make_image_tensor(*args, **kwargs):
+    return make_image(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+def make_image_pil(*args, **kwargs):
+    return to_pil_image(make_image(*args, **kwargs))
+
+
+def make_bounding_boxes(
+    canvas_size=DEFAULT_SIZE,
+    *,
+    format=tv_tensors.BoundingBoxFormat.XYXY,
+    batch_dims=(),
+    dtype=None,
+    device="cpu",
+):
+    def sample_position(values, max_value):
+        # We cannot use torch.randint directly here, because it only allows integer scalars as values for low and high.
+        # However, if we have batch_dims, we need tensors as limits.
+        return torch.stack([torch.randint(max_value - v, ()) for v in values.flatten().tolist()]).reshape(values.shape)
+
+    if isinstance(format, str):
+        format = tv_tensors.BoundingBoxFormat[format]
+
+    dtype = dtype or torch.float32
+
+    if any(dim == 0 for dim in batch_dims):
+        return tv_tensors.BoundingBoxes(
+            torch.empty(*batch_dims, 4, dtype=dtype, device=device), format=format, canvas_size=canvas_size
+        )
+
+    h, w = [torch.randint(1, c, batch_dims) for c in canvas_size]
+    y = sample_position(h, canvas_size[0])
+    x = sample_position(w, canvas_size[1])
+
+    if format is tv_tensors.BoundingBoxFormat.XYWH:
+        parts = (x, y, w, h)
+    elif format is tv_tensors.BoundingBoxFormat.XYXY:
+        x1, y1 = x, y
+        x2 = x1 + w
+        y2 = y1 + h
+        parts = (x1, y1, x2, y2)
+    elif format is tv_tensors.BoundingBoxFormat.CXCYWH:
+        cx = x + w / 2
+        cy = y + h / 2
+        parts = (cx, cy, w, h)
+    else:
+        raise ValueError(f"Format {format} is not supported")
+
+    return tv_tensors.BoundingBoxes(
+        torch.stack(parts, dim=-1).to(dtype=dtype, device=device), format=format, canvas_size=canvas_size
+    )
+
+
+def make_detection_mask(size=DEFAULT_SIZE, *, num_objects=5, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "detection" mask, i.e. (*, N, H, W), where each object is encoded as one of N boolean masks"""
+    return tv_tensors.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, num_objects, *size),
+            low=0,
+            high=2,
+            dtype=dtype or torch.bool,
+            device=device,
+        )
+    )
+
+
+def make_segmentation_mask(size=DEFAULT_SIZE, *, num_categories=10, batch_dims=(), dtype=None, device="cpu"):
+    """Make a "segmentation" mask, i.e. (*, H, W), where the category is encoded as pixel value"""
+    return tv_tensors.Mask(
+        torch.testing.make_tensor(
+            (*batch_dims, *size),
+            low=0,
+            high=num_categories,
+            dtype=dtype or torch.uint8,
+            device=device,
+        )
+    )
+
+
+def make_video(size=DEFAULT_SIZE, *, num_frames=3, batch_dims=(), **kwargs):
+    return tv_tensors.Video(make_image(size, batch_dims=(*batch_dims, num_frames), **kwargs))
+
+
+def make_video_tensor(*args, **kwargs):
+    return make_video(*args, **kwargs).as_subclass(torch.Tensor)
+
+
+DEFAULT_SQUARE_SPATIAL_SIZE = 15
+DEFAULT_LANDSCAPE_SPATIAL_SIZE = (7, 33)
+DEFAULT_PORTRAIT_SPATIAL_SIZE = (31, 9)
+DEFAULT_SPATIAL_SIZES = (
+    DEFAULT_LANDSCAPE_SPATIAL_SIZE,
+    DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    DEFAULT_SQUARE_SPATIAL_SIZE,
+)
+
+
+def _parse_size(size, *, name="size"):
+    if size == "random":
+        raise ValueError("This should never happen")
+    elif isinstance(size, int) and size > 0:
+        return (size, size)
+    elif (
+        isinstance(size, collections.abc.Sequence)
+        and len(size) == 2
+        and all(isinstance(length, int) and length > 0 for length in size)
+    ):
+        return tuple(size)
+    else:
+        raise pytest.UsageError(
+            f"'{name}' can either be `'random'`, a positive integer, or a sequence of two positive integers,"
+            f"but got {size} instead."
+        )
+
+
+def get_num_channels(color_space):
+    num_channels = NUM_CHANNELS_MAP.get(color_space)
+    if not num_channels:
+        raise pytest.UsageError(f"Can't determine the number of channels for color space {color_space}")
+    return num_channels
+
+
+VALID_EXTRA_DIMS = ((), (4,), (2, 3))
+DEGENERATE_BATCH_DIMS = ((0,), (5, 0), (0, 5))
+
+DEFAULT_EXTRA_DIMS = (*VALID_EXTRA_DIMS, *DEGENERATE_BATCH_DIMS)
+
+
+def from_loader(loader_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loader = loader_fn(*args, **kwargs)
+        return loader.load(device)
+
+    return wrapper
+
+
+def from_loaders(loaders_fn):
+    def wrapper(*args, **kwargs):
+        device = kwargs.pop("device", "cpu")
+        loaders = loaders_fn(*args, **kwargs)
+        for loader in loaders:
+            yield loader.load(device)
+
+    return wrapper
+
+
+@dataclasses.dataclass
+class TensorLoader:
+    fn: Callable[[Sequence[int], torch.dtype, Union[str, torch.device]], torch.Tensor]
+    shape: Sequence[int]
+    dtype: torch.dtype
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device)
+
+
+@dataclasses.dataclass
+class ImageLoader(TensorLoader):
+    spatial_size: Tuple[int, int] = dataclasses.field(init=False)
+    num_channels: int = dataclasses.field(init=False)
+    memory_format: torch.memory_format = torch.contiguous_format
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.spatial_size = self.canvas_size = self.shape[-2:]
+        self.num_channels = self.shape[-3]
+
+    def load(self, device):
+        return self.fn(self.shape, self.dtype, device, memory_format=self.memory_format)
+
+
+def make_image_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    *,
+    color_space="RGB",
+    extra_dims=(),
+    dtype=torch.float32,
+    constant_alpha=True,
+    memory_format=torch.contiguous_format,
+):
+    if not constant_alpha:
+        raise ValueError("This should never happen")
+    size = _parse_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device, memory_format):
+        *batch_dims, _, height, width = shape
+        return make_image(
+            (height, width),
+            color_space=color_space,
+            batch_dims=batch_dims,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
+        )
+
+    return ImageLoader(fn, shape=(*extra_dims, num_channels, *size), dtype=dtype, memory_format=memory_format)
+
+
+def make_image_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "GRAY_ALPHA",
+        "RGB",
+        "RGBA",
+    ),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.float32, torch.float64, torch.uint8),
+    constant_alpha=True,
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, extra_dims=extra_dims, dtype=dtypes):
+        yield make_image_loader(**params, constant_alpha=constant_alpha)
+
+
+make_images = from_loaders(make_image_loaders)
+
+
+def make_image_loader_for_interpolation(
+    size=(233, 147), *, color_space="RGB", dtype=torch.uint8, memory_format=torch.contiguous_format
+):
+    size = _parse_size(size)
+    num_channels = get_num_channels(color_space)
+
+    def fn(shape, dtype, device, memory_format):
+        height, width = shape[-2:]
+
+        image_pil = (
+            PIL.Image.open(pathlib.Path(__file__).parent / "assets" / "encode_jpeg" / "grace_hopper_517x606.jpg")
+            .resize((width, height))
+            .convert(
+                {
+                    "GRAY": "L",
+                    "GRAY_ALPHA": "LA",
+                    "RGB": "RGB",
+                    "RGBA": "RGBA",
+                }[color_space]
+            )
+        )
+
+        image_tensor = to_image(image_pil)
+        if memory_format == torch.contiguous_format:
+            image_tensor = image_tensor.to(device=device, memory_format=memory_format, copy=True)
+        else:
+            image_tensor = image_tensor.to(device=device)
+        image_tensor = to_dtype_image(image_tensor, dtype=dtype, scale=True)
+
+        return tv_tensors.Image(image_tensor)
+
+    return ImageLoader(fn, shape=(num_channels, *size), dtype=dtype, memory_format=memory_format)
+
+
+def make_image_loaders_for_interpolation(
+    sizes=((233, 147),),
+    color_spaces=("RGB",),
+    dtypes=(torch.uint8,),
+    memory_formats=(torch.contiguous_format, torch.channels_last),
+):
+    for params in combinations_grid(size=sizes, color_space=color_spaces, dtype=dtypes, memory_format=memory_formats):
+        yield make_image_loader_for_interpolation(**params)
+
+
+@dataclasses.dataclass
+class BoundingBoxesLoader(TensorLoader):
+    format: tv_tensors.BoundingBoxFormat
+    spatial_size: Tuple[int, int]
+    canvas_size: Tuple[int, int] = dataclasses.field(init=False)
+
+    def __post_init__(self):
+        self.canvas_size = self.spatial_size
+
+
+def make_bounding_box_loader(*, extra_dims=(), format, spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE, dtype=torch.float32):
+    if isinstance(format, str):
+        format = tv_tensors.BoundingBoxFormat[format]
+
+    spatial_size = _parse_size(spatial_size, name="spatial_size")
+
+    def fn(shape, dtype, device):
+        *batch_dims, num_coordinates = shape
+        if num_coordinates != 4:
+            raise pytest.UsageError()
+
+        return make_bounding_boxes(
+            format=format, canvas_size=spatial_size, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return BoundingBoxesLoader(fn, shape=(*extra_dims[-1:], 4), dtype=dtype, format=format, spatial_size=spatial_size)
+
+
+def make_bounding_box_loaders(
+    *,
+    extra_dims=tuple(d for d in DEFAULT_EXTRA_DIMS if len(d) < 2),
+    formats=tuple(tv_tensors.BoundingBoxFormat),
+    spatial_size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    dtypes=(torch.float32, torch.float64, torch.int64),
+):
+    for params in combinations_grid(extra_dims=extra_dims, format=formats, dtype=dtypes):
+        yield make_bounding_box_loader(**params, spatial_size=spatial_size)
+
+
+make_multiple_bounding_boxes = from_loaders(make_bounding_box_loaders)
+
+
+class MaskLoader(TensorLoader):
+    pass
+
+
+def make_detection_mask_loader(size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_objects=5, extra_dims=(), dtype=torch.uint8):
+    # This produces "detection" masks, i.e. `(*, N, H, W)`, where `N` denotes the number of objects
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device):
+        *batch_dims, num_objects, height, width = shape
+        return make_detection_mask(
+            (height, width), num_objects=num_objects, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return MaskLoader(fn, shape=(*extra_dims, num_objects, *size), dtype=dtype)
+
+
+def make_detection_mask_loaders(
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, 5),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_objects=num_objects, extra_dims=extra_dims, dtype=dtypes):
+        yield make_detection_mask_loader(**params)
+
+
+make_detection_masks = from_loaders(make_detection_mask_loaders)
+
+
+def make_segmentation_mask_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE, *, num_categories=10, extra_dims=(), dtype=torch.uint8
+):
+    # This produces "segmentation" masks, i.e. `(*, H, W)`, where the category is encoded in the values
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device):
+        *batch_dims, height, width = shape
+        return make_segmentation_mask(
+            (height, width), num_categories=num_categories, batch_dims=batch_dims, dtype=dtype, device=device
+        )
+
+    return MaskLoader(fn, shape=(*extra_dims, *size), dtype=dtype)
+
+
+def make_segmentation_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_categories=(1, 2, 10),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    for params in combinations_grid(size=sizes, num_categories=num_categories, extra_dims=extra_dims, dtype=dtypes):
+        yield make_segmentation_mask_loader(**params)
+
+
+make_segmentation_masks = from_loaders(make_segmentation_mask_loaders)
+
+
+def make_mask_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    num_objects=(1, 0, 5),
+    num_categories=(1, 2, 10),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8,),
+):
+    yield from make_detection_mask_loaders(sizes=sizes, num_objects=num_objects, extra_dims=extra_dims, dtypes=dtypes)
+    yield from make_segmentation_mask_loaders(
+        sizes=sizes, num_categories=num_categories, extra_dims=extra_dims, dtypes=dtypes
+    )
+
+
+make_masks = from_loaders(make_mask_loaders)
+
+
+class VideoLoader(ImageLoader):
+    pass
+
+
+def make_video_loader(
+    size=DEFAULT_PORTRAIT_SPATIAL_SIZE,
+    *,
+    color_space="RGB",
+    num_frames=3,
+    extra_dims=(),
+    dtype=torch.uint8,
+):
+    size = _parse_size(size)
+
+    def fn(shape, dtype, device, memory_format):
+        *batch_dims, num_frames, _, height, width = shape
+        return make_video(
+            (height, width),
+            num_frames=num_frames,
+            batch_dims=batch_dims,
+            color_space=color_space,
+            dtype=dtype,
+            device=device,
+            memory_format=memory_format,
+        )
+
+    return VideoLoader(fn, shape=(*extra_dims, num_frames, get_num_channels(color_space), *size), dtype=dtype)
+
+
+def make_video_loaders(
+    *,
+    sizes=DEFAULT_SPATIAL_SIZES,
+    color_spaces=(
+        "GRAY",
+        "RGB",
+    ),
+    num_frames=(1, 0, 3),
+    extra_dims=DEFAULT_EXTRA_DIMS,
+    dtypes=(torch.uint8, torch.float32, torch.float64),
+):
+    for params in combinations_grid(
+        size=sizes, color_space=color_spaces, num_frames=num_frames, extra_dims=extra_dims, dtype=dtypes
+    ):
+        yield make_video_loader(**params)
+
+
+make_videos = from_loaders(make_video_loaders)
+
+
+class TestMark:
+    def __init__(
+        self,
+        # Tuple of test class name and test function name that identifies the test the mark is applied to. If there is
+        # no test class, i.e. a standalone test function, use `None`.
+        test_id,
+        # `pytest.mark.*` to apply, e.g. `pytest.mark.skip` or `pytest.mark.xfail`
+        mark,
+        *,
+        # Callable, that will be passed an `ArgsKwargs` and should return a boolean to indicate if the mark will be
+        # applied. If omitted, defaults to always apply.
+        condition=None,
+    ):
+        self.test_id = test_id
+        self.mark = mark
+        self.condition = condition or (lambda args_kwargs: True)
+
+
+def mark_framework_limitation(test_id, reason, condition=None):
+    # The purpose of this function is to have a single entry point for skip marks that are only there, because the test
+    # framework cannot handle the kernel in general or a specific parameter combination.
+    # As development progresses, we can change the `mark.skip` to `mark.xfail` from time to time to see if the skip is
+    # still justified.
+    # We don't want to use `mark.xfail` all the time, because that actually runs the test until an error happens. Thus,
+    # we are wasting CI resources for no reason for most of the time
+    return TestMark(test_id, pytest.mark.skip(reason=reason), condition=condition)
+
+
+class InfoBase:
+    def __init__(
+        self,
+        *,
+        # Identifier if the info that shows up the parametrization.
+        id,
+        # Test markers that will be (conditionally) applied to an `ArgsKwargs` parametrization.
+        # See the `TestMark` class for details
+        test_marks=None,
+        # Additional parameters, e.g. `rtol=1e-3`, passed to `assert_close`. Keys are a 3-tuple of `test_id` (see
+        # `TestMark`), the dtype, and the device.
+        closeness_kwargs=None,
+    ):
+        self.id = id
+
+        self.test_marks = test_marks or []
+        test_marks_map = defaultdict(list)
+        for test_mark in self.test_marks:
+            test_marks_map[test_mark.test_id].append(test_mark)
+        self._test_marks_map = dict(test_marks_map)
+
+        self.closeness_kwargs = closeness_kwargs or dict()
+
+    def get_marks(self, test_id, args_kwargs):
+        return [
+            test_mark.mark for test_mark in self._test_marks_map.get(test_id, []) if test_mark.condition(args_kwargs)
+        ]
+
+    def get_closeness_kwargs(self, test_id, *, dtype, device):
+        if not (isinstance(test_id, tuple) and len(test_id) == 2):
+            msg = "`test_id` should be a `Tuple[Optional[str], str]` denoting the test class and function name"
+            if callable(test_id):
+                msg += ". Did you forget to add the `test_id` fixture to parameters of the test?"
+            else:
+                msg += f", but got {test_id} instead."
+            raise pytest.UsageError(msg)
+        if isinstance(device, torch.device):
+            device = device.type
+        return self.closeness_kwargs.get((test_id, dtype, device), dict())
+
+
+class ArgsKwargs:
+    def __init__(self, *args, **kwargs):
+        self.args = args
+        self.kwargs = kwargs
+
+    def __iter__(self):
+        yield self.args
+        yield self.kwargs
+
+    def load(self, device="cpu"):
+        return ArgsKwargs(
+            *(arg.load(device) if isinstance(arg, TensorLoader) else arg for arg in self.args),
+            **{
+                keyword: arg.load(device) if isinstance(arg, TensorLoader) else arg
+                for keyword, arg in self.kwargs.items()
+            },
+        )
+
+
+def parametrized_error_message(*args, **kwargs):
+    def to_str(obj):
+        if isinstance(obj, torch.Tensor) and obj.numel() > 30:
+            return f"tensor(shape={list(obj.shape)}, dtype={obj.dtype}, device={obj.device})"
+        elif isinstance(obj, enum.Enum):
+            return f"{type(obj).__name__}.{obj.name}"
+        else:
+            return repr(obj)
+
+    if args or kwargs:
+        postfix = "\n".join(
+            [
+                "",
+                "Failure happened for the following parameters:",
+                "",
+                *[to_str(arg) for arg in args],
+                *[f"{name}={to_str(kwarg)}" for name, kwarg in kwargs.items()],
+            ]
+        )
+    else:
+        postfix = ""
+
+    def wrapper(msg):
+        return msg + postfix
+
+    return wrapper
diff --git a/torchvision/__init__.py b/torchvision/__init__.py
index 06e9b42301f4403404ca392f24490636163995eb..5e8f06e3d0f98a63426f070d9b9b28d910b2c096 100644
--- a/torchvision/__init__.py
+++ b/torchvision/__init__.py
@@ -1,8 +1,9 @@
 import os
 import warnings
+from modulefinder import Module
 
 import torch
-from torchvision import datasets, io, models, ops, transforms, utils
+from torchvision import _meta_registrations, datasets, io, models, ops, transforms, utils
 
 from .extension import _HAS_OPS
 
@@ -71,11 +72,16 @@ def set_video_backend(backend):
         backend, please compile torchvision from source.
     """
     global _video_backend
-    if backend not in ["pyav", "video_reader"]:
-        raise ValueError("Invalid video backend '%s'. Options are 'pyav' and 'video_reader'" % backend)
+    if backend not in ["pyav", "video_reader", "cuda"]:
+        raise ValueError("Invalid video backend '%s'. Options are 'pyav', 'video_reader' and 'cuda'" % backend)
     if backend == "video_reader" and not io._HAS_VIDEO_OPT:
+        # TODO: better messages
         message = "video_reader video backend is not available. Please compile torchvision from source and try again"
-        warnings.warn(message)
+        raise RuntimeError(message)
+    elif backend == "cuda" and not io._HAS_GPU_VIDEO_DECODER:
+        # TODO: better messages
+        message = "cuda video backend is not available."
+        raise RuntimeError(message)
     else:
         _video_backend = backend
 
@@ -93,3 +99,9 @@ def get_video_backend():
 
 def _is_tracing():
     return torch._C._get_tracing_state()
+
+
+def disable_beta_transforms_warning():
+    # Noop, only exists to avoid breaking existing code.
+    # See https://github.com/pytorch/vision/issues/7896
+    pass
diff --git a/torchvision/_internally_replaced_utils.py b/torchvision/_internally_replaced_utils.py
index 18afc3ed93a8272600d73cc240047a0a49f23991..d9a6e261ea277989f4362037352cb24da6564460 100644
--- a/torchvision/_internally_replaced_utils.py
+++ b/torchvision/_internally_replaced_utils.py
@@ -28,7 +28,6 @@ def _get_extension_path(lib_name):
     if os.name == "nt":
         # Register the main torchvision library location on the default DLL path
         import ctypes
-        import sys
 
         kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
         with_load_library_flags = hasattr(kernel32, "AddDllDirectory")
@@ -37,14 +36,7 @@ def _get_extension_path(lib_name):
         if with_load_library_flags:
             kernel32.AddDllDirectory.restype = ctypes.c_void_p
 
-        if sys.version_info >= (3, 8):
-            os.add_dll_directory(lib_dir)
-        elif with_load_library_flags:
-            res = kernel32.AddDllDirectory(lib_dir)
-            if res is None:
-                err = ctypes.WinError(ctypes.get_last_error())
-                err.strerror += f' Error adding "{lib_dir}" to the DLL directories.'
-                raise err
+        os.add_dll_directory(lib_dir)
 
         kernel32.SetErrorMode(prev_error_mode)
 
diff --git a/torchvision/_meta_registrations.py b/torchvision/_meta_registrations.py
new file mode 100644
index 0000000000000000000000000000000000000000..9831cfdcb456ef5d569f2a76b3de366f9a59f8f9
--- /dev/null
+++ b/torchvision/_meta_registrations.py
@@ -0,0 +1,50 @@
+import functools
+
+import torch
+import torch.library
+
+# Ensure that torch.ops.torchvision is visible
+import torchvision.extension  # noqa: F401
+
+
+@functools.lru_cache(None)
+def get_meta_lib():
+    return torch.library.Library("torchvision", "IMPL", "Meta")
+
+
+def register_meta(op_name, overload_name="default"):
+    def wrapper(fn):
+        if torchvision.extension._has_ops():
+            get_meta_lib().impl(getattr(getattr(torch.ops.torchvision, op_name), overload_name), fn)
+        return fn
+
+    return wrapper
+
+
+@register_meta("roi_align")
+def meta_roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    torch._check(rois.size(1) == 5, lambda: "rois must have shape as Tensor[K, 5]")
+    torch._check(
+        input.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for input to have the same type as tensor for rois; "
+            f"but type {input.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    num_rois = rois.size(0)
+    _, channels, height, width = input.size()
+    return input.new_empty((num_rois, channels, pooled_height, pooled_width))
+
+
+@register_meta("_roi_align_backward")
+def meta_roi_align_backward(
+    grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio, aligned
+):
+    torch._check(
+        grad.dtype == rois.dtype,
+        lambda: (
+            "Expected tensor for grad to have the same type as tensor for rois; "
+            f"but type {grad.dtype} does not equal {rois.dtype}"
+        ),
+    )
+    return grad.new_empty((batch_size, channels, height, width))
diff --git a/torchvision/csrc/io/decoder/decoder.cpp b/torchvision/csrc/io/decoder/decoder.cpp
index f13e2c3ffcfdfdae59be744c5fabc211fb7d0fd3..a895eed2d397488d65641c73d4227a53ef8ac59d 100644
--- a/torchvision/csrc/io/decoder/decoder.cpp
+++ b/torchvision/csrc/io/decoder/decoder.cpp
@@ -312,6 +312,8 @@ bool Decoder::init(
     }
   }
 
+  av_dict_set_int(&options, "probesize", params_.probeSize, 0);
+
   interrupted_ = false;
 
   // ffmpeg avformat_open_input call can hang if media source doesn't respond
diff --git a/torchvision/csrc/io/decoder/defs.h b/torchvision/csrc/io/decoder/defs.h
index dac6293d366ff7580b64050d48119b30f71b2fd3..6be50f8abc2dc1bd7e10d279f267c67542a585e7 100644
--- a/torchvision/csrc/io/decoder/defs.h
+++ b/torchvision/csrc/io/decoder/defs.h
@@ -165,7 +165,7 @@ struct MediaFormat {
 struct DecoderParameters {
   // local file, remote file, http url, rtmp stream uri, etc. anything that
   // ffmpeg can recognize
-  std::string uri;
+  std::string uri{std::string()};
   // timeout on getting bytes for decoding
   size_t timeoutMs{1000};
   // logging level, default AV_LOG_PANIC
@@ -213,6 +213,12 @@ struct DecoderParameters {
 
   // Skip packets that fail with EPERM errors and continue decoding.
   bool skipOperationNotPermittedPackets{false};
+
+  // probing size in bytes, i.e. the size of the data to analyze to get stream
+  // information. A higher value will enable detecting more information in case
+  // it is dispersed into the stream, but will increase latency. Must be an
+  // integer not lesser than 32. It is 5000000 by default.
+  int64_t probeSize{5000000};
 };
 
 struct DecoderHeader {
@@ -295,7 +301,7 @@ struct DecoderMetadata {
 };
 /**
  * Abstract class for decoding media bytes
- * It has two diffrent modes. Internal media bytes retrieval for given uri and
+ * It has two different modes. Internal media bytes retrieval for given uri and
  * external media bytes provider in case of memory streams
  */
 class MediaDecoder {
diff --git a/torchvision/csrc/io/decoder/memory_buffer.cpp b/torchvision/csrc/io/decoder/memory_buffer.cpp
index a7b0128e3edecfa3b18bc3730a8e75e384d2738a..4e420c3b3cd685e8fbda2fd84f6f9256dbfc2229 100644
--- a/torchvision/csrc/io/decoder/memory_buffer.cpp
+++ b/torchvision/csrc/io/decoder/memory_buffer.cpp
@@ -61,7 +61,7 @@ DecoderInCallback MemoryBuffer::getCallback(
         }
         // seek mode
         if (!timeoutMs) {
-          // seek capabilty, yes - supported
+          // seek capability, yes - supported
           return 0;
         }
         return object.seek(size, whence);
diff --git a/torchvision/csrc/io/decoder/sync_decoder_test.cpp b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
index 936d1e94f46b535387142c39d9f7da51584ba658..980725c2fcb43fdd6dee2b61ee230c93673f3de8 100644
--- a/torchvision/csrc/io/decoder/sync_decoder_test.cpp
+++ b/torchvision/csrc/io/decoder/sync_decoder_test.cpp
@@ -368,7 +368,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithFullRead) {
         }
         // seek mode
         if (!timeoutMs) {
-          // seek capabilty, yes - no
+          // seek capability, yes - no
           return -1;
         }
         return object.seek(size, whence);
@@ -408,7 +408,7 @@ TEST(SyncDecoder, TestMemoryBufferNoSeekableWithPartialRead) {
         }
         // seek mode
         if (!timeoutMs) {
-          // seek capabilty, yes - no
+          // seek capability, yes - no
           return -1;
         }
         return object.seek(size, whence);
diff --git a/torchvision/csrc/io/decoder/video_sampler.cpp b/torchvision/csrc/io/decoder/video_sampler.cpp
index 62ec0709be1d90bf6ff47c33919410fc15601414..8b712609e3439cd6478968e7a5410a276cb9758b 100644
--- a/torchvision/csrc/io/decoder/video_sampler.cpp
+++ b/torchvision/csrc/io/decoder/video_sampler.cpp
@@ -181,6 +181,23 @@ bool VideoSampler::init(const SamplerParameters& params) {
   // set output format
   params_ = params;
 
+  if (params.in.video.format == AV_PIX_FMT_YUV420P) {
+    /* When the video width and height are not multiples of 8,
+     * and there is no size change in the conversion,
+     * a blurry screen will appear on the right side
+     * This problem was discovered in 2012 and
+     * continues to exist in version 4.1.3 in 2019
+     * This problem can be avoided by increasing SWS_ACCURATE_RND
+     * details https://trac.ffmpeg.org/ticket/1582
+     */
+    if ((params.in.video.width & 0x7) || (params.in.video.height & 0x7)) {
+      VLOG(1) << "The width " << params.in.video.width << " and height "
+              << params.in.video.height << " the image is not a multiple of 8, "
+              << "the decoding speed may be reduced";
+      swsFlags_ |= SWS_ACCURATE_RND;
+    }
+  }
+
   scaleContext_ = sws_getContext(
       params.in.video.width,
       params.in.video.height,
diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp
index 1cc05dc76cadcb563a777c444139799186d1977e..da4dc5833dea5dc4c8ce78772412e2881c84aadd 100644
--- a/torchvision/csrc/io/image/cpu/decode_image.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_image.cpp
@@ -7,6 +7,8 @@ namespace vision {
 namespace image {
 
 torch::Tensor decode_image(const torch::Tensor& data, ImageReadMode mode) {
+  // Check that tensor is a CPU tensor
+  TORCH_CHECK(data.device() == torch::kCPU, "Expected a CPU tensor");
   // Check that the input tensor dtype is uint8
   TORCH_CHECK(data.dtype() == torch::kU8, "Expected a torch.uint8 tensor");
   // Check that the input tensor is 1-dimensional
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
index 6ec644d003ee9d4211c5cb27e81e124b03a7bbe7..63a4e5b42ec8f137356fe4f5be848a3233522882 100644
--- a/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.cpp
@@ -67,6 +67,58 @@ static void torch_jpeg_set_source_mgr(
   src->pub.next_input_byte = src->data;
 }
 
+inline unsigned char clamped_cmyk_rgb_convert(
+    unsigned char k,
+    unsigned char cmy) {
+  // Inspired from Pillow:
+  // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L568-L569
+  int v = k * cmy + 128;
+  v = ((v >> 8) + v) >> 8;
+  return std::clamp(k - v, 0, 255);
+}
+
+void convert_line_cmyk_to_rgb(
+    j_decompress_ptr cinfo,
+    const unsigned char* cmyk_line,
+    unsigned char* rgb_line) {
+  int width = cinfo->output_width;
+  for (int i = 0; i < width; ++i) {
+    int c = cmyk_line[i * 4 + 0];
+    int m = cmyk_line[i * 4 + 1];
+    int y = cmyk_line[i * 4 + 2];
+    int k = cmyk_line[i * 4 + 3];
+
+    rgb_line[i * 3 + 0] = clamped_cmyk_rgb_convert(k, 255 - c);
+    rgb_line[i * 3 + 1] = clamped_cmyk_rgb_convert(k, 255 - m);
+    rgb_line[i * 3 + 2] = clamped_cmyk_rgb_convert(k, 255 - y);
+  }
+}
+
+inline unsigned char rgb_to_gray(int r, int g, int b) {
+  // Inspired from Pillow:
+  // https://github.com/python-pillow/Pillow/blob/07623d1a7cc65206a5355fba2ae256550bfcaba6/src/libImaging/Convert.c#L226
+  return (r * 19595 + g * 38470 + b * 7471 + 0x8000) >> 16;
+}
+
+void convert_line_cmyk_to_gray(
+    j_decompress_ptr cinfo,
+    const unsigned char* cmyk_line,
+    unsigned char* gray_line) {
+  int width = cinfo->output_width;
+  for (int i = 0; i < width; ++i) {
+    int c = cmyk_line[i * 4 + 0];
+    int m = cmyk_line[i * 4 + 1];
+    int y = cmyk_line[i * 4 + 2];
+    int k = cmyk_line[i * 4 + 3];
+
+    int r = clamped_cmyk_rgb_convert(k, 255 - c);
+    int g = clamped_cmyk_rgb_convert(k, 255 - m);
+    int b = clamped_cmyk_rgb_convert(k, 255 - y);
+
+    gray_line[i] = rgb_to_gray(r, g, b);
+  }
+}
+
 } // namespace
 
 torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
@@ -102,20 +154,29 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
   jpeg_read_header(&cinfo, TRUE);
 
   int channels = cinfo.num_components;
+  bool cmyk_to_rgb_or_gray = false;
 
   if (mode != IMAGE_READ_MODE_UNCHANGED) {
     switch (mode) {
       case IMAGE_READ_MODE_GRAY:
-        if (cinfo.jpeg_color_space != JCS_GRAYSCALE) {
+        if (cinfo.jpeg_color_space == JCS_CMYK ||
+            cinfo.jpeg_color_space == JCS_YCCK) {
+          cinfo.out_color_space = JCS_CMYK;
+          cmyk_to_rgb_or_gray = true;
+        } else {
           cinfo.out_color_space = JCS_GRAYSCALE;
-          channels = 1;
         }
+        channels = 1;
         break;
       case IMAGE_READ_MODE_RGB:
-        if (cinfo.jpeg_color_space != JCS_RGB) {
+        if (cinfo.jpeg_color_space == JCS_CMYK ||
+            cinfo.jpeg_color_space == JCS_YCCK) {
+          cinfo.out_color_space = JCS_CMYK;
+          cmyk_to_rgb_or_gray = true;
+        } else {
           cinfo.out_color_space = JCS_RGB;
-          channels = 3;
         }
+        channels = 3;
         break;
       /*
        * Libjpeg does not support converting from CMYK to grayscale etc. There
@@ -139,12 +200,28 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
   auto tensor =
       torch::empty({int64_t(height), int64_t(width), channels}, torch::kU8);
   auto ptr = tensor.data_ptr<uint8_t>();
+  torch::Tensor cmyk_line_tensor;
+  if (cmyk_to_rgb_or_gray) {
+    cmyk_line_tensor = torch::empty({int64_t(width), 4}, torch::kU8);
+  }
+
   while (cinfo.output_scanline < cinfo.output_height) {
     /* jpeg_read_scanlines expects an array of pointers to scanlines.
      * Here the array is only one element long, but you could ask for
      * more than one scanline at a time if that's more convenient.
      */
-    jpeg_read_scanlines(&cinfo, &ptr, 1);
+    if (cmyk_to_rgb_or_gray) {
+      auto cmyk_line_ptr = cmyk_line_tensor.data_ptr<uint8_t>();
+      jpeg_read_scanlines(&cinfo, &cmyk_line_ptr, 1);
+
+      if (channels == 3) {
+        convert_line_cmyk_to_rgb(&cinfo, cmyk_line_ptr, ptr);
+      } else if (channels == 1) {
+        convert_line_cmyk_to_gray(&cinfo, cmyk_line_ptr, ptr);
+      }
+    } else {
+      jpeg_read_scanlines(&cinfo, &ptr, 1);
+    }
     ptr += stride;
   }
 
@@ -152,8 +229,23 @@ torch::Tensor decode_jpeg(const torch::Tensor& data, ImageReadMode mode) {
   jpeg_destroy_decompress(&cinfo);
   return tensor.permute({2, 0, 1});
 }
+#endif // #if !JPEG_FOUND
+
+int64_t _jpeg_version() {
+#if JPEG_FOUND
+  return JPEG_LIB_VERSION;
+#else
+  return -1;
+#endif
+}
 
+bool _is_compiled_against_turbo() {
+#ifdef LIBJPEG_TURBO_VERSION
+  return true;
+#else
+  return false;
 #endif
+}
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_jpeg.h b/torchvision/csrc/io/image/cpu/decode_jpeg.h
index 97ed3d51a54e625989d695c42ccf78f1e2e79d9f..254e94680b6726c499f22d247d33d0e01fb524c0 100644
--- a/torchvision/csrc/io/image/cpu/decode_jpeg.h
+++ b/torchvision/csrc/io/image/cpu/decode_jpeg.h
@@ -10,5 +10,8 @@ C10_EXPORT torch::Tensor decode_jpeg(
     const torch::Tensor& data,
     ImageReadMode mode = IMAGE_READ_MODE_UNCHANGED);
 
+C10_EXPORT int64_t _jpeg_version();
+C10_EXPORT bool _is_compiled_against_turbo();
+
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/image/cpu/decode_png.cpp b/torchvision/csrc/io/image/cpu/decode_png.cpp
index b1ceaf1badd8dad866fc51d8570de5a78bdfe7ef..d27eafe45a754caa643cbd4309eb18776865cb55 100644
--- a/torchvision/csrc/io/image/cpu/decode_png.cpp
+++ b/torchvision/csrc/io/image/cpu/decode_png.cpp
@@ -49,6 +49,7 @@ torch::Tensor decode_png(
     png_destroy_read_struct(&png_ptr, &info_ptr, nullptr);
     TORCH_CHECK(false, "Internal error.");
   }
+  TORCH_CHECK(datap_len >= 8, "Content is too small for png!")
   auto is_png = !png_sig_cmp(datap, 0, 8);
   TORCH_CHECK(is_png, "Content is not png!")
 
diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp
index 3c9d632f03057df9d3f15535a1e63af030163c8a..b5952739a7a041e5b07ac40c8a15851bb4efbb5d 100644
--- a/torchvision/csrc/io/image/image.cpp
+++ b/torchvision/csrc/io/image/image.cpp
@@ -19,15 +19,18 @@ PyMODINIT_FUNC PyInit_image(void) {
 namespace vision {
 namespace image {
 
-static auto registry = torch::RegisterOperators()
-                           .op("image::decode_png", &decode_png)
-                           .op("image::encode_png", &encode_png)
-                           .op("image::decode_jpeg", &decode_jpeg)
-                           .op("image::encode_jpeg", &encode_jpeg)
-                           .op("image::read_file", &read_file)
-                           .op("image::write_file", &write_file)
-                           .op("image::decode_image", &decode_image)
-                           .op("image::decode_jpeg_cuda", &decode_jpeg_cuda);
+static auto registry =
+    torch::RegisterOperators()
+        .op("image::decode_png", &decode_png)
+        .op("image::encode_png", &encode_png)
+        .op("image::decode_jpeg", &decode_jpeg)
+        .op("image::encode_jpeg", &encode_jpeg)
+        .op("image::read_file", &read_file)
+        .op("image::write_file", &write_file)
+        .op("image::decode_image", &decode_image)
+        .op("image::decode_jpeg_cuda", &decode_jpeg_cuda)
+        .op("image::_jpeg_version", &_jpeg_version)
+        .op("image::_is_compiled_against_turbo", &_is_compiled_against_turbo);
 
 } // namespace image
 } // namespace vision
diff --git a/torchvision/csrc/io/video/video.cpp b/torchvision/csrc/io/video/video.cpp
index 38b350145957faf6bc783b2c3500baf96385f981..2167ea695ec8948c590b1873cb8cd3b98e40e4cc 100644
--- a/torchvision/csrc/io/video/video.cpp
+++ b/torchvision/csrc/io/video/video.cpp
@@ -156,14 +156,34 @@ void Video::_getDecoderParams(
 
 } // _get decoder params
 
-Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
-  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
+void Video::initFromFile(
+    std::string videoPath,
+    std::string stream,
+    int64_t numThreads) {
+  TORCH_CHECK(!initialized, "Video object can only be initialized once");
+  initialized = true;
+  params.uri = videoPath;
+  _init(stream, numThreads);
+}
+
+void Video::initFromMemory(
+    torch::Tensor videoTensor,
+    std::string stream,
+    int64_t numThreads) {
+  TORCH_CHECK(!initialized, "Video object can only be initialized once");
+  initialized = true;
+  callback = MemoryBuffer::getCallback(
+      videoTensor.data_ptr<uint8_t>(), videoTensor.size(0));
+  _init(stream, numThreads);
+}
+
+void Video::_init(std::string stream, int64_t numThreads) {
   // set number of threads global
   numThreads_ = numThreads;
   // parse stream information
   current_stream = _parseStream(stream);
   // note that in the initial call we want to get all streams
-  Video::_getDecoderParams(
+  _getDecoderParams(
       0, // video start
       0, // headerOnly
       std::get<0>(current_stream), // stream info - remove that
@@ -175,11 +195,6 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
 
   std::string logMessage, logType;
 
-  // TODO: add read from memory option
-  params.uri = videoPath;
-  logType = "file";
-  logMessage = videoPath;
-
   // locals
   std::vector<double> audioFPS, videoFPS;
   std::vector<double> audioDuration, videoDuration, ccDuration, subsDuration;
@@ -190,7 +205,8 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
   c10::Dict<std::string, std::vector<double>> subsMetadata;
 
   // callback and metadata defined in struct
-  succeeded = decoder.init(params, std::move(callback), &metadata);
+  DecoderInCallback tmp_callback = callback;
+  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
   if (succeeded) {
     for (const auto& header : metadata) {
       double fps = double(header.fps);
@@ -225,16 +241,24 @@ Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
   streamsMetadata.insert("subtitles", subsMetadata);
   streamsMetadata.insert("cc", ccMetadata);
 
-  succeeded = Video::setCurrentStream(stream);
+  succeeded = setCurrentStream(stream);
   LOG(INFO) << "\nDecoder inited with: " << succeeded << "\n";
   if (std::get<1>(current_stream) != -1) {
     LOG(INFO)
         << "Stream index set to " << std::get<1>(current_stream)
         << ". If you encounter trouble, consider switching it to automatic stream discovery. \n";
   }
+}
+
+Video::Video(std::string videoPath, std::string stream, int64_t numThreads) {
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.io.video.video.Video");
+  if (!videoPath.empty()) {
+    initFromFile(videoPath, stream, numThreads);
+  }
 } // video
 
 bool Video::setCurrentStream(std::string stream = "video") {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   if ((!stream.empty()) && (_parseStream(stream) != current_stream)) {
     current_stream = _parseStream(stream);
   }
@@ -256,19 +280,23 @@ bool Video::setCurrentStream(std::string stream = "video") {
   );
 
   // callback and metadata defined in Video.h
-  return (decoder.init(params, std::move(callback), &metadata));
+  DecoderInCallback tmp_callback = callback;
+  return (decoder.init(params, std::move(tmp_callback), &metadata));
 }
 
 std::tuple<std::string, int64_t> Video::getCurrentStream() const {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   return current_stream;
 }
 
 c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>> Video::
     getStreamMetadata() const {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   return streamsMetadata;
 }
 
 void Video::Seek(double ts, bool fastSeek = false) {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   // initialize the class variables used for seeking and retrurn
   _getDecoderParams(
       ts, // video start
@@ -282,20 +310,23 @@ void Video::Seek(double ts, bool fastSeek = false) {
   );
 
   // callback and metadata defined in Video.h
-  succeeded = decoder.init(params, std::move(callback), &metadata);
+  DecoderInCallback tmp_callback = callback;
+  succeeded = decoder.init(params, std::move(tmp_callback), &metadata);
+
   LOG(INFO) << "Decoder init at seek " << succeeded << "\n";
 }
 
 std::tuple<torch::Tensor, double> Video::Next() {
+  TORCH_CHECK(initialized, "Video object has to be initialized first");
   // if failing to decode simply return a null tensor (note, should we
-  // raise an exeption?)
+  // raise an exception?)
   double frame_pts_s;
   torch::Tensor outFrame = torch::zeros({0}, torch::kByte);
 
   // decode single frame
   DecoderOutputMessage out;
   int64_t res = decoder.decode(&out, decoderTimeoutMs);
-  // if successfull
+  // if successful
   if (res == 0) {
     frame_pts_s = double(double(out.header.pts) * 1e-6);
 
@@ -345,6 +376,8 @@ std::tuple<torch::Tensor, double> Video::Next() {
 static auto registerVideo =
     torch::class_<Video>("torchvision", "Video")
         .def(torch::init<std::string, std::string, int64_t>())
+        .def("init_from_file", &Video::initFromFile)
+        .def("init_from_memory", &Video::initFromMemory)
         .def("get_current_stream", &Video::getCurrentStream)
         .def("set_current_stream", &Video::setCurrentStream)
         .def("get_metadata", &Video::getStreamMetadata)
diff --git a/torchvision/csrc/io/video/video.h b/torchvision/csrc/io/video/video.h
index 7cd926b793c41ece14f0e7f4338c57f2c3509fd3..7e57fcf6ed19322e878bdaf763c4e3b0cc34435d 100644
--- a/torchvision/csrc/io/video/video.h
+++ b/torchvision/csrc/io/video/video.h
@@ -19,7 +19,19 @@ struct Video : torch::CustomClassHolder {
   int64_t numThreads_{0};
 
  public:
-  Video(std::string videoPath, std::string stream, int64_t numThreads);
+  Video(
+      std::string videoPath = std::string(),
+      std::string stream = std::string("video"),
+      int64_t numThreads = 0);
+  void initFromFile(
+      std::string videoPath,
+      std::string stream,
+      int64_t numThreads);
+  void initFromMemory(
+      torch::Tensor videoTensor,
+      std::string stream,
+      int64_t numThreads);
+
   std::tuple<std::string, int64_t> getCurrentStream() const;
   c10::Dict<std::string, c10::Dict<std::string, std::vector<double>>>
   getStreamMetadata() const;
@@ -30,10 +42,16 @@ struct Video : torch::CustomClassHolder {
  private:
   bool succeeded = false; // decoder init flag
   // seekTS and doSeek act as a flag - if it's not set, next function simply
-  // retruns the next frame. If it's set, we look at the global seek
-  // time in comination with any_frame settings
+  // returns the next frame. If it's set, we look at the global seek
+  // time in combination with any_frame settings
   double seekTS = -1;
 
+  bool initialized = false;
+
+  void _init(
+      std::string stream,
+      int64_t numThreads); // expects params.uri OR callback to be set
+
   void _getDecoderParams(
       double videoStartS,
       int64_t getPtsOnly,
diff --git a/torchvision/csrc/macros.h b/torchvision/csrc/macros.h
index 8a7136fad86f0a1d3b0eda2e1072ef99f251a254..64ca89429a9706eca3bac52b85673de04f475822 100644
--- a/torchvision/csrc/macros.h
+++ b/torchvision/csrc/macros.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#ifdef _WIN32
+#if defined(_WIN32) && !defined(TORCHVISION_BUILD_STATIC_LIBS)
 #if defined(torchvision_EXPORTS)
 #define VISION_API __declspec(dllexport)
 #else
diff --git a/torchvision/csrc/ops/autograd/roi_align_kernel.cpp b/torchvision/csrc/ops/autograd/roi_align_kernel.cpp
index f26842b64286f2c9d186520949727af5da109267..6d792fe09d9f01ed866cdba45ef217becd290d18 100644
--- a/torchvision/csrc/ops/autograd/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/autograd/roi_align_kernel.cpp
@@ -15,8 +15,8 @@ class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
       const torch::autograd::Variable& input,
       const torch::autograd::Variable& rois,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
+      c10::SymInt pooled_height,
+      c10::SymInt pooled_width,
       int64_t sampling_ratio,
       bool aligned) {
     ctx->saved_data["spatial_scale"] = spatial_scale;
@@ -24,10 +24,10 @@ class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
     ctx->saved_data["pooled_width"] = pooled_width;
     ctx->saved_data["sampling_ratio"] = sampling_ratio;
     ctx->saved_data["aligned"] = aligned;
-    ctx->saved_data["input_shape"] = input.sizes();
+    ctx->saved_data["input_shape"] = input.sym_sizes();
     ctx->save_for_backward({rois});
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result = roi_align(
+    auto result = roi_align_symint(
         input,
         rois,
         spatial_scale,
@@ -44,17 +44,17 @@ class ROIAlignFunction : public torch::autograd::Function<ROIAlignFunction> {
     // Use data saved in forward
     auto saved = ctx->get_saved_variables();
     auto rois = saved[0];
-    auto input_shape = ctx->saved_data["input_shape"].toIntList();
-    auto grad_in = detail::_roi_align_backward(
+    auto input_shape = ctx->saved_data["input_shape"].toList();
+    auto grad_in = detail::_roi_align_backward_symint(
         grad_output[0],
         rois,
         ctx->saved_data["spatial_scale"].toDouble(),
-        ctx->saved_data["pooled_height"].toInt(),
-        ctx->saved_data["pooled_width"].toInt(),
-        input_shape[0],
-        input_shape[1],
-        input_shape[2],
-        input_shape[3],
+        ctx->saved_data["pooled_height"].toSymInt(),
+        ctx->saved_data["pooled_width"].toSymInt(),
+        input_shape[0].get().toSymInt(),
+        input_shape[1].get().toSymInt(),
+        input_shape[2].get().toSymInt(),
+        input_shape[3].get().toSymInt(),
         ctx->saved_data["sampling_ratio"].toInt(),
         ctx->saved_data["aligned"].toBool());
     return {
@@ -77,16 +77,16 @@ class ROIAlignBackwardFunction
       const torch::autograd::Variable& grad,
       const torch::autograd::Variable& rois,
       double spatial_scale,
-      int64_t pooled_height,
-      int64_t pooled_width,
-      int64_t batch_size,
-      int64_t channels,
-      int64_t height,
-      int64_t width,
+      c10::SymInt pooled_height,
+      c10::SymInt pooled_width,
+      c10::SymInt batch_size,
+      c10::SymInt channels,
+      c10::SymInt height,
+      c10::SymInt width,
       int64_t sampling_ratio,
       bool aligned) {
     at::AutoDispatchBelowADInplaceOrView g;
-    auto result = detail::_roi_align_backward(
+    auto result = detail::_roi_align_backward_symint(
         grad,
         rois,
         spatial_scale,
@@ -112,8 +112,8 @@ at::Tensor roi_align_autograd(
     const at::Tensor& input,
     const at::Tensor& rois,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
     int64_t sampling_ratio,
     bool aligned) {
   return ROIAlignFunction::apply(
@@ -130,12 +130,12 @@ at::Tensor roi_align_backward_autograd(
     const at::Tensor& grad,
     const at::Tensor& rois,
     double spatial_scale,
-    int64_t pooled_height,
-    int64_t pooled_width,
-    int64_t batch_size,
-    int64_t channels,
-    int64_t height,
-    int64_t width,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width,
     int64_t sampling_ratio,
     bool aligned) {
   return ROIAlignBackwardFunction::apply(
diff --git a/torchvision/csrc/ops/cpu/nms_kernel.cpp b/torchvision/csrc/ops/cpu/nms_kernel.cpp
index c54d1f00148d84273fbe6f3321bba97c73bde2db..50479066cbdd5867bf83a3806c8847ed6fd18430 100644
--- a/torchvision/csrc/ops/cpu/nms_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/nms_kernel.cpp
@@ -11,8 +11,8 @@ at::Tensor nms_kernel_impl(
     const at::Tensor& dets,
     const at::Tensor& scores,
     double iou_threshold) {
-  TORCH_CHECK(!dets.is_cuda(), "dets must be a CPU tensor");
-  TORCH_CHECK(!scores.is_cuda(), "scores must be a CPU tensor");
+  TORCH_CHECK(dets.is_cpu(), "dets must be a CPU tensor");
+  TORCH_CHECK(scores.is_cpu(), "scores must be a CPU tensor");
   TORCH_CHECK(
       dets.scalar_type() == scores.scalar_type(),
       "dets should have the same type as scores");
diff --git a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
index e6684e953d07b0370f4e65fd119624df7e3a8d2e..b787de6f6bb27140ca73b5c006567ec357662ca2 100644
--- a/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
+++ b/torchvision/csrc/ops/cpu/roi_align_kernel.cpp
@@ -60,7 +60,7 @@ void roi_align_forward_kernel_impl(
     // When the grid is empty, output zeros.
     const T count = std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
-    // we want to precalculate indices and weights shared by all chanels,
+    // we want to precalculate indices and weights shared by all channels,
     // this is the key point of optimization
     std::vector<detail::PreCalc<T>> pre_calc(
         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
diff --git a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
index d28d332b41e385f38c8dcaf384cca535de0bd39e..0e82c5dd638c9fd7d0cef851cac72f7baad103ba 100644
--- a/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
+++ b/torchvision/csrc/ops/cuda/deform_conv2d_kernel.cu
@@ -70,7 +70,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -300,11 +300,7 @@ void deformable_im2col(
               data_col.data_ptr<scalar_t>());
         }));
   }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf("error in deformable_im2col: %s\n", cudaGetErrorString(err));
-  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 int get_greatest_divisor_below_bound(int n, int bound) {
@@ -339,6 +335,8 @@ __global__ void deformable_col2im_kernel(
     index_t out_w,
     bool use_mask,
     scalar_t* grad_im) {
+  const index_t grad_im_numel = width * height * channels * batch_sz;
+
   CUDA_1D_KERNEL_LOOP_T(index, n, int64_t) {
     const index_t out_x = index % out_w;
     const index_t out_y = (index / out_w) % out_h;
@@ -385,7 +383,12 @@ __global__ void deformable_col2im_kernel(
             std::abs(y - yp) < 1 && std::abs(x - xp) < 1) {
           index_t grad_pos = ((b * channels + c) * height + yp) * width + xp;
           scalar_t weight = (1 - std::abs(y - yp)) * (1 - std::abs(x - xp));
-          gpuAtomicAdd(grad_im + grad_pos, mask_value * weight * col[index]);
+          at::native::fastAtomicAdd(
+              grad_im,
+              grad_pos,
+              grad_im_numel,
+              mask_value * weight * col[index],
+              true);
         }
       }
     }
@@ -430,6 +433,8 @@ void compute_grad_input(
   // Checks if num_kernels or columns numel larger than 2 ** 31
   use_64bits_indexing |= num_kernels > (1 << 31);
 
+  at::globalContext().alertNotDeterministic("compute_grad_input");
+
   if (use_64bits_indexing) {
     AT_DISPATCH_FLOATING_TYPES_AND_HALF(
         columns.scalar_type(), "compute_grad_input", ([&] {
@@ -483,11 +488,7 @@ void compute_grad_input(
               grad_im.data_ptr<scalar_t>());
         }));
   }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf("error in compute_grad_input: %s\n", cudaGetErrorString(err));
-  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 template <typename scalar_t, typename index_t>
@@ -736,12 +737,7 @@ void compute_grad_offset_and_mask(
               grad_mask.data_ptr<scalar_t>());
         }));
   }
-
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    printf(
-        "error in compute_grad_offset_and_mask: %s\n", cudaGetErrorString(err));
-  }
+  C10_CUDA_KERNEL_LAUNCH_CHECK();
 }
 
 std::tuple<at::Tensor, at::Tensor, at::Tensor> backward_gradient_inputs(
diff --git a/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu b/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
index b9c624b09c8982f2bf0cc7a524baf3485b05312d..105c6a1425651e130af0131d996fa029f8fb6bd7 100644
--- a/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/ps_roi_align_kernel.cu
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -212,7 +212,8 @@ __global__ void ps_roi_align_backward_kernel_impl(
     int sampling_ratio,
     int channels_out,
     T* grad_input,
-    const T* rois) {
+    const T* rois,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, *, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -235,8 +236,6 @@ __global__ void ps_roi_align_backward_kernel_impl(
     T bin_size_w = roi_width / static_cast<T>(pooled_width);
 
     int c_in = channel_mapping[index];
-    T* grad_input_offset =
-        grad_input + (roi_batch_ind * channels + c_in) * height * width;
 
     // Do not using floor/ceil; this implementation detail is critical
     T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
@@ -252,6 +251,8 @@ __global__ void ps_roi_align_backward_kernel_impl(
         (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
     const T count = roi_bin_grid_h * roi_bin_grid_w;
 
+    const int offset = (roi_batch_ind * channels + c_in) * height * width;
+
     for (int iy = 0; iy < roi_bin_grid_h; iy++) {
       const T y = hstart +
           static_cast<T>(iy + .5f) * bin_size_h /
@@ -285,10 +286,30 @@ __global__ void ps_roi_align_backward_kernel_impl(
         T g4 = grad_output_this_bin * w4 / count;
 
         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          gpuAtomicAdd(grad_input_offset + y_low * width + x_low, g1);
-          gpuAtomicAdd(grad_input_offset + y_low * width + x_high, g2);
-          gpuAtomicAdd(grad_input_offset + y_high * width + x_low, g3);
-          gpuAtomicAdd(grad_input_offset + y_high * width + x_high, g4);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_low * width + x_low,
+              memory_span,
+              static_cast<T>(g1),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_low * width + x_high,
+              memory_span,
+              static_cast<T>(g2),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_high * width + x_low,
+              memory_span,
+              static_cast<T>(g3),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              offset + y_high * width + x_high,
+              memory_span,
+              static_cast<T>(g4),
+              true);
         } // if
       } // ix
     } // iy
@@ -412,6 +433,8 @@ at::Tensor ps_roi_align_backward_kernel(
 
   int channels_out = channels / (pooled_height * pooled_width);
 
+  at::globalContext().alertNotDeterministic("ps_roi_align_backward_kernel");
+
   auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "ps_roi_align_backward_kernel", [&] {
@@ -428,7 +451,8 @@ at::Tensor ps_roi_align_backward_kernel(
             sampling_ratio,
             channels_out,
             grad_input.data_ptr<scalar_t>(),
-            rois_.data_ptr<scalar_t>());
+            rois_.data_ptr<scalar_t>(),
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu b/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
index 917fff03e8d3d26369ae3868a81386aa2d7b22a3..2c90690f4a55d709735b3df16e89c7d537e8bf22 100644
--- a/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
+++ b/torchvision/csrc/ops/cuda/ps_roi_pool_kernel.cu
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -91,7 +91,8 @@ __global__ void ps_roi_pool_backward_kernel_impl(
     int pooled_width,
     int channels_out,
     T* grad_input,
-    const T* rois) {
+    const T* rois,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, *, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -124,14 +125,15 @@ __global__ void ps_roi_pool_backward_kernel_impl(
     bool is_empty = (hend <= hstart) || (wend <= wstart);
 
     int c_in = channel_mapping[index];
-    T* grad_input_offset =
-        grad_input + (roi_batch_ind * channels + c_in) * height * width;
     T bin_area = (hend - hstart) * (wend - wstart);
     T diff_val = is_empty ? static_cast<T>(0) : grad_output[index] / bin_area;
+
+    const int offset = (roi_batch_ind * channels + c_in) * height * width;
     for (int h = hstart; h < hend; ++h) {
       for (int w = wstart; w < wend; ++w) {
         int grad_input_index = h * width + w;
-        gpuAtomicAdd(grad_input_offset + grad_input_index, diff_val);
+        at::native::fastAtomicAdd(
+            grad_input, offset + grad_input_index, memory_span, diff_val, true);
       }
     }
   }
@@ -251,6 +253,8 @@ at::Tensor ps_roi_pool_backward_kernel(
 
   int channels_out = channels / (pooled_height * pooled_width);
 
+  at::globalContext().alertNotDeterministic("ps_roi_pool_backward_kernel");
+
   auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "ps_roi_pool_backward_kernel", [&] {
@@ -267,7 +271,8 @@ at::Tensor ps_roi_pool_backward_kernel(
             pooled_width,
             channels_out,
             grad_input.data_ptr<scalar_t>(),
-            rois_.data_ptr<scalar_t>());
+            rois_.data_ptr<scalar_t>(),
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/cuda/roi_align_kernel.cu b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
index f1f886c47383a92dfd6839b27dfee3f6a7ff6cab..26c534486634ac5fbb0f1f1c60fed9353f5b69eb 100644
--- a/torchvision/csrc/ops/cuda/roi_align_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_align_kernel.cu
@@ -2,7 +2,7 @@
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -218,7 +218,8 @@ __global__ void roi_align_backward_kernel_impl(
     int n_stride,
     int c_stride,
     int h_stride,
-    int w_stride) {
+    int w_stride,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -247,12 +248,9 @@ __global__ void roi_align_backward_kernel_impl(
     T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
     T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
 
-    T* offset_grad_input =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
-
     // We need to index the gradient using the tensor strides to access the
     // correct values.
-    int output_offset = n * n_stride + c * c_stride;
+    const int output_offset = n * n_stride + c * c_stride;
     const T* offset_grad_output = grad_output + output_offset;
     const T grad_output_this_bin =
         offset_grad_output[ph * h_stride + pw * w_stride];
@@ -267,6 +265,8 @@ __global__ void roi_align_backward_kernel_impl(
     // We do average (integral) pooling inside a bin
     const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
 
+    const int input_offset = (roi_batch_ind * channels + c) * height * width;
+
     for (int iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
     {
       const T y = roi_start_h + ph * bin_size_h +
@@ -301,14 +301,30 @@ __global__ void roi_align_backward_kernel_impl(
         T g4 = grad_output_this_bin * w4 / count;
 
         if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
-          gpuAtomicAdd(
-              offset_grad_input + y_low * width + x_low, static_cast<T>(g1));
-          gpuAtomicAdd(
-              offset_grad_input + y_low * width + x_high, static_cast<T>(g2));
-          gpuAtomicAdd(
-              offset_grad_input + y_high * width + x_low, static_cast<T>(g3));
-          gpuAtomicAdd(
-              offset_grad_input + y_high * width + x_high, static_cast<T>(g4));
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_low * width + x_low,
+              memory_span,
+              static_cast<T>(g1),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_low * width + x_high,
+              memory_span,
+              static_cast<T>(g2),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_high * width + x_low,
+              memory_span,
+              static_cast<T>(g3),
+              true);
+          at::native::fastAtomicAdd(
+              grad_input,
+              input_offset + y_high * width + x_high,
+              memory_span,
+              static_cast<T>(g4),
+              true);
         } // if
       } // ix
     } // iy
@@ -421,6 +437,8 @@ at::Tensor roi_align_backward_kernel(
   int h_stride = grad.stride(2);
   int w_stride = grad.stride(3);
 
+  at::globalContext().alertNotDeterministic("roi_align_backward_kernel");
+
   auto rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "roi_align_backward_kernel", [&] {
@@ -440,7 +458,8 @@ at::Tensor roi_align_backward_kernel(
             n_stride,
             c_stride,
             h_stride,
-            w_stride);
+            w_stride,
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/cuda/roi_pool_kernel.cu b/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
index e29c4438ed4b76b16b02028833bc7d5af7675fa2..3a9374bb4381b03484a84707c6a0f76b23859cde 100644
--- a/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
+++ b/torchvision/csrc/ops/cuda/roi_pool_kernel.cu
@@ -3,7 +3,7 @@
 #include <c10/cuda/CUDAGuard.h>
 #include <float.h>
 #include <torch/library.h>
-#include <ATen/cuda/Atomic.cuh>
+#include <ATen/native/cuda/KernelUtils.cuh>
 
 #include "cuda_helpers.h"
 
@@ -94,7 +94,8 @@ __global__ void roi_pool_backward_kernel_impl(
     int n_stride,
     int c_stride,
     int h_stride,
-    int w_stride) {
+    int w_stride,
+    const int memory_span) {
   CUDA_1D_KERNEL_LOOP(index, nthreads) {
     // (n, c, ph, pw) is an element in the pooled output
     int pw = index % pooled_width;
@@ -104,19 +105,21 @@ __global__ void roi_pool_backward_kernel_impl(
 
     const T* offset_rois = rois + n * 5;
     int roi_batch_ind = offset_rois[0];
-    T* grad_input_offset =
-        grad_input + ((roi_batch_ind * channels + c) * height * width);
 
-    int output_offset = n * n_stride + c * c_stride;
+    const int output_offset = n * n_stride + c * c_stride;
     const int* argmax_data_offset =
         argmax_data + (n * channels + c) * pooled_height * pooled_width;
-    int argmax = argmax_data_offset[ph * pooled_width + pw];
+    const int argmax = argmax_data_offset[ph * pooled_width + pw];
+    const int offset = (roi_batch_ind * channels + c) * height * width;
 
     if (argmax != -1) {
-      gpuAtomicAdd(
-          grad_input_offset + argmax,
+      at::native::fastAtomicAdd(
+          grad_input,
+          offset + argmax,
+          memory_span,
           static_cast<T>(
-              grad_output[output_offset + ph * h_stride + pw * w_stride]));
+              grad_output[output_offset + ph * h_stride + pw * w_stride]),
+          true);
     }
   }
 }
@@ -232,6 +235,8 @@ at::Tensor roi_pool_backward_kernel(
   int h_stride = grad.stride(2);
   int w_stride = grad.stride(3);
 
+  at::globalContext().alertNotDeterministic("roi_pool_backward_kernel");
+
   auto argmax_ = argmax.contiguous(), rois_ = rois.contiguous();
   AT_DISPATCH_FLOATING_TYPES_AND_HALF(
       grad.scalar_type(), "roi_pool_backward_kernel", [&] {
@@ -251,7 +256,8 @@ at::Tensor roi_pool_backward_kernel(
             n_stride,
             c_stride,
             h_stride,
-            w_stride);
+            w_stride,
+            grad_input.numel());
       });
   AT_CUDA_CHECK(cudaGetLastError());
   return grad_input;
diff --git a/torchvision/csrc/ops/mps/mps_helpers.h b/torchvision/csrc/ops/mps/mps_helpers.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3c0e8d94b748cce667138f282623838dff81f86
--- /dev/null
+++ b/torchvision/csrc/ops/mps/mps_helpers.h
@@ -0,0 +1,6 @@
+constexpr int threadsPerBlock = 512;
+
+template <typename T>
+constexpr inline T ceil_div(T n, T m) {
+  return (n + m - 1) / m;
+}
diff --git a/torchvision/csrc/ops/mps/mps_kernels.h b/torchvision/csrc/ops/mps/mps_kernels.h
new file mode 100644
index 0000000000000000000000000000000000000000..e720a1608f184386ea0043a694c2226486a64d15
--- /dev/null
+++ b/torchvision/csrc/ops/mps/mps_kernels.h
@@ -0,0 +1,1102 @@
+#include <ATen/native/mps/OperationUtils.h>
+
+namespace vision {
+namespace ops {
+
+namespace mps {
+
+static const char* METAL_VISION = R"VISION_METAL(
+
+#include <metal_atomic>
+#include <metal_stdlib>
+using namespace metal;
+
+/*----------Macros----------*/
+
+#define MPS_1D_KERNEL_LOOP_T(i, n, n_tgs, index_t)      \
+  for (index_t i = (tgid.x * tptg.x) + tid2.x; i < (n); \
+       i += (tptg.x * n_tgs))
+
+#define MPS_1D_KERNEL_LOOP(i, n, n_tgs) MPS_1D_KERNEL_LOOP_T(i, n, n_tgs, uint)
+
+/*----------Helpers--------*/
+
+template <typename T>
+inline T ceil_div(T n, T m) {
+  return (n + m - 1) / m;
+}
+
+template <typename T>
+inline void atomic_add_float( device T* data_ptr, const T val)
+{
+#if __METAL_VERSION__ >= 300
+  // atomic_float is supported in Metal 3 (macOS Ventura) onward.
+  device atomic_fetch_add_explicit((device atomic_float*) data_ptr, val, memory_order_relaxed);
+#else
+  // Custom atomic addition implementation
+  // https://github.com/ShoYamanishi/AppleNumericalComputing/blob/053f06c1f5a831095c4bcc29aaf11366fce5231e/03_dot/metal/dot.metal#L447-L472
+  // https://forums.developer.nvidia.com/t/atomicadd-float-float-atomicmul-float-float/14639
+  // https://on-demand.gputechconf.com/gtc/2013/presentations/S3101-Atomic-Memory-Operations.pdf (See the last slide)
+  
+  // Create an atomic uint pointer for atomic transaction.
+  device atomic_uint* atom_var = (device atomic_uint*)data_ptr;
+  // Create necessary storage.
+  uint  fetched_uint,  assigning_uint;
+  T fetched_float, assigning_float;
+
+  // Replace the value in atom_var with 0 and return the previous value in atom_var.
+  fetched_uint = atomic_exchange_explicit( atom_var, 0 /*desired*/, memory_order_relaxed);
+  // Read out the previous value as float.
+  fetched_float = *( (thread T*) &fetched_uint );
+
+  // Do addition and represent the addition result in uint for atomic transaction.
+  assigning_float = fetched_float + val;
+  assigning_uint =  *((thread uint*) &assigning_float);
+
+  // atom_var should be 0 now, try to assign the addition result back to the atom_var (data_ptr).
+  while ((fetched_uint = atomic_exchange_explicit( atom_var, assigning_uint /*desired*/, memory_order_relaxed)) != 0)  {
+    // If atom_var was not 0, i.e. fetched_uint != 0, it means that the data has been modified by other threads.
+    // Try to assign 0 and get the previously assigned addition result.
+    uint fetched_uint_again = atomic_exchange_explicit(atom_var, 0 /*desired*/, memory_order_relaxed);
+    T fetched_float_again = *( (thread T*) &fetched_uint_again );
+    // Re-add again
+    fetched_float = *((thread T*) &(fetched_uint));
+    // Previously assigned addition result + addition result from other threads.
+    assigning_float = fetched_float_again + fetched_float;
+    assigning_uint =  *( (thread uint*) &assigning_float);
+  }
+#endif
+}
+
+template <typename T, typename integer_t>
+inline T bilinear_interpolate(
+    constant T* input,
+    integer_t height,
+    integer_t width,
+    T y,
+    T x,
+    uint index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    return 0;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  integer_t y_low = (integer_t)y;
+  integer_t x_low = (integer_t)x;
+  integer_t y_high;
+  integer_t x_high;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // do bilinear interpolation
+  T v1 = input[y_low * width + x_low];
+  T v2 = input[y_low * width + x_high];
+  T v3 = input[y_high * width + x_low];
+  T v4 = input[y_high * width + x_high];
+  T w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+
+  T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  return val;
+}
+
+template <typename T, typename integer_t>
+inline void bilinear_interpolate_gradient(
+    integer_t height,
+    integer_t width,
+    T y,
+    T x,
+    thread T& w1,
+    thread T& w2,
+    thread T& w3,
+    thread T& w4,
+    thread integer_t& x_low,
+    thread integer_t& x_high,
+    thread integer_t& y_low,
+    thread integer_t& y_high,
+    uint index /* index for debug only*/) {
+  // deal with cases that inverse elements are out of feature map boundary
+  if (y < -1.0 || y > height || x < -1.0 || x > width) {
+    // empty
+    w1 = w2 = w3 = w4 = 0.;
+    x_low = x_high = y_low = y_high = -1;
+    return;
+  }
+
+  if (y <= 0)
+    y = 0;
+  if (x <= 0)
+    x = 0;
+
+  y_low = (integer_t)y;
+  x_low = (integer_t)x;
+
+  if (y_low >= height - 1) {
+    y_high = y_low = height - 1;
+    y = (T)y_low;
+  } else {
+    y_high = y_low + 1;
+  }
+
+  if (x_low >= width - 1) {
+    x_high = x_low = width - 1;
+    x = (T)x_low;
+  } else {
+    x_high = x_low + 1;
+  }
+
+  T ly = y - y_low;
+  T lx = x - x_low;
+  T hy = 1. - ly, hx = 1. - lx;
+
+  // reference in forward
+  // T v1 = input[y_low * width + x_low];
+  // T v2 = input[y_low * width + x_high];
+  // T v3 = input[y_high * width + x_low];
+  // T v4 = input[y_high * width + x_high];
+  // T val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
+
+  w1 = hy * hx, w2 = hy * lx, w3 = ly * hx, w4 = ly * lx;
+}
+
+template <typename T, typename scalar_t>
+inline bool IoU(
+  constant T & a,
+  threadgroup T & b,
+  const float threshold) {
+  auto xx1 = max(a.x, b.x);
+  auto yy1 = max(a.y, b.y);
+  auto xx2 = min(a.z, b.z);
+  auto yy2 = min(a.w, b.w);
+  auto w = max(static_cast<scalar_t>(0), xx2 - xx1);
+  auto h = max(static_cast<scalar_t>(0), yy2 - yy1);
+  // Upcast to float before multiplications to circumvent precision issues in half.
+  auto inter = static_cast<float>(w) * static_cast<float>(h);
+  auto area_b = static_cast<float>(b.z - b.x) * static_cast<float>(b.w - b.y);
+  auto area_a = static_cast<float>(a.z - a.x) * static_cast<float>(a.w - a.y);
+  return (inter / (area_a + area_b - inter)) > threshold;
+}
+
+/*----------Kernels----------*/
+
+// This should be in sync with the one in nms_kernel.mm.
+// Since metal does not support dynamic array,
+// we need to make it static instead of deriving it from [[threads_per_threadgroup]].
+constant int64_t nmsThreadsPerBlock = sizeof(uint64_t) * 8;
+
+template<typename T, typename scalar_t>
+kernel void nms(constant  T        * dev_boxes     [[buffer(0)]],
+                device    uint64_t * mask          [[buffer(1)]],
+                constant  int64_t  & n_boxes       [[buffer(2)]],
+                constant  float    & iou_threshold [[buffer(3)]],
+                uint2     tgid     [[threadgroup_position_in_grid]],
+                uint2     tid2     [[thread_position_in_threadgroup]]) {
+  
+  const uint row_start = tgid.y;
+  const uint col_start = tgid.x;
+  const uint tid = tid2.x;
+  const uint row_size =
+      min(n_boxes - row_start * nmsThreadsPerBlock, nmsThreadsPerBlock);
+  const uint col_size =
+      min(n_boxes - col_start * nmsThreadsPerBlock, nmsThreadsPerBlock);
+
+  threadgroup T block_boxes[nmsThreadsPerBlock];
+  block_boxes[tid] = dev_boxes[nmsThreadsPerBlock * col_start + tid];
+  threadgroup_barrier(mem_flags::mem_threadgroup);
+
+  if (tid < row_size) {
+    const uint cur_box_idx = nmsThreadsPerBlock * row_start + tid;
+    uint64_t t = 0;
+    uint start = 0;
+    
+    if (row_start == col_start) {
+      start = tid + 1;
+    }
+
+    for (uint i = start; i < col_size; i++){
+      if (IoU<T, scalar_t>(dev_boxes[cur_box_idx], block_boxes[i], iou_threshold)){
+        t |= static_cast<uint64_t>(1) << i;  // discard 1 keep 0
+      }
+    }
+    const uint col_blocks = ceil_div(n_boxes, nmsThreadsPerBlock);
+    mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+#define REGISTER_NMS_OP(DTYPE)                             \
+template                                                   \
+[[host_name("nms_" #DTYPE)]]                               \
+kernel void nms<DTYPE ## 4, DTYPE>(                        \
+  constant DTYPE ## 4 * dev_boxes         [[buffer(0)]],   \
+  device   uint64_t   * mask              [[buffer(1)]],   \
+  constant int64_t    & n_boxes           [[buffer(2)]],   \
+  constant float      & iou_threshold     [[buffer(3)]],   \
+  uint2    tgid   [[threadgroup_position_in_grid]],        \
+  uint2    tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_align(
+    constant T       * input          [[buffer(0)]],
+    constant T       * rois           [[buffer(1)]],
+    device   T       * output         [[buffer(2)]],
+    constant int64_t & output_size    [[buffer(3)]],
+    constant int64_t & channels       [[buffer(4)]],
+    constant int64_t & height         [[buffer(5)]],
+    constant int64_t & width          [[buffer(6)]],
+    constant int64_t & pooled_height  [[buffer(7)]],
+    constant int64_t & pooled_width   [[buffer(8)]],
+    constant int64_t & sampling_ratio [[buffer(9)]],
+    constant bool    & aligned        [[buffer(10)]],
+    constant float   & spatial_scale  [[buffer(11)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    // When the grid is empty, output zeros.
+    const T count = max(roi_bin_grid_h * roi_bin_grid_w, static_cast<integer_t>(1)); // e.g. = 4
+
+    T output_val = 0.;
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        output_val += val;
+      }
+    }
+    output_val /= count;
+
+    output[index] = output_val;
+  }
+}
+
+#define REGISTER_ROI_ALIGN_OP(DTYPE, INT_DTYPE)         \
+template                                                \
+[[host_name("roi_align_" #DTYPE)]]                      \
+kernel void roi_align<DTYPE, INT_DTYPE>(                \
+  constant DTYPE * input            [[buffer(0)]],      \
+  constant DTYPE * rois             [[buffer(1)]],      \
+  device   DTYPE * output           [[buffer(2)]],      \
+  constant int64_t & output_size    [[buffer(3)]],      \
+  constant int64_t & channels       [[buffer(4)]],      \
+  constant int64_t & height         [[buffer(5)]],      \
+  constant int64_t & width          [[buffer(6)]],      \
+  constant int64_t & pooled_height  [[buffer(7)]],      \
+  constant int64_t & pooled_width   [[buffer(8)]],      \
+  constant int64_t & sampling_ratio [[buffer(9)]],      \
+  constant bool    & aligned        [[buffer(10)]],     \
+  constant float   & spatial_scale  [[buffer(11)]],     \
+  uint2     tgid   [[threadgroup_position_in_grid]],    \
+  uint2     tptg   [[threads_per_threadgroup]],         \
+  uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_align_backward(
+    constant T       * grad_output    [[buffer(0)]],
+    constant T       * rois           [[buffer(1)]],
+    device   T       * grad_input     [[buffer(2)]],
+    constant int64_t & output_size    [[buffer(3)]],
+    constant int64_t & channels       [[buffer(4)]],
+    constant int64_t & height         [[buffer(5)]],
+    constant int64_t & width          [[buffer(6)]],
+    constant int64_t & pooled_height  [[buffer(7)]],
+    constant int64_t & pooled_width   [[buffer(8)]],
+    constant int64_t & sampling_ratio [[buffer(9)]],
+    constant bool    & aligned        [[buffer(10)]],
+    constant float   & spatial_scale  [[buffer(11)]],
+    constant int64_t & n_stride       [[buffer(12)]],
+    constant int64_t & c_stride       [[buffer(13)]],
+    constant int64_t & h_stride       [[buffer(14)]],
+    constant int64_t & w_stride       [[buffer(15)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T offset = aligned ? (T)0.5 : (T)0.0;
+    T roi_start_w = offset_rois[1] * spatial_scale - offset;
+    T roi_start_h = offset_rois[2] * spatial_scale - offset;
+    T roi_end_w = offset_rois[3] * spatial_scale - offset;
+    T roi_end_h = offset_rois[4] * spatial_scale - offset;
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    if (!aligned) {
+      // Force malformed ROIs to be 1x1
+      roi_width = max(roi_width, (T)1.);
+      roi_height = max(roi_height, (T)1.);
+    }
+
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // We need to index the gradient using the tensor strides to access the
+    // correct values.
+    const integer_t output_offset = n * n_stride + c * c_stride;
+    constant T* offset_grad_output = grad_output + output_offset;
+    const T grad_output_this_bin =
+        offset_grad_output[ph * h_stride + pw * w_stride];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+
+    // We do average (integral) pooling inside a bin
+    const T count = roi_bin_grid_h * roi_bin_grid_w; // e.g. = 4
+
+    const integer_t input_offset = (roi_batch_ind * channels + c) * height * width;
+
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) // e.g., iy = 0, 1
+    {
+      const T y = roi_start_h + ph * bin_size_h +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h); // e.g., 0.5, 1.5
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = roi_start_w + pw * bin_size_w +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        integer_t x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomic_add_float(grad_input + input_offset + y_low * width + x_low, static_cast<T>(g1));
+          atomic_add_float(grad_input + input_offset + y_low * width + x_high, static_cast<T>(g2));
+          atomic_add_float(grad_input + input_offset + y_high * width + x_low, static_cast<T>(g3));
+          atomic_add_float(grad_input + input_offset + y_high * width + x_high, static_cast<T>(g4));
+          
+        } // if
+      } // ix
+    } // iy
+  } // MPS_1D_KERNEL_LOOP
+}
+
+#define REGISTER_ROI_ALIGN_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                   \
+[[host_name("roi_align_backward_" #DTYPE)]]                \
+kernel void roi_align_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output    [[buffer(0)]],       \
+    constant DTYPE   * rois           [[buffer(1)]],       \
+    device   DTYPE   * grad_input     [[buffer(2)]],       \
+    constant int64_t & output_size    [[buffer(3)]],       \
+    constant int64_t & channels       [[buffer(4)]],       \
+    constant int64_t & height         [[buffer(5)]],       \
+    constant int64_t & width          [[buffer(6)]],       \
+    constant int64_t & pooled_height  [[buffer(7)]],       \
+    constant int64_t & pooled_width   [[buffer(8)]],       \
+    constant int64_t & sampling_ratio [[buffer(9)]],       \
+    constant bool    & aligned        [[buffer(10)]],      \
+    constant float   & spatial_scale  [[buffer(11)]],      \
+    constant int64_t & n_stride       [[buffer(12)]],      \
+    constant int64_t & c_stride       [[buffer(13)]],      \
+    constant int64_t & h_stride       [[buffer(14)]],      \
+    constant int64_t & w_stride       [[buffer(15)]],      \
+    uint2     tgid   [[threadgroup_position_in_grid]],     \
+    uint2     tptg   [[threads_per_threadgroup]],          \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_pool(
+    constant T       * input         [[buffer(0)]],
+    constant T       * rois          [[buffer(1)]],
+    device   T       * output        [[buffer(2)]],
+    device   int64_t * argmax        [[buffer(3)]],
+    constant int64_t & output_size   [[buffer(4)]],
+    constant int64_t & channels      [[buffer(5)]],
+    constant int64_t & height        [[buffer(6)]],
+    constant int64_t & width         [[buffer(7)]],
+    constant int64_t & pooled_height [[buffer(8)]],
+    constant int64_t & pooled_width  [[buffer(9)]],
+    constant float   & spatial_scale [[buffer(10)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+    integer_t roi_start_w = round(offset_rois[1] * spatial_scale);
+    integer_t roi_start_h = round(offset_rois[2] * spatial_scale);
+    integer_t roi_end_w = round(offset_rois[3] * spatial_scale);
+    integer_t roi_end_h = round(offset_rois[4] * spatial_scale);
+
+    // Force malformed ROIs to be 1x1
+    integer_t roi_width = max(roi_end_w - roi_start_w + 1, static_cast<integer_t>(1));
+    integer_t roi_height = max(roi_end_h - roi_start_h + 1, static_cast<integer_t>(1));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    integer_t hstart = static_cast<integer_t>(floor(static_cast<T>(ph) * bin_size_h));
+    integer_t wstart = static_cast<integer_t>(floor(static_cast<T>(pw) * bin_size_w));
+    integer_t hend = static_cast<integer_t>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    integer_t wend = static_cast<integer_t>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    hend = min(max(hend + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    wstart = min(max(wstart + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    wend = min(max(wend + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    // Define an empty pooling region to be zero
+    T maxval = is_empty ? 0 : -FLT_MAX;
+    // If nothing is pooled, argmax = -1 causes nothing to be backprop'd
+    integer_t maxidx = -1;
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c) * height * width;
+    for (integer_t h = hstart; h < hend; ++h) {
+      for (integer_t w = wstart; w < wend; ++w) {
+        integer_t input_index = h * width + w;
+        if (offset_input[input_index] > maxval) {
+          maxval = offset_input[input_index];
+          maxidx = input_index;
+        }
+      }
+    }
+    output[index] = maxval;
+    argmax[index] = maxidx;
+  }
+}
+
+#define REGISTER_ROI_POOL_OP(DTYPE, INT_DTYPE)          \
+template                                                \
+[[host_name("roi_pool_" #DTYPE)]]                       \
+kernel void roi_pool<DTYPE, INT_DTYPE>(                 \
+  constant DTYPE * input           [[buffer(0)]],       \
+  constant DTYPE * rois            [[buffer(1)]],       \
+  device   DTYPE * output          [[buffer(2)]],       \
+  device   int64_t * argmax_data   [[buffer(3)]],       \
+  constant int64_t & output_size   [[buffer(4)]],       \
+  constant int64_t & channels      [[buffer(5)]],       \
+  constant int64_t & height        [[buffer(6)]],       \
+  constant int64_t & width         [[buffer(7)]],       \
+  constant int64_t & pooled_height [[buffer(8)]],       \
+  constant int64_t & pooled_width  [[buffer(9)]],       \
+  constant float   & spatial_scale [[buffer(10)]],      \
+  uint2     tgid   [[threadgroup_position_in_grid]],    \
+  uint2     tptg   [[threads_per_threadgroup]],         \
+  uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void roi_pool_backward(
+    constant T       * grad_output   [[buffer(0)]],
+    constant T       * rois          [[buffer(1)]],
+    constant int64_t * argmax_data   [[buffer(2)]],
+    device   T       * grad_input    [[buffer(3)]],
+    constant int64_t & output_size   [[buffer(4)]],
+    constant int64_t & channels      [[buffer(5)]],
+    constant int64_t & height        [[buffer(6)]],
+    constant int64_t & width         [[buffer(7)]],
+    constant int64_t & pooled_height [[buffer(8)]],
+    constant int64_t & pooled_width  [[buffer(9)]],
+    constant float   & spatial_scale [[buffer(10)]],
+    constant int64_t & n_stride      [[buffer(11)]],
+    constant int64_t & c_stride      [[buffer(12)]],
+    constant int64_t & h_stride      [[buffer(13)]],
+    constant int64_t & w_stride      [[buffer(14)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c = (index / pooled_width / pooled_height) % channels;
+    integer_t n = index / pooled_width / pooled_height / channels;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    const integer_t output_offset = n * n_stride + c * c_stride;
+    constant integer_t * argmax_data_offset =
+        argmax_data + (n * channels + c) * pooled_height * pooled_width;
+    const integer_t argmax = argmax_data_offset[ph * pooled_width + pw];
+    const integer_t offset = (roi_batch_ind * channels + c) * height * width;
+
+    if (argmax != -1) {
+      atomic_add_float(grad_input + offset + argmax, static_cast<T>(grad_output[output_offset + ph * h_stride + pw * w_stride]));
+    }
+    
+  } // MPS_1D_KERNEL_LOOP
+}
+
+#define REGISTER_ROI_POOL_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                  \
+[[host_name("roi_pool_backward_" #DTYPE)]]                \
+kernel void roi_pool_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output   [[buffer(0)]],       \
+    constant DTYPE   * rois          [[buffer(1)]],       \
+    constant int64_t * argmax_data   [[buffer(2)]],       \
+    device   DTYPE   * grad_input    [[buffer(3)]],       \
+    constant int64_t & output_size   [[buffer(4)]],       \
+    constant int64_t & channels      [[buffer(5)]],       \
+    constant int64_t & height        [[buffer(6)]],       \
+    constant int64_t & width         [[buffer(7)]],       \
+    constant int64_t & pooled_height [[buffer(8)]],       \
+    constant int64_t & pooled_width  [[buffer(9)]],       \
+    constant float   & spatial_scale [[buffer(10)]],      \
+    constant int64_t & n_stride      [[buffer(11)]],      \
+    constant int64_t & c_stride      [[buffer(12)]],      \
+    constant int64_t & h_stride      [[buffer(13)]],      \
+    constant int64_t & w_stride      [[buffer(14)]],      \
+    uint2     tgid   [[threadgroup_position_in_grid]],    \
+    uint2     tptg   [[threads_per_threadgroup]],         \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_align(
+    constant T       * input           [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    device   T       * output          [[buffer(2)]],
+    device   int64_t * channel_mapping [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & sampling_ratio  [[buffer(10)]],
+    constant int64_t & channels_out    [[buffer(11)]],
+    constant float   & spatial_scale   [[buffer(12)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c_out, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c_out = (index / pooled_width / pooled_height) % channels_out;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    // (n, c_in, ph, pw) is the associated element in the input
+    integer_t c_in = (c_out * pooled_height + ph) * pooled_width + pw;
+
+    // [start, end) interval for spatial sampling
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - static_cast<T>(0.5);
+    T roi_start_h = offset_rois[2] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_w = offset_rois[3] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_h = offset_rois[4] * spatial_scale - static_cast<T>(0.5);
+
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    // Do not using floor/ceil; this implementation detail is critical
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height);
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c_in) * height * width;
+    T out_sum = 0;
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = hstart +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = wstart +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+        T val = bilinear_interpolate(offset_input, height, width, y, x, index);
+        out_sum += val;
+      }
+    }
+
+    out_sum /= count;
+    output[index] = out_sum;
+    channel_mapping[index] = c_in;
+  }
+}
+
+#define REGISTER_PS_ROI_ALIGN_OP(DTYPE, INT_DTYPE)      \
+template                                                \
+[[host_name("ps_roi_align_" #DTYPE)]]                   \
+kernel void ps_roi_align<DTYPE, INT_DTYPE>(             \
+  constant DTYPE   * input           [[buffer(0)]],     \
+  constant DTYPE   * rois            [[buffer(1)]],     \
+  device   DTYPE   * output          [[buffer(2)]],     \
+  device   int64_t * channel_mapping [[buffer(3)]],     \
+  constant int64_t & output_size     [[buffer(4)]],     \
+  constant int64_t & channels        [[buffer(5)]],     \
+  constant int64_t & height          [[buffer(6)]],     \
+  constant int64_t & width           [[buffer(7)]],     \
+  constant int64_t & pooled_height   [[buffer(8)]],     \
+  constant int64_t & pooled_width    [[buffer(9)]],     \
+  constant int64_t & sampling_ratio  [[buffer(10)]],    \
+  constant int64_t & channels_out    [[buffer(11)]],    \
+  constant float   & spatial_scale   [[buffer(12)]],    \
+  uint2     tgid   [[threadgroup_position_in_grid]],    \
+  uint2     tptg   [[threads_per_threadgroup]],         \
+  uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_align_backward(
+    constant T       * grad_output     [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    constant int64_t * channel_mapping [[buffer(2)]],
+    device   T       * grad_input      [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & sampling_ratio  [[buffer(10)]],
+    constant int64_t & channels_out    [[buffer(11)]],
+    constant float   & spatial_scale   [[buffer(12)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, *, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+
+    // Do not using rounding; this implementation detail is critical
+    T roi_start_w = offset_rois[1] * spatial_scale - static_cast<T>(0.5);
+    T roi_start_h = offset_rois[2] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_w = offset_rois[3] * spatial_scale - static_cast<T>(0.5);
+    T roi_end_h = offset_rois[4] * spatial_scale - static_cast<T>(0.5);
+
+    // Force too small ROIs to be 1x1
+    T roi_width = roi_end_w - roi_start_w;
+    T roi_height = roi_end_h - roi_start_h;
+    T bin_size_h = roi_height / static_cast<T>(pooled_height);
+    T bin_size_w = roi_width / static_cast<T>(pooled_width);
+
+    integer_t c_in = channel_mapping[index];
+
+    // Do not using floor/ceil; this implementation detail is critical
+    T hstart = static_cast<T>(ph) * bin_size_h + roi_start_h;
+    T wstart = static_cast<T>(pw) * bin_size_w + roi_start_w;
+
+    const T grad_output_this_bin = grad_output[index];
+
+    // We use roi_bin_grid to sample the grid and mimic integral
+    integer_t roi_bin_grid_h = (sampling_ratio > 0)
+        ? sampling_ratio
+        : ceil(roi_height / pooled_height); // e.g., = 2
+    integer_t roi_bin_grid_w =
+        (sampling_ratio > 0) ? sampling_ratio : ceil(roi_width / pooled_width);
+    const T count = roi_bin_grid_h * roi_bin_grid_w;
+
+    const integer_t offset = (roi_batch_ind * channels + c_in) * height * width;
+
+    for (integer_t iy = 0; iy < roi_bin_grid_h; iy++) {
+      const T y = hstart +
+          static_cast<T>(iy + .5f) * bin_size_h /
+              static_cast<T>(roi_bin_grid_h);
+      for (integer_t ix = 0; ix < roi_bin_grid_w; ix++) {
+        const T x = wstart +
+            static_cast<T>(ix + .5f) * bin_size_w /
+                static_cast<T>(roi_bin_grid_w);
+
+        T w1, w2, w3, w4;
+        integer_t x_low, x_high, y_low, y_high;
+
+        bilinear_interpolate_gradient(
+            height,
+            width,
+            y,
+            x,
+            w1,
+            w2,
+            w3,
+            w4,
+            x_low,
+            x_high,
+            y_low,
+            y_high,
+            index);
+
+        T g1 = grad_output_this_bin * w1 / count;
+        T g2 = grad_output_this_bin * w2 / count;
+        T g3 = grad_output_this_bin * w3 / count;
+        T g4 = grad_output_this_bin * w4 / count;
+
+        if (x_low >= 0 && x_high >= 0 && y_low >= 0 && y_high >= 0) {
+          atomic_add_float(grad_input + offset + y_low * width + x_low, static_cast<T>(g1));
+          atomic_add_float(grad_input + offset + y_low * width + x_high, static_cast<T>(g2));
+          atomic_add_float(grad_input + offset + y_high * width + x_low, static_cast<T>(g3));
+          atomic_add_float(grad_input + offset + y_high * width + x_high, static_cast<T>(g4));
+        } // if
+      } // ix
+    } // iy
+  }
+}
+
+#define REGISTER_PS_ROI_ALIGN_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                      \
+[[host_name("ps_roi_align_backward_" #DTYPE)]]                \
+kernel void ps_roi_align_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output     [[buffer(0)]],         \
+    constant DTYPE   * rois            [[buffer(1)]],         \
+    constant int64_t * channel_mapping [[buffer(2)]],         \
+    device   DTYPE   * grad_input      [[buffer(3)]],         \
+    constant int64_t & output_size     [[buffer(4)]],         \
+    constant int64_t & channels        [[buffer(5)]],         \
+    constant int64_t & height          [[buffer(6)]],         \
+    constant int64_t & width           [[buffer(7)]],         \
+    constant int64_t & pooled_height   [[buffer(8)]],         \
+    constant int64_t & pooled_width    [[buffer(9)]],         \
+    constant int64_t & sampling_ratio  [[buffer(10)]],        \
+    constant int64_t & channels_out    [[buffer(11)]],        \
+    constant float   & spatial_scale   [[buffer(12)]],        \
+    uint2     tgid   [[threadgroup_position_in_grid]],        \
+    uint2     tptg   [[threads_per_threadgroup]],             \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_pool(
+    constant T       * input           [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    device   T       * output          [[buffer(2)]],
+    device   int64_t * channel_mapping [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & channels_out    [[buffer(10)]],
+    constant float   & spatial_scale   [[buffer(11)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, c_out, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t c_out = (index / (pooled_width * pooled_height)) % channels_out;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    // (n, c_in, ph, pw) is the associated element in the input
+    integer_t c_in = (c_out * pooled_height + ph) * pooled_width + pw;
+
+    // [start, end) interval for spatial sampling
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+    integer_t roi_start_w = round(offset_rois[1] * spatial_scale);
+    integer_t roi_start_h = round(offset_rois[2] * spatial_scale);
+    integer_t roi_end_w = round(offset_rois[3] * spatial_scale);
+    integer_t roi_end_h = round(offset_rois[4] * spatial_scale);
+
+    // Force too small ROIs to be 1x1
+    integer_t roi_width = max(roi_end_w - roi_start_w, static_cast<integer_t>(1));
+    integer_t roi_height = max(roi_end_h - roi_start_h, static_cast<integer_t>(1));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    integer_t hstart = static_cast<integer_t>(floor(static_cast<T>(ph) * bin_size_h));
+    integer_t wstart = static_cast<integer_t>(floor(static_cast<T>(pw) * bin_size_w));
+    integer_t hend = static_cast<integer_t>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    integer_t wend = static_cast<integer_t>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height - 1));
+    hend = min(max(hend + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height - 1));
+    wstart = min(max(wstart + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width - 1));
+    wend = min(max(wend + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width - 1));
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    constant T* offset_input =
+        input + (roi_batch_ind * channels + c_in) * height * width;
+    T out_sum = 0;
+    for (integer_t h = hstart; h < hend; ++h) {
+      for (integer_t w = wstart; w < wend; ++w) {
+        integer_t input_index = h * width + w;
+        out_sum += offset_input[input_index];
+      }
+    }
+
+    T bin_area = (hend - hstart) * (wend - wstart);
+    output[index] = is_empty ? static_cast<T>(0) : out_sum / bin_area;
+    channel_mapping[index] = c_in;
+  }
+}
+
+#define REGISTER_PS_ROI_POOL_OP(DTYPE, INT_DTYPE)     \
+template                                              \
+[[host_name("ps_roi_pool_" #DTYPE)]]                  \
+kernel void ps_roi_pool<DTYPE, INT_DTYPE>(            \
+  constant DTYPE   * input           [[buffer(0)]],   \
+  constant DTYPE   * rois            [[buffer(1)]],   \
+  device   DTYPE   * output          [[buffer(2)]],   \
+  device   int64_t * channel_mapping [[buffer(3)]],   \
+  constant int64_t & output_size     [[buffer(4)]],   \
+  constant int64_t & channels        [[buffer(5)]],   \
+  constant int64_t & height          [[buffer(6)]],   \
+  constant int64_t & width           [[buffer(7)]],   \
+  constant int64_t & pooled_height   [[buffer(8)]],   \
+  constant int64_t & pooled_width    [[buffer(9)]],   \
+  constant int64_t & channels_out    [[buffer(10)]],  \
+  constant float   & spatial_scale   [[buffer(11)]],  \
+  uint2    tgid   [[threadgroup_position_in_grid]],   \
+  uint2    tptg   [[threads_per_threadgroup]],        \
+  uint2    tid2   [[thread_position_in_threadgroup]]);
+
+template<typename T, typename integer_t>
+kernel void ps_roi_pool_backward(
+    constant T       * grad_output     [[buffer(0)]],
+    constant T       * rois            [[buffer(1)]],
+    constant int64_t * channel_mapping [[buffer(2)]],
+    device   T       * grad_input      [[buffer(3)]],
+    constant int64_t & output_size     [[buffer(4)]],
+    constant int64_t & channels        [[buffer(5)]],
+    constant int64_t & height          [[buffer(6)]],
+    constant int64_t & width           [[buffer(7)]],
+    constant int64_t & pooled_height   [[buffer(8)]],
+    constant int64_t & pooled_width    [[buffer(9)]],
+    constant int64_t & channels_out    [[buffer(10)]],
+    constant float   & spatial_scale   [[buffer(11)]],
+    uint2     tgid   [[threadgroup_position_in_grid]],
+    uint2     tptg   [[threads_per_threadgroup]],
+    uint2     tid2   [[thread_position_in_threadgroup]]){
+
+  MPS_1D_KERNEL_LOOP(index, output_size, 1) {
+    // (n, *, ph, pw) is an element in the pooled output
+    integer_t pw = index % pooled_width;
+    integer_t ph = (index / pooled_width) % pooled_height;
+    integer_t n = index / pooled_width / pooled_height / channels_out;
+
+    constant T* offset_rois = rois + n * 5;
+    integer_t roi_batch_ind = offset_rois[0];
+    integer_t roi_start_w = round(offset_rois[1] * spatial_scale);
+    integer_t roi_start_h = round(offset_rois[2] * spatial_scale);
+    integer_t roi_end_w = round(offset_rois[3] * spatial_scale);
+    integer_t roi_end_h = round(offset_rois[4] * spatial_scale);
+
+    // Force too small ROIs to be 1x1
+    integer_t roi_width = max(roi_end_w - roi_start_w, static_cast<integer_t>(1));
+    integer_t roi_height = max(roi_end_h - roi_start_h, static_cast<integer_t>(1));
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    integer_t hstart = static_cast<integer_t>(floor(static_cast<T>(ph) * bin_size_h));
+    integer_t wstart = static_cast<integer_t>(floor(static_cast<T>(pw) * bin_size_w));
+    integer_t hend = static_cast<integer_t>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    integer_t wend = static_cast<integer_t>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    // Add roi offsets and clip to input boundaries
+    hstart = min(max(hstart + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    hend = min(max(hend + roi_start_h, static_cast<integer_t>(0)), static_cast<integer_t>(height));
+    wstart = min(max(wstart + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    wend = min(max(wend + roi_start_w, static_cast<integer_t>(0)), static_cast<integer_t>(width));
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    integer_t c_in = channel_mapping[index];
+    T bin_area = (hend - hstart) * (wend - wstart);
+    T diff_val = is_empty ? static_cast<T>(0) : grad_output[index] / bin_area;
+
+    const integer_t offset = (roi_batch_ind * channels + c_in) * height * width;
+
+    for (integer_t h = hstart; h < hend; ++h) {
+      for (integer_t w = wstart; w < wend; ++w) {
+        integer_t grad_input_index = h * width + w;
+        atomic_add_float(grad_input + offset + grad_input_index, diff_val);
+      }
+    }
+    
+  } // MPS_1D_KERNEL_LOOP
+}
+
+#define REGISTER_PS_ROI_POOL_BACKWARD_OP(DTYPE, INT_DTYPE)   \
+template                                                     \
+[[host_name("ps_roi_pool_backward_" #DTYPE)]]                \
+kernel void ps_roi_pool_backward<DTYPE, INT_DTYPE>(          \
+    constant DTYPE   * grad_output     [[buffer(0)]],        \
+    constant DTYPE   * rois            [[buffer(1)]],        \
+    constant int64_t * channel_mapping [[buffer(2)]],        \
+    device   DTYPE   * grad_input      [[buffer(3)]],        \
+    constant int64_t & output_size     [[buffer(4)]],        \
+    constant int64_t & channels        [[buffer(5)]],        \
+    constant int64_t & height          [[buffer(6)]],        \
+    constant int64_t & width           [[buffer(7)]],        \
+    constant int64_t & pooled_height   [[buffer(8)]],        \
+    constant int64_t & pooled_width    [[buffer(9)]],        \
+    constant int64_t & channels_out    [[buffer(10)]],       \ 
+    constant float   & spatial_scale   [[buffer(11)]],       \
+    uint2     tgid   [[threadgroup_position_in_grid]],       \
+    uint2     tptg   [[threads_per_threadgroup]],            \
+    uint2     tid2   [[thread_position_in_threadgroup]]);
+
+REGISTER_NMS_OP(float);
+REGISTER_NMS_OP(half);
+REGISTER_ROI_ALIGN_OP(float, int64_t);
+REGISTER_ROI_ALIGN_OP(half, int64_t);
+REGISTER_ROI_ALIGN_BACKWARD_OP(float, int64_t);
+REGISTER_ROI_ALIGN_BACKWARD_OP(half, int64_t);
+REGISTER_ROI_POOL_OP(float, int64_t);
+REGISTER_ROI_POOL_OP(half, int64_t);
+REGISTER_ROI_POOL_BACKWARD_OP(float, int64_t);
+REGISTER_ROI_POOL_BACKWARD_OP(half, int64_t);
+REGISTER_PS_ROI_ALIGN_OP(float, int64_t);
+REGISTER_PS_ROI_ALIGN_OP(half, int64_t);
+REGISTER_PS_ROI_ALIGN_BACKWARD_OP(float, int64_t);
+REGISTER_PS_ROI_ALIGN_BACKWARD_OP(half, int64_t);
+REGISTER_PS_ROI_POOL_OP(float, int64_t);
+REGISTER_PS_ROI_POOL_OP(half, int64_t);
+REGISTER_PS_ROI_POOL_BACKWARD_OP(float, int64_t);
+REGISTER_PS_ROI_POOL_BACKWARD_OP(half, int64_t);
+
+)VISION_METAL";
+
+static id<MTLLibrary> compileVisionOpsLibrary(id<MTLDevice> device) {
+  static id<MTLLibrary> visionLibrary = nil;
+  if (visionLibrary) {
+    return visionLibrary;
+  }
+
+  NSError* error = nil;
+  MTLCompileOptions* options = [[MTLCompileOptions new] autorelease];
+  [options setLanguageVersion:MTLLanguageVersion2_3];
+  visionLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_VISION encoding:NSASCIIStringEncoding]
+                                       options:options
+                                         error:&error];
+  TORCH_CHECK(visionLibrary, "Failed to create metal vision library, error: ", [[error description] UTF8String]);
+  return visionLibrary;
+}
+
+static id<MTLComputePipelineState> visionPipelineState(id<MTLDevice> device, const std::string& kernel) {
+  static std::unordered_map<std::string, id<MTLComputePipelineState>> psoCache;
+  id<MTLComputePipelineState> pso = psoCache[kernel];
+  if (pso) {
+    return pso;
+  }
+
+  NSError* error = nil;
+  id<MTLLibrary> visionLib = compileVisionOpsLibrary(device);
+  id<MTLFunction> visionFunc = [visionLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]];
+  TORCH_CHECK(visionFunc, "Failed to create function state object for: ", kernel);
+  pso = [device newComputePipelineStateWithFunction:visionFunc error:&error];
+  TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]);
+
+  psoCache[kernel] = pso;
+  return pso;
+}
+
+} // namespace mps
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/nms_kernel.mm b/torchvision/csrc/ops/mps/nms_kernel.mm
new file mode 100644
index 0000000000000000000000000000000000000000..5ee9b5cbeae80f514848e665cd1f9c7888ba7c7d
--- /dev/null
+++ b/torchvision/csrc/ops/mps/nms_kernel.mm
@@ -0,0 +1,109 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+// This should be in sync with `nmsThreadsPerBlock` in the metal kernel.
+constexpr int64_t nmsThreadsPerBlock = sizeof(uint64_t) * 8;
+
+at::Tensor nms_kernel(const at::Tensor& dets, const at::Tensor& scores, double iou_threshold) {
+  using namespace at::native::mps;
+  TORCH_CHECK(dets.is_mps(), "dets must be a MPS tensor");
+  TORCH_CHECK(scores.is_mps(), "scores must be a MPS tensor");
+
+  TORCH_CHECK(dets.dim() == 2, "boxes should be a 2d tensor, got ", dets.dim(), "D");
+  TORCH_CHECK(dets.size(1) == 4, "boxes should have 4 elements in dimension 1, got ", dets.size(1));
+  TORCH_CHECK(scores.dim() == 1, "scores should be a 1d tensor, got ", scores.dim(), "D");
+  TORCH_CHECK(dets.size(0) == scores.size(0),
+              "boxes and scores should have same number of elements in ",
+              "dimension 0, got ",
+              dets.size(0),
+              " and ",
+              scores.size(0))
+
+  if (dets.numel() == 0) {
+    return at::empty({0}, dets.options().dtype(at::kLong));
+  }
+
+  auto order_t = std::get<1>(scores.sort(/*stable=*/true, /*dim=*/0, /* descending=*/true));
+  auto dets_sorted = dets.index_select(0, order_t).contiguous();
+  int64_t dets_num = dets.size(0);
+  float iou_threshold_f = static_cast<float>(iou_threshold);
+
+  const int col_blocks = (dets_num + nmsThreadsPerBlock - 1) / nmsThreadsPerBlock;
+  at::Tensor mask = at::empty({dets_num * col_blocks}, dets.options().dtype(at::kLong));
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(dets_sorted);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(mask);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(col_blocks, col_blocks, 1);
+
+      const std::string kernel = "nms_" + scalarToMetalTypeString(dets_sorted.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {dets, scores});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      [computeEncoder setBuffer:inputBuffer offset:dets_sorted.storage_offset() * dets_sorted.element_size() atIndex:0];
+      [computeEncoder setBuffer:outputBuffer offset:mask.storage_offset() * mask.element_size() atIndex:1];
+      [computeEncoder setBytes:&dets_num length:sizeof(int64_t) atIndex:2];
+      [computeEncoder setBytes:&iou_threshold_f length:sizeof(float) atIndex:3];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > nmsThreadsPerBlock) {
+        tgSize = nmsThreadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+
+  int64_t num_to_keep = 0;
+
+  at::Tensor mask_cpu = mask.to(at::kCPU);
+  unsigned long long* mask_host = (unsigned long long*)mask_cpu.data_ptr<int64_t>();
+
+  std::vector<unsigned long long> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(unsigned long long) * col_blocks);
+
+  at::Tensor keep = at::empty({dets_num}, dets.options().dtype(at::kLong).device(at::kCPU));
+  int64_t* keep_out = keep.data_ptr<int64_t>();
+
+  for (int64_t i = 0; i < dets_num; i++) {
+    int64_t nblock = i / nmsThreadsPerBlock;
+    int64_t inblock = i % nmsThreadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      keep_out[num_to_keep++] = i;
+      unsigned long long* p = mask_host + i * col_blocks;
+      for (int64_t j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+
+  return order_t.index(
+      {keep.narrow(/*dim=*/0, /*start=*/0, /*length=*/num_to_keep).to(order_t.device(), keep.scalar_type())});
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::nms"), TORCH_FN(nms_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/ps_roi_align_kernel.mm b/torchvision/csrc/ops/mps/ps_roi_align_kernel.mm
new file mode 100644
index 0000000000000000000000000000000000000000..16b711ad5efce47d58efa808d5226dc7e52afd2b
--- /dev/null
+++ b/torchvision/csrc/ops/mps/ps_roi_align_kernel.mm
@@ -0,0 +1,205 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_align_forward_kernel(const at::Tensor& input,
+                                                               const at::Tensor& rois,
+                                                               double spatial_scale,
+                                                               int64_t pooled_height,
+                                                               int64_t pooled_width,
+                                                               int64_t sampling_ratio) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ps_roi_align_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  TORCH_CHECK(channels % (pooled_height * pooled_width) == 0,
+              "input channels must be a multiple of pooling height * pooling width");
+
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+
+  auto output = at::zeros({num_rois, channels_out, pooled_height, pooled_width}, input.options());
+  auto channel_mapping = at::zeros(output.sizes(), input.options().dtype(at::kLong));
+
+  int64_t output_size = output.numel();
+
+  if (output_size == 0) {
+    return std::make_tuple(output, channel_mapping);
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_align_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:11];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:12];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return std::make_tuple(output, channel_mapping);
+}
+
+at::Tensor ps_roi_align_backward_kernel(const at::Tensor& grad,
+                                        const at::Tensor& rois,
+                                        const at::Tensor& channel_mapping,
+                                        double spatial_scale,
+                                        int64_t pooled_height,
+                                        int64_t pooled_width,
+                                        int64_t sampling_ratio,
+                                        int64_t batch_size,
+                                        int64_t channels,
+                                        int64_t height,
+                                        int64_t width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support ps_roi_align backward with float16 inputs.");
+  TORCH_CHECK(channel_mapping.is_mps(), "channel_mapping must be a MPS tensor");
+
+  at::TensorArg grad_t{grad, "input", 1}, rois_t{rois, "rois", 2},
+      channel_mapping_t{channel_mapping, "channel_mapping", 3};
+
+  at::CheckedFrom c = "ps_roi_align_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t output_size = grad.numel();
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+
+  at::globalContext().alertNotDeterministic("ps_roi_align_backward_kernel");
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_align_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad_.storage_offset() * grad_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:2];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:11];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:12];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::ps_roi_align"), TORCH_FN(ps_roi_align_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_ps_roi_align_backward"), TORCH_FN(ps_roi_align_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
new file mode 100644
index 0000000000000000000000000000000000000000..fc24f6990faf8fa23e6ec8c62e6aee29e7af7783
--- /dev/null
+++ b/torchvision/csrc/ops/mps/ps_roi_pool_kernel.mm
@@ -0,0 +1,200 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> ps_roi_pool_forward_kernel(const at::Tensor& input,
+                                                              const at::Tensor& rois,
+                                                              double spatial_scale,
+                                                              int64_t pooled_height,
+                                                              int64_t pooled_width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "ps_roi_pool_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  TORCH_CHECK(channels % (pooled_height * pooled_width) == 0,
+              "input channels must be a multiple of pooling height * pooling width");
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+
+  auto output = at::zeros({num_rois, channels_out, pooled_height, pooled_width}, input.options());
+  auto channel_mapping = at::zeros(output.sizes(), input.options().dtype(at::kLong));
+  auto output_size = output.numel();
+
+  if (output_size == 0) {
+    return std::make_tuple(output, channel_mapping);
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_pool_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return std::make_tuple(output, channel_mapping);
+}
+
+at::Tensor ps_roi_pool_backward_kernel(const at::Tensor& grad,
+                                       const at::Tensor& rois,
+                                       const at::Tensor& channel_mapping,
+                                       double spatial_scale,
+                                       int64_t pooled_height,
+                                       int64_t pooled_width,
+                                       int64_t batch_size,
+                                       int64_t channels,
+                                       int64_t height,
+                                       int64_t width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support ps_roi_pool backward with float16 inputs.");
+  TORCH_CHECK(channel_mapping.is_mps(), "channel_mapping must be a MPS tensor");
+
+  at::TensorArg grad_t{grad, "grad", 1}, rois_t{rois, "rois", 2},
+      channel_mapping_t{channel_mapping, "channel_mapping", 3};
+
+  at::CheckedFrom c = "ps_roi_pool_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t, channel_mapping_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  auto num_rois = rois.size(0);
+  auto grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t channels_out = channels / (pooled_height * pooled_width);
+  int64_t output_size = grad.numel();
+
+  at::globalContext().alertNotDeterministic("ps_roi_pool_backward_kernel");
+  auto grad_ = grad.contiguous(), rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> channelMappingBuffer = getMTLBufferStorage(channel_mapping);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "ps_roi_pool_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad_, rois_, channel_mapping});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad_.storage_offset() * grad_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:channelMappingBuffer
+                         offset:channel_mapping.storage_offset() * channel_mapping.element_size()
+                        atIndex:2];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&channels_out length:sizeof(int64_t) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::ps_roi_pool"), TORCH_FN(ps_roi_pool_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_ps_roi_pool_backward"), TORCH_FN(ps_roi_pool_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/roi_align_kernel.mm b/torchvision/csrc/ops/mps/roi_align_kernel.mm
new file mode 100644
index 0000000000000000000000000000000000000000..d4ed8b43fd26d62f28feaad6f46f5ec1894f0338
--- /dev/null
+++ b/torchvision/csrc/ops/mps/roi_align_kernel.mm
@@ -0,0 +1,197 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+at::Tensor roi_align_forward_kernel(const at::Tensor& input,
+                                    const at::Tensor& rois,
+                                    double spatial_scale,
+                                    int64_t pooled_height,
+                                    int64_t pooled_width,
+                                    int64_t sampling_ratio,
+                                    bool aligned) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "roi_align_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor output = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options());
+
+  int64_t output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return output;
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_align_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:3];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&aligned length:sizeof(bool) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return output;
+}
+
+at::Tensor roi_align_backward_kernel(const at::Tensor& grad,
+                                     const at::Tensor& rois,
+                                     double spatial_scale,
+                                     int64_t pooled_height,
+                                     int64_t pooled_width,
+                                     int64_t batch_size,
+                                     int64_t channels,
+                                     int64_t height,
+                                     int64_t width,
+                                     int64_t sampling_ratio,
+                                     bool aligned) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support roi_align backward with float16 inputs.");
+
+  at::TensorArg grad_t{grad, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "roi_align_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t n_stride = grad.stride(0);
+  int64_t c_stride = grad.stride(1);
+  int64_t h_stride = grad.stride(2);
+  int64_t w_stride = grad.stride(3);
+  int64_t output_size = grad.numel();
+
+  at::globalContext().alertNotDeterministic("roi_align_backward_kernel");
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_align_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad.storage_offset() * grad.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:2];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:3];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&sampling_ratio length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&aligned length:sizeof(bool) atIndex:10];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:11];
+      [computeEncoder setBytes:&n_stride length:sizeof(int64_t) atIndex:12];
+      [computeEncoder setBytes:&c_stride length:sizeof(int64_t) atIndex:13];
+      [computeEncoder setBytes:&h_stride length:sizeof(int64_t) atIndex:14];
+      [computeEncoder setBytes:&w_stride length:sizeof(int64_t) atIndex:15];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::roi_align"), TORCH_FN(roi_align_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_roi_align_backward"), TORCH_FN(roi_align_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/mps/roi_pool_kernel.mm b/torchvision/csrc/ops/mps/roi_pool_kernel.mm
new file mode 100644
index 0000000000000000000000000000000000000000..816d8d708636107e27f4cb94acbd13ad7870b953
--- /dev/null
+++ b/torchvision/csrc/ops/mps/roi_pool_kernel.mm
@@ -0,0 +1,196 @@
+#include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/OperationUtils.h>
+#include "mps_helpers.h"
+#include "mps_kernels.h"
+
+namespace vision {
+namespace ops {
+
+namespace {
+
+std::tuple<at::Tensor, at::Tensor> roi_pool_forward_kernel(const at::Tensor& input,
+                                                           const at::Tensor& rois,
+                                                           double spatial_scale,
+                                                           int64_t pooled_height,
+                                                           int64_t pooled_width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(input.is_mps(), "input must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(rois.size(1) == 5, "rois must have shape as Tensor[K, 5]");
+
+  at::TensorArg input_t{input, "input", 1}, rois_t{rois, "rois", 2};
+
+  at::CheckedFrom c = "roi_pool_forward_kernel";
+  at::checkAllSameGPU(c, {input_t, rois_t});
+  at::checkAllSameType(c, {input_t, rois_t});
+
+  int64_t num_rois = rois.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor output = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options());
+  at::Tensor argmax = at::zeros({num_rois, channels, pooled_height, pooled_width}, input.options().dtype(at::kLong));
+
+  int64_t output_size = num_rois * pooled_height * pooled_width * channels;
+
+  if (output.numel() == 0) {
+    return std::make_tuple(output, argmax);
+  }
+
+  auto input_ = input.contiguous();
+  auto rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(input_);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(output);
+  id<MTLBuffer> argmaxBuffer = getMTLBufferStorage(argmax);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(output_size), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_pool_" + scalarToMetalTypeString(input.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {input_, rois_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:input_.storage_offset() * input_.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:outputBuffer offset:output.storage_offset() * output.element_size() atIndex:2];
+      [computeEncoder setBuffer:argmaxBuffer offset:argmax.storage_offset() * argmax.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:10];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return std::make_tuple(output, argmax);
+}
+
+at::Tensor roi_pool_backward_kernel(const at::Tensor& grad,
+                                    const at::Tensor& rois,
+                                    const at::Tensor& argmax,
+                                    double spatial_scale,
+                                    int64_t pooled_height,
+                                    int64_t pooled_width,
+                                    int64_t batch_size,
+                                    int64_t channels,
+                                    int64_t height,
+                                    int64_t width) {
+  using namespace at::native::mps;
+  TORCH_CHECK(grad.is_mps(), "grad must be a MPS tensor");
+  TORCH_CHECK(rois.is_mps(), "rois must be a MPS tensor");
+  TORCH_CHECK(grad.scalar_type() != at::kHalf, "MPS does not support roi_pool backward with float16 inputs.");
+  TORCH_CHECK(argmax.is_mps(), "argmax must be a MPS tensor");
+
+  at::TensorArg grad_t{grad, "input", 1}, rois_t{rois, "rois", 2}, argmax_t{argmax, "argmax", 3};
+
+  at::CheckedFrom c = "roi_pool_backward_kernel";
+  at::checkAllSameGPU(c, {grad_t, rois_t, argmax_t});
+  at::checkAllSameType(c, {grad_t, rois_t});
+
+  float spatial_scale_f = static_cast<float>(spatial_scale);
+
+  at::Tensor grad_input = at::zeros({batch_size, channels, height, width}, grad.options());
+
+  if (grad.numel() == 0) {
+    return grad_input;
+  }
+
+  int64_t n_stride = grad.stride(0);
+  int64_t c_stride = grad.stride(1);
+  int64_t h_stride = grad.stride(2);
+  int64_t w_stride = grad.stride(3);
+  int64_t output_size = grad.numel();
+
+  at::globalContext().alertNotDeterministic("roi_pool_backward_kernel");
+  auto argmax_ = argmax.contiguous(), rois_ = rois.contiguous();
+
+  id<MTLBuffer> inputBuffer = getMTLBufferStorage(grad);
+  id<MTLBuffer> roisBuffer = getMTLBufferStorage(rois_);
+  id<MTLBuffer> argmaxBuffer = getMTLBufferStorage(argmax_);
+  id<MTLBuffer> outputBuffer = getMTLBufferStorage(grad_input);
+  id<MTLDevice> device = MPSDevice::getInstance()->device();
+  MPSStream* mpsStream = getCurrentMPSStream();
+  dispatch_sync(mpsStream->queue(), ^() {
+    @autoreleasepool {
+      id<MTLComputeCommandEncoder> computeEncoder = mpsStream->commandEncoder();
+      MTLSize threadgroupsPerGrid = MTLSizeMake(
+          std::min(ceil_div(static_cast<int64_t>(grad.numel()), static_cast<int64_t>(512)), static_cast<int64_t>(4096)),
+          1,
+          1);
+
+      const std::string kernel = "roi_pool_backward_" + scalarToMetalTypeString(grad.scalar_type());
+      id<MTLComputePipelineState> visionPSO = mps::visionPipelineState(device, kernel);
+
+      // this function call is a no-op if MPS Profiler is not enabled
+      getMPSProfiler().beginProfileKernel(visionPSO, kernel, {grad, rois_, argmax_});
+
+      [computeEncoder setComputePipelineState:visionPSO];
+      // [N, C, H, W]
+      [computeEncoder setBuffer:inputBuffer offset:grad.storage_offset() * grad.element_size() atIndex:0];
+      [computeEncoder setBuffer:roisBuffer offset:rois_.storage_offset() * rois_.element_size() atIndex:1];
+      [computeEncoder setBuffer:argmaxBuffer offset:argmax_.storage_offset() * argmax_.element_size() atIndex:2];
+      [computeEncoder setBuffer:outputBuffer offset:grad_input.storage_offset() * grad_input.element_size() atIndex:3];
+
+      [computeEncoder setBytes:&output_size length:sizeof(int64_t) atIndex:4];
+      [computeEncoder setBytes:&channels length:sizeof(int64_t) atIndex:5];
+      [computeEncoder setBytes:&height length:sizeof(int64_t) atIndex:6];
+      [computeEncoder setBytes:&width length:sizeof(int64_t) atIndex:7];
+      [computeEncoder setBytes:&pooled_height length:sizeof(int64_t) atIndex:8];
+      [computeEncoder setBytes:&pooled_width length:sizeof(int64_t) atIndex:9];
+      [computeEncoder setBytes:&spatial_scale_f length:sizeof(float) atIndex:10];
+      [computeEncoder setBytes:&n_stride length:sizeof(int64_t) atIndex:11];
+      [computeEncoder setBytes:&c_stride length:sizeof(int64_t) atIndex:12];
+      [computeEncoder setBytes:&h_stride length:sizeof(int64_t) atIndex:13];
+      [computeEncoder setBytes:&w_stride length:sizeof(int64_t) atIndex:14];
+
+      // A threadGroup is equivalent to a cuda's block.
+      NSUInteger tgSize = visionPSO.maxTotalThreadsPerThreadgroup;
+      if (tgSize > threadsPerBlock) {
+        tgSize = threadsPerBlock;
+      }
+
+      MTLSize threadGroupSize = MTLSizeMake(tgSize, 1, 1);
+      [computeEncoder dispatchThreadgroups:threadgroupsPerGrid threadsPerThreadgroup:threadGroupSize];
+
+      getMPSProfiler().endProfileKernel(visionPSO);
+    }
+  });
+  return grad_input;
+}
+
+} // namespace
+
+TORCH_LIBRARY_IMPL(torchvision, MPS, m) {
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::roi_pool"), TORCH_FN(roi_pool_forward_kernel));
+  m.impl(TORCH_SELECTIVE_NAME("torchvision::_roi_pool_backward"), TORCH_FN(roi_pool_backward_kernel));
+}
+
+} // namespace ops
+} // namespace vision
diff --git a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
index 15f468b31e7234284515c3cf98c3855ac5775747..5b13a8ca3efe537278050761b025ab1356c93a7c 100644
--- a/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
+++ b/torchvision/csrc/ops/quantized/cpu/qroi_align_kernel.cpp
@@ -164,7 +164,7 @@ void qroi_align_forward_kernel_impl(
     const float count =
         std::max(roi_bin_grid_h * roi_bin_grid_w, 1); // e.g. = 4
 
-    // we want to precalculate indices and weights shared by all chanels,
+    // we want to precalculate indices and weights shared by all channels,
     // this is the key point of optimization
     std::vector<detail::PreCalc<float>> pre_calc(
         roi_bin_grid_h * roi_bin_grid_w * pooled_width * pooled_height);
diff --git a/torchvision/csrc/ops/roi_align.cpp b/torchvision/csrc/ops/roi_align.cpp
index e2465d6261e9686a07852ac31390c3f45aeb4916..aa6dccb44f29b717bcc07e2b1b07e59fecd77f92 100644
--- a/torchvision/csrc/ops/roi_align.cpp
+++ b/torchvision/csrc/ops/roi_align.cpp
@@ -32,6 +32,31 @@ at::Tensor roi_align(
       aligned);
 }
 
+at::Tensor roi_align_symint(
+    const at::Tensor& input, // Input feature map.
+    const at::Tensor& rois, // List of ROIs to pool over.
+    double spatial_scale, // The scale of the image features. ROIs will be
+    // scaled to this.
+    c10::SymInt pooled_height, // The height of the pooled feature map.
+    c10::SymInt pooled_width, // The width of the pooled feature
+    int64_t sampling_ratio, // The number of points to sample in each bin
+    bool aligned) // The flag for pixel shift
+// along each axis.
+{
+  C10_LOG_API_USAGE_ONCE("torchvision.csrc.ops.roi_align.roi_align");
+  static auto op = c10::Dispatcher::singleton()
+                       .findSchemaOrThrow("torchvision::roi_align", "")
+                       .typed<decltype(roi_align_symint)>();
+  return op.call(
+      input,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      sampling_ratio,
+      aligned);
+}
+
 namespace detail {
 
 at::Tensor _roi_align_backward(
@@ -64,13 +89,43 @@ at::Tensor _roi_align_backward(
       aligned);
 }
 
+at::Tensor _roi_align_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width,
+    int64_t sampling_ratio,
+    bool aligned) {
+  static auto op =
+      c10::Dispatcher::singleton()
+          .findSchemaOrThrow("torchvision::_roi_align_backward", "")
+          .typed<decltype(_roi_align_backward_symint)>();
+  return op.call(
+      grad,
+      rois,
+      spatial_scale,
+      pooled_height,
+      pooled_width,
+      batch_size,
+      channels,
+      height,
+      width,
+      sampling_ratio,
+      aligned);
+}
+
 } // namespace detail
 
 TORCH_LIBRARY_FRAGMENT(torchvision, m) {
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::roi_align(Tensor input, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int sampling_ratio, bool aligned) -> Tensor"));
+      "torchvision::roi_align(Tensor input, Tensor rois, float spatial_scale, SymInt pooled_height, SymInt pooled_width, int sampling_ratio, bool aligned) -> Tensor"));
   m.def(TORCH_SELECTIVE_SCHEMA(
-      "torchvision::_roi_align_backward(Tensor grad, Tensor rois, float spatial_scale, int pooled_height, int pooled_width, int batch_size, int channels, int height, int width, int sampling_ratio, bool aligned) -> Tensor"));
+      "torchvision::_roi_align_backward(Tensor grad, Tensor rois, float spatial_scale, SymInt pooled_height, SymInt pooled_width, SymInt batch_size, SymInt channels, SymInt height, SymInt width, int sampling_ratio, bool aligned) -> Tensor"));
 }
 
 } // namespace ops
diff --git a/torchvision/csrc/ops/roi_align.h b/torchvision/csrc/ops/roi_align.h
index 2ddb6ac39455c5816c3f66fb91313e0cb82844bd..072d6d4231c84c99610d1e4e6517eef8a7fa4c4f 100644
--- a/torchvision/csrc/ops/roi_align.h
+++ b/torchvision/csrc/ops/roi_align.h
@@ -15,6 +15,15 @@ VISION_API at::Tensor roi_align(
     int64_t sampling_ratio,
     bool aligned);
 
+VISION_API at::Tensor roi_align_symint(
+    const at::Tensor& input,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    int64_t sampling_ratio,
+    bool aligned);
+
 namespace detail {
 
 at::Tensor _roi_align_backward(
@@ -30,6 +39,19 @@ at::Tensor _roi_align_backward(
     int64_t sampling_ratio,
     bool aligned);
 
+at::Tensor _roi_align_backward_symint(
+    const at::Tensor& grad,
+    const at::Tensor& rois,
+    double spatial_scale,
+    c10::SymInt pooled_height,
+    c10::SymInt pooled_width,
+    c10::SymInt batch_size,
+    c10::SymInt channels,
+    c10::SymInt height,
+    c10::SymInt width,
+    int64_t sampling_ratio,
+    bool aligned);
+
 } // namespace detail
 
 } // namespace ops
diff --git a/torchvision/datasets/__init__.py b/torchvision/datasets/__init__.py
index 23eddb236b0e9918c3e4e0e7897d97229ee57ade..f196713b7039c1e3de3cef29c1b346b5b89c8449 100644
--- a/torchvision/datasets/__init__.py
+++ b/torchvision/datasets/__init__.py
@@ -36,6 +36,7 @@ from .kitti import Kitti
 from .lfw import LFWPairs, LFWPeople
 from .lsun import LSUN, LSUNClass
 from .mnist import EMNIST, FashionMNIST, KMNIST, MNIST, QMNIST
+from .moving_mnist import MovingMNIST
 from .omniglot import Omniglot
 from .oxford_iiit_pet import OxfordIIITPet
 from .pcam import PCAM
@@ -126,4 +127,18 @@ __all__ = (
     "SintelStereo",
     "InStereo2k",
     "ETH3DStereo",
+    "wrap_dataset_for_transforms_v2",
 )
+
+
+# We override current module's attributes to handle the import:
+# from torchvision.datasets import wrap_dataset_for_transforms_v2
+# without a cyclic error.
+# Ref: https://peps.python.org/pep-0562/
+def __getattr__(name):
+    if name in ("wrap_dataset_for_transforms_v2",):
+        from torchvision.tv_tensors._dataset_wrapper import wrap_dataset_for_transforms_v2
+
+        return wrap_dataset_for_transforms_v2
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/torchvision/datasets/_optical_flow.py b/torchvision/datasets/_optical_flow.py
index bc26f51dc75673500b829930779d15f155a460e4..c766325889989d49601c191c8a7008e856d57ff6 100644
--- a/torchvision/datasets/_optical_flow.py
+++ b/torchvision/datasets/_optical_flow.py
@@ -3,6 +3,7 @@ import os
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
+from typing import Callable, List, Optional, Tuple, Union
 
 import numpy as np
 import torch
@@ -13,6 +14,10 @@ from .utils import _read_pfm, verify_str_arg
 from .vision import VisionDataset
 
 
+T1 = Tuple[Image.Image, Image.Image, Optional[np.ndarray], Optional[np.ndarray]]
+T2 = Tuple[Image.Image, Image.Image, Optional[np.ndarray]]
+
+
 __all__ = (
     "KittiFlow",
     "Sintel",
@@ -28,26 +33,26 @@ class FlowDataset(ABC, VisionDataset):
     # and it's up to whatever consumes the dataset to decide what valid_flow_mask should be.
     _has_builtin_flow_mask = False
 
-    def __init__(self, root, transforms=None):
+    def __init__(self, root: str, transforms: Optional[Callable] = None) -> None:
 
         super().__init__(root=root)
         self.transforms = transforms
 
-        self._flow_list = []
-        self._image_list = []
+        self._flow_list: List[str] = []
+        self._image_list: List[List[str]] = []
 
-    def _read_img(self, file_name):
+    def _read_img(self, file_name: str) -> Image.Image:
         img = Image.open(file_name)
         if img.mode != "RGB":
             img = img.convert("RGB")
         return img
 
     @abstractmethod
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str):
         # Return the flow or a tuple with the flow and the valid_flow_mask if _has_builtin_flow_mask is True
         pass
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
 
         img1 = self._read_img(self._image_list[index][0])
         img2 = self._read_img(self._image_list[index][1])
@@ -70,10 +75,10 @@ class FlowDataset(ABC, VisionDataset):
         else:
             return img1, img2, flow
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self._image_list)
 
-    def __rmul__(self, v):
+    def __rmul__(self, v: int) -> torch.utils.data.ConcatDataset:
         return torch.utils.data.ConcatDataset([self] * v)
 
 
@@ -118,7 +123,13 @@ class Sintel(FlowDataset):
             return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`.
     """
 
-    def __init__(self, root, split="train", pass_name="clean", transforms=None):
+    def __init__(
+        self,
+        root: str,
+        split: str = "train",
+        pass_name: str = "clean",
+        transforms: Optional[Callable] = None,
+    ) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -139,7 +150,7 @@ class Sintel(FlowDataset):
                 if split == "train":
                     self._flow_list += sorted(glob(str(flow_root / scene / "*.flo")))
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -154,7 +165,7 @@ class Sintel(FlowDataset):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> np.ndarray:
         return _read_flo(file_name)
 
 
@@ -180,7 +191,7 @@ class KittiFlow(FlowDataset):
 
     _has_builtin_flow_mask = True
 
-    def __init__(self, root, split="train", transforms=None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -200,7 +211,7 @@ class KittiFlow(FlowDataset):
         if split == "train":
             self._flow_list = sorted(glob(str(root / "flow_occ" / "*_10.png")))
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -215,7 +226,7 @@ class KittiFlow(FlowDataset):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> Tuple[np.ndarray, np.ndarray]:
         return _read_16bits_png_with_flow_and_valid_mask(file_name)
 
 
@@ -245,7 +256,7 @@ class FlyingChairs(FlowDataset):
             return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`.
     """
 
-    def __init__(self, root, split="train", transforms=None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "val"))
@@ -268,7 +279,7 @@ class FlyingChairs(FlowDataset):
                 self._flow_list += [flows[i]]
                 self._image_list += [[images[2 * i], images[2 * i + 1]]]
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -283,7 +294,7 @@ class FlyingChairs(FlowDataset):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> np.ndarray:
         return _read_flo(file_name)
 
 
@@ -316,7 +327,14 @@ class FlyingThings3D(FlowDataset):
             return a built-in valid mask, such as :class:`~torchvision.datasets.KittiFlow`.
     """
 
-    def __init__(self, root, split="train", pass_name="clean", camera="left", transforms=None):
+    def __init__(
+        self,
+        root: str,
+        split: str = "train",
+        pass_name: str = "clean",
+        camera: str = "left",
+        transforms: Optional[Callable] = None,
+    ) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -359,7 +377,7 @@ class FlyingThings3D(FlowDataset):
                         self._image_list += [[images[i + 1], images[i]]]
                         self._flow_list += [flows[i + 1]]
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -374,7 +392,7 @@ class FlyingThings3D(FlowDataset):
         """
         return super().__getitem__(index)
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> np.ndarray:
         return _read_pfm(file_name)
 
 
@@ -401,7 +419,7 @@ class HD1K(FlowDataset):
 
     _has_builtin_flow_mask = True
 
-    def __init__(self, root, split="train", transforms=None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root=root, transforms=transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -426,10 +444,10 @@ class HD1K(FlowDataset):
                 "Could not find the HD1K images. Please make sure the directory structure is correct."
             )
 
-    def _read_flow(self, file_name):
+    def _read_flow(self, file_name: str) -> Tuple[np.ndarray, np.ndarray]:
         return _read_16bits_png_with_flow_and_valid_mask(file_name)
 
-    def __getitem__(self, index):
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -445,7 +463,7 @@ class HD1K(FlowDataset):
         return super().__getitem__(index)
 
 
-def _read_flo(file_name):
+def _read_flo(file_name: str) -> np.ndarray:
     """Read .flo file in Middlebury format"""
     # Code adapted from:
     # http://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy
@@ -462,7 +480,7 @@ def _read_flo(file_name):
         return data.reshape(h, w, 2).transpose(2, 0, 1)
 
 
-def _read_16bits_png_with_flow_and_valid_mask(file_name):
+def _read_16bits_png_with_flow_and_valid_mask(file_name: str) -> Tuple[np.ndarray, np.ndarray]:
 
     flow_and_valid = _read_png_16(file_name).to(torch.float32)
     flow, valid_flow_mask = flow_and_valid[:2, :, :], flow_and_valid[2, :, :]
diff --git a/torchvision/datasets/_stereo_matching.py b/torchvision/datasets/_stereo_matching.py
index cd06cfe1cabaf5bcbffc425a2421d329f0818006..c180e2e1eb8f6df0e341cf9d708b0f074fa89b99 100644
--- a/torchvision/datasets/_stereo_matching.py
+++ b/torchvision/datasets/_stereo_matching.py
@@ -6,7 +6,7 @@ import shutil
 from abc import ABC, abstractmethod
 from glob import glob
 from pathlib import Path
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, cast, List, Optional, Tuple, Union
 
 import numpy as np
 from PIL import Image
@@ -14,6 +14,9 @@ from PIL import Image
 from .utils import _read_pfm, download_and_extract_archive, verify_str_arg
 from .vision import VisionDataset
 
+T1 = Tuple[Image.Image, Image.Image, Optional[np.ndarray], np.ndarray]
+T2 = Tuple[Image.Image, Image.Image, Optional[np.ndarray]]
+
 __all__ = ()
 
 _read_pfm_file = functools.partial(_read_pfm, slice_channels=1)
@@ -24,7 +27,7 @@ class StereoMatchingDataset(ABC, VisionDataset):
 
     _has_built_in_disparity_mask = False
 
-    def __init__(self, root: str, transforms: Optional[Callable] = None):
+    def __init__(self, root: str, transforms: Optional[Callable] = None) -> None:
         """
         Args:
             root(str): Root directory of the dataset.
@@ -58,7 +61,11 @@ class StereoMatchingDataset(ABC, VisionDataset):
             img = img.convert("RGB")
         return img
 
-    def _scan_pairs(self, paths_left_pattern: str, paths_right_pattern: Optional[str] = None):
+    def _scan_pairs(
+        self,
+        paths_left_pattern: str,
+        paths_right_pattern: Optional[str] = None,
+    ) -> List[Tuple[str, Optional[str]]]:
 
         left_paths = list(sorted(glob(paths_left_pattern)))
 
@@ -85,11 +92,11 @@ class StereoMatchingDataset(ABC, VisionDataset):
         return paths
 
     @abstractmethod
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
         # function that returns a disparity map and an occlusion map
         pass
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> Union[T1, T2]:
         """Return example at given index.
 
         Args:
@@ -120,7 +127,7 @@ class StereoMatchingDataset(ABC, VisionDataset):
             ) = self.transforms(imgs, dsp_maps, valid_masks)
 
         if self._has_built_in_disparity_mask or valid_masks[0] is not None:
-            return imgs[0], imgs[1], dsp_maps[0], valid_masks[0]
+            return imgs[0], imgs[1], dsp_maps[0], cast(np.ndarray, valid_masks[0])
         else:
             return imgs[0], imgs[1], dsp_maps[0]
 
@@ -156,7 +163,7 @@ class CarlaStereo(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, transforms: Optional[Callable] = None):
+    def __init__(self, root: str, transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "carla-highres"
@@ -171,13 +178,13 @@ class CarlaStereo(StereoMatchingDataset):
         disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
         self._disparities = disparities
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         disparity_map = _read_pfm_file(file_path)
         disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -189,7 +196,7 @@ class CarlaStereo(StereoMatchingDataset):
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class Kitti2012Stereo(StereoMatchingDataset):
@@ -233,7 +240,7 @@ class Kitti2012Stereo(StereoMatchingDataset):
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -250,7 +257,7 @@ class Kitti2012Stereo(StereoMatchingDataset):
         else:
             self._disparities = list((None, None) for _ in self._images)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[Optional[np.ndarray], None]:
         # test split has no disparity maps
         if file_path is None:
             return None, None
@@ -261,7 +268,7 @@ class Kitti2012Stereo(StereoMatchingDataset):
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -274,7 +281,7 @@ class Kitti2012Stereo(StereoMatchingDataset):
             generate a valid mask.
             Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class Kitti2015Stereo(StereoMatchingDataset):
@@ -321,7 +328,7 @@ class Kitti2015Stereo(StereoMatchingDataset):
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -338,7 +345,7 @@ class Kitti2015Stereo(StereoMatchingDataset):
         else:
             self._disparities = list((None, None) for _ in self._images)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[Optional[np.ndarray], None]:
         # test split has no disparity maps
         if file_path is None:
             return None, None
@@ -349,7 +356,7 @@ class Kitti2015Stereo(StereoMatchingDataset):
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -362,7 +369,7 @@ class Kitti2015Stereo(StereoMatchingDataset):
             generate a valid mask.
             Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class Middlebury2014Stereo(StereoMatchingDataset):
@@ -417,9 +424,9 @@ class Middlebury2014Stereo(StereoMatchingDataset):
         split (string, optional): The dataset split of scenes, either "train" (default), "test", or "additional"
         use_ambient_views (boolean, optional): Whether to use different expose or lightning views when possible.
             The dataset samples with equal probability between ``[im1.png, im1E.png, im1L.png]``.
-        calibration (string, optional): Wether or not to use the calibrated (default) or uncalibrated scenes.
+        calibration (string, optional): Whether or not to use the calibrated (default) or uncalibrated scenes.
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
-        download (boolean, optional): Wether or not to download the dataset in the ``root`` directory.
+        download (boolean, optional): Whether or not to download the dataset in the ``root`` directory.
     """
 
     splits = {
@@ -479,7 +486,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
         use_ambient_views: bool = False,
         transforms: Optional[Callable] = None,
         download: bool = False,
-    ):
+    ) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test", "additional"))
@@ -558,7 +565,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
             file_path = random.choice(ambient_file_paths)  # type: ignore
         return super()._read_img(file_path)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Union[Tuple[None, None], Tuple[np.ndarray, np.ndarray]]:
         # test split has not disparity maps
         if file_path is None:
             return None, None
@@ -569,7 +576,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
         valid_mask = (disparity_map > 0).squeeze(0)  # mask out invalid disparities
         return disparity_map, valid_mask
 
-    def _download_dataset(self, root: str):
+    def _download_dataset(self, root: str) -> None:
         base_url = "https://vision.middlebury.edu/stereo/data/scenes2014/zip"
         # train and additional splits have 2 different calibration settings
         root = Path(root) / "Middlebury2014"
@@ -608,7 +615,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
                 # cleanup MiddEval3 directory
                 shutil.rmtree(str(root / "MiddEval3"))
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T2:
         """Return example at given index.
 
         Args:
@@ -619,7 +626,7 @@ class Middlebury2014Stereo(StereoMatchingDataset):
             The disparity is a numpy array of shape (1, H, W) and the images are PIL images.
             ``valid_mask`` is implicitly ``None`` for `split=test`.
         """
-        return super().__getitem__(index)
+        return cast(T2, super().__getitem__(index))
 
 
 class CREStereo(StereoMatchingDataset):
@@ -670,7 +677,7 @@ class CREStereo(StereoMatchingDataset):
         self,
         root: str,
         transforms: Optional[Callable] = None,
-    ):
+    ) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "CREStereo"
@@ -688,14 +695,14 @@ class CREStereo(StereoMatchingDataset):
             disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
             self._disparities += disparities
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         disparity_map = np.asarray(Image.open(file_path), dtype=np.float32)
         # unsqueeze the disparity map into (C, H, W) format
         disparity_map = disparity_map[None, :, :] / 32.0
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -707,13 +714,13 @@ class CREStereo(StereoMatchingDataset):
             ``valid_mask`` is implicitly ``None`` if the ``transforms`` parameter does not
             generate a valid mask.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class FallingThingsStereo(StereoMatchingDataset):
     """`FallingThings <https://research.nvidia.com/publication/2018-06_falling-things-synthetic-dataset-3d-object-detection-and-pose-estimation>`_ dataset.
 
-    The dataset is expected to have the following structre: ::
+    The dataset is expected to have the following structure: ::
 
         root
             FallingThings
@@ -755,7 +762,7 @@ class FallingThingsStereo(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, variant: str = "single", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, variant: str = "single", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "FallingThings"
@@ -782,14 +789,14 @@ class FallingThingsStereo(StereoMatchingDataset):
             right_disparity_pattern = str(root / s / split_prefix[s] / "*.right.depth.png")
             self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         # (H, W) image
         depth = np.asarray(Image.open(file_path))
         # as per https://research.nvidia.com/sites/default/files/pubs/2018-06_Falling-Things/readme_0.txt
         # in order to extract disparity from depth maps
         camera_settings_path = Path(file_path).parent / "_camera_settings.json"
         with open(camera_settings_path, "r") as f:
-            # inverse of depth-from-disparity equation: depth = (baseline * focal) / (disparity * pixel_constatnt)
+            # inverse of depth-from-disparity equation: depth = (baseline * focal) / (disparity * pixel_constant)
             intrinsics = json.load(f)
             focal = intrinsics["camera_settings"][0]["intrinsic_settings"]["fx"]
             baseline, pixel_constant = 6, 100  # pixel constant is inverted
@@ -799,7 +806,7 @@ class FallingThingsStereo(StereoMatchingDataset):
             valid_mask = None
             return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -811,14 +818,14 @@ class FallingThingsStereo(StereoMatchingDataset):
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class SceneFlowStereo(StereoMatchingDataset):
     """Dataset interface for `Scene Flow <https://lmb.informatik.uni-freiburg.de/resources/datasets/SceneFlowDatasets.en.html>`_ datasets.
     This interface provides access to the `FlyingThings3D, `Monkaa` and `Driving` datasets.
 
-    The dataset is expected to have the following structre: ::
+    The dataset is expected to have the following structure: ::
 
         root
             SceneFlow
@@ -874,7 +881,7 @@ class SceneFlowStereo(StereoMatchingDataset):
         variant: str = "FlyingThings3D",
         pass_name: str = "clean",
         transforms: Optional[Callable] = None,
-    ):
+    ) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "SceneFlow"
@@ -905,13 +912,13 @@ class SceneFlowStereo(StereoMatchingDataset):
             right_disparity_pattern = str(root / "disparity" / prefix_directories[variant] / "right" / "*.pfm")
             self._disparities += self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         disparity_map = _read_pfm_file(file_path)
         disparity_map = np.abs(disparity_map)  # ensure that the disparity is positive
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -923,7 +930,7 @@ class SceneFlowStereo(StereoMatchingDataset):
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class SintelStereo(StereoMatchingDataset):
@@ -973,7 +980,7 @@ class SintelStereo(StereoMatchingDataset):
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, pass_name: str = "final", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, pass_name: str = "final", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(pass_name, "pass_name", valid_values=("final", "clean", "both"))
@@ -1014,7 +1021,7 @@ class SintelStereo(StereoMatchingDataset):
 
         return occlusion_path, outofframe_path
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Union[Tuple[None, None], Tuple[np.ndarray, np.ndarray]]:
         if file_path is None:
             return None, None
 
@@ -1024,7 +1031,7 @@ class SintelStereo(StereoMatchingDataset):
         disparity_map = r * 4 + g / (2**6) + b / (2**14)
         # reshape into (C, H, W) format
         disparity_map = np.transpose(disparity_map, (2, 0, 1))
-        # find the appropiate file paths
+        # find the appropriate file paths
         occlued_mask_path, out_of_frame_mask_path = self._get_occlussion_mask_paths(file_path)
         # occlusion masks
         valid_mask = np.asarray(Image.open(occlued_mask_path)) == 0
@@ -1034,7 +1041,7 @@ class SintelStereo(StereoMatchingDataset):
         valid_mask = np.logical_and(off_mask, valid_mask)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T2:
         """Return example at given index.
 
         Args:
@@ -1045,13 +1052,13 @@ class SintelStereo(StereoMatchingDataset):
             The disparity is a numpy array of shape (1, H, W) and the images are PIL images whilst
             the valid_mask is a numpy array of shape (H, W).
         """
-        return super().__getitem__(index)
+        return cast(T2, super().__getitem__(index))
 
 
 class InStereo2k(StereoMatchingDataset):
     """`InStereo2k <https://github.com/YuhuaXu/StereoDataset>`_ dataset.
 
-    The dataset is expected to have the following structre: ::
+    The dataset is expected to have the following structure: ::
 
         root
             InStereo2k
@@ -1080,7 +1087,7 @@ class InStereo2k(StereoMatchingDataset):
         transforms (callable, optional): A function/transform that takes in a sample and returns a transformed version.
     """
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         root = Path(root) / "InStereo2k" / split
@@ -1095,14 +1102,14 @@ class InStereo2k(StereoMatchingDataset):
         right_disparity_pattern = str(root / "*" / "right_disp.png")
         self._disparities = self._scan_pairs(left_disparity_pattern, right_disparity_pattern)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Tuple[np.ndarray, None]:
         disparity_map = np.asarray(Image.open(file_path), dtype=np.float32)
         # unsqueeze disparity to (C, H, W)
         disparity_map = disparity_map[None, :, :] / 1024.0
         valid_mask = None
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T1:
         """Return example at given index.
 
         Args:
@@ -1114,7 +1121,7 @@ class InStereo2k(StereoMatchingDataset):
             If a ``valid_mask`` is generated within the ``transforms`` parameter,
             a 4-tuple with ``(img_left, img_right, disparity, valid_mask)`` is returned.
         """
-        return super().__getitem__(index)
+        return cast(T1, super().__getitem__(index))
 
 
 class ETH3DStereo(StereoMatchingDataset):
@@ -1169,7 +1176,7 @@ class ETH3DStereo(StereoMatchingDataset):
 
     _has_built_in_disparity_mask = True
 
-    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None):
+    def __init__(self, root: str, split: str = "train", transforms: Optional[Callable] = None) -> None:
         super().__init__(root, transforms)
 
         verify_str_arg(split, "split", valid_values=("train", "test"))
@@ -1189,7 +1196,7 @@ class ETH3DStereo(StereoMatchingDataset):
             disparity_pattern = str(root / anot_dir / "*" / "disp0GT.pfm")
             self._disparities = self._scan_pairs(disparity_pattern, None)
 
-    def _read_disparity(self, file_path: str) -> Tuple:
+    def _read_disparity(self, file_path: str) -> Union[Tuple[None, None], Tuple[np.ndarray, np.ndarray]]:
         # test split has no disparity maps
         if file_path is None:
             return None, None
@@ -1201,7 +1208,7 @@ class ETH3DStereo(StereoMatchingDataset):
         valid_mask = np.asarray(valid_mask).astype(bool)
         return disparity_map, valid_mask
 
-    def __getitem__(self, index: int) -> Tuple:
+    def __getitem__(self, index: int) -> T2:
         """Return example at given index.
 
         Args:
@@ -1214,4 +1221,4 @@ class ETH3DStereo(StereoMatchingDataset):
             generate a valid mask.
             Both ``disparity`` and ``valid_mask`` are ``None`` if the dataset split is test.
         """
-        return super().__getitem__(index)
+        return cast(T2, super().__getitem__(index))
diff --git a/torchvision/datasets/celeba.py b/torchvision/datasets/celeba.py
index dbacece88c9b6cfb87aed7fdc8058bca226d9669..d055f92f1944a4d106ce9c40c3074b6af308166f 100644
--- a/torchvision/datasets/celeba.py
+++ b/torchvision/datasets/celeba.py
@@ -23,10 +23,10 @@ class CelebA(VisionDataset):
             or ``landmarks``. Can also be a list to output a tuple with all specified target types.
             The targets represent:
 
-                - ``attr`` (np.array shape=(40,) dtype=int): binary (0, 1) labels for attributes
+                - ``attr`` (Tensor shape=(40,) dtype=int): binary (0, 1) labels for attributes
                 - ``identity`` (int): label for each person (data points with the same identity are the same person)
-                - ``bbox`` (np.array shape=(4,) dtype=int): bounding box (x, y, width, height)
-                - ``landmarks`` (np.array shape=(10,) dtype=int): landmark points (lefteye_x, lefteye_y, righteye_x,
+                - ``bbox`` (Tensor shape=(4,) dtype=int): bounding box (x, y, width, height)
+                - ``landmarks`` (Tensor shape=(10,) dtype=int): landmark points (lefteye_x, lefteye_y, righteye_x,
                   righteye_y, nose_x, nose_y, leftmouth_x, leftmouth_y, rightmouth_x, rightmouth_y)
 
             Defaults to ``attr``. If empty, ``None`` will be returned as target.
@@ -41,7 +41,7 @@ class CelebA(VisionDataset):
     """
 
     base_folder = "celeba"
-    # There currently does not appear to be a easy way to extract 7z in python (without introducing additional
+    # There currently does not appear to be an easy way to extract 7z in python (without introducing additional
     # dependencies). The "in-the-wild" (not aligned+cropped) images are only in 7z, so they are not available
     # right now.
     file_list = [
diff --git a/torchvision/datasets/cityscapes.py b/torchvision/datasets/cityscapes.py
index 86d65c7c091b5f6ceba40283805a7e6c3f03c1a5..85544598176074ac025a395fdf64b13ddfb62fdd 100644
--- a/torchvision/datasets/cityscapes.py
+++ b/torchvision/datasets/cityscapes.py
@@ -177,7 +177,7 @@ class Cityscapes(VisionDataset):
             index (int): Index
         Returns:
             tuple: (image, target) where target is a tuple of all target types if target_type is a list with more
-            than one item. Otherwise target is a json object if target_type="polygon", else the image segmentation.
+            than one item. Otherwise, target is a json object if target_type="polygon", else the image segmentation.
         """
 
         image = Image.open(self.images[index]).convert("RGB")
diff --git a/torchvision/datasets/country211.py b/torchvision/datasets/country211.py
index 9a62520fe2b9292d35f31bb329424bfc282ad9da..59598fd44e26098d476103c09a703d2d37cd9857 100644
--- a/torchvision/datasets/country211.py
+++ b/torchvision/datasets/country211.py
@@ -11,7 +11,7 @@ class Country211(ImageFolder):
     This dataset was built by filtering the images from the YFCC100m dataset
     that have GPS coordinate corresponding to a ISO-3166 country code. The
     dataset is balanced by sampling 150 train images, 50 validation images, and
-    100 test images images for each country.
+    100 test images for each country.
 
     Args:
         root (string): Root directory of the dataset.
diff --git a/torchvision/datasets/dtd.py b/torchvision/datasets/dtd.py
index 2d8314346b9d389d05c092f1afc9324a898b5e3e..2b10efe94e2dba365752e83d2aafa22c9c12ba49 100644
--- a/torchvision/datasets/dtd.py
+++ b/torchvision/datasets/dtd.py
@@ -1,6 +1,6 @@
 import os
 import pathlib
-from typing import Callable, Optional
+from typing import Any, Callable, Optional, Tuple
 
 import PIL.Image
 
@@ -76,7 +76,7 @@ class DTD(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx):
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/fgvc_aircraft.py b/torchvision/datasets/fgvc_aircraft.py
index 2e4993361ca3786362c3b2e032100c15c4dec87c..aa705b305d80b37be71b8577575a038dc6c82e7f 100644
--- a/torchvision/datasets/fgvc_aircraft.py
+++ b/torchvision/datasets/fgvc_aircraft.py
@@ -90,7 +90,7 @@ class FGVCAircraft(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/flowers102.py b/torchvision/datasets/flowers102.py
index ad3a6dda0e8af2dffe3e3e6a776002338914a3a9..fdaf2ddb4d1084b69f91d81c4ad0684557a6adb1 100644
--- a/torchvision/datasets/flowers102.py
+++ b/torchvision/datasets/flowers102.py
@@ -76,7 +76,7 @@ class Flowers102(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/food101.py b/torchvision/datasets/food101.py
index e7d1bd19447fc1bc7b007d258a6d6125acf0e586..d2557a82736dc80713b6cb807924d8723f1b0723 100644
--- a/torchvision/datasets/food101.py
+++ b/torchvision/datasets/food101.py
@@ -69,7 +69,7 @@ class Food101(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/hmdb51.py b/torchvision/datasets/hmdb51.py
index 9067418d84777b3462466bba3f76fe68bd03e79b..a58ddc293d923e634c4ee4973847b582001a23f1 100644
--- a/torchvision/datasets/hmdb51.py
+++ b/torchvision/datasets/hmdb51.py
@@ -102,7 +102,7 @@ class HMDB51(VisionDataset):
             output_format=output_format,
         )
         # we bookkeep the full version of video clips because we want to be able
-        # to return the meta data of full version rather than the subset version of
+        # to return the metadata of full version rather than the subset version of
         # video clips
         self.full_video_clips = video_clips
         self.fold = fold
diff --git a/torchvision/datasets/imagenet.py b/torchvision/datasets/imagenet.py
index 4b86bf2f2b9589afc120c3f8d8c07e74efee42f1..eee0d00548846ef8af029f85244d62abfdf5812a 100644
--- a/torchvision/datasets/imagenet.py
+++ b/torchvision/datasets/imagenet.py
@@ -21,6 +21,12 @@ META_FILE = "meta.bin"
 class ImageNet(ImageFolder):
     """`ImageNet <http://image-net.org/>`_ 2012 Classification Dataset.
 
+    .. note::
+        Before using this class, it is required to download ImageNet 2012 dataset from
+        `here <https://image-net.org/challenges/LSVRC/2012/2012-downloads.php>`_ and
+        place the files ``ILSVRC2012_devkit_t12.tar.gz`` and ``ILSVRC2012_img_train.tar``
+        or ``ILSVRC2012_img_val.tar`` based on ``split`` in the root directory.
+
     Args:
         root (string): Root directory of the ImageNet Dataset.
         split (string, optional): The dataset split, supports ``train``, or ``val``.
diff --git a/torchvision/datasets/lfw.py b/torchvision/datasets/lfw.py
index a25765d572594d7cc11d88b561dc2484ea6e1c98..7a5aa45aa4db56d9f61fba663a45fcef47856d8d 100644
--- a/torchvision/datasets/lfw.py
+++ b/torchvision/datasets/lfw.py
@@ -1,5 +1,5 @@
 import os
-from typing import Any, Callable, List, Optional, Tuple
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
 
 from PIL import Image
 
@@ -38,7 +38,7 @@ class _LFW(VisionDataset):
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
-    ):
+    ) -> None:
         super().__init__(os.path.join(root, self.base_folder), transform=transform, target_transform=target_transform)
 
         self.image_set = verify_str_arg(image_set.lower(), "image_set", self.file_dict.keys())
@@ -62,7 +62,7 @@ class _LFW(VisionDataset):
             img = Image.open(f)
             return img.convert("RGB")
 
-    def _check_integrity(self):
+    def _check_integrity(self) -> bool:
         st1 = check_integrity(os.path.join(self.root, self.filename), self.md5)
         st2 = check_integrity(os.path.join(self.root, self.labels_file), self.checksums[self.labels_file])
         if not st1 or not st2:
@@ -71,7 +71,7 @@ class _LFW(VisionDataset):
             return check_integrity(os.path.join(self.root, self.names), self.checksums[self.names])
         return True
 
-    def download(self):
+    def download(self) -> None:
         if self._check_integrity():
             print("Files already downloaded and verified")
             return
@@ -81,13 +81,13 @@ class _LFW(VisionDataset):
         if self.view == "people":
             download_url(f"{self.download_url_prefix}{self.names}", self.root)
 
-    def _get_path(self, identity, no):
+    def _get_path(self, identity: str, no: Union[int, str]) -> str:
         return os.path.join(self.images_dir, identity, f"{identity}_{int(no):04d}.jpg")
 
     def extra_repr(self) -> str:
         return f"Alignment: {self.image_set}\nSplit: {self.split}"
 
-    def __len__(self):
+    def __len__(self) -> int:
         return len(self.data)
 
 
@@ -119,13 +119,13 @@ class LFWPeople(_LFW):
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
-    ):
+    ) -> None:
         super().__init__(root, split, image_set, "people", transform, target_transform, download)
 
         self.class_to_idx = self._get_classes()
         self.data, self.targets = self._get_people()
 
-    def _get_people(self):
+    def _get_people(self) -> Tuple[List[str], List[int]]:
         data, targets = [], []
         with open(os.path.join(self.root, self.labels_file)) as f:
             lines = f.readlines()
@@ -143,7 +143,7 @@ class LFWPeople(_LFW):
 
         return data, targets
 
-    def _get_classes(self):
+    def _get_classes(self) -> Dict[str, int]:
         with open(os.path.join(self.root, self.names)) as f:
             lines = f.readlines()
             names = [line.strip().split()[0] for line in lines]
@@ -201,12 +201,12 @@ class LFWPairs(_LFW):
         transform: Optional[Callable] = None,
         target_transform: Optional[Callable] = None,
         download: bool = False,
-    ):
+    ) -> None:
         super().__init__(root, split, image_set, "pairs", transform, target_transform, download)
 
         self.pair_names, self.data, self.targets = self._get_pairs(self.images_dir)
 
-    def _get_pairs(self, images_dir):
+    def _get_pairs(self, images_dir: str) -> Tuple[List[Tuple[str, str]], List[Tuple[str, str]], List[int]]:
         pair_names, data, targets = [], [], []
         with open(os.path.join(self.root, self.labels_file)) as f:
             lines = f.readlines()
diff --git a/torchvision/datasets/mnist.py b/torchvision/datasets/mnist.py
index fd74254493526d036b1fb7b61639bb7a039b89b3..6953d1fc5c24fd42267fdc02ff16c04aec7c5399 100644
--- a/torchvision/datasets/mnist.py
+++ b/torchvision/datasets/mnist.py
@@ -12,7 +12,7 @@ import numpy as np
 import torch
 from PIL import Image
 
-from .utils import check_integrity, download_and_extract_archive, extract_archive, verify_str_arg
+from .utils import _flip_byte_order, check_integrity, download_and_extract_archive, extract_archive, verify_str_arg
 from .vision import VisionDataset
 
 
@@ -366,7 +366,7 @@ class QMNIST(MNIST):
             that takes in the target and transforms it.
         train (bool,optional,compatibility): When argument 'what' is
             not specified, this boolean decides whether to load the
-            training set ot the testing set.  Default: True.
+            training set or the testing set.  Default: True.
     """
 
     subsets = {"train": "train", "test": "test", "test10k": "test", "test50k": "test", "nist": "nist"}
@@ -519,13 +519,12 @@ def read_sn3_pascalvincent_tensor(path: str, strict: bool = True) -> torch.Tenso
     torch_type = SN3_PASCALVINCENT_TYPEMAP[ty]
     s = [get_int(data[4 * (i + 1) : 4 * (i + 2)]) for i in range(nd)]
 
-    num_bytes_per_value = torch.iinfo(torch_type).bits // 8
-    # The MNIST format uses the big endian byte order. If the system uses little endian byte order by default,
-    # we need to reverse the bytes before we can read them with torch.frombuffer().
-    needs_byte_reversal = sys.byteorder == "little" and num_bytes_per_value > 1
     parsed = torch.frombuffer(bytearray(data), dtype=torch_type, offset=(4 * (nd + 1)))
-    if needs_byte_reversal:
-        parsed = parsed.flip(0)
+
+    # The MNIST format uses the big endian byte order, while `torch.frombuffer` uses whatever the system uses. In case
+    # that is little endian and the dtype has more than one byte, we need to flip them.
+    if sys.byteorder == "little" and parsed.element_size() > 1:
+        parsed = _flip_byte_order(parsed)
 
     assert parsed.shape[0] == np.prod(s) or not strict
     return parsed.view(*s)
diff --git a/torchvision/datasets/moving_mnist.py b/torchvision/datasets/moving_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac5a2b1503da025a02d1c69dcae382146ab4d1c9
--- /dev/null
+++ b/torchvision/datasets/moving_mnist.py
@@ -0,0 +1,93 @@
+import os.path
+from typing import Callable, Optional
+
+import numpy as np
+import torch
+from torchvision.datasets.utils import download_url, verify_str_arg
+from torchvision.datasets.vision import VisionDataset
+
+
+class MovingMNIST(VisionDataset):
+    """`MovingMNIST <http://www.cs.toronto.edu/~nitish/unsupervised_video/>`_ Dataset.
+
+    Args:
+        root (string): Root directory of dataset where ``MovingMNIST/mnist_test_seq.npy`` exists.
+        split (string, optional): The dataset split, supports ``None`` (default), ``"train"`` and ``"test"``.
+            If ``split=None``, the full data is returned.
+        split_ratio (int, optional): The split ratio of number of frames. If ``split="train"``, the first split
+            frames ``data[:, :split_ratio]`` is returned. If ``split="test"``, the last split frames ``data[:, split_ratio:]``
+            is returned. If ``split=None``, this parameter is ignored and the all frames data is returned.
+        transform (callable, optional): A function/transform that takes in an torch Tensor
+            and returns a transformed version. E.g, ``transforms.RandomCrop``
+        download (bool, optional): If true, downloads the dataset from the internet and
+            puts it in root directory. If dataset is already downloaded, it is not
+            downloaded again.
+    """
+
+    _URL = "http://www.cs.toronto.edu/~nitish/unsupervised_video/mnist_test_seq.npy"
+
+    def __init__(
+        self,
+        root: str,
+        split: Optional[str] = None,
+        split_ratio: int = 10,
+        download: bool = False,
+        transform: Optional[Callable] = None,
+    ) -> None:
+        super().__init__(root, transform=transform)
+
+        self._base_folder = os.path.join(self.root, self.__class__.__name__)
+        self._filename = self._URL.split("/")[-1]
+
+        if split is not None:
+            verify_str_arg(split, "split", ("train", "test"))
+        self.split = split
+
+        if not isinstance(split_ratio, int):
+            raise TypeError(f"`split_ratio` should be an integer, but got {type(split_ratio)}")
+        elif not (1 <= split_ratio <= 19):
+            raise ValueError(f"`split_ratio` should be `1 <= split_ratio <= 19`, but got {split_ratio} instead.")
+        self.split_ratio = split_ratio
+
+        if download:
+            self.download()
+
+        if not self._check_exists():
+            raise RuntimeError("Dataset not found. You can use download=True to download it.")
+
+        data = torch.from_numpy(np.load(os.path.join(self._base_folder, self._filename)))
+        if self.split == "train":
+            data = data[: self.split_ratio]
+        elif self.split == "test":
+            data = data[self.split_ratio :]
+        self.data = data.transpose(0, 1).unsqueeze(2).contiguous()
+
+    def __getitem__(self, idx: int) -> torch.Tensor:
+        """
+        Args:
+            index (int): Index
+        Returns:
+            torch.Tensor: Video frames (torch Tensor[T, C, H, W]). The `T` is the number of frames.
+        """
+        data = self.data[idx]
+        if self.transform is not None:
+            data = self.transform(data)
+
+        return data
+
+    def __len__(self) -> int:
+        return len(self.data)
+
+    def _check_exists(self) -> bool:
+        return os.path.exists(os.path.join(self._base_folder, self._filename))
+
+    def download(self) -> None:
+        if self._check_exists():
+            return
+
+        download_url(
+            url=self._URL,
+            root=self._base_folder,
+            filename=self._filename,
+            md5="be083ec986bfe91a449d63653c411eb2",
+        )
diff --git a/torchvision/datasets/places365.py b/torchvision/datasets/places365.py
index c26b6f03074d7d8ce46bc2025257721eca9ec4ff..5c97202a2e6c4889394bc7358c1f3aaa52868936 100644
--- a/torchvision/datasets/places365.py
+++ b/torchvision/datasets/places365.py
@@ -15,7 +15,7 @@ class Places365(VisionDataset):
         root (string): Root directory of the Places365 dataset.
         split (string, optional): The dataset split. Can be one of ``train-standard`` (default), ``train-challenge``,
             ``val``.
-        small (bool, optional): If ``True``, uses the small images, i. e. resized to 256 x 256 pixels, instead of the
+        small (bool, optional): If ``True``, uses the small images, i.e. resized to 256 x 256 pixels, instead of the
             high resolution ones.
         download (bool, optional): If ``True``, downloads the dataset components and places them in ``root``. Already
             downloaded archives are not downloaded again.
@@ -32,7 +32,7 @@ class Places365(VisionDataset):
         targets (list): The class_index value for each image in the dataset
 
     Raises:
-        RuntimeError: If ``download is False`` and the meta files, i. e. the devkit, are not present or corrupted.
+        RuntimeError: If ``download is False`` and the meta files, i.e. the devkit, are not present or corrupted.
         RuntimeError: If ``download is True`` and the image archive is already extracted.
     """
     _SPLITS = ("train-standard", "train-challenge", "val")
diff --git a/torchvision/datasets/rendered_sst2.py b/torchvision/datasets/rendered_sst2.py
index 89adf8cf8d81f4baf450f395b55477dfad1d6be2..58ea2f9cfe930b41914ff5ba9cefe76ca1817417 100644
--- a/torchvision/datasets/rendered_sst2.py
+++ b/torchvision/datasets/rendered_sst2.py
@@ -59,7 +59,7 @@ class RenderedSST2(VisionDataset):
     def __len__(self) -> int:
         return len(self._samples)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._samples[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/sbu.py b/torchvision/datasets/sbu.py
index 9e46c1dff8584674b7405fa98062d39d417eb8f9..ee90eeb64aef9af3a13e39498397bfb450e5fa83 100644
--- a/torchvision/datasets/sbu.py
+++ b/torchvision/datasets/sbu.py
@@ -3,7 +3,7 @@ from typing import Any, Callable, Optional, Tuple
 
 from PIL import Image
 
-from .utils import check_integrity, download_url
+from .utils import check_integrity, download_and_extract_archive, download_url
 from .vision import VisionDataset
 
 
@@ -90,17 +90,12 @@ class SBU(VisionDataset):
 
     def download(self) -> None:
         """Download and extract the tarball, and download each individual photo."""
-        import tarfile
 
         if self._check_integrity():
             print("Files already downloaded and verified")
             return
 
-        download_url(self.url, self.root, self.filename, self.md5_checksum)
-
-        # Extract file
-        with tarfile.open(os.path.join(self.root, self.filename), "r:gz") as tar:
-            tar.extractall(path=self.root)
+        download_and_extract_archive(self.url, self.root, self.root, self.filename, self.md5_checksum)
 
         # Download individual photos
         with open(os.path.join(self.root, "dataset", "SBU_captioned_photo_dataset_urls.txt")) as fh:
diff --git a/torchvision/datasets/stl10.py b/torchvision/datasets/stl10.py
index 11b170b196c1d272cbb54a80d50ee48b4e4d58ca..f47d0c32a2cd88cb83393eeb833e44838725efb6 100644
--- a/torchvision/datasets/stl10.py
+++ b/torchvision/datasets/stl10.py
@@ -15,7 +15,7 @@ class STL10(VisionDataset):
         root (string): Root directory of dataset where directory
             ``stl10_binary`` exists.
         split (string): One of {'train', 'test', 'unlabeled', 'train+unlabeled'}.
-            Accordingly dataset is selected.
+            Accordingly, dataset is selected.
         folds (int, optional): One of {0-9} or None.
             For training, loads one of the 10 pre-defined folds of 1k samples for the
             standard evaluation procedure. If no value is passed, loads the 5k samples.
diff --git a/torchvision/datasets/sun397.py b/torchvision/datasets/sun397.py
index 05cb910dde891b75b51ce5bf6e5ef0f4cb20b766..0a1ffef9b98c33ed6927aa164e70672b467c6609 100644
--- a/torchvision/datasets/sun397.py
+++ b/torchvision/datasets/sun397.py
@@ -55,7 +55,7 @@ class SUN397(VisionDataset):
     def __len__(self) -> int:
         return len(self._image_files)
 
-    def __getitem__(self, idx) -> Tuple[Any, Any]:
+    def __getitem__(self, idx: int) -> Tuple[Any, Any]:
         image_file, label = self._image_files[idx], self._labels[idx]
         image = PIL.Image.open(image_file).convert("RGB")
 
diff --git a/torchvision/datasets/svhn.py b/torchvision/datasets/svhn.py
index facb2d8858e124d7ce5e2f15fb500c24d1afa3c1..8a0d70b8fd0b2d0b7e053575eb7d60737bcc6977 100644
--- a/torchvision/datasets/svhn.py
+++ b/torchvision/datasets/svhn.py
@@ -78,7 +78,7 @@ class SVHN(VisionDataset):
         loaded_mat = sio.loadmat(os.path.join(self.root, self.filename))
 
         self.data = loaded_mat["X"]
-        # loading from the .mat file gives an np array of type np.uint8
+        # loading from the .mat file gives an np.ndarray of type np.uint8
         # converting to np.int64, so that we have a LongTensor after
         # the conversion from the numpy array
         # the squeeze is needed to obtain a 1D tensor
diff --git a/torchvision/datasets/ucf101.py b/torchvision/datasets/ucf101.py
index 749646080fd28a8a2208c900b66294dab572b164..60e83e158a327706777fdd75c6fb29d3b6cd1e18 100644
--- a/torchvision/datasets/ucf101.py
+++ b/torchvision/datasets/ucf101.py
@@ -93,7 +93,7 @@ class UCF101(VisionDataset):
             output_format=output_format,
         )
         # we bookkeep the full version of video clips because we want to be able
-        # to return the meta data of full version rather than the subset version of
+        # to return the metadata of full version rather than the subset version of
         # video clips
         self.full_video_clips = video_clips
         self.indices = self._select_fold(video_list, annotation_path, fold, train)
diff --git a/torchvision/datasets/utils.py b/torchvision/datasets/utils.py
index b8aaff3d773a2a35d3c65198f5a4fbebc5b16c9b..b79b4ef4e617867228a82f51cb7a5fa4c356a79e 100644
--- a/torchvision/datasets/utils.py
+++ b/torchvision/datasets/utils.py
@@ -48,19 +48,6 @@ def _urlretrieve(url: str, filename: str, chunk_size: int = 1024 * 32) -> None:
         _save_response_content(iter(lambda: response.read(chunk_size), b""), filename, length=response.length)
 
 
-def gen_bar_updater() -> Callable[[int, int, int], None]:
-    warnings.warn("The function `gen_bar_update` is deprecated since 0.13 and will be removed in 0.15.")
-    pbar = tqdm(total=None)
-
-    def bar_update(count, block_size, total_size):
-        if pbar.total is None and total_size:
-            pbar.total = total_size
-        progress_bytes = count * block_size
-        pbar.update(progress_bytes - pbar.n)
-
-    return bar_update
-
-
 def calculate_md5(fpath: str, chunk_size: int = 1024 * 1024) -> str:
     # Setting the `usedforsecurity` flag does not change anything about the functionality, but indicates that we are
     # not using the MD5 checksum for cryptography. This enables its usage in restricted environments like FIPS. Without
@@ -70,7 +57,7 @@ def calculate_md5(fpath: str, chunk_size: int = 1024 * 1024) -> str:
     else:
         md5 = hashlib.md5()
     with open(fpath, "rb") as f:
-        for chunk in iter(lambda: f.read(chunk_size), b""):
+        while chunk := f.read(chunk_size):
             md5.update(chunk)
     return md5.hexdigest()
 
@@ -464,7 +451,7 @@ def verify_str_arg(
     valid_values: Optional[Iterable[T]] = None,
     custom_msg: Optional[str] = None,
 ) -> T:
-    if not isinstance(value, torch._six.string_classes):
+    if not isinstance(value, str):
         if arg is None:
             msg = "Expected type str, but got type {type}."
         else:
@@ -520,3 +507,9 @@ def _read_pfm(file_name: str, slice_channels: int = 2) -> np.ndarray:
     data = np.flip(data, axis=1)  # flip on h dimension
     data = data[:slice_channels, :, :]
     return data.astype(np.float32)
+
+
+def _flip_byte_order(t: torch.Tensor) -> torch.Tensor:
+    return (
+        t.contiguous().view(torch.uint8).view(*t.shape, t.element_size()).flip(-1).view(*t.shape[:-1], -1).view(t.dtype)
+    )
diff --git a/torchvision/datasets/video_utils.py b/torchvision/datasets/video_utils.py
index b607def243d1614e486f23c336199c09f53681b7..df55518de371436fe7e5134bb2263d7c60342c23 100644
--- a/torchvision/datasets/video_utils.py
+++ b/torchvision/datasets/video_utils.py
@@ -49,7 +49,7 @@ class _VideoTimestampsDataset:
     Dataset used to parallelize the reading of the timestamps
     of a list of videos, given their paths in the filesystem.
 
-    Used in VideoClips and defined at top level so it can be
+    Used in VideoClips and defined at top level, so it can be
     pickled when forking.
     """
 
@@ -187,9 +187,9 @@ class VideoClips:
         }
         return type(self)(
             video_paths,
-            self.num_frames,
-            self.step,
-            self.frame_rate,
+            clip_length_in_frames=self.num_frames,
+            frames_between_clips=self.step,
+            frame_rate=self.frame_rate,
             _precomputed_metadata=metadata,
             num_workers=self.num_workers,
             _video_width=self._video_width,
diff --git a/torchvision/datasets/vision.py b/torchvision/datasets/vision.py
index 22fc85322e49fa6d71bf906fe7d1a3b7c551ca93..aba19369b642e5e1609becb7b8fa9a92d935cc2f 100644
--- a/torchvision/datasets/vision.py
+++ b/torchvision/datasets/vision.py
@@ -1,7 +1,6 @@
 import os
 from typing import Any, Callable, List, Optional, Tuple
 
-import torch
 import torch.utils.data as data
 
 from ..utils import _log_api_usage_once
@@ -36,7 +35,7 @@ class VisionDataset(data.Dataset):
         target_transform: Optional[Callable] = None,
     ) -> None:
         _log_api_usage_once(self)
-        if isinstance(root, torch._six.string_classes):
+        if isinstance(root, str):
             root = os.path.expanduser(root)
         self.root = root
 
diff --git a/torchvision/datasets/widerface.py b/torchvision/datasets/widerface.py
index b46c7982d8bf6542b85a2706bedf0da821ec84ff..aa520455ef1bce4597444fecb91cd3f60ab0ff36 100644
--- a/torchvision/datasets/widerface.py
+++ b/torchvision/datasets/widerface.py
@@ -137,13 +137,13 @@ class WIDERFace(VisionDataset):
                             {
                                 "img_path": img_path,
                                 "annotations": {
-                                    "bbox": labels_tensor[:, 0:4],  # x, y, width, height
-                                    "blur": labels_tensor[:, 4],
-                                    "expression": labels_tensor[:, 5],
-                                    "illumination": labels_tensor[:, 6],
-                                    "occlusion": labels_tensor[:, 7],
-                                    "pose": labels_tensor[:, 8],
-                                    "invalid": labels_tensor[:, 9],
+                                    "bbox": labels_tensor[:, 0:4].clone(),  # x, y, width, height
+                                    "blur": labels_tensor[:, 4].clone(),
+                                    "expression": labels_tensor[:, 5].clone(),
+                                    "illumination": labels_tensor[:, 6].clone(),
+                                    "occlusion": labels_tensor[:, 7].clone(),
+                                    "pose": labels_tensor[:, 8].clone(),
+                                    "invalid": labels_tensor[:, 9].clone(),
                                 },
                             }
                         )
diff --git a/torchvision/extension.py b/torchvision/extension.py
index de5ea0c94d8ce16ce29cb42c0e9cc79108e5be65..67801056e88b44d40bc2d382d62c389bf4ef039e 100644
--- a/torchvision/extension.py
+++ b/torchvision/extension.py
@@ -1,7 +1,5 @@
-import ctypes
 import os
 import sys
-from warnings import warn
 
 import torch
 
@@ -22,7 +20,7 @@ try:
     # conda environment/bin path is configured Please take a look:
     # https://stackoverflow.com/questions/59330863/cant-import-dll-module-in-python
     # Please note: if some path can't be added using add_dll_directory we simply ignore this path
-    if os.name == "nt" and sys.version_info >= (3, 8) and sys.version_info < (3, 9):
+    if os.name == "nt" and sys.version_info < (3, 9):
         env_path = os.environ["PATH"]
         path_arr = env_path.split(";")
         for path in path_arr:
@@ -76,9 +74,9 @@ def _check_cuda_version():
         t_version = torch_version_cuda.split(".")
         t_major = int(t_version[0])
         t_minor = int(t_version[1])
-        if t_major != tv_major or t_minor != tv_minor:
+        if t_major != tv_major:
             raise RuntimeError(
-                "Detected that PyTorch and torchvision were compiled with different CUDA versions. "
+                "Detected that PyTorch and torchvision were compiled with different CUDA major versions. "
                 f"PyTorch has CUDA Version={t_major}.{t_minor} and torchvision has "
                 f"CUDA Version={tv_major}.{tv_minor}. "
                 "Please reinstall the torchvision that matches your PyTorch install."
@@ -88,19 +86,6 @@ def _check_cuda_version():
 
 def _load_library(lib_name):
     lib_path = _get_extension_path(lib_name)
-    # On Windows Python-3.8+ has `os.add_dll_directory` call,
-    # which is called from _get_extension_path to configure dll search path
-    # Condition below adds a workaround for older versions by
-    # explicitly calling `LoadLibraryExW` with the following flags:
-    #  - LOAD_LIBRARY_SEARCH_DEFAULT_DIRS (0x1000)
-    #  - LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR (0x100)
-    if os.name == "nt" and sys.version_info < (3, 8):
-        _kernel32 = ctypes.WinDLL("kernel32.dll", use_last_error=True)
-        if hasattr(_kernel32, "LoadLibraryExW"):
-            _kernel32.LoadLibraryExW(lib_path, None, 0x00001100)
-        else:
-            warn("LoadLibraryExW is missing in kernel32.dll")
-
     torch.ops.load_library(lib_path)
 
 
diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py
index ba7d4f69f2627e4620412ee9b05fc29c2e9a0d9b..8427095cea62068d19718f31e3898d0eda856c11 100644
--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -8,6 +8,7 @@ try:
     from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
 except ModuleNotFoundError:
     _HAS_GPU_VIDEO_DECODER = False
+
 from ._video_opt import (
     _HAS_VIDEO_OPT,
     _probe_video_from_file,
diff --git a/torchvision/io/_video_opt.py b/torchvision/io/_video_opt.py
index b598196d413c442ea28a361639320781f3e39a26..2bd7d11929e15c519f4f5227ed40b937e1460bb7 100644
--- a/torchvision/io/_video_opt.py
+++ b/torchvision/io/_video_opt.py
@@ -137,8 +137,7 @@ def _read_video_from_file(
     audio_timebase: Fraction = default_timebase,
 ) -> Tuple[torch.Tensor, torch.Tensor, VideoMetaData]:
     """
-    Reads a video from a file, returning both the video frames as well as
-    the audio frames
+    Reads a video from a file, returning both the video frames and the audio frames
 
     Args:
     filename (str): path to the video file
@@ -281,8 +280,7 @@ def _read_video_from_memory(
     audio_timebase_denominator: int = 1,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
     """
-    Reads a video from memory, returning both the video frames as well as
-    the audio frames
+    Reads a video from memory, returning both the video frames as the audio frames
     This function is torchscriptable.
 
     Args:
@@ -336,7 +334,10 @@ def _read_video_from_memory(
     _validate_pts(audio_pts_range)
 
     if not isinstance(video_data, torch.Tensor):
-        video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually don't modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
 
     result = torch.ops.video_reader.read_video_from_memory(
         video_data,
@@ -378,7 +379,10 @@ def _read_video_timestamps_from_memory(
     is much faster than read_video(...)
     """
     if not isinstance(video_data, torch.Tensor):
-        video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually don't modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
     result = torch.ops.video_reader.read_video_from_memory(
         video_data,
         0,  # seek_frame_margin
@@ -416,7 +420,10 @@ def _probe_video_from_memory(
     This function is torchscriptable
     """
     if not isinstance(video_data, torch.Tensor):
-        video_data = torch.frombuffer(video_data, dtype=torch.uint8)
+        with warnings.catch_warnings():
+            # Ignore the warning because we actually don't modify the buffer in this function
+            warnings.filterwarnings("ignore", message="The given buffer is not writable")
+            video_data = torch.frombuffer(video_data, dtype=torch.uint8)
     result = torch.ops.video_reader.probe_video_from_memory(video_data)
     vtimebase, vfps, vduration, atimebase, asample_rate, aduration = result
     info = _fill_info(vtimebase, vfps, vduration, atimebase, asample_rate, aduration)
diff --git a/torchvision/io/image.py b/torchvision/io/image.py
index 005c58e32cc734e55057a9fcb46ee331ee3f5c80..900ac4e36d2e435c3fa21822301200b9c778613d 100644
--- a/torchvision/io/image.py
+++ b/torchvision/io/image.py
@@ -10,7 +10,12 @@ from ..utils import _log_api_usage_once
 try:
     _load_library("image")
 except (ImportError, OSError) as e:
-    warn(f"Failed to load image Python extension: {e}")
+    warn(
+        f"Failed to load image Python extension: '{e}'"
+        f"If you don't plan on using image functionality from `torchvision.io`, you can ignore this warning. "
+        f"Otherwise, there might be something wrong with your environment. "
+        f"Did you have `libjpeg` or `libpng` installed before building `torchvision` from source?"
+    )
 
 
 class ImageReadMode(Enum):
@@ -50,7 +55,7 @@ def read_file(path: str) -> torch.Tensor:
 
 def write_file(filename: str, data: torch.Tensor) -> None:
     """
-    Writes the contents of a uint8 tensor with one dimension to a
+    Writes the contents of an uint8 tensor with one dimension to a
     file.
 
     Args:
diff --git a/torchvision/io/video.py b/torchvision/io/video.py
index 002fde9988c13ca1ee5fb407a009930079cd4d1a..73e61dac18e3141d927e4ada00b0eb1251760f59 100644
--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -12,7 +12,6 @@ import torch
 from ..utils import _log_api_usage_once
 from . import _video_opt
 
-
 try:
     import av
 
@@ -242,8 +241,7 @@ def read_video(
     output_format: str = "THWC",
 ) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, Any]]:
     """
-    Reads a video from a file, returning both the video frames as well as
-    the audio frames
+    Reads a video from a file, returning both the video frames and the audio frames
 
     Args:
         filename (str): path to the video file
diff --git a/torchvision/io/video_reader.py b/torchvision/io/video_reader.py
index c2ffa049d310b16756a80dc5e608a6650eb51cf4..0107c82019bbb492ceec2ce128e63278ca7ea1d0 100644
--- a/torchvision/io/video_reader.py
+++ b/torchvision/io/video_reader.py
@@ -1,13 +1,12 @@
-from typing import Any, Dict, Iterator
+import io
+import warnings
+
+from typing import Any, Dict, Iterator, Optional
 
 import torch
 
 from ..utils import _log_api_usage_once
 
-try:
-    from ._load_gpu_decoder import _HAS_GPU_VIDEO_DECODER
-except ModuleNotFoundError:
-    _HAS_GPU_VIDEO_DECODER = False
 from ._video_opt import _HAS_VIDEO_OPT
 
 if _HAS_VIDEO_OPT:
@@ -21,11 +20,37 @@ else:
         return False
 
 
+try:
+    import av
+
+    av.logging.set_level(av.logging.ERROR)
+    if not hasattr(av.video.frame.VideoFrame, "pict_type"):
+        av = ImportError(
+            """\
+Your version of PyAV is too old for the necessary video operations in torchvision.
+If you are on Python 3.5, you will have to build from source (the conda-forge
+packages are not up-to-date).  See
+https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+        )
+except ImportError:
+    av = ImportError(
+        """\
+PyAV is not installed, and is necessary for the video operations in torchvision.
+See https://github.com/mikeboers/PyAV#installation for instructions on how to
+install PyAV on your system.
+"""
+    )
+
+
 class VideoReader:
     """
     Fine-grained video-reading API.
     Supports frame-by-frame reading of various streams from a single video
-    container.
+    container. Much like previous video_reader API it supports the following
+    backends: video_reader, pyav, and cuda.
+    Backends can be set via `torchvision.set_video_backend` function.
 
     .. betastatus:: VideoReader class
 
@@ -66,13 +91,18 @@ class VideoReader:
 
         Each stream descriptor consists of two parts: stream type (e.g. 'video') and
         a unique stream id (which are determined by the video encoding).
-        In this way, if the video contaner contains multiple
-        streams of the same type, users can acces the one they want.
+        In this way, if the video container contains multiple
+        streams of the same type, users can access the one they want.
         If only stream type is passed, the decoder auto-detects first stream of that type.
 
     Args:
+        src (string, bytes object, or tensor): The media source.
+            If string-type, it must be a file path supported by FFMPEG.
+            If bytes, should be an in-memory representation of a file supported by FFMPEG.
+            If Tensor, it is interpreted internally as byte buffer.
+            It must be one-dimensional, of type ``torch.uint8``.
+
 
-        path (string): Path to the video file in supported format
 
         stream (string, optional): descriptor of the required stream, followed by the stream id,
             in the format ``{stream_type}:{stream_id}``. Defaults to ``"video:0"``.
@@ -82,30 +112,73 @@ class VideoReader:
             Default value (0) enables multithreading with codec-dependent heuristic. The performance
             will depend on the version of FFMPEG codecs supported.
 
-        device (str, optional): Device to be used for decoding. Defaults to ``"cpu"``.
-            To use GPU decoding, pass ``device="cuda"``.
 
+        path (str, optional):
+            .. warning:
+                This parameter was deprecated in ``0.15`` and will be removed in ``0.17``.
+                Please use ``src`` instead.
     """
 
-    def __init__(self, path: str, stream: str = "video", num_threads: int = 0, device: str = "cpu") -> None:
+    def __init__(
+        self,
+        src: str = "",
+        stream: str = "video",
+        num_threads: int = 0,
+        path: Optional[str] = None,
+    ) -> None:
         _log_api_usage_once(self)
-        self.is_cuda = False
-        device = torch.device(device)
-        if device.type == "cuda":
-            if not _HAS_GPU_VIDEO_DECODER:
-                raise RuntimeError("Not compiled with GPU decoder support.")
-            self.is_cuda = True
-            self._c = torch.classes.torchvision.GPUDecoder(path, device)
-            return
-        if not _has_video_opt():
-            raise RuntimeError(
-                "Not compiled with video_reader support, "
-                + "to enable video_reader support, please install "
-                + "ffmpeg (version 4.2 is currently supported) and "
-                + "build torchvision from source."
-            )
-
-        self._c = torch.classes.torchvision.Video(path, stream, num_threads)
+        from .. import get_video_backend
+
+        self.backend = get_video_backend()
+        if isinstance(src, str):
+            if src == "":
+                if path is None:
+                    raise TypeError("src cannot be empty")
+                src = path
+                warnings.warn("path is deprecated and will be removed in 0.17. Please use src instead")
+        elif isinstance(src, bytes):
+            if self.backend in ["cuda"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from bytes object when using cuda or pyav backend."
+                )
+            elif self.backend == "pyav":
+                src = io.BytesIO(src)
+            else:
+                with warnings.catch_warnings():
+                    # Ignore the warning because we actually don't modify the buffer in this function
+                    warnings.filterwarnings("ignore", message="The given buffer is not writable")
+                    src = torch.frombuffer(src, dtype=torch.uint8)
+        elif isinstance(src, torch.Tensor):
+            if self.backend in ["cuda", "pyav"]:
+                raise RuntimeError(
+                    "VideoReader cannot be initialized from Tensor object when using cuda or pyav backend."
+                )
+        else:
+            raise TypeError("`src` must be either string, Tensor or bytes object.")
+
+        if self.backend == "cuda":
+            device = torch.device("cuda")
+            self._c = torch.classes.torchvision.GPUDecoder(src, device)
+
+        elif self.backend == "video_reader":
+            if isinstance(src, str):
+                self._c = torch.classes.torchvision.Video(src, stream, num_threads)
+            elif isinstance(src, torch.Tensor):
+                self._c = torch.classes.torchvision.Video("", "", 0)
+                self._c.init_from_memory(src, stream, num_threads)
+
+        elif self.backend == "pyav":
+            self.container = av.open(src, metadata_errors="ignore")
+            # TODO: load metadata
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+
+            # TODO: add extradata exception
+
+        else:
+            raise RuntimeError("Unknown video backend: {}".format(self.backend))
 
     def __next__(self) -> Dict[str, Any]:
         """Decodes and returns the next frame of the current stream.
@@ -119,14 +192,29 @@ class VideoReader:
             and corresponding timestamp (``pts``) in seconds
 
         """
-        if self.is_cuda:
+        if self.backend == "cuda":
             frame = self._c.next()
             if frame.numel() == 0:
                 raise StopIteration
-            return {"data": frame}
-        frame, pts = self._c.next()
+            return {"data": frame, "pts": None}
+        elif self.backend == "video_reader":
+            frame, pts = self._c.next()
+        else:
+            try:
+                frame = next(self._c)
+                pts = float(frame.pts * frame.time_base)
+                if "video" in self.pyav_stream:
+                    frame = torch.tensor(frame.to_rgb().to_ndarray()).permute(2, 0, 1)
+                elif "audio" in self.pyav_stream:
+                    frame = torch.tensor(frame.to_ndarray()).permute(1, 0)
+                else:
+                    frame = None
+            except av.error.EOFError:
+                raise StopIteration
+
         if frame.numel() == 0:
             raise StopIteration
+
         return {"data": frame, "pts": pts}
 
     def __iter__(self) -> Iterator[Dict[str, Any]]:
@@ -145,7 +233,18 @@ class VideoReader:
             frame with the exact timestamp if it exists or
             the first frame with timestamp larger than ``time_s``.
         """
-        self._c.seek(time_s, keyframes_only)
+        if self.backend in ["cuda", "video_reader"]:
+            self._c.seek(time_s, keyframes_only)
+        else:
+            # handle special case as pyav doesn't catch it
+            if time_s < 0:
+                time_s = 0
+            temp_str = self.container.streams.get(**self.pyav_stream)[0]
+            offset = int(round(time_s / temp_str.time_base))
+            if not keyframes_only:
+                warnings.warn("Accurate seek is not implemented for pyav backend")
+            self.container.seek(offset, backward=True, any_frame=False, stream=temp_str)
+            self._c = self.container.decode(**self.pyav_stream)
         return self
 
     def get_metadata(self) -> Dict[str, Any]:
@@ -154,6 +253,21 @@ class VideoReader:
         Returns:
             (dict): dictionary containing duration and frame rate for every stream
         """
+        if self.backend == "pyav":
+            metadata = {}  # type:  Dict[str, Any]
+            for stream in self.container.streams:
+                if stream.type not in metadata:
+                    if stream.type == "video":
+                        rate_n = "fps"
+                    else:
+                        rate_n = "framerate"
+                    metadata[stream.type] = {rate_n: [], "duration": []}
+
+                rate = stream.average_rate if stream.average_rate is not None else stream.sample_rate
+
+                metadata[stream.type]["duration"].append(float(stream.duration * stream.time_base))
+                metadata[stream.type][rate_n].append(float(rate))
+            return metadata
         return self._c.get_metadata()
 
     def set_current_stream(self, stream: str) -> bool:
@@ -165,14 +279,20 @@ class VideoReader:
                 Currently available stream types include ``['video', 'audio']``.
                 Each descriptor consists of two parts: stream type (e.g. 'video') and
                 a unique stream id (which are determined by video encoding).
-                In this way, if the video contaner contains multiple
-                streams of the same type, users can acces the one they want.
+                In this way, if the video container contains multiple
+                streams of the same type, users can access the one they want.
                 If only stream type is passed, the decoder auto-detects first stream
                 of that type and returns it.
 
         Returns:
-            (bool): True on succes, False otherwise
+            (bool): True on success, False otherwise
         """
-        if self.is_cuda:
-            print("GPU decoding only works with video stream.")
+        if self.backend == "cuda":
+            warnings.warn("GPU decoding only works with video stream.")
+        if self.backend == "pyav":
+            stream_type = stream.split(":")[0]
+            stream_id = 0 if len(stream.split(":")) == 1 else int(stream.split(":")[1])
+            self.pyav_stream = {stream_type: stream_id}
+            self._c = self.container.decode(**self.pyav_stream)
+            return True
         return self._c.set_current_stream(stream)
diff --git a/torchvision/models/__init__.py b/torchvision/models/__init__.py
index 93d96112ba18278aceeb8b1d935cf3e6e64d4180..6ea0a1f7178b6ca03776d58c17411a8ff483f8b2 100644
--- a/torchvision/models/__init__.py
+++ b/torchvision/models/__init__.py
@@ -15,4 +15,9 @@ from .vision_transformer import *
 from .swin_transformer import *
 from .maxvit import *
 from . import detection, optical_flow, quantization, segmentation, video
-from ._api import get_model, get_model_builder, get_model_weights, get_weight, list_models
+
+# The Weights and WeightsEnum are developer-facing utils that we make public for
+# downstream libs like torchgeo https://github.com/pytorch/vision/issues/7094
+# TODO: we could / should document them publicly, but it's not clear where, as
+# they're not intended for end users.
+from ._api import get_model, get_model_builder, get_model_weights, get_weight, list_models, Weights, WeightsEnum
diff --git a/torchvision/models/_api.py b/torchvision/models/_api.py
index 52ac070e6d3323e75368b6c4603baed4a333b856..0999bf7ba6beba91acc0b3374c2307cc8137847c 100644
--- a/torchvision/models/_api.py
+++ b/torchvision/models/_api.py
@@ -1,15 +1,16 @@
+import fnmatch
 import importlib
 import inspect
 import sys
-from dataclasses import dataclass, fields
+from dataclasses import dataclass
+from enum import Enum
+from functools import partial
 from inspect import signature
 from types import ModuleType
-from typing import Any, Callable, cast, Dict, List, Mapping, Optional, TypeVar, Union
+from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Set, Type, TypeVar, Union
 
 from torch import nn
 
-from torchvision._utils import StrEnum
-
 from .._internally_replaced_utils import load_state_dict_from_url
 
 
@@ -37,8 +38,34 @@ class Weights:
     transforms: Callable
     meta: Dict[str, Any]
 
-
-class WeightsEnum(StrEnum):
+    def __eq__(self, other: Any) -> bool:
+        # We need this custom implementation for correct deep-copy and deserialization behavior.
+        # TL;DR: After the definition of an enum, creating a new instance, i.e. by deep-copying or deserializing it,
+        # involves an equality check against the defined members. Unfortunately, the `transforms` attribute is often
+        # defined with `functools.partial` and `fn = partial(...); assert deepcopy(fn) != fn`. Without custom handling
+        # for it, the check against the defined members would fail and effectively prevent the weights from being
+        # deep-copied or deserialized.
+        # See https://github.com/pytorch/vision/pull/7107 for details.
+        if not isinstance(other, Weights):
+            return NotImplemented
+
+        if self.url != other.url:
+            return False
+
+        if self.meta != other.meta:
+            return False
+
+        if isinstance(self.transforms, partial) and isinstance(other.transforms, partial):
+            return (
+                self.transforms.func == other.transforms.func
+                and self.transforms.args == other.transforms.args
+                and self.transforms.keywords == other.transforms.keywords
+            )
+        else:
+            return self.transforms == other.transforms
+
+
+class WeightsEnum(Enum):
     """
     This class is the parent class of all model weights. Each model building method receives an optional `weights`
     parameter with its associated pre-trained weights. It inherits from `Enum` and its values should be of type
@@ -48,40 +75,40 @@ class WeightsEnum(StrEnum):
         value (Weights): The data class entry with the weight information.
     """
 
-    def __init__(self, value: Weights):
-        self._value_ = value
-
     @classmethod
     def verify(cls, obj: Any) -> Any:
         if obj is not None:
             if type(obj) is str:
-                obj = cls.from_str(obj.replace(cls.__name__ + ".", ""))
+                obj = cls[obj.replace(cls.__name__ + ".", "")]
             elif not isinstance(obj, cls):
                 raise TypeError(
                     f"Invalid Weight class provided; expected {cls.__name__} but received {obj.__class__.__name__}."
                 )
         return obj
 
-    def get_state_dict(self, progress: bool) -> Mapping[str, Any]:
-        return load_state_dict_from_url(self.url, progress=progress)
+    def get_state_dict(self, *args: Any, **kwargs: Any) -> Mapping[str, Any]:
+        return load_state_dict_from_url(self.url, *args, **kwargs)
 
     def __repr__(self) -> str:
         return f"{self.__class__.__name__}.{self._name_}"
 
-    def __getattr__(self, name):
-        # Be able to fetch Weights attributes directly
-        for f in fields(Weights):
-            if f.name == name:
-                return object.__getattribute__(self.value, name)
-        return super().__getattr__(name)
+    @property
+    def url(self):
+        return self.value.url
+
+    @property
+    def transforms(self):
+        return self.value.transforms
+
+    @property
+    def meta(self):
+        return self.value.meta
 
 
 def get_weight(name: str) -> WeightsEnum:
     """
     Gets the weights enum value by its full name. Example: "ResNet50_Weights.IMAGENET1K_V1"
 
-    .. betastatus:: function
-
     Args:
         name (str): The name of the weight enum entry.
 
@@ -96,7 +123,9 @@ def get_weight(name: str) -> WeightsEnum:
     base_module_name = ".".join(sys.modules[__name__].__name__.split(".")[:-1])
     base_module = importlib.import_module(base_module_name)
     model_modules = [base_module] + [
-        x[1] for x in inspect.getmembers(base_module, inspect.ismodule) if x[1].__file__.endswith("__init__.py")
+        x[1]
+        for x in inspect.getmembers(base_module, inspect.ismodule)
+        if x[1].__file__.endswith("__init__.py")  # type: ignore[union-attr]
     ]
 
     weights_enum = None
@@ -109,14 +138,12 @@ def get_weight(name: str) -> WeightsEnum:
     if weights_enum is None:
         raise ValueError(f"The weight enum '{enum_name}' for the specific method couldn't be retrieved.")
 
-    return weights_enum.from_str(value_name)
+    return weights_enum[value_name]
 
 
-def get_model_weights(name: Union[Callable, str]) -> WeightsEnum:
+def get_model_weights(name: Union[Callable, str]) -> Type[WeightsEnum]:
     """
-    Retuns the weights enum class associated to the given model.
-
-    .. betastatus:: function
+    Returns the weights enum class associated to the given model.
 
     Args:
         name (callable or str): The model builder function or the name under which it is registered.
@@ -128,13 +155,12 @@ def get_model_weights(name: Union[Callable, str]) -> WeightsEnum:
     return _get_enum_from_fn(model)
 
 
-def _get_enum_from_fn(fn: Callable) -> WeightsEnum:
+def _get_enum_from_fn(fn: Callable) -> Type[WeightsEnum]:
     """
     Internal method that gets the weight enum of a specific model builder method.
 
     Args:
         fn (Callable): The builder method used to create the model.
-        weight_name (str): The name of the weight enum entry of the specific model.
     Returns:
         WeightsEnum: The requested weight enum.
     """
@@ -159,7 +185,7 @@ def _get_enum_from_fn(fn: Callable) -> WeightsEnum:
             "The WeightsEnum class for the specific method couldn't be retrieved. Make sure the typing info is correct."
         )
 
-    return cast(WeightsEnum, weights_enum)
+    return weights_enum
 
 
 M = TypeVar("M", bound=nn.Module)
@@ -178,21 +204,43 @@ def register_model(name: Optional[str] = None) -> Callable[[Callable[..., M]], C
     return wrapper
 
 
-def list_models(module: Optional[ModuleType] = None) -> List[str]:
+def list_models(
+    module: Optional[ModuleType] = None,
+    include: Union[Iterable[str], str, None] = None,
+    exclude: Union[Iterable[str], str, None] = None,
+) -> List[str]:
     """
     Returns a list with the names of registered models.
 
-    .. betastatus:: function
-
     Args:
         module (ModuleType, optional): The module from which we want to extract the available models.
+        include (str or Iterable[str], optional): Filter(s) for including the models from the set of all models.
+            Filters are passed to `fnmatch <https://docs.python.org/3/library/fnmatch.html>`__ to match Unix shell-style
+            wildcards. In case of many filters, the results is the union of individual filters.
+        exclude (str or Iterable[str], optional): Filter(s) applied after include_filters to remove models.
+            Filter are passed to `fnmatch <https://docs.python.org/3/library/fnmatch.html>`__ to match Unix shell-style
+            wildcards. In case of many filters, the results is removal of all the models that match any individual filter.
 
     Returns:
         models (list): A list with the names of available models.
     """
-    models = [
+    all_models = {
         k for k, v in BUILTIN_MODELS.items() if module is None or v.__module__.rsplit(".", 1)[0] == module.__name__
-    ]
+    }
+    if include:
+        models: Set[str] = set()
+        if isinstance(include, str):
+            include = [include]
+        for include_filter in include:
+            models = models | set(fnmatch.filter(all_models, include_filter))
+    else:
+        models = all_models
+
+    if exclude:
+        if isinstance(exclude, str):
+            exclude = [exclude]
+        for exclude_filter in exclude:
+            models = models - set(fnmatch.filter(all_models, exclude_filter))
     return sorted(models)
 
 
@@ -200,8 +248,6 @@ def get_model_builder(name: str) -> Callable[..., nn.Module]:
     """
     Gets the model name and returns the model builder method.
 
-    .. betastatus:: function
-
     Args:
         name (str): The name under which the model is registered.
 
@@ -220,8 +266,6 @@ def get_model(name: str, **config: Any) -> nn.Module:
     """
     Gets the model name and configuration and returns an instantiated model.
 
-    .. betastatus:: function
-
     Args:
         name (str): The name under which the model is registered.
         **config (Any): parameters passed to the model builder method.
diff --git a/torchvision/models/_utils.py b/torchvision/models/_utils.py
index f73d193805e0034aa745f0d73a1d46ccf1c5b936..d59a6220b91f0f2fe85b6bf01948ee5a506cd82a 100644
--- a/torchvision/models/_utils.py
+++ b/torchvision/models/_utils.py
@@ -191,7 +191,7 @@ def handle_legacy_interface(**weights: Tuple[str, Union[Optional[W], Callable[[D
                 # used to be a pretrained parameter.
                 pretrained_positional = weights_arg is not sentinel
                 if pretrained_positional:
-                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have a
+                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have
                     # unified access to the value if the default value is a callable.
                     kwargs[pretrained_param] = pretrained_arg = kwargs.pop(weights_param)
                 else:
diff --git a/torchvision/models/alexnet.py b/torchvision/models/alexnet.py
index 328f978ba11d482498718aeba7a70bca459b0491..f85acbeb2148d2aa8f289808e61aa61e2d68e2f9 100644
--- a/torchvision/models/alexnet.py
+++ b/torchvision/models/alexnet.py
@@ -67,6 +67,8 @@ class AlexNet_Weights(WeightsEnum):
                     "acc@5": 79.066,
                 }
             },
+            "_ops": 0.714,
+            "_file_size": 233.087,
             "_docs": """
                 These weights reproduce closely the results of the paper using a simplified training recipe.
             """,
@@ -112,17 +114,6 @@ def alexnet(*, weights: Optional[AlexNet_Weights] = None, progress: bool = True,
     model = AlexNet(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "alexnet": AlexNet_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/convnext.py b/torchvision/models/convnext.py
index 025baa3d14802866a68d997fcb07129f6ceb497d..444ef3c219efa09d1c720f29db44a8acc8714bbc 100644
--- a/torchvision/models/convnext.py
+++ b/torchvision/models/convnext.py
@@ -189,7 +189,7 @@ def _convnext(
     model = ConvNeXt(block_setting, stochastic_depth_prob=stochastic_depth_prob, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -219,6 +219,8 @@ class ConvNeXt_Tiny_Weights(WeightsEnum):
                     "acc@5": 96.146,
                 }
             },
+            "_ops": 4.456,
+            "_file_size": 109.119,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -237,6 +239,8 @@ class ConvNeXt_Small_Weights(WeightsEnum):
                     "acc@5": 96.650,
                 }
             },
+            "_ops": 8.684,
+            "_file_size": 191.703,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -255,6 +259,8 @@ class ConvNeXt_Base_Weights(WeightsEnum):
                     "acc@5": 96.870,
                 }
             },
+            "_ops": 15.355,
+            "_file_size": 338.064,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -273,6 +279,8 @@ class ConvNeXt_Large_Weights(WeightsEnum):
                     "acc@5": 96.976,
                 }
             },
+            "_ops": 34.361,
+            "_file_size": 754.537,
         },
     )
     DEFAULT = IMAGENET1K_V1
diff --git a/torchvision/models/densenet.py b/torchvision/models/densenet.py
index 9aa5ed176a0fa7e4d0949b9c024f736de6a005a8..3b42807cc96ebf2c0399c10968240566e72ce368 100644
--- a/torchvision/models/densenet.py
+++ b/torchvision/models/densenet.py
@@ -15,7 +15,6 @@ from ._api import register_model, Weights, WeightsEnum
 from ._meta import _IMAGENET_CATEGORIES
 from ._utils import _ovewrite_named_param, handle_legacy_interface
 
-
 __all__ = [
     "DenseNet",
     "DenseNet121_Weights",
@@ -228,7 +227,7 @@ def _load_state_dict(model: nn.Module, weights: WeightsEnum, progress: bool) ->
         r"^(.*denselayer\d+\.(?:norm|relu|conv))\.((?:[12])\.(?:weight|bias|running_mean|running_var))$"
     )
 
-    state_dict = weights.get_state_dict(progress=progress)
+    state_dict = weights.get_state_dict(progress=progress, check_hash=True)
     for key in list(state_dict.keys()):
         res = pattern.match(key)
         if res:
@@ -278,6 +277,8 @@ class DenseNet121_Weights(WeightsEnum):
                     "acc@5": 91.972,
                 }
             },
+            "_ops": 2.834,
+            "_file_size": 30.845,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -296,6 +297,8 @@ class DenseNet161_Weights(WeightsEnum):
                     "acc@5": 93.560,
                 }
             },
+            "_ops": 7.728,
+            "_file_size": 110.369,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -314,6 +317,8 @@ class DenseNet169_Weights(WeightsEnum):
                     "acc@5": 92.806,
                 }
             },
+            "_ops": 3.36,
+            "_file_size": 54.708,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -332,6 +337,8 @@ class DenseNet201_Weights(WeightsEnum):
                     "acc@5": 93.370,
                 }
             },
+            "_ops": 4.291,
+            "_file_size": 77.373,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -439,17 +446,3 @@ def densenet201(*, weights: Optional[DenseNet201_Weights] = None, progress: bool
     weights = DenseNet201_Weights.verify(weights)
 
     return _densenet(32, (6, 12, 48, 32), 64, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "densenet121": DenseNet121_Weights.IMAGENET1K_V1.url,
-        "densenet169": DenseNet169_Weights.IMAGENET1K_V1.url,
-        "densenet201": DenseNet201_Weights.IMAGENET1K_V1.url,
-        "densenet161": DenseNet161_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/_utils.py b/torchvision/models/detection/_utils.py
index 10d31852856e3d55c48e7c41db4dcc91c4090b1a..559db858ac32f3b9f157aff3c22da83abece2a73 100644
--- a/torchvision/models/detection/_utils.py
+++ b/torchvision/models/detection/_utils.py
@@ -25,7 +25,7 @@ class BalancedPositiveNegativeSampler:
     def __call__(self, matched_idxs: List[Tensor]) -> Tuple[List[Tensor], List[Tensor]]:
         """
         Args:
-            matched idxs: list of tensors containing -1, 0 or positive values.
+            matched_idxs: list of tensors containing -1, 0 or positive values.
                 Each tensor corresponds to a specific image.
                 -1 values are ignored, 0 are considered as negatives and > 0 as
                 positives.
@@ -403,22 +403,14 @@ class Matcher:
         it is unmatched, then match it to the ground-truth with which it has the highest
         quality value.
         """
-        # For each gt, find the prediction with which it has highest quality
+        # For each gt, find the prediction with which it has the highest quality
         highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
-        # Find highest quality match available, even if it is low, including ties
+        # Find the highest quality match available, even if it is low, including ties
         gt_pred_pairs_of_highest_quality = torch.where(match_quality_matrix == highest_quality_foreach_gt[:, None])
         # Example gt_pred_pairs_of_highest_quality:
-        #   tensor([[    0, 39796],
-        #           [    1, 32055],
-        #           [    1, 32070],
-        #           [    2, 39190],
-        #           [    2, 40255],
-        #           [    3, 40390],
-        #           [    3, 41455],
-        #           [    4, 45470],
-        #           [    5, 45325],
-        #           [    5, 46390]])
-        # Each row is a (gt index, prediction index)
+        # (tensor([0, 1, 1, 2, 2, 3, 3, 4, 5, 5]),
+        #  tensor([39796, 32055, 32070, 39190, 40255, 40390, 41455, 45470, 45325, 46390]))
+        # Each element in the first tensor is a gt index, and each element in second tensor is a prediction index
         # Note how gt items 1, 2, 3, and 5 each have two ties
 
         pred_inds_to_update = gt_pred_pairs_of_highest_quality[1]
@@ -501,14 +493,14 @@ def _topk_min(input: Tensor, orig_kval: int, axis: int) -> int:
     if K exceeds the number of elements along that axis. Previously, python's min() function was
     used to determine whether to use the provided k-value or the specified dim axis value.
 
-    However in cases where the model is being exported in tracing mode, python min() is
+    However, in cases where the model is being exported in tracing mode, python min() is
     static causing the model to be traced incorrectly and eventually fail at the topk node.
     In order to avoid this situation, in tracing mode, torch.min() is used instead.
 
     Args:
-        input (Tensor): The orignal input tensor.
+        input (Tensor): The original input tensor.
         orig_kval (int): The provided k-value.
-        axis(int): Axis along which we retreive the input size.
+        axis(int): Axis along which we retrieve the input size.
 
     Returns:
         min_kval (int): Appropriately selected k-value.
diff --git a/torchvision/models/detection/anchor_utils.py b/torchvision/models/detection/anchor_utils.py
index f42c10d8208eeb8c41d4e23d3da3c266fdd196b2..253f6502a9b6344f5a3da239f2394179a256424e 100644
--- a/torchvision/models/detection/anchor_utils.py
+++ b/torchvision/models/detection/anchor_utils.py
@@ -61,7 +61,7 @@ class AnchorGenerator(nn.Module):
         aspect_ratios: List[float],
         dtype: torch.dtype = torch.float32,
         device: torch.device = torch.device("cpu"),
-    ):
+    ) -> Tensor:
         scales = torch.as_tensor(scales, dtype=dtype, device=device)
         aspect_ratios = torch.as_tensor(aspect_ratios, dtype=dtype, device=device)
         h_ratios = torch.sqrt(aspect_ratios)
@@ -76,7 +76,7 @@ class AnchorGenerator(nn.Module):
     def set_cell_anchors(self, dtype: torch.dtype, device: torch.device):
         self.cell_anchors = [cell_anchor.to(dtype=dtype, device=device) for cell_anchor in self.cell_anchors]
 
-    def num_anchors_per_location(self):
+    def num_anchors_per_location(self) -> List[int]:
         return [len(s) * len(a) for s, a in zip(self.sizes, self.aspect_ratios)]
 
     # For every combination of (a, (g, s), i) in (self.cell_anchors, zip(grid_sizes, strides), 0:2),
@@ -145,7 +145,7 @@ class DefaultBoxGenerator(nn.Module):
             of the scales of each feature map. It is used only if the ``scales`` parameter is not provided.
         scales (List[float]], optional): The scales of the default boxes. If not provided it will be estimated using
             the ``min_ratio`` and ``max_ratio`` parameters.
-        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of defalt boxes. If not provided
+        steps (List[int]], optional): It's a hyper-parameter that affects the tiling of default boxes. If not provided
             it will be estimated from the data.
         clip (bool): Whether the standardized values of default boxes should be clipped between 0 and 1. The clipping
             is applied while the boxes are encoded in format ``(cx, cy, w, h)``.
@@ -201,7 +201,7 @@ class DefaultBoxGenerator(nn.Module):
             _wh_pairs.append(torch.as_tensor(wh_pairs, dtype=dtype, device=device))
         return _wh_pairs
 
-    def num_anchors_per_location(self):
+    def num_anchors_per_location(self) -> List[int]:
         # Estimate num of anchors based on aspect ratios: 2 default boxes + 2 * ratios of feaure map.
         return [2 + 2 * len(r) for r in self.aspect_ratios]
 
diff --git a/torchvision/models/detection/backbone_utils.py b/torchvision/models/detection/backbone_utils.py
index 4941d7ec440246c16daed64054d15251229dd44c..87ae8627fbe5dc26cfef39ac677661d79decc6a0 100644
--- a/torchvision/models/detection/backbone_utils.py
+++ b/torchvision/models/detection/backbone_utils.py
@@ -62,7 +62,7 @@ class BackboneWithFPN(nn.Module):
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: _get_enum_from_fn(resnet.__dict__[kwargs["backbone_name"]]).from_str("IMAGENET1K_V1"),
+        lambda kwargs: _get_enum_from_fn(resnet.__dict__[kwargs["backbone_name"]])["IMAGENET1K_V1"],
     ),
 )
 def resnet_fpn_backbone(
@@ -102,12 +102,12 @@ def resnet_fpn_backbone(
         trainable_layers (int): number of trainable (not frozen) layers starting from final block.
             Valid values are between 0 and 5, with 5 meaning all backbone layers are trainable.
         returned_layers (list of int): The layers of the network to return. Each entry must be in ``[1, 4]``.
-            By default all layers are returned.
+            By default, all layers are returned.
         extra_blocks (ExtraFPNBlock or None): if provided, extra operations will
             be performed. It is expected to take the fpn features, the original
             features and the names of the original features as input, and returns
             a new list of feature maps and their corresponding names. By
-            default a ``LastLevelMaxPool`` is used.
+            default, a ``LastLevelMaxPool`` is used.
     """
     backbone = resnet.__dict__[backbone_name](weights=weights, norm_layer=norm_layer)
     return _resnet_fpn_extractor(backbone, trainable_layers, returned_layers, extra_blocks)
@@ -121,7 +121,7 @@ def _resnet_fpn_extractor(
     norm_layer: Optional[Callable[..., nn.Module]] = None,
 ) -> BackboneWithFPN:
 
-    # select layers that wont be frozen
+    # select layers that won't be frozen
     if trainable_layers < 0 or trainable_layers > 5:
         raise ValueError(f"Trainable layers should be in the range [0,5], got {trainable_layers}")
     layers_to_train = ["layer4", "layer3", "layer2", "layer1", "conv1"][:trainable_layers]
@@ -158,7 +158,7 @@ def _validate_trainable_layers(
     if not is_trained:
         if trainable_backbone_layers is not None:
             warnings.warn(
-                "Changing trainable_backbone_layers has not effect if "
+                "Changing trainable_backbone_layers has no effect if "
                 "neither pretrained nor pretrained_backbone have been set to True, "
                 f"falling back to trainable_backbone_layers={max_value} so that all layers are trainable"
             )
@@ -177,7 +177,7 @@ def _validate_trainable_layers(
 @handle_legacy_interface(
     weights=(
         "pretrained",
-        lambda kwargs: _get_enum_from_fn(mobilenet.__dict__[kwargs["backbone_name"]]).from_str("IMAGENET1K_V1"),
+        lambda kwargs: _get_enum_from_fn(mobilenet.__dict__[kwargs["backbone_name"]])["IMAGENET1K_V1"],
     ),
 )
 def mobilenet_backbone(
@@ -208,7 +208,7 @@ def _mobilenet_extractor(
     stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
     num_stages = len(stage_indices)
 
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
     if trainable_layers < 0 or trainable_layers > num_stages:
         raise ValueError(f"Trainable layers should be in the range [0,{num_stages}], got {trainable_layers} ")
     freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py
index 9d99fd236c700261a1f24d7ec2e54117d3dd41a0..de32f3453bda7af57701cc84afec1275b8ad98bb 100644
--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -47,9 +47,9 @@ class FasterRCNN(GeneralizedRCNN):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -68,7 +68,7 @@ class FasterRCNN(GeneralizedRCNN):
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or and OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
@@ -128,7 +128,7 @@ class FasterRCNN(GeneralizedRCNN):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # FasterRCNN needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -388,6 +388,8 @@ class FasterRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "box_map": 37.0,
                 }
             },
+            "_ops": 134.38,
+            "_file_size": 159.743,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -407,6 +409,8 @@ class FasterRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
                     "box_map": 46.7,
                 }
             },
+            "_ops": 280.371,
+            "_file_size": 167.104,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
@@ -426,6 +430,8 @@ class FasterRCNN_MobileNet_V3_Large_FPN_Weights(WeightsEnum):
                     "box_map": 32.8,
                 }
             },
+            "_ops": 4.494,
+            "_file_size": 74.239,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -445,6 +451,8 @@ class FasterRCNN_MobileNet_V3_Large_320_FPN_Weights(WeightsEnum):
                     "box_map": 22.8,
                 }
             },
+            "_ops": 0.719,
+            "_file_size": 74.239,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -475,9 +483,9 @@ def fasterrcnn_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and a targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -563,7 +571,7 @@ def fasterrcnn_resnet50_fpn(
     model = FasterRCNN(backbone, num_classes=num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == FasterRCNN_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
@@ -645,7 +653,7 @@ def fasterrcnn_resnet50_fpn_v2(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -686,7 +694,7 @@ def _fasterrcnn_mobilenet_v3_large_fpn(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -706,7 +714,7 @@ def fasterrcnn_mobilenet_v3_large_320_fpn(
     **kwargs: Any,
 ) -> FasterRCNN:
     """
-    Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tunned for mobile use cases.
+    Low resolution Faster R-CNN model with a MobileNetV3-Large backbone tuned for mobile use cases.
 
     .. betastatus:: detection module
 
@@ -833,16 +841,3 @@ def fasterrcnn_mobilenet_v3_large_fpn(
         trainable_backbone_layers=trainable_backbone_layers,
         **kwargs,
     )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "fasterrcnn_resnet50_fpn_coco": FasterRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-        "fasterrcnn_mobilenet_v3_large_320_fpn_coco": FasterRCNN_MobileNet_V3_Large_320_FPN_Weights.COCO_V1.url,
-        "fasterrcnn_mobilenet_v3_large_fpn_coco": FasterRCNN_MobileNet_V3_Large_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py
index 2ac71c339a40fba58d38f2df563b901c58247284..dd846aea9adc17baf940c6a131d6b3d80da3ad5e 100644
--- a/torchvision/models/detection/fcos.py
+++ b/torchvision/models/detection/fcos.py
@@ -70,7 +70,7 @@ class FCOSHead(nn.Module):
             else:
                 gt_classes_targets = targets_per_image["labels"][matched_idxs_per_image.clip(min=0)]
                 gt_boxes_targets = targets_per_image["boxes"][matched_idxs_per_image.clip(min=0)]
-            gt_classes_targets[matched_idxs_per_image < 0] = -1  # backgroud
+            gt_classes_targets[matched_idxs_per_image < 0] = -1  # background
             all_gt_classes_targets.append(gt_classes_targets)
             all_gt_boxes_targets.append(gt_boxes_targets)
 
@@ -274,9 +274,9 @@ class FCOS(nn.Module):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -329,7 +329,7 @@ class FCOS(nn.Module):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # FCOS needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -662,6 +662,8 @@ class FCOS_ResNet50_FPN_Weights(WeightsEnum):
                     "box_map": 39.2,
                 }
             },
+            "_ops": 128.207,
+            "_file_size": 123.608,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -693,9 +695,9 @@ def fcos_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -764,17 +766,6 @@ def fcos_resnet50_fpn(
     model = FCOS(backbone, num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "fcos_resnet50_fpn_coco": FCOS_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py
index c19dd21a5ce7996059fb6e51bb7b09360e451bd8..1ef0c1950d10a9eb7908b47c9b4cc67a1ca89c49 100644
--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -29,9 +29,9 @@ class KeypointRCNN(FasterRCNN):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -55,7 +55,7 @@ class KeypointRCNN(FasterRCNN):
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or and OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
@@ -121,7 +121,7 @@ class KeypointRCNN(FasterRCNN):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # KeypointRCNN needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -328,6 +328,8 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "kp_map": 61.1,
                 }
             },
+            "_ops": 133.924,
+            "_file_size": 226.054,
             "_docs": """
                 These weights were produced by following a similar training recipe as on the paper but use a checkpoint
                 from an early epoch.
@@ -347,6 +349,8 @@ class KeypointRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "kp_map": 65.0,
                 }
             },
+            "_ops": 137.42,
+            "_file_size": 226.054,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -383,9 +387,9 @@ def keypointrcnn_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -461,21 +465,8 @@ def keypointrcnn_resnet50_fpn(
     model = KeypointRCNN(backbone, num_classes, num_keypoints=num_keypoints, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == KeypointRCNN_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # legacy model for BC reasons, see https://github.com/pytorch/vision/issues/1606
-        "keypointrcnn_resnet50_fpn_coco_legacy": KeypointRCNN_ResNet50_FPN_Weights.COCO_LEGACY.url,
-        "keypointrcnn_resnet50_fpn_coco": KeypointRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py
index 795f9b8f79c485e0680538503d0c7bc4aa778a62..695dd4d63ec6721fe031013a3dd50c996144a3f5 100644
--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -31,9 +31,9 @@ class MaskRCNN(FasterRCNN):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -56,7 +56,7 @@ class MaskRCNN(FasterRCNN):
 
     Args:
         backbone (nn.Module): the network used to compute the features for the model.
-            It should contain a out_channels attribute, which indicates the number of output
+            It should contain an out_channels attribute, which indicates the number of output
             channels that each feature map has (and it should be the same for all feature maps).
             The backbone should return a single Tensor or and OrderedDict[Tensor].
         num_classes (int): number of output classes of the model (including the background).
@@ -123,7 +123,7 @@ class MaskRCNN(FasterRCNN):
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # MaskRCNN needs to know the number of
         >>> # output channels in a backbone. For mobilenet_v2, it's 1280
-        >>> # so we need to add it here
+        >>> # so we need to add it here,
         >>> backbone.out_channels = 1280
         >>>
         >>> # let's make the RPN generate 5 x 3 anchors per spatial
@@ -370,6 +370,8 @@ class MaskRCNN_ResNet50_FPN_Weights(WeightsEnum):
                     "mask_map": 34.6,
                 }
             },
+            "_ops": 134.38,
+            "_file_size": 169.84,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -390,6 +392,8 @@ class MaskRCNN_ResNet50_FPN_V2_Weights(WeightsEnum):
                     "mask_map": 41.8,
                 }
             },
+            "_ops": 333.577,
+            "_file_size": 177.219,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
@@ -418,9 +422,9 @@ def maskrcnn_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -497,7 +501,7 @@ def maskrcnn_resnet50_fpn(
     model = MaskRCNN(backbone, num_classes=num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == MaskRCNN_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
@@ -578,17 +582,6 @@ def maskrcnn_resnet50_fpn_v2(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "maskrcnn_resnet50_fpn_coco": MaskRCNN_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py
index ffa21b14f7044e03e766ac0d7c45b3365dc7b6a8..3a9cf80d1d5da75501cf48765330ab42b0d9f011 100644
--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -327,9 +327,9 @@ class RetinaNet(nn.Module):
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
     image, and should be in 0-1 range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -382,7 +382,7 @@ class RetinaNet(nn.Module):
         >>> # only the features
         >>> backbone = torchvision.models.mobilenet_v2(weights=MobileNet_V2_Weights.DEFAULT).features
         >>> # RetinaNet needs to know the number of
-        >>> # output channels in a backbone. For mobilenet_v2, it's 1280
+        >>> # output channels in a backbone. For mobilenet_v2, it's 1280,
         >>> # so we need to add it here
         >>> backbone.out_channels = 1280
         >>>
@@ -690,6 +690,8 @@ class RetinaNet_ResNet50_FPN_Weights(WeightsEnum):
                     "box_map": 36.4,
                 }
             },
+            "_ops": 151.54,
+            "_file_size": 130.267,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -709,6 +711,8 @@ class RetinaNet_ResNet50_FPN_V2_Weights(WeightsEnum):
                     "box_map": 41.5,
                 }
             },
+            "_ops": 152.238,
+            "_file_size": 146.037,
             "_docs": """These weights were produced using an enhanced training recipe to boost the model accuracy.""",
         },
     )
@@ -739,9 +743,9 @@ def retinanet_resnet50_fpn(
     The input to the model is expected to be a list of tensors, each of shape ``[C, H, W]``, one for each
     image, and should be in ``0-1`` range. Different images can have different sizes.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -811,7 +815,7 @@ def retinanet_resnet50_fpn(
     model = RetinaNet(backbone, num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if weights == RetinaNet_ResNet50_FPN_Weights.COCO_V1:
             overwrite_eps(model, 0.0)
 
@@ -890,17 +894,6 @@ def retinanet_resnet50_fpn_v2(
     model = RetinaNet(backbone, num_classes, anchor_generator=anchor_generator, head=head, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "retinanet_resnet50_fpn_coco": RetinaNet_ResNet50_FPN_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/roi_heads.py b/torchvision/models/detection/roi_heads.py
index 18a6782a06b41d2be98b981aedcceed632961b6c..51b210cb6f368c1f4914ffe99287efef6057cba4 100644
--- a/torchvision/models/detection/roi_heads.py
+++ b/torchvision/models/detection/roi_heads.py
@@ -315,7 +315,7 @@ def keypointrcnn_loss(keypoint_logits, proposals, gt_keypoints, keypoint_matched
     valid = torch.cat(valid, dim=0).to(dtype=torch.uint8)
     valid = torch.where(valid)[0]
 
-    # torch.mean (in binary_cross_entropy_with_logits) does'nt
+    # torch.mean (in binary_cross_entropy_with_logits) doesn't
     # accept empty tensors, so handle it sepaartely
     if keypoint_targets.numel() == 0 or len(valid) == 0:
         return keypoint_logits.sum() * 0
@@ -746,7 +746,7 @@ class RoIHeads(nn.Module):
                 if not t["boxes"].dtype in floating_point_types:
                     raise TypeError(f"target boxes must of float type, instead got {t['boxes'].dtype}")
                 if not t["labels"].dtype == torch.int64:
-                    raise TypeError("target labels must of int64 type, instead got {t['labels'].dtype}")
+                    raise TypeError(f"target labels must of int64 type, instead got {t['labels'].dtype}")
                 if self.has_keypoint():
                     if not t["keypoints"].dtype == torch.float32:
                         raise TypeError(f"target keypoints must of float type, instead got {t['keypoints'].dtype}")
@@ -787,7 +787,7 @@ class RoIHeads(nn.Module):
             mask_proposals = [p["boxes"] for p in result]
             if self.training:
                 if matched_idxs is None:
-                    raise ValueError("if in trainning, matched_idxs should not be None")
+                    raise ValueError("if in training, matched_idxs should not be None")
 
                 # during training, only focus on positive boxes
                 num_images = len(proposals)
diff --git a/torchvision/models/detection/ssd.py b/torchvision/models/detection/ssd.py
index 44102f7ac5a459f8fac3850819bc17ed8ac039e0..87062d2bc88a5bf17625e0530116aba22941c538 100644
--- a/torchvision/models/detection/ssd.py
+++ b/torchvision/models/detection/ssd.py
@@ -39,6 +39,8 @@ class SSD300_VGG16_Weights(WeightsEnum):
                     "box_map": 25.1,
                 }
             },
+            "_ops": 34.858,
+            "_file_size": 135.988,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -126,12 +128,12 @@ class SSD(nn.Module):
     Implements SSD architecture from `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
 
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
     to a fixed size before passing it to the backbone.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
           ``0 <= x1 < x2 <= W`` and ``0 <= y1 < y2 <= H``.
@@ -554,7 +556,7 @@ def _vgg_extractor(backbone: VGG, highres: bool, trainable_layers: int):
     stage_indices = [0] + [i for i, b in enumerate(backbone) if isinstance(b, nn.MaxPool2d)][:-1]
     num_stages = len(stage_indices)
 
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
     torch._assert(
         0 <= trainable_layers <= num_stages,
         f"trainable_layers should be in the range [0, {num_stages}]. Instead got {trainable_layers}",
@@ -588,12 +590,12 @@ def ssd300_vgg16(
     .. betastatus:: detection module
 
     The input to the model is expected to be a list of tensors, each of shape [C, H, W], one for each
-    image, and should be in 0-1 range. Different images can have different sizes but they will be resized
+    image, and should be in 0-1 range. Different images can have different sizes, but they will be resized
     to a fixed size before passing it to the backbone.
 
-    The behavior of the model changes depending if it is in training or evaluation mode.
+    The behavior of the model changes depending on if it is in training or evaluation mode.
 
-    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
+    During training, the model expects both the input tensors and targets (list of dictionary),
     containing:
 
         - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with
@@ -675,28 +677,6 @@ def ssd300_vgg16(
     model = SSD(backbone, anchor_generator, (300, 300), num_classes, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "ssd300_vgg16_coco": SSD300_VGG16_Weights.COCO_V1.url,
-    }
-)
-
-
-backbone_urls = _ModelURLs(
-    {
-        # We port the features of a VGG16 backbone trained by amdegroot because unlike the one on TorchVision, it uses
-        # the same input standardization method as the paper.
-        # Ref: https://s3.amazonaws.com/amdegroot-models/vgg16_reducedfc.pth
-        # Only the `features` weights have proper values, those on the `classifier` module are filled with nans.
-        "vgg16_features": VGG16_Weights.IMAGENET1K_FEATURES.url,
-    }
-)
diff --git a/torchvision/models/detection/ssdlite.py b/torchvision/models/detection/ssdlite.py
index d34795d728646b4d833f7421f41734e2ace7f2d4..f16d46ad1143c95e11a079b1770947aa8038bde8 100644
--- a/torchvision/models/detection/ssdlite.py
+++ b/torchvision/models/detection/ssdlite.py
@@ -172,7 +172,7 @@ def _mobilenet_extractor(
     stage_indices = [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
     num_stages = len(stage_indices)
 
-    # find the index of the layer from which we wont freeze
+    # find the index of the layer from which we won't freeze
     if not 0 <= trainable_layers <= num_stages:
         raise ValueError("trainable_layers should be in the range [0, {num_stages}], instead got {trainable_layers}")
     freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
@@ -198,6 +198,8 @@ class SSDLite320_MobileNet_V3_Large_Weights(WeightsEnum):
                     "box_map": 21.3,
                 }
             },
+            "_ops": 0.583,
+            "_file_size": 13.418,
             "_docs": """These weights were produced by following a similar training recipe as on the paper.""",
         },
     )
@@ -324,17 +326,6 @@ def ssdlite320_mobilenet_v3_large(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "ssdlite320_mobilenet_v3_large_coco": SSDLite320_MobileNet_V3_Large_Weights.COCO_V1.url,
-    }
-)
diff --git a/torchvision/models/detection/transform.py b/torchvision/models/detection/transform.py
index dd2d728abf9cb565ecb8f68c78355b117784e19f..658c9e834556ce52b1e015891ac9a222d2b220f0 100644
--- a/torchvision/models/detection/transform.py
+++ b/torchvision/models/detection/transform.py
@@ -24,8 +24,8 @@ def _fake_cast_onnx(v: Tensor) -> float:
 
 def _resize_image_and_masks(
     image: Tensor,
-    self_min_size: float,
-    self_max_size: float,
+    self_min_size: int,
+    self_max_size: int,
     target: Optional[Dict[str, Tensor]] = None,
     fixed_size: Optional[Tuple[int, int]] = None,
 ) -> Tuple[Tensor, Optional[Dict[str, Tensor]]]:
@@ -40,14 +40,24 @@ def _resize_image_and_masks(
     if fixed_size is not None:
         size = [fixed_size[1], fixed_size[0]]
     else:
-        min_size = torch.min(im_shape).to(dtype=torch.float32)
-        max_size = torch.max(im_shape).to(dtype=torch.float32)
-        scale = torch.min(self_min_size / min_size, self_max_size / max_size)
+        if torch.jit.is_scripting() or torchvision._is_tracing():
+            min_size = torch.min(im_shape).to(dtype=torch.float32)
+            max_size = torch.max(im_shape).to(dtype=torch.float32)
+            self_min_size_f = float(self_min_size)
+            self_max_size_f = float(self_max_size)
+            scale = torch.min(self_min_size_f / min_size, self_max_size_f / max_size)
+
+            if torchvision._is_tracing():
+                scale_factor = _fake_cast_onnx(scale)
+            else:
+                scale_factor = scale.item()
 
-        if torchvision._is_tracing():
-            scale_factor = _fake_cast_onnx(scale)
         else:
-            scale_factor = scale.item()
+            # Do it the normal way
+            min_size = min(im_shape)
+            max_size = max(im_shape)
+            scale_factor = min(self_min_size / min_size, self_max_size / max_size)
+
         recompute_scale_factor = True
 
     image = torch.nn.functional.interpolate(
@@ -76,7 +86,7 @@ class GeneralizedRCNNTransform(nn.Module):
     Performs input / target transformation before feeding the data to a GeneralizedRCNN
     model.
 
-    The transformations it perform are:
+    The transformations it performs are:
         - input normalization (mean subtraction and std division)
         - input / target resizing to match min_size / max_size
 
@@ -158,9 +168,8 @@ class GeneralizedRCNNTransform(nn.Module):
 
     def torch_choice(self, k: List[int]) -> int:
         """
-        Implements `random.choice` via torch ops so it can be compiled with
-        TorchScript. Remove if https://github.com/pytorch/pytorch/issues/25803
-        is fixed.
+        Implements `random.choice` via torch ops, so it can be compiled with
+        TorchScript and we use PyTorch's RNG (not native RNG)
         """
         index = int(torch.empty(1).uniform_(0.0, float(len(k))).item())
         return k[index]
@@ -174,11 +183,10 @@ class GeneralizedRCNNTransform(nn.Module):
         if self.training:
             if self._skip_resize:
                 return image, target
-            size = float(self.torch_choice(self.min_size))
+            size = self.torch_choice(self.min_size)
         else:
-            # FIXME assume for now that testing uses the largest scale
-            size = float(self.min_size[-1])
-        image, target = _resize_image_and_masks(image, size, float(self.max_size), target, self.fixed_size)
+            size = self.min_size[-1]
+        image, target = _resize_image_and_masks(image, size, self.max_size, target, self.fixed_size)
 
         if target is None:
             return image, target
diff --git a/torchvision/models/efficientnet.py b/torchvision/models/efficientnet.py
index c98eb37f935797bdcef802d0d84635996de6776b..d04028a3bbc6f5e1e645faa9113984289aadb417 100644
--- a/torchvision/models/efficientnet.py
+++ b/torchvision/models/efficientnet.py
@@ -1,6 +1,5 @@
 import copy
 import math
-import warnings
 from dataclasses import dataclass
 from functools import partial
 from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
@@ -239,7 +238,6 @@ class EfficientNet(nn.Module):
         num_classes: int = 1000,
         norm_layer: Optional[Callable[..., nn.Module]] = None,
         last_channel: Optional[int] = None,
-        **kwargs: Any,
     ) -> None:
         """
         EfficientNet V1 and V2 main class
@@ -263,16 +261,6 @@ class EfficientNet(nn.Module):
         ):
             raise TypeError("The inverted_residual_setting should be List[MBConvConfig]")
 
-        if "block" in kwargs:
-            warnings.warn(
-                "The parameter 'block' is deprecated since 0.13 and will be removed 0.15. "
-                "Please pass this information on 'MBConvConfig.block' instead."
-            )
-            if kwargs["block"] is not None:
-                for s in inverted_residual_setting:
-                    if isinstance(s, MBConvConfig):
-                        s.block = kwargs["block"]
-
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
 
@@ -369,7 +357,7 @@ def _efficientnet(
     model = EfficientNet(inverted_residual_setting, dropout, last_channel=last_channel, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -464,6 +452,8 @@ class EfficientNet_B0_Weights(WeightsEnum):
                     "acc@5": 93.532,
                 }
             },
+            "_ops": 0.386,
+            "_file_size": 20.451,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -473,7 +463,7 @@ class EfficientNet_B0_Weights(WeightsEnum):
 class EfficientNet_B1_Weights(WeightsEnum):
     IMAGENET1K_V1 = Weights(
         # Weights ported from https://github.com/rwightman/pytorch-image-models/
-        url="https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
+        url="https://download.pytorch.org/models/efficientnet_b1_rwightman-bac287d4.pth",
         transforms=partial(
             ImageClassification, crop_size=240, resize_size=256, interpolation=InterpolationMode.BICUBIC
         ),
@@ -486,6 +476,8 @@ class EfficientNet_B1_Weights(WeightsEnum):
                     "acc@5": 94.186,
                 }
             },
+            "_ops": 0.687,
+            "_file_size": 30.134,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -504,6 +496,8 @@ class EfficientNet_B1_Weights(WeightsEnum):
                     "acc@5": 94.934,
                 }
             },
+            "_ops": 0.687,
+            "_file_size": 30.136,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -530,6 +524,8 @@ class EfficientNet_B2_Weights(WeightsEnum):
                     "acc@5": 95.310,
                 }
             },
+            "_ops": 1.088,
+            "_file_size": 35.174,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -552,6 +548,8 @@ class EfficientNet_B3_Weights(WeightsEnum):
                     "acc@5": 96.054,
                 }
             },
+            "_ops": 1.827,
+            "_file_size": 47.184,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -574,6 +572,8 @@ class EfficientNet_B4_Weights(WeightsEnum):
                     "acc@5": 96.594,
                 }
             },
+            "_ops": 4.394,
+            "_file_size": 74.489,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -596,6 +596,8 @@ class EfficientNet_B5_Weights(WeightsEnum):
                     "acc@5": 96.628,
                 }
             },
+            "_ops": 10.266,
+            "_file_size": 116.864,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -618,6 +620,8 @@ class EfficientNet_B6_Weights(WeightsEnum):
                     "acc@5": 96.916,
                 }
             },
+            "_ops": 19.068,
+            "_file_size": 165.362,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -640,6 +644,8 @@ class EfficientNet_B7_Weights(WeightsEnum):
                     "acc@5": 96.908,
                 }
             },
+            "_ops": 37.746,
+            "_file_size": 254.675,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -664,6 +670,8 @@ class EfficientNet_V2_S_Weights(WeightsEnum):
                     "acc@5": 96.878,
                 }
             },
+            "_ops": 8.366,
+            "_file_size": 82.704,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -692,6 +700,8 @@ class EfficientNet_V2_M_Weights(WeightsEnum):
                     "acc@5": 97.156,
                 }
             },
+            "_ops": 24.582,
+            "_file_size": 208.01,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -723,6 +733,8 @@ class EfficientNet_V2_L_Weights(WeightsEnum):
                     "acc@5": 97.788,
                 }
             },
+            "_ops": 56.08,
+            "_file_size": 454.573,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -755,7 +767,9 @@ def efficientnet_b0(
     weights = EfficientNet_B0_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b0", width_mult=1.0, depth_mult=1.0)
-    return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )
 
 
 @register_model()
@@ -784,7 +798,9 @@ def efficientnet_b1(
     weights = EfficientNet_B1_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b1", width_mult=1.0, depth_mult=1.1)
-    return _efficientnet(inverted_residual_setting, 0.2, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.2), last_channel, weights, progress, **kwargs
+    )
 
 
 @register_model()
@@ -813,7 +829,9 @@ def efficientnet_b2(
     weights = EfficientNet_B2_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b2", width_mult=1.1, depth_mult=1.2)
-    return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting, kwargs.pop("dropout", 0.3), last_channel, weights, progress, **kwargs
+    )
 
 
 @register_model()
@@ -842,7 +860,14 @@ def efficientnet_b3(
     weights = EfficientNet_B3_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b3", width_mult=1.2, depth_mult=1.4)
-    return _efficientnet(inverted_residual_setting, 0.3, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.3),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )
 
 
 @register_model()
@@ -871,7 +896,14 @@ def efficientnet_b4(
     weights = EfficientNet_B4_Weights.verify(weights)
 
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b4", width_mult=1.4, depth_mult=1.8)
-    return _efficientnet(inverted_residual_setting, 0.4, last_channel, weights, progress, **kwargs)
+    return _efficientnet(
+        inverted_residual_setting,
+        kwargs.pop("dropout", 0.4),
+        last_channel,
+        weights,
+        progress,
+        **kwargs,
+    )
 
 
 @register_model()
@@ -902,7 +934,7 @@ def efficientnet_b5(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b5", width_mult=1.6, depth_mult=2.2)
     return _efficientnet(
         inverted_residual_setting,
-        0.4,
+        kwargs.pop("dropout", 0.4),
         last_channel,
         weights,
         progress,
@@ -939,7 +971,7 @@ def efficientnet_b6(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b6", width_mult=1.8, depth_mult=2.6)
     return _efficientnet(
         inverted_residual_setting,
-        0.5,
+        kwargs.pop("dropout", 0.5),
         last_channel,
         weights,
         progress,
@@ -976,7 +1008,7 @@ def efficientnet_b7(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_b7", width_mult=2.0, depth_mult=3.1)
     return _efficientnet(
         inverted_residual_setting,
-        0.5,
+        kwargs.pop("dropout", 0.5),
         last_channel,
         weights,
         progress,
@@ -1014,7 +1046,7 @@ def efficientnet_v2_s(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_s")
     return _efficientnet(
         inverted_residual_setting,
-        0.2,
+        kwargs.pop("dropout", 0.2),
         last_channel,
         weights,
         progress,
@@ -1052,7 +1084,7 @@ def efficientnet_v2_m(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_m")
     return _efficientnet(
         inverted_residual_setting,
-        0.3,
+        kwargs.pop("dropout", 0.3),
         last_channel,
         weights,
         progress,
@@ -1090,28 +1122,10 @@ def efficientnet_v2_l(
     inverted_residual_setting, last_channel = _efficientnet_conf("efficientnet_v2_l")
     return _efficientnet(
         inverted_residual_setting,
-        0.4,
+        kwargs.pop("dropout", 0.4),
         last_channel,
         weights,
         progress,
         norm_layer=partial(nn.BatchNorm2d, eps=1e-03),
         **kwargs,
     )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "efficientnet_b0": EfficientNet_B0_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b1": EfficientNet_B1_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b2": EfficientNet_B2_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b3": EfficientNet_B3_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b4": EfficientNet_B4_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b5": EfficientNet_B5_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b6": EfficientNet_B6_Weights.IMAGENET1K_V1.url,
-        "efficientnet_b7": EfficientNet_B7_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/feature_extraction.py b/torchvision/models/feature_extraction.py
index d247d9a3e26f3fd317fb13e74698c82587141af7..d8c2dca4afea7546b3374aadc779fa0c07584f48 100644
--- a/torchvision/models/feature_extraction.py
+++ b/torchvision/models/feature_extraction.py
@@ -18,7 +18,7 @@ __all__ = ["create_feature_extractor", "get_graph_node_names"]
 
 class LeafModuleAwareTracer(fx.Tracer):
     """
-    An fx.Tracer that allows the user to specify a set of leaf modules, ie.
+    An fx.Tracer that allows the user to specify a set of leaf modules, i.e.
     modules that are not to be traced through. The resulting graph ends up
     having single nodes referencing calls to the leaf modules' forward methods.
     """
@@ -103,7 +103,7 @@ class NodePathTracer(LeafModuleAwareTracer):
 
         if node.op != "call_module":
             # In this case module_qualname from torch.fx doesn't go all the
-            # way to the leaf function/op so we need to append it
+            # way to the leaf function/op, so we need to append it
             if len(node_qualname) > 0:
                 # Only append '.' if we are deeper than the top level module
                 node_qualname += "."
@@ -136,7 +136,7 @@ class NodePathTracer(LeafModuleAwareTracer):
 
 
 def _is_subseq(x, y):
-    """Check if y is a subseqence of x
+    """Check if y is a subsequence of x
     https://stackoverflow.com/a/24017747/4391249
     """
     iter_x = iter(x)
@@ -228,7 +228,7 @@ def get_graph_node_names(
         tracer_kwargs (dict, optional): a dictionary of keyword arguments for
             ``NodePathTracer`` (they are eventually passed onto
             `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
             {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
             WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
             provided dictionary.
@@ -391,7 +391,7 @@ def create_feature_extractor(
         tracer_kwargs (dict, optional): a dictionary of keyword arguments for
             ``NodePathTracer`` (which passes them onto it's parent class
             `torch.fx.Tracer <https://pytorch.org/docs/stable/fx.html#torch.fx.Tracer>`_).
-            By default it will be set to wrap and make leaf nodes all torchvision ops:
+            By default, it will be set to wrap and make leaf nodes all torchvision ops:
             {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),}
             WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user
             provided dictionary.
@@ -544,7 +544,7 @@ def create_feature_extractor(
         graph_module.graph.eliminate_dead_code()
         graph_module.recompile()
 
-        # Keep track of the tracer and graph so we can choose the main one
+        # Keep track of the tracer and graph, so we can choose the main one
         tracers[mode] = tracer
         graphs[mode] = graph
 
diff --git a/torchvision/models/googlenet.py b/torchvision/models/googlenet.py
index 0ea3dd5d0b934ee6cb55ee0df672d3405c71ef75..1dc5136d726a1542a7e83cff8cf356ee207b1949 100644
--- a/torchvision/models/googlenet.py
+++ b/torchvision/models/googlenet.py
@@ -290,6 +290,8 @@ class GoogLeNet_Weights(WeightsEnum):
                     "acc@5": 89.530,
                 }
             },
+            "_ops": 1.498,
+            "_file_size": 49.731,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -330,7 +332,7 @@ def googlenet(*, weights: Optional[GoogLeNet_Weights] = None, progress: bool = T
     model = GoogLeNet(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not original_aux_logits:
             model.aux_logits = False
             model.aux1 = None  # type: ignore[assignment]
@@ -341,15 +343,3 @@ def googlenet(*, weights: Optional[GoogLeNet_Weights] = None, progress: bool = T
             )
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # GoogLeNet ported from TensorFlow
-        "googlenet": GoogLeNet_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/inception.py b/torchvision/models/inception.py
index 928c07ac8438d4a060fcb9820dc1a9e9ab3ce1cd..447a7682d62c0e31570aaf1e32c2da55a0d697d1 100644
--- a/torchvision/models/inception.py
+++ b/torchvision/models/inception.py
@@ -48,7 +48,7 @@ class Inception3(nn.Module):
             )
             init_weights = True
         if len(inception_blocks) != 7:
-            raise ValueError(f"lenght of inception_blocks should be 7 instead of {len(inception_blocks)}")
+            raise ValueError(f"length of inception_blocks should be 7 instead of {len(inception_blocks)}")
         conv_block = inception_blocks[0]
         inception_a = inception_blocks[1]
         inception_b = inception_blocks[2]
@@ -422,6 +422,8 @@ class Inception_V3_Weights(WeightsEnum):
                     "acc@5": 93.450,
                 }
             },
+            "_ops": 5.713,
+            "_file_size": 103.903,
             "_docs": """These weights are ported from the original paper.""",
         },
     )
@@ -468,21 +470,9 @@ def inception_v3(*, weights: Optional[Inception_V3_Weights] = None, progress: bo
     model = Inception3(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not original_aux_logits:
             model.aux_logits = False
             model.AuxLogits = None
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        # Inception v3 ported from TensorFlow
-        "inception_v3_google": Inception_V3_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/maxvit.py b/torchvision/models/maxvit.py
index 7bf92876385a913f50f93e5d47074663a24393fd..d8e62cd36a2d5dd25d8f5e479ae07ebbee3334f6 100644
--- a/torchvision/models/maxvit.py
+++ b/torchvision/models/maxvit.py
@@ -1,6 +1,7 @@
 import math
+from collections import OrderedDict
 from functools import partial
-from typing import Any, Callable, List, Optional, OrderedDict, Sequence, Tuple
+from typing import Any, Callable, List, Optional, Sequence, Tuple
 
 import numpy as np
 import torch
@@ -300,7 +301,7 @@ class PartitionAttentionLayer(nn.Module):
         self,
         in_channels: int,
         head_dim: int,
-        # partitioning parameteres
+        # partitioning parameters
         partition_size: int,
         partition_type: str,
         # grid size needs to be known at initialization time
@@ -426,7 +427,7 @@ class MaxVitLayer(nn.Module):
     ) -> None:
         super().__init__()
 
-        layers: OrderedDict[str, Any] = OrderedDict()  # type: ignore
+        layers: OrderedDict = OrderedDict()
 
         # convolutional layer
         layers["MBconv"] = MBConv(
@@ -762,7 +763,7 @@ def _maxvit(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -785,6 +786,8 @@ class MaxVit_T_Weights(WeightsEnum):
                     "acc@5": 96.722,
                 }
             },
+            "_ops": 5.558,
+            "_file_size": 118.769,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
diff --git a/torchvision/models/mnasnet.py b/torchvision/models/mnasnet.py
index 48103f11585e60ca870ae844ef979f3192498b4e..5846111ab1c05b4ebca7ccf9240ac744267b859a 100644
--- a/torchvision/models/mnasnet.py
+++ b/torchvision/models/mnasnet.py
@@ -88,14 +88,14 @@ def _round_to_multiple_of(val: float, divisor: int, round_up_bias: float = 0.9)
 
 
 def _get_depths(alpha: float) -> List[int]:
-    """Scales tensor depths as in reference MobileNet code, prefers rouding up
+    """Scales tensor depths as in reference MobileNet code, prefers rounding up
     rather than down."""
     depths = [32, 16, 24, 40, 80, 96, 192, 320]
     return [_round_to_multiple_of(depth * alpha, 8) for depth in depths]
 
 
 class MNASNet(torch.nn.Module):
-    """MNASNet, as described in https://arxiv.org/pdf/1807.11626.pdf. This
+    """MNASNet, as described in https://arxiv.org/abs/1807.11626. This
     implements the B1 variant of the model.
     >>> model = MNASNet(1.0, num_classes=1000)
     >>> x = torch.rand(1, 3, 224, 224)
@@ -231,6 +231,8 @@ class MNASNet0_5_Weights(WeightsEnum):
                     "acc@5": 87.490,
                 }
             },
+            "_ops": 0.104,
+            "_file_size": 8.591,
             "_docs": """These weights reproduce closely the results of the paper.""",
         },
     )
@@ -251,6 +253,8 @@ class MNASNet0_75_Weights(WeightsEnum):
                     "acc@5": 90.496,
                 }
             },
+            "_ops": 0.215,
+            "_file_size": 12.303,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -273,6 +277,8 @@ class MNASNet1_0_Weights(WeightsEnum):
                     "acc@5": 91.510,
                 }
             },
+            "_ops": 0.314,
+            "_file_size": 16.915,
             "_docs": """These weights reproduce closely the results of the paper.""",
         },
     )
@@ -293,6 +299,8 @@ class MNASNet1_3_Weights(WeightsEnum):
                     "acc@5": 93.522,
                 }
             },
+            "_ops": 0.526,
+            "_file_size": 24.246,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -309,7 +317,7 @@ def _mnasnet(alpha: float, weights: Optional[WeightsEnum], progress: bool, **kwa
     model = MNASNet(alpha, **kwargs)
 
     if weights:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -319,7 +327,7 @@ def _mnasnet(alpha: float, weights: Optional[WeightsEnum], progress: bool, **kwa
 def mnasnet0_5(*, weights: Optional[MNASNet0_5_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 0.5 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet0_5_Weights`, optional): The
@@ -347,7 +355,7 @@ def mnasnet0_5(*, weights: Optional[MNASNet0_5_Weights] = None, progress: bool =
 def mnasnet0_75(*, weights: Optional[MNASNet0_75_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 0.75 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet0_75_Weights`, optional): The
@@ -375,7 +383,7 @@ def mnasnet0_75(*, weights: Optional[MNASNet0_75_Weights] = None, progress: bool
 def mnasnet1_0(*, weights: Optional[MNASNet1_0_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 1.0 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet1_0_Weights`, optional): The
@@ -403,7 +411,7 @@ def mnasnet1_0(*, weights: Optional[MNASNet1_0_Weights] = None, progress: bool =
 def mnasnet1_3(*, weights: Optional[MNASNet1_3_Weights] = None, progress: bool = True, **kwargs: Any) -> MNASNet:
     """MNASNet with depth multiplier of 1.3 from
     `MnasNet: Platform-Aware Neural Architecture Search for Mobile
-    <https://arxiv.org/pdf/1807.11626.pdf>`_ paper.
+    <https://arxiv.org/abs/1807.11626>`_ paper.
 
     Args:
         weights (:class:`~torchvision.models.MNASNet1_3_Weights`, optional): The
diff --git a/torchvision/models/mobilenetv2.py b/torchvision/models/mobilenetv2.py
index 86b659ebd05fe3a5cb8f219aa9efbac3ecffde92..fbb6a4981d6dec73c563dfc845329a58eaa3e083 100644
--- a/torchvision/models/mobilenetv2.py
+++ b/torchvision/models/mobilenetv2.py
@@ -23,7 +23,7 @@ class InvertedResidual(nn.Module):
         super().__init__()
         self.stride = stride
         if stride not in [1, 2]:
-            raise ValueError(f"stride should be 1 or 2 insted of {stride}")
+            raise ValueError(f"stride should be 1 or 2 instead of {stride}")
 
         if norm_layer is None:
             norm_layer = nn.BatchNorm2d
@@ -194,6 +194,8 @@ class MobileNet_V2_Weights(WeightsEnum):
                     "acc@5": 90.286,
                 }
             },
+            "_ops": 0.301,
+            "_file_size": 13.555,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -209,6 +211,8 @@ class MobileNet_V2_Weights(WeightsEnum):
                     "acc@5": 90.822,
                 }
             },
+            "_ops": 0.301,
+            "_file_size": 13.598,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -251,17 +255,6 @@ def mobilenet_v2(
     model = MobileNetV2(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "mobilenet_v2": MobileNet_V2_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/mobilenetv3.py b/torchvision/models/mobilenetv3.py
index 715fc822ed3ab87cba3347c7d507f4b62773ac31..1041d4d149fb1c414991b79156733434a2268e4b 100644
--- a/torchvision/models/mobilenetv3.py
+++ b/torchvision/models/mobilenetv3.py
@@ -282,7 +282,7 @@ def _mobilenet_v3(
     model = MobileNetV3(inverted_residual_setting, last_channel, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -307,6 +307,8 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                     "acc@5": 91.340,
                 }
             },
+            "_ops": 0.217,
+            "_file_size": 21.114,
             "_docs": """These weights were trained from scratch by using a simple training recipe.""",
         },
     )
@@ -323,6 +325,8 @@ class MobileNet_V3_Large_Weights(WeightsEnum):
                     "acc@5": 92.566,
                 }
             },
+            "_ops": 0.217,
+            "_file_size": 21.107,
             "_docs": """
                 These weights improve marginally upon the results of the original paper by using a modified version of
                 TorchVision's `new training recipe
@@ -347,6 +351,8 @@ class MobileNet_V3_Small_Weights(WeightsEnum):
                     "acc@5": 87.402,
                 }
             },
+            "_ops": 0.057,
+            "_file_size": 9.829,
             "_docs": """
                 These weights improve upon the results of the original paper by using a simple training recipe.
             """,
@@ -372,7 +378,7 @@ def mobilenet_v3_large(
             weights are used.
         progress (bool, optional): If True, displays a progress bar of the
             download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.models.resnet.MobileNetV3``
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
             base class. Please refer to the `source code
             <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
             for more details about this class.
@@ -403,7 +409,7 @@ def mobilenet_v3_small(
             weights are used.
         progress (bool, optional): If True, displays a progress bar of the
             download to stderr. Default is True.
-        **kwargs: parameters passed to the ``torchvision.models.resnet.MobileNetV3``
+        **kwargs: parameters passed to the ``torchvision.models.mobilenet.MobileNetV3``
             base class. Please refer to the `source code
             <https://github.com/pytorch/vision/blob/main/torchvision/models/mobilenetv3.py>`_
             for more details about this class.
@@ -415,15 +421,3 @@ def mobilenet_v3_small(
 
     inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_small", **kwargs)
     return _mobilenet_v3(inverted_residual_setting, last_channel, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "mobilenet_v3_large": MobileNet_V3_Large_Weights.IMAGENET1K_V1.url,
-        "mobilenet_v3_small": MobileNet_V3_Small_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/optical_flow/raft.py b/torchvision/models/optical_flow/raft.py
index 1773f3d5983d73b7f260537f61dc68bfcabcf10e..c294777ee6ffc0a9151f76f13bf2bde018580f9e 100644
--- a/torchvision/models/optical_flow/raft.py
+++ b/torchvision/models/optical_flow/raft.py
@@ -35,7 +35,7 @@ class ResidualBlock(nn.Module):
         # But in the RAFT training reference, the BatchNorm2d layers are only activated for the first dataset,
         # and frozen for the rest of the training process (i.e. set as eval()). The bias term is thus still useful
         # for the rest of the datasets. Technically, we could remove the bias for other norm layers like Instance norm
-        # because these aren't frozen, but we don't bother (also, we woudn't be able to load the original weights).
+        # because these aren't frozen, but we don't bother (also, we wouldn't be able to load the original weights).
         self.convnormrelu1 = Conv2dNormActivation(
             in_channels, out_channels, norm_layer=norm_layer, kernel_size=3, stride=stride, bias=True
         )
@@ -318,7 +318,7 @@ class MaskPredictor(nn.Module):
     def __init__(self, *, in_channels, hidden_size, multiplier=0.25):
         super().__init__()
         self.convrelu = Conv2dNormActivation(in_channels, hidden_size, norm_layer=None, kernel_size=3)
-        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder
+        # 8 * 8 * 9 because the predicted flow is downsampled by 8, from the downsampling of the initial FeatureEncoder,
         # and we interpolate with all 9 surrounding neighbors. See paper and appendix B.
         self.conv = nn.Conv2d(hidden_size, 8 * 8 * 9, 1, padding=0)
 
@@ -369,6 +369,19 @@ class CorrBlock(nn.Module):
             raise ValueError(
                 f"Input feature maps should have the same shape, instead got {fmap1.shape} (fmap1.shape) != {fmap2.shape} (fmap2.shape)"
             )
+
+        # Explaining min_fmap_size below: the fmaps are down-sampled (num_levels - 1) times by a factor of 2.
+        # The last corr_volume most have at least 2 values (hence the 2* factor), otherwise grid_sample() would
+        # produce nans in its output.
+        min_fmap_size = 2 * (2 ** (self.num_levels - 1))
+        if any(fmap_size < min_fmap_size for fmap_size in fmap1.shape[-2:]):
+            raise ValueError(
+                "Feature maps are too small to be down-sampled by the correlation pyramid. "
+                f"H and W of feature maps should be at least {min_fmap_size}; got: {fmap1.shape[-2:]}. "
+                "Remember that input images to the model are downsampled by 8, so that means their "
+                f"dimensions should be at least 8 * {min_fmap_size} = {8 * min_fmap_size}."
+            )
+
         corr_volume = self._compute_corr_volume(fmap1, fmap2)
 
         batch_size, h, w, num_channels, _, _ = corr_volume.shape  # _, _ = h, w
@@ -430,7 +443,7 @@ class RAFT(nn.Module):
                 Its input is ``image1``. As in the original implementation, its output will be split into 2 parts:
 
                 - one part will be used as the actual "context", passed to the recurrent unit of the ``update_block``
-                - one part will be used to initialize the hidden state of the of the recurrent unit of
+                - one part will be used to initialize the hidden state of the recurrent unit of
                   the ``update_block``
 
                 These 2 parts are split according to the ``hidden_state_size`` of the ``update_block``, so the output
@@ -474,7 +487,7 @@ class RAFT(nn.Module):
         if (h, w) != image2.shape[-2:]:
             raise ValueError(f"input images should have the same shape, instead got ({h}, {w}) != {image2.shape[-2:]}")
         if not (h % 8 == 0) and (w % 8 == 0):
-            raise ValueError(f"input image H and W should be divisible by 8, insted got {h} (h) and {w} (w)")
+            raise ValueError(f"input image H and W should be divisible by 8, instead got {h} (h) and {w} (w)")
 
         fmaps = self.feature_encoder(torch.cat([image1, image2], dim=0))
         fmap1, fmap2 = torch.chunk(fmaps, chunks=2, dim=0)
@@ -552,6 +565,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 2.7894},
                 "Kitti-Train": {"per_image_epe": 5.0172, "fl_all": 17.4506},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """These weights were ported from the original paper. They
             are trained on :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -570,6 +585,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 2.7161},
                 "Kitti-Train": {"per_image_epe": 4.5118, "fl_all": 16.0679},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """These weights were trained from scratch on
             :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -588,6 +605,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Test-Cleanpass": {"epe": 1.94},
                 "Sintel-Test-Finalpass": {"epe": 3.18},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were ported from the original paper. They are
                 trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -612,6 +631,8 @@ class Raft_Large_Weights(WeightsEnum):
                 "Sintel-Test-Cleanpass": {"epe": 1.819},
                 "Sintel-Test-Finalpass": {"epe": 3.067},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were trained from scratch. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -636,6 +657,8 @@ class Raft_Large_Weights(WeightsEnum):
             "_metrics": {
                 "Kitti-Test": {"fl_all": 5.10},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were ported from the original paper. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -657,6 +680,8 @@ class Raft_Large_Weights(WeightsEnum):
             "_metrics": {
                 "Kitti-Test": {"fl_all": 5.19},
             },
+            "_ops": 211.007,
+            "_file_size": 20.129,
             "_docs": """
                 These weights were trained from scratch. They are
                 pre-trained on :class:`~torchvision.datasets.FlyingChairs` +
@@ -698,6 +723,8 @@ class Raft_Small_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 3.2790},
                 "Kitti-Train": {"per_image_epe": 7.6557, "fl_all": 25.2801},
             },
+            "_ops": 47.655,
+            "_file_size": 3.821,
             "_docs": """These weights were ported from the original paper. They
             are trained on :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -715,6 +742,8 @@ class Raft_Small_Weights(WeightsEnum):
                 "Sintel-Train-Finalpass": {"epe": 3.2831},
                 "Kitti-Train": {"per_image_epe": 7.5978, "fl_all": 25.2369},
             },
+            "_ops": 47.655,
+            "_file_size": 3.821,
             "_docs": """These weights were trained from scratch on
             :class:`~torchvision.datasets.FlyingChairs` +
             :class:`~torchvision.datasets.FlyingThings3D`.""",
@@ -802,7 +831,7 @@ def _raft(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
diff --git a/torchvision/models/quantization/googlenet.py b/torchvision/models/quantization/googlenet.py
index abf2184acec6f415a07e94da83d4d8f0e0e8e918..30ef3356ba13108b9bdc4c90a9ab4cb7f92e445a 100644
--- a/torchvision/models/quantization/googlenet.py
+++ b/torchvision/models/quantization/googlenet.py
@@ -108,7 +108,7 @@ class QuantizableGoogLeNet(GoogLeNet):
 
 class GoogLeNet_QuantizedWeights(WeightsEnum):
     IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c00238cf.pth",
+        url="https://download.pytorch.org/models/quantized/googlenet_fbgemm-c81f6644.pth",
         transforms=partial(ImageClassification, crop_size=224),
         meta={
             "num_params": 6624904,
@@ -123,6 +123,8 @@ class GoogLeNet_QuantizedWeights(WeightsEnum):
                     "acc@5": 89.404,
                 }
             },
+            "_ops": 1.498,
+            "_file_size": 12.618,
             "_docs": """
                 These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                 weights listed below.
@@ -195,7 +197,7 @@ def googlenet(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not original_aux_logits:
             model.aux_logits = False
             model.aux1 = None  # type: ignore[assignment]
@@ -206,16 +208,3 @@ def googlenet(
             )
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..googlenet import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        # fp32 GoogLeNet ported from TensorFlow, with weights quantized in PyTorch
-        "googlenet_fbgemm": GoogLeNet_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/inception.py b/torchvision/models/quantization/inception.py
index 34cd2a0a36a55ba28a3739a5c966e26e80a8cb78..75c126697e99befd6ae7d3c1ee88fb8542e06d31 100644
--- a/torchvision/models/quantization/inception.py
+++ b/torchvision/models/quantization/inception.py
@@ -168,7 +168,7 @@ class QuantizableInception3(inception_module.Inception3):
 
 class Inception_V3_QuantizedWeights(WeightsEnum):
     IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-71447a44.pth",
+        url="https://download.pytorch.org/models/quantized/inception_v3_google_fbgemm-a2837893.pth",
         transforms=partial(ImageClassification, crop_size=299, resize_size=342),
         meta={
             "num_params": 27161264,
@@ -183,6 +183,8 @@ class Inception_V3_QuantizedWeights(WeightsEnum):
                     "acc@5": 93.354,
                 }
             },
+            "_ops": 5.713,
+            "_file_size": 23.146,
             "_docs": """
                 These weights were produced by doing Post Training Quantization (eager mode) on top of the unquantized
                 weights listed below.
@@ -263,22 +265,9 @@ def inception_v3(
         if quantize and not original_aux_logits:
             model.aux_logits = False
             model.AuxLogits = None
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
         if not quantize and not original_aux_logits:
             model.aux_logits = False
             model.AuxLogits = None
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..inception import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        # fp32 weights ported from TensorFlow, quantized in PyTorch
-        "inception_v3_google_fbgemm": Inception_V3_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/mobilenetv2.py b/torchvision/models/quantization/mobilenetv2.py
index 1f91967f146c9062f6f4d6d46b118c450b9091c9..4700bb4af931072f1aee3403c1e8c461ec33c76d 100644
--- a/torchvision/models/quantization/mobilenetv2.py
+++ b/torchvision/models/quantization/mobilenetv2.py
@@ -80,6 +80,8 @@ class MobileNet_V2_QuantizedWeights(WeightsEnum):
                     "acc@5": 90.150,
                 }
             },
+            "_ops": 0.301,
+            "_file_size": 3.423,
             "_docs": """
                 These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                 weights listed below.
@@ -147,18 +149,6 @@ def mobilenet_v2(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..mobilenetv2 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "mobilenet_v2_qnnpack": MobileNet_V2_QuantizedWeights.IMAGENET1K_QNNPACK_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/mobilenetv3.py b/torchvision/models/quantization/mobilenetv3.py
index 53229c09534433b79e62ee90feeb1da829ec47b3..f1fdcfec9570d35683efb10344e667d3f4487fce 100644
--- a/torchvision/models/quantization/mobilenetv3.py
+++ b/torchvision/models/quantization/mobilenetv3.py
@@ -149,7 +149,7 @@ def _mobilenet_v3_model(
         torch.ao.quantization.prepare_qat(model, inplace=True)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     if quantize:
         torch.ao.quantization.convert(model, inplace=True)
@@ -175,6 +175,8 @@ class MobileNet_V3_Large_QuantizedWeights(WeightsEnum):
                     "acc@5": 90.858,
                 }
             },
+            "_ops": 0.217,
+            "_file_size": 21.554,
             "_docs": """
                 These weights were produced by doing Quantization Aware Training (eager mode) on top of the unquantized
                 weights listed below.
@@ -233,15 +235,3 @@ def mobilenet_v3_large(
 
     inverted_residual_setting, last_channel = _mobilenet_v3_conf("mobilenet_v3_large", **kwargs)
     return _mobilenet_v3_model(inverted_residual_setting, last_channel, weights, progress, quantize, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..mobilenetv3 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "mobilenet_v3_large_qnnpack": MobileNet_V3_Large_QuantizedWeights.IMAGENET1K_QNNPACK_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/resnet.py b/torchvision/models/quantization/resnet.py
index 286c040b0064cd2e3a4a240b577f92baa77b47fa..39958a010fbd335709bc77a1aaf26c996584a398 100644
--- a/torchvision/models/quantization/resnet.py
+++ b/torchvision/models/quantization/resnet.py
@@ -144,7 +144,7 @@ def _resnet(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -175,6 +175,8 @@ class ResNet18_QuantizedWeights(WeightsEnum):
                     "acc@5": 88.882,
                 }
             },
+            "_ops": 1.814,
+            "_file_size": 11.238,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -194,6 +196,8 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                     "acc@5": 92.814,
                 }
             },
+            "_ops": 4.089,
+            "_file_size": 24.759,
         },
     )
     IMAGENET1K_FBGEMM_V2 = Weights(
@@ -209,6 +213,8 @@ class ResNet50_QuantizedWeights(WeightsEnum):
                     "acc@5": 94.976,
                 }
             },
+            "_ops": 4.089,
+            "_file_size": 24.953,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -228,6 +234,8 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                     "acc@5": 94.480,
                 }
             },
+            "_ops": 16.414,
+            "_file_size": 86.034,
         },
     )
     IMAGENET1K_FBGEMM_V2 = Weights(
@@ -243,6 +251,8 @@ class ResNeXt101_32X8D_QuantizedWeights(WeightsEnum):
                     "acc@5": 96.132,
                 }
             },
+            "_ops": 16.414,
+            "_file_size": 86.645,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V2
@@ -263,6 +273,8 @@ class ResNeXt101_64X4D_QuantizedWeights(WeightsEnum):
                     "acc@5": 96.326,
                 }
             },
+            "_ops": 15.46,
+            "_file_size": 81.556,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -470,17 +482,3 @@ def resnext101_64x4d(
     _ovewrite_named_param(kwargs, "groups", 64)
     _ovewrite_named_param(kwargs, "width_per_group", 4)
     return _resnet(QuantizableBottleneck, [3, 4, 23, 3], weights, progress, quantize, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..resnet import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "resnet18_fbgemm": ResNet18_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "resnet50_fbgemm": ResNet50_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "resnext101_32x8d_fbgemm": ResNeXt101_32X8D_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/quantization/shufflenetv2.py b/torchvision/models/quantization/shufflenetv2.py
index a6317e28b238f6c9b6181a59e2ac972d1a555805..3e1b01356a74b8e4f16d66811060be698cfed199 100644
--- a/torchvision/models/quantization/shufflenetv2.py
+++ b/torchvision/models/quantization/shufflenetv2.py
@@ -108,7 +108,7 @@ def _shufflenetv2(
         quantize_model(model, backend)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -139,6 +139,8 @@ class ShuffleNet_V2_X0_5_QuantizedWeights(WeightsEnum):
                     "acc@5": 79.780,
                 }
             },
+            "_ops": 0.04,
+            "_file_size": 1.501,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -146,7 +148,7 @@ class ShuffleNet_V2_X0_5_QuantizedWeights(WeightsEnum):
 
 class ShuffleNet_V2_X1_0_QuantizedWeights(WeightsEnum):
     IMAGENET1K_FBGEMM_V1 = Weights(
-        url="https://download.pytorch.org/models/quantized/shufflenetv2_x1_fbgemm-db332c57.pth",
+        url="https://download.pytorch.org/models/quantized/shufflenetv2_x1_fbgemm-1e62bb32.pth",
         transforms=partial(ImageClassification, crop_size=224),
         meta={
             **_COMMON_META,
@@ -158,6 +160,8 @@ class ShuffleNet_V2_X1_0_QuantizedWeights(WeightsEnum):
                     "acc@5": 87.582,
                 }
             },
+            "_ops": 0.145,
+            "_file_size": 2.334,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -178,6 +182,8 @@ class ShuffleNet_V2_X1_5_QuantizedWeights(WeightsEnum):
                     "acc@5": 90.700,
                 }
             },
+            "_ops": 0.296,
+            "_file_size": 3.672,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -198,6 +204,8 @@ class ShuffleNet_V2_X2_0_QuantizedWeights(WeightsEnum):
                     "acc@5": 92.488,
                 }
             },
+            "_ops": 0.583,
+            "_file_size": 7.467,
         },
     )
     DEFAULT = IMAGENET1K_FBGEMM_V1
@@ -417,16 +425,3 @@ def shufflenet_v2_x2_0(
     return _shufflenetv2(
         [4, 8, 4], [24, 244, 488, 976, 2048], weights=weights, progress=progress, quantize=quantize, **kwargs
     )
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-from ..shufflenetv2 import model_urls  # noqa: F401
-
-
-quant_model_urls = _ModelURLs(
-    {
-        "shufflenetv2_x0.5_fbgemm": ShuffleNet_V2_X0_5_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-        "shufflenetv2_x1.0_fbgemm": ShuffleNet_V2_X1_0_QuantizedWeights.IMAGENET1K_FBGEMM_V1.url,
-    }
-)
diff --git a/torchvision/models/regnet.py b/torchvision/models/regnet.py
index 866e62c164d9afcd54f2901503ddf654e6907c95..f37b2994e48fd202cccd4db6dc63b97f7c0c1ae7 100644
--- a/torchvision/models/regnet.py
+++ b/torchvision/models/regnet.py
@@ -212,7 +212,7 @@ class BlockParams:
         **kwargs: Any,
     ) -> "BlockParams":
         """
-        Programatically compute all the per-block settings,
+        Programmatically compute all the per-block settings,
         given the RegNet parameters.
 
         The first step is to compute the quantized linear block parameters,
@@ -397,7 +397,7 @@ def _regnet(
     model = RegNet(block_params, norm_layer=norm_layer, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -428,6 +428,8 @@ class RegNet_Y_400MF_Weights(WeightsEnum):
                     "acc@5": 91.716,
                 }
             },
+            "_ops": 0.402,
+            "_file_size": 16.806,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -444,6 +446,8 @@ class RegNet_Y_400MF_Weights(WeightsEnum):
                     "acc@5": 92.742,
                 }
             },
+            "_ops": 0.402,
+            "_file_size": 16.806,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -468,6 +472,8 @@ class RegNet_Y_800MF_Weights(WeightsEnum):
                     "acc@5": 93.136,
                 }
             },
+            "_ops": 0.834,
+            "_file_size": 24.774,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -484,6 +490,8 @@ class RegNet_Y_800MF_Weights(WeightsEnum):
                     "acc@5": 94.502,
                 }
             },
+            "_ops": 0.834,
+            "_file_size": 24.774,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -508,6 +516,8 @@ class RegNet_Y_1_6GF_Weights(WeightsEnum):
                     "acc@5": 93.966,
                 }
             },
+            "_ops": 1.612,
+            "_file_size": 43.152,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -524,6 +534,8 @@ class RegNet_Y_1_6GF_Weights(WeightsEnum):
                     "acc@5": 95.444,
                 }
             },
+            "_ops": 1.612,
+            "_file_size": 43.152,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -548,6 +560,8 @@ class RegNet_Y_3_2GF_Weights(WeightsEnum):
                     "acc@5": 94.576,
                 }
             },
+            "_ops": 3.176,
+            "_file_size": 74.567,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -564,6 +578,8 @@ class RegNet_Y_3_2GF_Weights(WeightsEnum):
                     "acc@5": 95.972,
                 }
             },
+            "_ops": 3.176,
+            "_file_size": 74.567,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -588,6 +604,8 @@ class RegNet_Y_8GF_Weights(WeightsEnum):
                     "acc@5": 95.048,
                 }
             },
+            "_ops": 8.473,
+            "_file_size": 150.701,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -604,6 +622,8 @@ class RegNet_Y_8GF_Weights(WeightsEnum):
                     "acc@5": 96.330,
                 }
             },
+            "_ops": 8.473,
+            "_file_size": 150.701,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -628,6 +648,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 95.240,
                 }
             },
+            "_ops": 15.912,
+            "_file_size": 319.49,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -644,6 +666,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 96.328,
                 }
             },
+            "_ops": 15.912,
+            "_file_size": 319.49,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -665,6 +689,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 98.054,
                 }
             },
+            "_ops": 46.735,
+            "_file_size": 319.49,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -686,6 +712,8 @@ class RegNet_Y_16GF_Weights(WeightsEnum):
                     "acc@5": 97.244,
                 }
             },
+            "_ops": 15.912,
+            "_file_size": 319.49,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -709,6 +737,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 95.340,
                 }
             },
+            "_ops": 32.28,
+            "_file_size": 554.076,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -725,6 +755,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 96.498,
                 }
             },
+            "_ops": 32.28,
+            "_file_size": 554.076,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -746,6 +778,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 98.362,
                 }
             },
+            "_ops": 94.826,
+            "_file_size": 554.076,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -767,6 +801,8 @@ class RegNet_Y_32GF_Weights(WeightsEnum):
                     "acc@5": 97.480,
                 }
             },
+            "_ops": 32.28,
+            "_file_size": 554.076,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -791,6 +827,8 @@ class RegNet_Y_128GF_Weights(WeightsEnum):
                     "acc@5": 98.682,
                 }
             },
+            "_ops": 374.57,
+            "_file_size": 2461.564,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -812,6 +850,8 @@ class RegNet_Y_128GF_Weights(WeightsEnum):
                     "acc@5": 97.844,
                 }
             },
+            "_ops": 127.518,
+            "_file_size": 2461.564,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -835,6 +875,8 @@ class RegNet_X_400MF_Weights(WeightsEnum):
                     "acc@5": 90.950,
                 }
             },
+            "_ops": 0.414,
+            "_file_size": 21.258,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -851,6 +893,8 @@ class RegNet_X_400MF_Weights(WeightsEnum):
                     "acc@5": 92.322,
                 }
             },
+            "_ops": 0.414,
+            "_file_size": 21.257,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -875,6 +919,8 @@ class RegNet_X_800MF_Weights(WeightsEnum):
                     "acc@5": 92.348,
                 }
             },
+            "_ops": 0.8,
+            "_file_size": 27.945,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -891,6 +937,8 @@ class RegNet_X_800MF_Weights(WeightsEnum):
                     "acc@5": 93.826,
                 }
             },
+            "_ops": 0.8,
+            "_file_size": 27.945,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -915,6 +963,8 @@ class RegNet_X_1_6GF_Weights(WeightsEnum):
                     "acc@5": 93.440,
                 }
             },
+            "_ops": 1.603,
+            "_file_size": 35.339,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -931,6 +981,8 @@ class RegNet_X_1_6GF_Weights(WeightsEnum):
                     "acc@5": 94.922,
                 }
             },
+            "_ops": 1.603,
+            "_file_size": 35.339,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -955,6 +1007,8 @@ class RegNet_X_3_2GF_Weights(WeightsEnum):
                     "acc@5": 93.992,
                 }
             },
+            "_ops": 3.177,
+            "_file_size": 58.756,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -971,6 +1025,8 @@ class RegNet_X_3_2GF_Weights(WeightsEnum):
                     "acc@5": 95.430,
                 }
             },
+            "_ops": 3.177,
+            "_file_size": 58.756,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -995,6 +1051,8 @@ class RegNet_X_8GF_Weights(WeightsEnum):
                     "acc@5": 94.686,
                 }
             },
+            "_ops": 7.995,
+            "_file_size": 151.456,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1011,6 +1069,8 @@ class RegNet_X_8GF_Weights(WeightsEnum):
                     "acc@5": 95.678,
                 }
             },
+            "_ops": 7.995,
+            "_file_size": 151.456,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1035,6 +1095,8 @@ class RegNet_X_16GF_Weights(WeightsEnum):
                     "acc@5": 94.944,
                 }
             },
+            "_ops": 15.941,
+            "_file_size": 207.627,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1051,6 +1113,8 @@ class RegNet_X_16GF_Weights(WeightsEnum):
                     "acc@5": 96.196,
                 }
             },
+            "_ops": 15.941,
+            "_file_size": 207.627,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1075,6 +1139,8 @@ class RegNet_X_32GF_Weights(WeightsEnum):
                     "acc@5": 95.248,
                 }
             },
+            "_ops": 31.736,
+            "_file_size": 412.039,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -1091,6 +1157,8 @@ class RegNet_X_32GF_Weights(WeightsEnum):
                     "acc@5": 96.288,
                 }
             },
+            "_ops": 31.736,
+            "_file_size": 412.039,
             "_docs": """
                 These weights improve upon the results of the original paper by using a modified version of TorchVision's
                 `new training recipe
@@ -1501,27 +1569,3 @@ def regnet_x_32gf(*, weights: Optional[RegNet_X_32GF_Weights] = None, progress:
 
     params = BlockParams.from_init_params(depth=23, w_0=320, w_a=69.86, w_m=2.0, group_width=168, **kwargs)
     return _regnet(params, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "regnet_y_400mf": RegNet_Y_400MF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_800mf": RegNet_Y_800MF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_1_6gf": RegNet_Y_1_6GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_3_2gf": RegNet_Y_3_2GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_8gf": RegNet_Y_8GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_16gf": RegNet_Y_16GF_Weights.IMAGENET1K_V1.url,
-        "regnet_y_32gf": RegNet_Y_32GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_400mf": RegNet_X_400MF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_800mf": RegNet_X_800MF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_1_6gf": RegNet_X_1_6GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_3_2gf": RegNet_X_3_2GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_8gf": RegNet_X_8GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_16gf": RegNet_X_16GF_Weights.IMAGENET1K_V1.url,
-        "regnet_x_32gf": RegNet_X_32GF_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/resnet.py b/torchvision/models/resnet.py
index dbf14463eaf575b75ab253b14234d2205e394252..83c0340cef74d9cb4c1dc92c38f4b3024be1f731 100644
--- a/torchvision/models/resnet.py
+++ b/torchvision/models/resnet.py
@@ -108,7 +108,7 @@ class BasicBlock(nn.Module):
 class Bottleneck(nn.Module):
     # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
     # while original implementation places the stride at the first 1x1 convolution(self.conv1)
-    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
+    # according to "Deep residual learning for image recognition" https://arxiv.org/abs/1512.03385.
     # This variant is also known as ResNet V1.5 and improves accuracy according to
     # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
 
@@ -298,7 +298,7 @@ def _resnet(
     model = ResNet(block, layers, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -323,6 +323,8 @@ class ResNet18_Weights(WeightsEnum):
                     "acc@5": 89.078,
                 }
             },
+            "_ops": 1.814,
+            "_file_size": 44.661,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -343,6 +345,8 @@ class ResNet34_Weights(WeightsEnum):
                     "acc@5": 91.420,
                 }
             },
+            "_ops": 3.664,
+            "_file_size": 83.275,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -363,6 +367,8 @@ class ResNet50_Weights(WeightsEnum):
                     "acc@5": 92.862,
                 }
             },
+            "_ops": 4.089,
+            "_file_size": 97.781,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -379,6 +385,8 @@ class ResNet50_Weights(WeightsEnum):
                     "acc@5": 95.434,
                 }
             },
+            "_ops": 4.089,
+            "_file_size": 97.79,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -402,6 +410,8 @@ class ResNet101_Weights(WeightsEnum):
                     "acc@5": 93.546,
                 }
             },
+            "_ops": 7.801,
+            "_file_size": 170.511,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -418,6 +428,8 @@ class ResNet101_Weights(WeightsEnum):
                     "acc@5": 95.780,
                 }
             },
+            "_ops": 7.801,
+            "_file_size": 170.53,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -441,6 +453,8 @@ class ResNet152_Weights(WeightsEnum):
                     "acc@5": 94.046,
                 }
             },
+            "_ops": 11.514,
+            "_file_size": 230.434,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -457,6 +471,8 @@ class ResNet152_Weights(WeightsEnum):
                     "acc@5": 96.002,
                 }
             },
+            "_ops": 11.514,
+            "_file_size": 230.474,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -480,6 +496,8 @@ class ResNeXt50_32X4D_Weights(WeightsEnum):
                     "acc@5": 93.698,
                 }
             },
+            "_ops": 4.23,
+            "_file_size": 95.789,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -496,6 +514,8 @@ class ResNeXt50_32X4D_Weights(WeightsEnum):
                     "acc@5": 95.340,
                 }
             },
+            "_ops": 4.23,
+            "_file_size": 95.833,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -519,6 +539,8 @@ class ResNeXt101_32X8D_Weights(WeightsEnum):
                     "acc@5": 94.526,
                 }
             },
+            "_ops": 16.414,
+            "_file_size": 339.586,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -535,6 +557,8 @@ class ResNeXt101_32X8D_Weights(WeightsEnum):
                     "acc@5": 96.228,
                 }
             },
+            "_ops": 16.414,
+            "_file_size": 339.673,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -558,6 +582,8 @@ class ResNeXt101_64X4D_Weights(WeightsEnum):
                     "acc@5": 96.454,
                 }
             },
+            "_ops": 15.46,
+            "_file_size": 319.318,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -581,6 +607,8 @@ class Wide_ResNet50_2_Weights(WeightsEnum):
                     "acc@5": 94.086,
                 }
             },
+            "_ops": 11.398,
+            "_file_size": 131.82,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -597,6 +625,8 @@ class Wide_ResNet50_2_Weights(WeightsEnum):
                     "acc@5": 95.758,
                 }
             },
+            "_ops": 11.398,
+            "_file_size": 263.124,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -620,6 +650,8 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
                     "acc@5": 94.284,
                 }
             },
+            "_ops": 22.753,
+            "_file_size": 242.896,
             "_docs": """These weights reproduce closely the results of the paper using a simple training recipe.""",
         },
     )
@@ -636,6 +668,8 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
                     "acc@5": 96.020,
                 }
             },
+            "_ops": 22.753,
+            "_file_size": 484.747,
             "_docs": """
                 These weights improve upon the results of the original paper by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -648,7 +682,7 @@ class Wide_ResNet101_2_Weights(WeightsEnum):
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet18_Weights.IMAGENET1K_V1))
 def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-18 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-18 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     Args:
         weights (:class:`~torchvision.models.ResNet18_Weights`, optional): The
@@ -674,7 +708,7 @@ def resnet18(*, weights: Optional[ResNet18_Weights] = None, progress: bool = Tru
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet34_Weights.IMAGENET1K_V1))
 def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     Args:
         weights (:class:`~torchvision.models.ResNet34_Weights`, optional): The
@@ -700,7 +734,7 @@ def resnet34(*, weights: Optional[ResNet34_Weights] = None, progress: bool = Tru
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet50_Weights.IMAGENET1K_V1))
 def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-50 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-50 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     .. note::
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
@@ -732,7 +766,7 @@ def resnet50(*, weights: Optional[ResNet50_Weights] = None, progress: bool = Tru
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet101_Weights.IMAGENET1K_V1))
 def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-101 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-101 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     .. note::
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
@@ -764,7 +798,7 @@ def resnet101(*, weights: Optional[ResNet101_Weights] = None, progress: bool = T
 @register_model()
 @handle_legacy_interface(weights=("pretrained", ResNet152_Weights.IMAGENET1K_V1))
 def resnet152(*, weights: Optional[ResNet152_Weights] = None, progress: bool = True, **kwargs: Any) -> ResNet:
-    """ResNet-152 from `Deep Residual Learning for Image Recognition <https://arxiv.org/pdf/1512.03385.pdf>`__.
+    """ResNet-152 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`__.
 
     .. note::
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
@@ -949,22 +983,3 @@ def wide_resnet101_2(
 
     _ovewrite_named_param(kwargs, "width_per_group", 64 * 2)
     return _resnet(Bottleneck, [3, 4, 23, 3], weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "resnet18": ResNet18_Weights.IMAGENET1K_V1.url,
-        "resnet34": ResNet34_Weights.IMAGENET1K_V1.url,
-        "resnet50": ResNet50_Weights.IMAGENET1K_V1.url,
-        "resnet101": ResNet101_Weights.IMAGENET1K_V1.url,
-        "resnet152": ResNet152_Weights.IMAGENET1K_V1.url,
-        "resnext50_32x4d": ResNeXt50_32X4D_Weights.IMAGENET1K_V1.url,
-        "resnext101_32x8d": ResNeXt101_32X8D_Weights.IMAGENET1K_V1.url,
-        "wide_resnet50_2": Wide_ResNet50_2_Weights.IMAGENET1K_V1.url,
-        "wide_resnet101_2": Wide_ResNet101_2_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/deeplabv3.py b/torchvision/models/segmentation/deeplabv3.py
index 29ab0154807b7344345eaa93714ec65646ea6d24..f58c5d26a66a605d4c7e7ad00e1ecabb844b99c1 100644
--- a/torchvision/models/segmentation/deeplabv3.py
+++ b/torchvision/models/segmentation/deeplabv3.py
@@ -152,6 +152,8 @@ class DeepLabV3_ResNet50_Weights(WeightsEnum):
                     "pixel_acc": 92.4,
                 }
             },
+            "_ops": 178.722,
+            "_file_size": 160.515,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -171,6 +173,8 @@ class DeepLabV3_ResNet101_Weights(WeightsEnum):
                     "pixel_acc": 92.4,
                 }
             },
+            "_ops": 258.743,
+            "_file_size": 233.217,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -190,6 +194,8 @@ class DeepLabV3_MobileNet_V3_Large_Weights(WeightsEnum):
                     "pixel_acc": 91.2,
                 }
             },
+            "_ops": 10.452,
+            "_file_size": 42.301,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -269,7 +275,7 @@ def deeplabv3_resnet50(
     model = _deeplabv3_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -325,7 +331,7 @@ def deeplabv3_resnet101(
     model = _deeplabv3_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -379,19 +385,6 @@ def deeplabv3_mobilenet_v3_large(
     model = _deeplabv3_mobilenetv3(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "deeplabv3_resnet50_coco": DeepLabV3_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1.url,
-        "deeplabv3_resnet101_coco": DeepLabV3_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1.url,
-        "deeplabv3_mobilenet_v3_large_coco": DeepLabV3_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/fcn.py b/torchvision/models/segmentation/fcn.py
index 6f1c9c4b80bdc1ce5293323d816637a1994a886d..fb2e242adac0e7430bab6155ae0347770e29fee9 100644
--- a/torchvision/models/segmentation/fcn.py
+++ b/torchvision/models/segmentation/fcn.py
@@ -71,6 +71,8 @@ class FCN_ResNet50_Weights(WeightsEnum):
                     "pixel_acc": 91.4,
                 }
             },
+            "_ops": 152.717,
+            "_file_size": 135.009,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -90,6 +92,8 @@ class FCN_ResNet101_Weights(WeightsEnum):
                     "pixel_acc": 91.9,
                 }
             },
+            "_ops": 232.738,
+            "_file_size": 207.711,
         },
     )
     DEFAULT = COCO_WITH_VOC_LABELS_V1
@@ -164,7 +168,7 @@ def fcn_resnet50(
     model = _fcn_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -223,18 +227,6 @@ def fcn_resnet101(
     model = _fcn_resnet(backbone, num_classes, aux_loss)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "fcn_resnet50_coco": FCN_ResNet50_Weights.COCO_WITH_VOC_LABELS_V1.url,
-        "fcn_resnet101_coco": FCN_ResNet101_Weights.COCO_WITH_VOC_LABELS_V1.url,
-    }
-)
diff --git a/torchvision/models/segmentation/lraspp.py b/torchvision/models/segmentation/lraspp.py
index 44c96f1c272e261b2e36fa92198b335b0513ed14..70bced70fd37c3c681915492cea0c68c87cf0a7e 100644
--- a/torchvision/models/segmentation/lraspp.py
+++ b/torchvision/models/segmentation/lraspp.py
@@ -108,6 +108,8 @@ class LRASPP_MobileNet_V3_Large_Weights(WeightsEnum):
                     "pixel_acc": 91.2,
                 }
             },
+            "_ops": 2.086,
+            "_file_size": 12.49,
             "_docs": """
                 These weights were trained on a subset of COCO, using only the 20 categories that are present in the
                 Pascal VOC dataset.
@@ -171,17 +173,6 @@ def lraspp_mobilenet_v3_large(
     model = _lraspp_mobilenetv3(backbone, num_classes)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from .._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "lraspp_mobilenet_v3_large_coco": LRASPP_MobileNet_V3_Large_Weights.COCO_WITH_VOC_LABELS_V1.url,
-    }
-)
diff --git a/torchvision/models/shufflenetv2.py b/torchvision/models/shufflenetv2.py
index 159e1be3bc83ca66a364d89e08e1047644dbc754..3f3322b7a88f183c15838308f39c38d36dea13c0 100644
--- a/torchvision/models/shufflenetv2.py
+++ b/torchvision/models/shufflenetv2.py
@@ -35,7 +35,7 @@ def channel_shuffle(x: Tensor, groups: int) -> Tensor:
     x = torch.transpose(x, 1, 2).contiguous()
 
     # flatten
-    x = x.view(batchsize, -1, height, width)
+    x = x.view(batchsize, num_channels, height, width)
 
     return x
 
@@ -178,7 +178,7 @@ def _shufflenetv2(
     model = ShuffleNetV2(*args, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -204,6 +204,8 @@ class ShuffleNet_V2_X0_5_Weights(WeightsEnum):
                     "acc@5": 81.746,
                 }
             },
+            "_ops": 0.04,
+            "_file_size": 5.282,
             "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
         },
     )
@@ -224,6 +226,8 @@ class ShuffleNet_V2_X1_0_Weights(WeightsEnum):
                     "acc@5": 88.316,
                 }
             },
+            "_ops": 0.145,
+            "_file_size": 8.791,
             "_docs": """These weights were trained from scratch to reproduce closely the results of the paper.""",
         },
     )
@@ -244,6 +248,8 @@ class ShuffleNet_V2_X1_5_Weights(WeightsEnum):
                     "acc@5": 91.086,
                 }
             },
+            "_ops": 0.296,
+            "_file_size": 13.557,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -267,6 +273,8 @@ class ShuffleNet_V2_X2_0_Weights(WeightsEnum):
                     "acc@5": 93.006,
                 }
             },
+            "_ops": 0.583,
+            "_file_size": 28.433,
             "_docs": """
                 These weights were trained from scratch by using TorchVision's `new training recipe
                 <https://pytorch.org/blog/how-to-train-state-of-the-art-models-using-torchvision-latest-primitives/>`_.
@@ -398,17 +406,3 @@ def shufflenet_v2_x2_0(
     weights = ShuffleNet_V2_X2_0_Weights.verify(weights)
 
     return _shufflenetv2(weights, progress, [4, 8, 4], [24, 244, 488, 976, 2048], **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "shufflenetv2_x0.5": ShuffleNet_V2_X0_5_Weights.IMAGENET1K_V1.url,
-        "shufflenetv2_x1.0": ShuffleNet_V2_X1_0_Weights.IMAGENET1K_V1.url,
-        "shufflenetv2_x1.5": None,
-        "shufflenetv2_x2.0": None,
-    }
-)
diff --git a/torchvision/models/squeezenet.py b/torchvision/models/squeezenet.py
index 9fe6521e1a1337587435fc01b26160c597d6f3be..982b32107b09c280b4c7caa61e6b80be0cbf041e 100644
--- a/torchvision/models/squeezenet.py
+++ b/torchvision/models/squeezenet.py
@@ -109,7 +109,7 @@ def _squeezenet(
     model = SqueezeNet(version, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -135,6 +135,8 @@ class SqueezeNet1_0_Weights(WeightsEnum):
                     "acc@5": 80.420,
                 }
             },
+            "_ops": 0.819,
+            "_file_size": 4.778,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -154,6 +156,8 @@ class SqueezeNet1_1_Weights(WeightsEnum):
                     "acc@5": 80.624,
                 }
             },
+            "_ops": 0.349,
+            "_file_size": 4.729,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -217,15 +221,3 @@ def squeezenet1_1(
     """
     weights = SqueezeNet1_1_Weights.verify(weights)
     return _squeezenet("1_1", weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "squeezenet1_0": SqueezeNet1_0_Weights.IMAGENET1K_V1.url,
-        "squeezenet1_1": SqueezeNet1_1_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/swin_transformer.py b/torchvision/models/swin_transformer.py
index 64714c3afac7b241d91427a80da07cfb2f28f70f..2035f659bfc7f4e4f98d36cabff9be3802d49e59 100644
--- a/torchvision/models/swin_transformer.py
+++ b/torchvision/models/swin_transformer.py
@@ -126,7 +126,8 @@ def shifted_window_attention(
     qkv_bias: Optional[Tensor] = None,
     proj_bias: Optional[Tensor] = None,
     logit_scale: Optional[torch.Tensor] = None,
-):
+    training: bool = True,
+) -> Tensor:
     """
     Window based multi-head self attention (W-MSA) module with relative position bias.
     It supports both of shifted and non-shifted window.
@@ -143,6 +144,7 @@ def shifted_window_attention(
         qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
         proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
         logit_scale (Tensor[out_dim], optional): Logit scale of cosine attention for Swin Transformer V2. Default: None.
+        training (bool, optional): Training flag used by the dropout parameters. Default: True.
     Returns:
         Tensor[N, H, W, C]: The output tensor after shifted window attention.
     """
@@ -207,11 +209,11 @@ def shifted_window_attention(
         attn = attn.view(-1, num_heads, x.size(1), x.size(1))
 
     attn = F.softmax(attn, dim=-1)
-    attn = F.dropout(attn, p=attention_dropout)
+    attn = F.dropout(attn, p=attention_dropout, training=training)
 
     x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), C)
     x = F.linear(x, proj_weight, proj_bias)
-    x = F.dropout(x, p=dropout)
+    x = F.dropout(x, p=dropout, training=training)
 
     # reverse windows
     x = x.view(B, pad_H // window_size[0], pad_W // window_size[1], window_size[0], window_size[1], C)
@@ -286,7 +288,7 @@ class ShiftedWindowAttention(nn.Module):
             self.relative_position_bias_table, self.relative_position_index, self.window_size  # type: ignore[arg-type]
         )
 
-    def forward(self, x: Tensor):
+    def forward(self, x: Tensor) -> Tensor:
         """
         Args:
             x (Tensor): Tensor with layout of [B, H, W, C]
@@ -306,6 +308,7 @@ class ShiftedWindowAttention(nn.Module):
             dropout=self.dropout,
             qkv_bias=self.qkv.bias,
             proj_bias=self.proj.bias,
+            training=self.training,
         )
 
 
@@ -391,6 +394,7 @@ class ShiftedWindowAttentionV2(ShiftedWindowAttention):
             qkv_bias=self.qkv.bias,
             proj_bias=self.proj.bias,
             logit_scale=self.logit_scale,
+            training=self.training,
         )
 
 
@@ -494,6 +498,8 @@ class SwinTransformerBlockV2(SwinTransformerBlock):
         )
 
     def forward(self, x: Tensor):
+        # Here is the difference, we apply norm after the attention in V2.
+        # In V1 we applied norm before the attention.
         x = x + self.stochastic_depth(self.norm1(self.attn(x)))
         x = x + self.stochastic_depth(self.norm2(self.mlp(x)))
         return x
@@ -502,7 +508,7 @@ class SwinTransformerBlockV2(SwinTransformerBlock):
 class SwinTransformer(nn.Module):
     """
     Implements Swin Transformer from the `"Swin Transformer: Hierarchical Vision Transformer using
-    Shifted Windows" <https://arxiv.org/pdf/2103.14030>`_ paper.
+    Shifted Windows" <https://arxiv.org/abs/2103.14030>`_ paper.
     Args:
         patch_size (List[int]): Patch size.
         embed_dim (int): Patch embedding dimension.
@@ -587,7 +593,7 @@ class SwinTransformer(nn.Module):
 
         num_features = embed_dim * 2 ** (len(depths) - 1)
         self.norm = norm_layer(num_features)
-        self.permute = Permute([0, 3, 1, 2])
+        self.permute = Permute([0, 3, 1, 2])  # B H W C -> B C H W
         self.avgpool = nn.AdaptiveAvgPool2d(1)
         self.flatten = nn.Flatten(1)
         self.head = nn.Linear(num_features, num_classes)
@@ -633,7 +639,7 @@ def _swin_transformer(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -660,6 +666,8 @@ class Swin_T_Weights(WeightsEnum):
                     "acc@5": 95.776,
                 }
             },
+            "_ops": 4.491,
+            "_file_size": 108.19,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -683,6 +691,8 @@ class Swin_S_Weights(WeightsEnum):
                     "acc@5": 96.360,
                 }
             },
+            "_ops": 8.741,
+            "_file_size": 189.786,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -706,6 +716,8 @@ class Swin_B_Weights(WeightsEnum):
                     "acc@5": 96.640,
                 }
             },
+            "_ops": 15.431,
+            "_file_size": 335.364,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -729,6 +741,8 @@ class Swin_V2_T_Weights(WeightsEnum):
                     "acc@5": 96.132,
                 }
             },
+            "_ops": 5.94,
+            "_file_size": 108.626,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -752,6 +766,8 @@ class Swin_V2_S_Weights(WeightsEnum):
                     "acc@5": 96.816,
                 }
             },
+            "_ops": 11.546,
+            "_file_size": 190.675,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -775,6 +791,8 @@ class Swin_V2_B_Weights(WeightsEnum):
                     "acc@5": 96.864,
                 }
             },
+            "_ops": 20.325,
+            "_file_size": 336.372,
             "_docs": """These weights reproduce closely the results of the paper using a similar training recipe.""",
         },
     )
@@ -786,7 +804,7 @@ class Swin_V2_B_Weights(WeightsEnum):
 def swin_t(*, weights: Optional[Swin_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_tiny architecture from
-    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_T_Weights`, optional): The
@@ -824,7 +842,7 @@ def swin_t(*, weights: Optional[Swin_T_Weights] = None, progress: bool = True, *
 def swin_s(*, weights: Optional[Swin_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_small architecture from
-    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_S_Weights`, optional): The
@@ -862,7 +880,7 @@ def swin_s(*, weights: Optional[Swin_S_Weights] = None, progress: bool = True, *
 def swin_b(*, weights: Optional[Swin_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_base architecture from
-    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/pdf/2103.14030>`_.
+    `Swin Transformer: Hierarchical Vision Transformer using Shifted Windows <https://arxiv.org/abs/2103.14030>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_B_Weights`, optional): The
@@ -900,7 +918,7 @@ def swin_b(*, weights: Optional[Swin_B_Weights] = None, progress: bool = True, *
 def swin_v2_t(*, weights: Optional[Swin_V2_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_v2_tiny architecture from
-    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/pdf/2111.09883>`_.
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_V2_T_Weights`, optional): The
@@ -940,7 +958,7 @@ def swin_v2_t(*, weights: Optional[Swin_V2_T_Weights] = None, progress: bool = T
 def swin_v2_s(*, weights: Optional[Swin_V2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_v2_small architecture from
-    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/pdf/2111.09883>`_.
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_V2_S_Weights`, optional): The
@@ -980,7 +998,7 @@ def swin_v2_s(*, weights: Optional[Swin_V2_S_Weights] = None, progress: bool = T
 def swin_v2_b(*, weights: Optional[Swin_V2_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer:
     """
     Constructs a swin_v2_base architecture from
-    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/pdf/2111.09883>`_.
+    `Swin Transformer V2: Scaling Up Capacity and Resolution <https://arxiv.org/abs/2111.09883>`_.
 
     Args:
         weights (:class:`~torchvision.models.Swin_V2_B_Weights`, optional): The
diff --git a/torchvision/models/vgg.py b/torchvision/models/vgg.py
index dea783c2fb19ef662725e843b8374a4576b6517c..0a548570dd41fa87309e8aa4bdb373517f265060 100644
--- a/torchvision/models/vgg.py
+++ b/torchvision/models/vgg.py
@@ -102,7 +102,7 @@ def _vgg(cfg: str, batch_norm: bool, weights: Optional[WeightsEnum], progress: b
             _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
     model = VGG(make_layers(cfgs[cfg], batch_norm=batch_norm), **kwargs)
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
     return model
 
 
@@ -127,6 +127,8 @@ class VGG11_Weights(WeightsEnum):
                     "acc@5": 88.628,
                 }
             },
+            "_ops": 7.609,
+            "_file_size": 506.84,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -145,6 +147,8 @@ class VGG11_BN_Weights(WeightsEnum):
                     "acc@5": 89.810,
                 }
             },
+            "_ops": 7.609,
+            "_file_size": 506.881,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -163,6 +167,8 @@ class VGG13_Weights(WeightsEnum):
                     "acc@5": 89.246,
                 }
             },
+            "_ops": 11.308,
+            "_file_size": 507.545,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -181,6 +187,8 @@ class VGG13_BN_Weights(WeightsEnum):
                     "acc@5": 90.374,
                 }
             },
+            "_ops": 11.308,
+            "_file_size": 507.59,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -199,6 +207,8 @@ class VGG16_Weights(WeightsEnum):
                     "acc@5": 90.382,
                 }
             },
+            "_ops": 15.47,
+            "_file_size": 527.796,
         },
     )
     IMAGENET1K_FEATURES = Weights(
@@ -221,6 +231,8 @@ class VGG16_Weights(WeightsEnum):
                     "acc@5": float("nan"),
                 }
             },
+            "_ops": 15.47,
+            "_file_size": 527.802,
             "_docs": """
                 These weights can't be used for classification because they are missing values in the `classifier`
                 module. Only the `features` module has valid values and can be used for feature extraction. The weights
@@ -244,6 +256,8 @@ class VGG16_BN_Weights(WeightsEnum):
                     "acc@5": 91.516,
                 }
             },
+            "_ops": 15.47,
+            "_file_size": 527.866,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -262,6 +276,8 @@ class VGG19_Weights(WeightsEnum):
                     "acc@5": 90.876,
                 }
             },
+            "_ops": 19.632,
+            "_file_size": 548.051,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -280,6 +296,8 @@ class VGG19_BN_Weights(WeightsEnum):
                     "acc@5": 91.842,
                 }
             },
+            "_ops": 19.632,
+            "_file_size": 548.143,
         },
     )
     DEFAULT = IMAGENET1K_V1
@@ -491,21 +509,3 @@ def vgg19_bn(*, weights: Optional[VGG19_BN_Weights] = None, progress: bool = Tru
     weights = VGG19_BN_Weights.verify(weights)
 
     return _vgg("E", True, weights, progress, **kwargs)
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "vgg11": VGG11_Weights.IMAGENET1K_V1.url,
-        "vgg13": VGG13_Weights.IMAGENET1K_V1.url,
-        "vgg16": VGG16_Weights.IMAGENET1K_V1.url,
-        "vgg19": VGG19_Weights.IMAGENET1K_V1.url,
-        "vgg11_bn": VGG11_BN_Weights.IMAGENET1K_V1.url,
-        "vgg13_bn": VGG13_BN_Weights.IMAGENET1K_V1.url,
-        "vgg16_bn": VGG16_BN_Weights.IMAGENET1K_V1.url,
-        "vgg19_bn": VGG19_BN_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/models/video/__init__.py b/torchvision/models/video/__init__.py
index 232c92013c77fc2a71f3e7f83c50e1df3fa6c38d..f1eedd3116001af22ec202d2ccec6eefad8090ae 100644
--- a/torchvision/models/video/__init__.py
+++ b/torchvision/models/video/__init__.py
@@ -1,3 +1,4 @@
 from .mvit import *
 from .resnet import *
 from .s3d import *
+from .swin_transformer import *
diff --git a/torchvision/models/video/mvit.py b/torchvision/models/video/mvit.py
index 1b5118b53f5b0e06ec2cbb8bdcb79c806ce5a155..159c12a4f3eac579f4e122741f57cc60f5cd0a23 100644
--- a/torchvision/models/video/mvit.py
+++ b/torchvision/models/video/mvit.py
@@ -593,7 +593,7 @@ def _mvit(
     )
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -624,6 +624,8 @@ class MViT_V1_B_Weights(WeightsEnum):
                     "acc@5": 93.582,
                 }
             },
+            "_ops": 70.599,
+            "_file_size": 139.764,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -655,6 +657,8 @@ class MViT_V2_S_Weights(WeightsEnum):
                     "acc@5": 94.665,
                 }
             },
+            "_ops": 64.224,
+            "_file_size": 131.884,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -761,9 +765,10 @@ def mvit_v1_b(*, weights: Optional[MViT_V1_B_Weights] = None, progress: bool = T
 @register_model()
 @handle_legacy_interface(weights=("pretrained", MViT_V2_S_Weights.KINETICS400_V1))
 def mvit_v2_s(*, weights: Optional[MViT_V2_S_Weights] = None, progress: bool = True, **kwargs: Any) -> MViT:
-    """
-    Constructs a small MViTV2 architecture from
-    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__.
+    """Constructs a small MViTV2 architecture from
+    `Multiscale Vision Transformers <https://arxiv.org/abs/2104.11227>`__ and
+    `MViTv2: Improved Multiscale Vision Transformers for Classification
+    and Detection <https://arxiv.org/abs/2112.01526>`__.
 
     .. betastatus:: video module
 
@@ -781,7 +786,7 @@ def mvit_v2_s(*, weights: Optional[MViT_V2_S_Weights] = None, progress: bool = T
             for more details about this class.
 
     .. autoclass:: torchvision.models.video.MViT_V2_S_Weights
-        :members:
+            :members:
     """
     weights = MViT_V2_S_Weights.verify(weights)
 
diff --git a/torchvision/models/video/resnet.py b/torchvision/models/video/resnet.py
index 352ae92d194fa1ea11fa755142f62946fa19334f..a1cb2884013c053118555344617e4b1efb8ddaab 100644
--- a/torchvision/models/video/resnet.py
+++ b/torchvision/models/video/resnet.py
@@ -303,7 +303,7 @@ def _video_resnet(
     model = VideoResNet(block, conv_makers, layers, stem, **kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -332,6 +332,8 @@ class R3D_18_Weights(WeightsEnum):
                     "acc@5": 83.479,
                 }
             },
+            "_ops": 40.697,
+            "_file_size": 127.359,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -350,6 +352,8 @@ class MC3_18_Weights(WeightsEnum):
                     "acc@5": 84.130,
                 }
             },
+            "_ops": 43.343,
+            "_file_size": 44.672,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -368,6 +372,8 @@ class R2Plus1D_18_Weights(WeightsEnum):
                     "acc@5": 86.175,
                 }
             },
+            "_ops": 40.519,
+            "_file_size": 120.318,
         },
     )
     DEFAULT = KINETICS400_V1
diff --git a/torchvision/models/video/s3d.py b/torchvision/models/video/s3d.py
index 53e3e841a27ae4985c5aa5427309952066c86310..4b202829b24fb1dc314452d38a521dfe6c8e446f 100644
--- a/torchvision/models/video/s3d.py
+++ b/torchvision/models/video/s3d.py
@@ -175,6 +175,8 @@ class S3D_Weights(WeightsEnum):
                     "acc@5": 88.050,
                 }
             },
+            "_ops": 17.979,
+            "_file_size": 31.972,
         },
     )
     DEFAULT = KINETICS400_V1
@@ -212,6 +214,6 @@ def s3d(*, weights: Optional[S3D_Weights] = None, progress: bool = True, **kwarg
     model = S3D(**kwargs)
 
     if weights is not None:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
diff --git a/torchvision/models/video/swin_transformer.py b/torchvision/models/video/swin_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a8d87ffbe5af6caa1de0a3760fa5c506fdf8e231
--- /dev/null
+++ b/torchvision/models/video/swin_transformer.py
@@ -0,0 +1,743 @@
+# Modified from 2d Swin Transformers in torchvision:
+# https://github.com/pytorch/vision/blob/main/torchvision/models/swin_transformer.py
+
+from functools import partial
+from typing import Any, Callable, List, Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+
+from ...transforms._presets import VideoClassification
+
+from ...utils import _log_api_usage_once
+
+from .._api import register_model, Weights, WeightsEnum
+
+from .._meta import _KINETICS400_CATEGORIES
+from .._utils import _ovewrite_named_param, handle_legacy_interface
+from ..swin_transformer import PatchMerging, SwinTransformerBlock
+
+__all__ = [
+    "SwinTransformer3d",
+    "Swin3D_T_Weights",
+    "Swin3D_S_Weights",
+    "Swin3D_B_Weights",
+    "swin3d_t",
+    "swin3d_s",
+    "swin3d_b",
+]
+
+
+def _get_window_and_shift_size(
+    shift_size: List[int], size_dhw: List[int], window_size: List[int]
+) -> Tuple[List[int], List[int]]:
+    for i in range(3):
+        if size_dhw[i] <= window_size[i]:
+            # In this case, window_size will adapt to the input size, and no need to shift
+            window_size[i] = size_dhw[i]
+            shift_size[i] = 0
+
+    return window_size, shift_size
+
+
+torch.fx.wrap("_get_window_and_shift_size")
+
+
+def _get_relative_position_bias(
+    relative_position_bias_table: torch.Tensor, relative_position_index: torch.Tensor, window_size: List[int]
+) -> Tensor:
+    window_vol = window_size[0] * window_size[1] * window_size[2]
+    # In 3d case we flatten the relative_position_bias
+    relative_position_bias = relative_position_bias_table[
+        relative_position_index[:window_vol, :window_vol].flatten()  # type: ignore[index]
+    ]
+    relative_position_bias = relative_position_bias.view(window_vol, window_vol, -1)
+    relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous().unsqueeze(0)
+    return relative_position_bias
+
+
+torch.fx.wrap("_get_relative_position_bias")
+
+
+def _compute_pad_size_3d(size_dhw: Tuple[int, int, int], patch_size: Tuple[int, int, int]) -> Tuple[int, int, int]:
+    pad_size = [(patch_size[i] - size_dhw[i] % patch_size[i]) % patch_size[i] for i in range(3)]
+    return pad_size[0], pad_size[1], pad_size[2]
+
+
+torch.fx.wrap("_compute_pad_size_3d")
+
+
+def _compute_attention_mask_3d(
+    x: Tensor,
+    size_dhw: Tuple[int, int, int],
+    window_size: Tuple[int, int, int],
+    shift_size: Tuple[int, int, int],
+) -> Tensor:
+    # generate attention mask
+    attn_mask = x.new_zeros(*size_dhw)
+    num_windows = (size_dhw[0] // window_size[0]) * (size_dhw[1] // window_size[1]) * (size_dhw[2] // window_size[2])
+    slices = [
+        (
+            (0, -window_size[i]),
+            (-window_size[i], -shift_size[i]),
+            (-shift_size[i], None),
+        )
+        for i in range(3)
+    ]
+    count = 0
+    for d in slices[0]:
+        for h in slices[1]:
+            for w in slices[2]:
+                attn_mask[d[0] : d[1], h[0] : h[1], w[0] : w[1]] = count
+                count += 1
+
+    # Partition window on attn_mask
+    attn_mask = attn_mask.view(
+        size_dhw[0] // window_size[0],
+        window_size[0],
+        size_dhw[1] // window_size[1],
+        window_size[1],
+        size_dhw[2] // window_size[2],
+        window_size[2],
+    )
+    attn_mask = attn_mask.permute(0, 2, 4, 1, 3, 5).reshape(
+        num_windows, window_size[0] * window_size[1] * window_size[2]
+    )
+    attn_mask = attn_mask.unsqueeze(1) - attn_mask.unsqueeze(2)
+    attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill(attn_mask == 0, float(0.0))
+    return attn_mask
+
+
+torch.fx.wrap("_compute_attention_mask_3d")
+
+
+def shifted_window_attention_3d(
+    input: Tensor,
+    qkv_weight: Tensor,
+    proj_weight: Tensor,
+    relative_position_bias: Tensor,
+    window_size: List[int],
+    num_heads: int,
+    shift_size: List[int],
+    attention_dropout: float = 0.0,
+    dropout: float = 0.0,
+    qkv_bias: Optional[Tensor] = None,
+    proj_bias: Optional[Tensor] = None,
+    training: bool = True,
+) -> Tensor:
+    """
+    Window based multi-head self attention (W-MSA) module with relative position bias.
+    It supports both of shifted and non-shifted window.
+    Args:
+        input (Tensor[B, T, H, W, C]): The input tensor, 5-dimensions.
+        qkv_weight (Tensor[in_dim, out_dim]): The weight tensor of query, key, value.
+        proj_weight (Tensor[out_dim, out_dim]): The weight tensor of projection.
+        relative_position_bias (Tensor): The learned relative position bias added to attention.
+        window_size (List[int]): 3-dimensions window size, T, H, W .
+        num_heads (int): Number of attention heads.
+        shift_size (List[int]): Shift size for shifted window attention (T, H, W).
+        attention_dropout (float): Dropout ratio of attention weight. Default: 0.0.
+        dropout (float): Dropout ratio of output. Default: 0.0.
+        qkv_bias (Tensor[out_dim], optional): The bias tensor of query, key, value. Default: None.
+        proj_bias (Tensor[out_dim], optional): The bias tensor of projection. Default: None.
+        training (bool, optional): Training flag used by the dropout parameters. Default: True.
+    Returns:
+        Tensor[B, T, H, W, C]: The output tensor after shifted window attention.
+    """
+    b, t, h, w, c = input.shape
+    # pad feature maps to multiples of window size
+    pad_size = _compute_pad_size_3d((t, h, w), (window_size[0], window_size[1], window_size[2]))
+    x = F.pad(input, (0, 0, 0, pad_size[2], 0, pad_size[1], 0, pad_size[0]))
+    _, tp, hp, wp, _ = x.shape
+    padded_size = (tp, hp, wp)
+
+    # cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(-shift_size[0], -shift_size[1], -shift_size[2]), dims=(1, 2, 3))
+
+    # partition windows
+    num_windows = (
+        (padded_size[0] // window_size[0]) * (padded_size[1] // window_size[1]) * (padded_size[2] // window_size[2])
+    )
+    x = x.view(
+        b,
+        padded_size[0] // window_size[0],
+        window_size[0],
+        padded_size[1] // window_size[1],
+        window_size[1],
+        padded_size[2] // window_size[2],
+        window_size[2],
+        c,
+    )
+    x = x.permute(0, 1, 3, 5, 2, 4, 6, 7).reshape(
+        b * num_windows, window_size[0] * window_size[1] * window_size[2], c
+    )  # B*nW, Wd*Wh*Ww, C
+
+    # multi-head attention
+    qkv = F.linear(x, qkv_weight, qkv_bias)
+    qkv = qkv.reshape(x.size(0), x.size(1), 3, num_heads, c // num_heads).permute(2, 0, 3, 1, 4)
+    q, k, v = qkv[0], qkv[1], qkv[2]
+    q = q * (c // num_heads) ** -0.5
+    attn = q.matmul(k.transpose(-2, -1))
+    # add relative position bias
+    attn = attn + relative_position_bias
+
+    if sum(shift_size) > 0:
+        # generate attention mask to handle shifted windows with varying size
+        attn_mask = _compute_attention_mask_3d(
+            x,
+            (padded_size[0], padded_size[1], padded_size[2]),
+            (window_size[0], window_size[1], window_size[2]),
+            (shift_size[0], shift_size[1], shift_size[2]),
+        )
+        attn = attn.view(x.size(0) // num_windows, num_windows, num_heads, x.size(1), x.size(1))
+        attn = attn + attn_mask.unsqueeze(1).unsqueeze(0)
+        attn = attn.view(-1, num_heads, x.size(1), x.size(1))
+
+    attn = F.softmax(attn, dim=-1)
+    attn = F.dropout(attn, p=attention_dropout, training=training)
+
+    x = attn.matmul(v).transpose(1, 2).reshape(x.size(0), x.size(1), c)
+    x = F.linear(x, proj_weight, proj_bias)
+    x = F.dropout(x, p=dropout, training=training)
+
+    # reverse windows
+    x = x.view(
+        b,
+        padded_size[0] // window_size[0],
+        padded_size[1] // window_size[1],
+        padded_size[2] // window_size[2],
+        window_size[0],
+        window_size[1],
+        window_size[2],
+        c,
+    )
+    x = x.permute(0, 1, 4, 2, 5, 3, 6, 7).reshape(b, tp, hp, wp, c)
+
+    # reverse cyclic shift
+    if sum(shift_size) > 0:
+        x = torch.roll(x, shifts=(shift_size[0], shift_size[1], shift_size[2]), dims=(1, 2, 3))
+
+    # unpad features
+    x = x[:, :t, :h, :w, :].contiguous()
+    return x
+
+
+torch.fx.wrap("shifted_window_attention_3d")
+
+
+class ShiftedWindowAttention3d(nn.Module):
+    """
+    See :func:`shifted_window_attention_3d`.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        window_size: List[int],
+        shift_size: List[int],
+        num_heads: int,
+        qkv_bias: bool = True,
+        proj_bias: bool = True,
+        attention_dropout: float = 0.0,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        if len(window_size) != 3 or len(shift_size) != 3:
+            raise ValueError("window_size and shift_size must be of length 2")
+
+        self.window_size = window_size  # Wd, Wh, Ww
+        self.shift_size = shift_size
+        self.num_heads = num_heads
+        self.attention_dropout = attention_dropout
+        self.dropout = dropout
+
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+
+        self.define_relative_position_bias_table()
+        self.define_relative_position_index()
+
+    def define_relative_position_bias_table(self) -> None:
+        # define a parameter table of relative position bias
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(
+                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1),
+                self.num_heads,
+            )
+        )  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH
+        nn.init.trunc_normal_(self.relative_position_bias_table, std=0.02)
+
+    def define_relative_position_index(self) -> None:
+        # get pair-wise relative position index for each token inside the window
+        coords_dhw = [torch.arange(self.window_size[i]) for i in range(3)]
+        coords = torch.stack(
+            torch.meshgrid(coords_dhw[0], coords_dhw[1], coords_dhw[2], indexing="ij")
+        )  # 3, Wd, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 3, Wd*Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 3, Wd*Wh*Ww, Wd*Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wd*Wh*Ww, Wd*Wh*Ww, 3
+        relative_coords[:, :, 0] += self.window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 2] += self.window_size[2] - 1
+
+        relative_coords[:, :, 0] *= (2 * self.window_size[1] - 1) * (2 * self.window_size[2] - 1)
+        relative_coords[:, :, 1] *= 2 * self.window_size[2] - 1
+        # We don't flatten the relative_position_index here in 3d case.
+        relative_position_index = relative_coords.sum(-1)  # Wd*Wh*Ww, Wd*Wh*Ww
+        self.register_buffer("relative_position_index", relative_position_index)
+
+    def get_relative_position_bias(self, window_size: List[int]) -> torch.Tensor:
+        return _get_relative_position_bias(self.relative_position_bias_table, self.relative_position_index, window_size)  # type: ignore
+
+    def forward(self, x: Tensor) -> Tensor:
+        _, t, h, w, _ = x.shape
+        size_dhw = [t, h, w]
+        window_size, shift_size = self.window_size.copy(), self.shift_size.copy()
+        # Handle case where window_size is larger than the input tensor
+        window_size, shift_size = _get_window_and_shift_size(shift_size, size_dhw, window_size)
+
+        relative_position_bias = self.get_relative_position_bias(window_size)
+
+        return shifted_window_attention_3d(
+            x,
+            self.qkv.weight,
+            self.proj.weight,
+            relative_position_bias,
+            window_size,
+            self.num_heads,
+            shift_size=shift_size,
+            attention_dropout=self.attention_dropout,
+            dropout=self.dropout,
+            qkv_bias=self.qkv.bias,
+            proj_bias=self.proj.bias,
+            training=self.training,
+        )
+
+
+# Modified from:
+# https://github.com/SwinTransformer/Video-Swin-Transformer/blob/master/mmaction/models/backbones/swin_transformer.py
+class PatchEmbed3d(nn.Module):
+    """Video to Patch Embedding.
+
+    Args:
+        patch_size (List[int]): Patch token size.
+        in_channels (int): Number of input channels. Default: 3
+        embed_dim (int): Number of linear projection output channels. Default: 96.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None
+    """
+
+    def __init__(
+        self,
+        patch_size: List[int],
+        in_channels: int = 3,
+        embed_dim: int = 96,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.tuple_patch_size = (patch_size[0], patch_size[1], patch_size[2])
+
+        self.proj = nn.Conv3d(
+            in_channels,
+            embed_dim,
+            kernel_size=self.tuple_patch_size,
+            stride=self.tuple_patch_size,
+        )
+        if norm_layer is not None:
+            self.norm = norm_layer(embed_dim)
+        else:
+            self.norm = nn.Identity()
+
+    def forward(self, x: Tensor) -> Tensor:
+        """Forward function."""
+        # padding
+        _, _, t, h, w = x.size()
+        pad_size = _compute_pad_size_3d((t, h, w), self.tuple_patch_size)
+        x = F.pad(x, (0, pad_size[2], 0, pad_size[1], 0, pad_size[0]))
+        x = self.proj(x)  # B C T Wh Ww
+        x = x.permute(0, 2, 3, 4, 1)  # B T Wh Ww C
+        if self.norm is not None:
+            x = self.norm(x)
+        return x
+
+
+class SwinTransformer3d(nn.Module):
+    """
+    Implements 3D Swin Transformer from the `"Video Swin Transformer" <https://arxiv.org/abs/2106.13230>`_ paper.
+    Args:
+        patch_size (List[int]): Patch size.
+        embed_dim (int): Patch embedding dimension.
+        depths (List(int)): Depth of each Swin Transformer layer.
+        num_heads (List(int)): Number of attention heads in different layers.
+        window_size (List[int]): Window size.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. Default: 4.0.
+        dropout (float): Dropout rate. Default: 0.0.
+        attention_dropout (float): Attention dropout rate. Default: 0.0.
+        stochastic_depth_prob (float): Stochastic depth rate. Default: 0.1.
+        num_classes (int): Number of classes for classification head. Default: 400.
+        norm_layer (nn.Module, optional): Normalization layer. Default: None.
+        block (nn.Module, optional): SwinTransformer Block. Default: None.
+        downsample_layer (nn.Module): Downsample layer (patch merging). Default: PatchMerging.
+        patch_embed (nn.Module, optional): Patch Embedding layer. Default: None.
+    """
+
+    def __init__(
+        self,
+        patch_size: List[int],
+        embed_dim: int,
+        depths: List[int],
+        num_heads: List[int],
+        window_size: List[int],
+        mlp_ratio: float = 4.0,
+        dropout: float = 0.0,
+        attention_dropout: float = 0.0,
+        stochastic_depth_prob: float = 0.1,
+        num_classes: int = 400,
+        norm_layer: Optional[Callable[..., nn.Module]] = None,
+        block: Optional[Callable[..., nn.Module]] = None,
+        downsample_layer: Callable[..., nn.Module] = PatchMerging,
+        patch_embed: Optional[Callable[..., nn.Module]] = None,
+    ) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+        self.num_classes = num_classes
+
+        if block is None:
+            block = partial(SwinTransformerBlock, attn_layer=ShiftedWindowAttention3d)
+
+        if norm_layer is None:
+            norm_layer = partial(nn.LayerNorm, eps=1e-5)
+
+        if patch_embed is None:
+            patch_embed = PatchEmbed3d
+
+        # split image into non-overlapping patches
+        self.patch_embed = patch_embed(patch_size=patch_size, embed_dim=embed_dim, norm_layer=norm_layer)
+        self.pos_drop = nn.Dropout(p=dropout)
+
+        layers: List[nn.Module] = []
+        total_stage_blocks = sum(depths)
+        stage_block_id = 0
+        # build SwinTransformer blocks
+        for i_stage in range(len(depths)):
+            stage: List[nn.Module] = []
+            dim = embed_dim * 2**i_stage
+            for i_layer in range(depths[i_stage]):
+                # adjust stochastic depth probability based on the depth of the stage block
+                sd_prob = stochastic_depth_prob * float(stage_block_id) / (total_stage_blocks - 1)
+                stage.append(
+                    block(
+                        dim,
+                        num_heads[i_stage],
+                        window_size=window_size,
+                        shift_size=[0 if i_layer % 2 == 0 else w // 2 for w in window_size],
+                        mlp_ratio=mlp_ratio,
+                        dropout=dropout,
+                        attention_dropout=attention_dropout,
+                        stochastic_depth_prob=sd_prob,
+                        norm_layer=norm_layer,
+                        attn_layer=ShiftedWindowAttention3d,
+                    )
+                )
+                stage_block_id += 1
+            layers.append(nn.Sequential(*stage))
+            # add patch merging layer
+            if i_stage < (len(depths) - 1):
+                layers.append(downsample_layer(dim, norm_layer))
+        self.features = nn.Sequential(*layers)
+
+        self.num_features = embed_dim * 2 ** (len(depths) - 1)
+        self.norm = norm_layer(self.num_features)
+        self.avgpool = nn.AdaptiveAvgPool3d(1)
+        self.head = nn.Linear(self.num_features, num_classes)
+
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.trunc_normal_(m.weight, std=0.02)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+
+    def forward(self, x: Tensor) -> Tensor:
+        # x: B C T H W
+        x = self.patch_embed(x)  # B _T _H _W C
+        x = self.pos_drop(x)
+        x = self.features(x)  # B _T _H _W C
+        x = self.norm(x)
+        x = x.permute(0, 4, 1, 2, 3)  # B, C, _T, _H, _W
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.head(x)
+        return x
+
+
+def _swin_transformer3d(
+    patch_size: List[int],
+    embed_dim: int,
+    depths: List[int],
+    num_heads: List[int],
+    window_size: List[int],
+    stochastic_depth_prob: float,
+    weights: Optional[WeightsEnum],
+    progress: bool,
+    **kwargs: Any,
+) -> SwinTransformer3d:
+    if weights is not None:
+        _ovewrite_named_param(kwargs, "num_classes", len(weights.meta["categories"]))
+
+    model = SwinTransformer3d(
+        patch_size=patch_size,
+        embed_dim=embed_dim,
+        depths=depths,
+        num_heads=num_heads,
+        window_size=window_size,
+        stochastic_depth_prob=stochastic_depth_prob,
+        **kwargs,
+    )
+
+    if weights is not None:
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
+
+    return model
+
+
+_COMMON_META = {
+    "categories": _KINETICS400_CATEGORIES,
+    "min_size": (1, 1),
+    "min_temporal_size": 1,
+}
+
+
+class Swin3D_T_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_t-7615ae03.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 28158070,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 77.715,
+                    "acc@5": 93.519,
+                }
+            },
+            "_ops": 43.882,
+            "_file_size": 121.543,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class Swin3D_S_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_s-da41c237.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 49816678,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 79.521,
+                    "acc@5": 94.158,
+                }
+            },
+            "_ops": 82.841,
+            "_file_size": 218.288,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+class Swin3D_B_Weights(WeightsEnum):
+    KINETICS400_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_b_1k-24f7c7c6.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 88048984,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 79.427,
+                    "acc@5": 94.386,
+                }
+            },
+            "_ops": 140.667,
+            "_file_size": 364.134,
+        },
+    )
+    KINETICS400_IMAGENET22K_V1 = Weights(
+        url="https://download.pytorch.org/models/swin3d_b_22k-7c6ae6fa.pth",
+        transforms=partial(
+            VideoClassification,
+            crop_size=(224, 224),
+            resize_size=(256,),
+            mean=(0.4850, 0.4560, 0.4060),
+            std=(0.2290, 0.2240, 0.2250),
+        ),
+        meta={
+            **_COMMON_META,
+            "recipe": "https://github.com/SwinTransformer/Video-Swin-Transformer#kinetics-400",
+            "_docs": (
+                "The weights were ported from the paper. The accuracies are estimated on video-level "
+                "with parameters `frame_rate=15`, `clips_per_video=12`, and `clip_len=32`"
+            ),
+            "num_params": 88048984,
+            "_metrics": {
+                "Kinetics-400": {
+                    "acc@1": 81.643,
+                    "acc@5": 95.574,
+                }
+            },
+            "_ops": 140.667,
+            "_file_size": 364.134,
+        },
+    )
+    DEFAULT = KINETICS400_V1
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_T_Weights.KINETICS400_V1))
+def swin3d_t(*, weights: Optional[Swin3D_T_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_tiny architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_T_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_T_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_T_Weights
+        :members:
+    """
+    weights = Swin3D_T_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_S_Weights.KINETICS400_V1))
+def swin3d_s(*, weights: Optional[Swin3D_S_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_small architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_S_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_S_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_S_Weights
+        :members:
+    """
+    weights = Swin3D_S_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=96,
+        depths=[2, 2, 18, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
+
+
+@register_model()
+@handle_legacy_interface(weights=("pretrained", Swin3D_B_Weights.KINETICS400_V1))
+def swin3d_b(*, weights: Optional[Swin3D_B_Weights] = None, progress: bool = True, **kwargs: Any) -> SwinTransformer3d:
+    """
+    Constructs a swin_base architecture from
+    `Video Swin Transformer <https://arxiv.org/abs/2106.13230>`_.
+
+    Args:
+        weights (:class:`~torchvision.models.video.Swin3D_B_Weights`, optional): The
+            pretrained weights to use. See
+            :class:`~torchvision.models.video.Swin3D_B_Weights` below for
+            more details, and possible values. By default, no pre-trained
+            weights are used.
+        progress (bool, optional): If True, displays a progress bar of the
+            download to stderr. Default is True.
+        **kwargs: parameters passed to the ``torchvision.models.video.swin_transformer.SwinTransformer``
+            base class. Please refer to the `source code
+            <https://github.com/pytorch/vision/blob/main/torchvision/models/video/swin_transformer.py>`_
+            for more details about this class.
+
+    .. autoclass:: torchvision.models.video.Swin3D_B_Weights
+        :members:
+    """
+    weights = Swin3D_B_Weights.verify(weights)
+
+    return _swin_transformer3d(
+        patch_size=[2, 4, 4],
+        embed_dim=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=[8, 7, 7],
+        stochastic_depth_prob=0.1,
+        weights=weights,
+        progress=progress,
+        **kwargs,
+    )
diff --git a/torchvision/models/vision_transformer.py b/torchvision/models/vision_transformer.py
index be62ce1ce968a18d228b4102f65e2a2732ad4692..f2983ef9db0a3adf232dd6fe1b90ce9417ce0853 100644
--- a/torchvision/models/vision_transformer.py
+++ b/torchvision/models/vision_transformer.py
@@ -110,7 +110,7 @@ class EncoderBlock(nn.Module):
     def forward(self, input: torch.Tensor):
         torch._assert(input.dim() == 3, f"Expected (batch_size, seq_length, hidden_dim) got {input.shape}")
         x = self.ln_1(input)
-        x, _ = self.self_attention(query=x, key=x, value=x, need_weights=False)
+        x, _ = self.self_attention(x, x, x, need_weights=False)
         x = self.dropout(x)
         x = x + input
 
@@ -332,7 +332,7 @@ def _vision_transformer(
     )
 
     if weights:
-        model.load_state_dict(weights.get_state_dict(progress=progress))
+        model.load_state_dict(weights.get_state_dict(progress=progress, check_hash=True))
 
     return model
 
@@ -363,6 +363,8 @@ class ViT_B_16_Weights(WeightsEnum):
                     "acc@5": 95.318,
                 }
             },
+            "_ops": 17.564,
+            "_file_size": 330.285,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -387,6 +389,8 @@ class ViT_B_16_Weights(WeightsEnum):
                     "acc@5": 97.650,
                 }
             },
+            "_ops": 55.484,
+            "_file_size": 331.398,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -412,6 +416,8 @@ class ViT_B_16_Weights(WeightsEnum):
                     "acc@5": 96.180,
                 }
             },
+            "_ops": 17.564,
+            "_file_size": 330.285,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -436,6 +442,8 @@ class ViT_B_32_Weights(WeightsEnum):
                     "acc@5": 92.466,
                 }
             },
+            "_ops": 4.409,
+            "_file_size": 336.604,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -460,6 +468,8 @@ class ViT_L_16_Weights(WeightsEnum):
                     "acc@5": 94.638,
                 }
             },
+            "_ops": 61.555,
+            "_file_size": 1161.023,
             "_docs": """
                 These weights were trained from scratch by using a modified version of TorchVision's
                 `new training recipe
@@ -485,6 +495,8 @@ class ViT_L_16_Weights(WeightsEnum):
                     "acc@5": 98.512,
                 }
             },
+            "_ops": 361.986,
+            "_file_size": 1164.258,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -510,6 +522,8 @@ class ViT_L_16_Weights(WeightsEnum):
                     "acc@5": 97.422,
                 }
             },
+            "_ops": 61.555,
+            "_file_size": 1161.023,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -534,6 +548,8 @@ class ViT_L_32_Weights(WeightsEnum):
                     "acc@5": 93.07,
                 }
             },
+            "_ops": 15.378,
+            "_file_size": 1169.449,
             "_docs": """
                 These weights were trained from scratch by using a modified version of `DeIT
                 <https://arxiv.org/abs/2012.12877>`_'s training recipe.
@@ -562,6 +578,8 @@ class ViT_H_14_Weights(WeightsEnum):
                     "acc@5": 98.694,
                 }
             },
+            "_ops": 1016.717,
+            "_file_size": 2416.643,
             "_docs": """
                 These weights are learnt via transfer learning by end-to-end fine-tuning the original
                 `SWAG <https://arxiv.org/abs/2201.08371>`_ weights on ImageNet-1K data.
@@ -587,6 +605,8 @@ class ViT_H_14_Weights(WeightsEnum):
                     "acc@5": 97.730,
                 }
             },
+            "_ops": 167.295,
+            "_file_size": 2411.209,
             "_docs": """
                 These weights are composed of the original frozen `SWAG <https://arxiv.org/abs/2201.08371>`_ trunk
                 weights and a linear classifier learnt on top of them trained on ImageNet-1K data.
@@ -773,7 +793,7 @@ def interpolate_embeddings(
     interpolation_mode: str = "bicubic",
     reset_heads: bool = False,
 ) -> "OrderedDict[str, torch.Tensor]":
-    """This function helps interpolating positional embeddings during checkpoint loading,
+    """This function helps interpolate positional embeddings during checkpoint loading,
     especially when you want to apply a pre-trained model on images with different resolution.
 
     Args:
@@ -798,7 +818,7 @@ def interpolate_embeddings(
     # We do this by reshaping the positions embeddings to a 2d grid, performing
     # an interpolation in the (h, w) space and then reshaping back to a 1d grid.
     if new_seq_length != seq_length:
-        # The class token embedding shouldn't be interpolated so we split it up.
+        # The class token embedding shouldn't be interpolated, so we split it up.
         seq_length -= 1
         new_seq_length -= 1
         pos_embedding_token = pos_embedding[:, :1, :]
@@ -842,17 +862,3 @@ def interpolate_embeddings(
             model_state = model_state_copy
 
     return model_state
-
-
-# The dictionary below is internal implementation detail and will be removed in v0.15
-from ._utils import _ModelURLs
-
-
-model_urls = _ModelURLs(
-    {
-        "vit_b_16": ViT_B_16_Weights.IMAGENET1K_V1.url,
-        "vit_b_32": ViT_B_32_Weights.IMAGENET1K_V1.url,
-        "vit_l_16": ViT_L_16_Weights.IMAGENET1K_V1.url,
-        "vit_l_32": ViT_L_32_Weights.IMAGENET1K_V1.url,
-    }
-)
diff --git a/torchvision/ops/_box_convert.py b/torchvision/ops/_box_convert.py
index 5e0520fc48aad07b13af5de30969378cfaebbe50..124bdd0bcc6527d04e3c9c1f710b803f7dfbbb71 100644
--- a/torchvision/ops/_box_convert.py
+++ b/torchvision/ops/_box_convert.py
@@ -50,7 +50,7 @@ def _box_xyxy_to_cxcywh(boxes: Tensor) -> Tensor:
 def _box_xywh_to_xyxy(boxes: Tensor) -> Tensor:
     """
     Converts bounding boxes from (x, y, w, h) format to (x1, y1, x2, y2) format.
-    (x, y) refers to top left of bouding box.
+    (x, y) refers to top left of bounding box.
     (w, h) refers to width and height of box.
     Args:
         boxes (Tensor[N, 4]): boxes in (x, y, w, h) which will be converted.
diff --git a/torchvision/ops/_register_onnx_ops.py b/torchvision/ops/_register_onnx_ops.py
index 629c19c16462684c16e91d4000973405610399c3..5dd263a5d8ef497becc4aa39252a93c913b84880 100644
--- a/torchvision/ops/_register_onnx_ops.py
+++ b/torchvision/ops/_register_onnx_ops.py
@@ -2,65 +2,106 @@ import sys
 import warnings
 
 import torch
+from torch.onnx import symbolic_opset11 as opset11
+from torch.onnx.symbolic_helper import parse_args
 
-_onnx_opset_version = 11
+_ONNX_OPSET_VERSION_11 = 11
+_ONNX_OPSET_VERSION_16 = 16
+BASE_ONNX_OPSET_VERSION = _ONNX_OPSET_VERSION_11
 
 
-def _register_custom_op():
-    from torch.onnx.symbolic_helper import parse_args
-    from torch.onnx.symbolic_opset11 import select, squeeze, unsqueeze
-    from torch.onnx.symbolic_opset9 import _cast_Long
-
-    @parse_args("v", "v", "f")
-    def symbolic_multi_label_nms(g, boxes, scores, iou_threshold):
-        boxes = unsqueeze(g, boxes, 0)
-        scores = unsqueeze(g, unsqueeze(g, scores, 0), 0)
-        max_output_per_class = g.op("Constant", value_t=torch.tensor([sys.maxsize], dtype=torch.long))
-        iou_threshold = g.op("Constant", value_t=torch.tensor([iou_threshold], dtype=torch.float))
-        nms_out = g.op("NonMaxSuppression", boxes, scores, max_output_per_class, iou_threshold)
-        return squeeze(g, select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1)
-
-    @parse_args("v", "v", "f", "i", "i", "i", "i")
-    def roi_align(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
-        batch_indices = _cast_Long(
-            g, squeeze(g, select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1), False
-        )
-        rois = select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
-        # TODO: Remove this warning after ONNX opset 16 is supported.
-        if aligned:
-            warnings.warn(
-                "ROIAlign with aligned=True is not supported in ONNX, but will be supported in opset 16. "
-                "The workaround is that the user need apply the patch "
-                "https://github.com/microsoft/onnxruntime/pull/8564 "
-                "and build ONNXRuntime from source."
-            )
-
-        # ONNX doesn't support negative sampling_ratio
-        if sampling_ratio < 0:
-            warnings.warn(
-                "ONNX doesn't support negative sampling ratio, therefore is set to 0 in order to be exported."
-            )
-            sampling_ratio = 0
-        return g.op(
-            "RoiAlign",
-            input,
-            rois,
-            batch_indices,
-            spatial_scale_f=spatial_scale,
-            output_height_i=pooled_height,
-            output_width_i=pooled_width,
-            sampling_ratio_i=sampling_ratio,
+@parse_args("v", "v", "f")
+def symbolic_multi_label_nms(g, boxes, scores, iou_threshold):
+    boxes = opset11.unsqueeze(g, boxes, 0)
+    scores = opset11.unsqueeze(g, opset11.unsqueeze(g, scores, 0), 0)
+    max_output_per_class = g.op("Constant", value_t=torch.tensor([sys.maxsize], dtype=torch.long))
+    iou_threshold = g.op("Constant", value_t=torch.tensor([iou_threshold], dtype=torch.float))
+
+    # Cast boxes and scores to float32 in case they are float64 inputs
+    nms_out = g.op(
+        "NonMaxSuppression",
+        g.op("Cast", boxes, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+        g.op("Cast", scores, to_i=torch.onnx.TensorProtoDataType.FLOAT),
+        max_output_per_class,
+        iou_threshold,
+    )
+    return opset11.squeeze(
+        g, opset11.select(g, nms_out, 1, g.op("Constant", value_t=torch.tensor([2], dtype=torch.long))), 1
+    )
+
+
+def _process_batch_indices_for_roi_align(g, rois):
+    indices = opset11.squeeze(
+        g, opset11.select(g, rois, 1, g.op("Constant", value_t=torch.tensor([0], dtype=torch.long))), 1
+    )
+    return g.op("Cast", indices, to_i=torch.onnx.TensorProtoDataType.INT64)
+
+
+def _process_rois_for_roi_align(g, rois):
+    return opset11.select(g, rois, 1, g.op("Constant", value_t=torch.tensor([1, 2, 3, 4], dtype=torch.long)))
+
+
+def _process_sampling_ratio_for_roi_align(g, sampling_ratio: int):
+    if sampling_ratio < 0:
+        warnings.warn(
+            "ONNX export for RoIAlign with a non-zero sampling_ratio is not supported. "
+            "The model will be exported with a sampling_ratio of 0."
         )
+        sampling_ratio = 0
+    return sampling_ratio
+
 
-    @parse_args("v", "v", "f", "i", "i")
-    def roi_pool(g, input, rois, spatial_scale, pooled_height, pooled_width):
-        roi_pool = g.op(
-            "MaxRoiPool", input, rois, pooled_shape_i=(pooled_height, pooled_width), spatial_scale_f=spatial_scale
+@parse_args("v", "v", "f", "i", "i", "i", "i")
+def roi_align_opset11(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    batch_indices = _process_batch_indices_for_roi_align(g, rois)
+    rois = _process_rois_for_roi_align(g, rois)
+    if aligned:
+        warnings.warn(
+            "ROIAlign with aligned=True is only supported in opset >= 16. "
+            "Please export with opset 16 or higher, or use aligned=False."
         )
-        return roi_pool, None
+    sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
+    return g.op(
+        "RoiAlign",
+        input,
+        rois,
+        batch_indices,
+        spatial_scale_f=spatial_scale,
+        output_height_i=pooled_height,
+        output_width_i=pooled_width,
+        sampling_ratio_i=sampling_ratio,
+    )
+
+
+@parse_args("v", "v", "f", "i", "i", "i", "i")
+def roi_align_opset16(g, input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    batch_indices = _process_batch_indices_for_roi_align(g, rois)
+    rois = _process_rois_for_roi_align(g, rois)
+    coordinate_transformation_mode = "half_pixel" if aligned else "output_half_pixel"
+    sampling_ratio = _process_sampling_ratio_for_roi_align(g, sampling_ratio)
+    return g.op(
+        "RoiAlign",
+        input,
+        rois,
+        batch_indices,
+        coordinate_transformation_mode_s=coordinate_transformation_mode,
+        spatial_scale_f=spatial_scale,
+        output_height_i=pooled_height,
+        output_width_i=pooled_width,
+        sampling_ratio_i=sampling_ratio,
+    )
 
-    from torch.onnx import register_custom_op_symbolic
 
-    register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _onnx_opset_version)
-    register_custom_op_symbolic("torchvision::roi_align", roi_align, _onnx_opset_version)
-    register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _onnx_opset_version)
+@parse_args("v", "v", "f", "i", "i")
+def roi_pool(g, input, rois, spatial_scale, pooled_height, pooled_width):
+    roi_pool = g.op(
+        "MaxRoiPool", input, rois, pooled_shape_i=(pooled_height, pooled_width), spatial_scale_f=spatial_scale
+    )
+    return roi_pool, None
+
+
+def _register_custom_op():
+    torch.onnx.register_custom_op_symbolic("torchvision::nms", symbolic_multi_label_nms, _ONNX_OPSET_VERSION_11)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_align", roi_align_opset11, _ONNX_OPSET_VERSION_11)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_align", roi_align_opset16, _ONNX_OPSET_VERSION_16)
+    torch.onnx.register_custom_op_symbolic("torchvision::roi_pool", roi_pool, _ONNX_OPSET_VERSION_11)
diff --git a/torchvision/ops/ciou_loss.py b/torchvision/ops/ciou_loss.py
index a9f20a5f4c8860ffa931e75637fd352afe1d6fc1..75a1c4cb1f39f7b8ff33bdac3641ca71b6413d20 100644
--- a/torchvision/ops/ciou_loss.py
+++ b/torchvision/ops/ciou_loss.py
@@ -63,9 +63,16 @@ def complete_box_iou_loss(
         alpha = v / (1 - iou + v + eps)
 
     loss = diou_loss + alpha * v
-    if reduction == "mean":
+
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
         loss = loss.sum()
-
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
diff --git a/torchvision/ops/deform_conv.py b/torchvision/ops/deform_conv.py
index bb4400e5c29c107ea945a410b2f1aeedf3c9b8d4..b3cc83332a0a780710cdd61f0930f81ca9ab18b9 100644
--- a/torchvision/ops/deform_conv.py
+++ b/torchvision/ops/deform_conv.py
@@ -68,7 +68,7 @@ def deform_conv2d(
     use_mask = mask is not None
 
     if mask is None:
-        mask = torch.zeros((input.shape[0], 0), device=input.device, dtype=input.dtype)
+        mask = torch.zeros((input.shape[0], 1), device=input.device, dtype=input.dtype)
 
     if bias is None:
         bias = torch.zeros(out_channels, device=input.device, dtype=input.dtype)
diff --git a/torchvision/ops/diou_loss.py b/torchvision/ops/diou_loss.py
index 2187aea4cc5cba79fe59aa8ea344ae376d12a73d..c64c6673a882f439f8ba9b0e25d1387d8d2f4284 100644
--- a/torchvision/ops/diou_loss.py
+++ b/torchvision/ops/diou_loss.py
@@ -36,7 +36,7 @@ def distance_box_iou_loss(
         Tensor: Loss tensor with the reduction option applied.
 
     Reference:
-        Zhaohui Zheng et. al: Distance Intersection over Union Loss:
+        Zhaohui Zheng et al.: Distance Intersection over Union Loss:
         https://arxiv.org/abs/1911.08287
     """
 
@@ -50,10 +50,17 @@ def distance_box_iou_loss(
 
     loss, _ = _diou_iou_loss(boxes1, boxes2, eps)
 
-    if reduction == "mean":
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
         loss = loss.sum()
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
 
 
diff --git a/torchvision/ops/feature_pyramid_network.py b/torchvision/ops/feature_pyramid_network.py
index ffec3505ec0ff25040d8d361a46b73a934273036..2e7aef0e2fad773ac884d576f781a305e5e880d0 100644
--- a/torchvision/ops/feature_pyramid_network.py
+++ b/torchvision/ops/feature_pyramid_network.py
@@ -178,7 +178,7 @@ class FeaturePyramidNetwork(nn.Module):
 
         Returns:
             results (OrderedDict[Tensor]): feature maps after FPN layers.
-                They are ordered from highest resolution first.
+                They are ordered from the highest resolution first.
         """
         # unpack OrderedDict into two lists for easier handling
         names = list(x.keys())
@@ -206,7 +206,7 @@ class FeaturePyramidNetwork(nn.Module):
 
 class LastLevelMaxPool(ExtraFPNBlock):
     """
-    Applies a max_pool2d on top of the last feature map
+    Applies a max_pool2d (not actual max_pool2d, we just subsample) on top of the last feature map
     """
 
     def forward(
@@ -216,7 +216,8 @@ class LastLevelMaxPool(ExtraFPNBlock):
         names: List[str],
     ) -> Tuple[List[Tensor], List[str]]:
         names.append("pool")
-        x.append(F.max_pool2d(x[-1], 1, 2, 0))
+        # Use max pooling to simulate stride 2 subsampling
+        x.append(F.max_pool2d(x[-1], kernel_size=1, stride=2, padding=0))
         return x, names
 
 
diff --git a/torchvision/ops/focal_loss.py b/torchvision/ops/focal_loss.py
index c8cc9a8ac1418838447b7c056acd5c63df1f50de..08c282555fca01cbee78d79c2a0fa7bb8a9df570 100644
--- a/torchvision/ops/focal_loss.py
+++ b/torchvision/ops/focal_loss.py
@@ -32,6 +32,7 @@ def sigmoid_focal_loss(
         Loss tensor with the reduction option applied.
     """
     # Original implementation from https://github.com/facebookresearch/fvcore/blob/master/fvcore/nn/focal_loss.py
+
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(sigmoid_focal_loss)
     p = torch.sigmoid(inputs)
@@ -43,9 +44,15 @@ def sigmoid_focal_loss(
         alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
         loss = alpha_t * loss
 
-    if reduction == "mean":
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean()
     elif reduction == "sum":
         loss = loss.sum()
-
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
diff --git a/torchvision/ops/giou_loss.py b/torchvision/ops/giou_loss.py
index 0c555ec4fe987a2f114366cad22b1ed9837de9bd..ec8bc8852fe71258cb5a46fdf5428581a23b4c3e 100644
--- a/torchvision/ops/giou_loss.py
+++ b/torchvision/ops/giou_loss.py
@@ -33,7 +33,7 @@ def generalized_box_iou_loss(
         Tensor: Loss tensor with the reduction option applied.
 
     Reference:
-        Hamid Rezatofighi et. al: Generalized Intersection over Union:
+        Hamid Rezatofighi et al.: Generalized Intersection over Union:
         A Metric and A Loss for Bounding Box Regression:
         https://arxiv.org/abs/1902.09630
     """
@@ -62,9 +62,15 @@ def generalized_box_iou_loss(
 
     loss = 1 - miouk
 
-    if reduction == "mean":
+    # Check reduction option and return loss accordingly
+    if reduction == "none":
+        pass
+    elif reduction == "mean":
         loss = loss.mean() if loss.numel() > 0 else 0.0 * loss.sum()
     elif reduction == "sum":
         loss = loss.sum()
-
+    else:
+        raise ValueError(
+            f"Invalid Value for arg 'reduction': '{reduction} \n Supported reduction modes: 'none', 'mean', 'sum'"
+        )
     return loss
diff --git a/torchvision/ops/misc.py b/torchvision/ops/misc.py
index d4bda7decc52d61b601dde7e19e23aae96994379..0bbea6bce43425dec79a655857a92b1f36883098 100644
--- a/torchvision/ops/misc.py
+++ b/torchvision/ops/misc.py
@@ -131,10 +131,10 @@ class Conv2dNormActivation(ConvNormActivation):
         out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
         kernel_size: (int, optional): Size of the convolving kernel. Default: 3
         stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm2d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm2d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
         dilation (int): Spacing between kernel elements. Default: 1
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -181,10 +181,10 @@ class Conv3dNormActivation(ConvNormActivation):
         out_channels (int): Number of channels produced by the Convolution-Normalization-Activation block
         kernel_size: (int, optional): Size of the convolving kernel. Default: 3
         stride (int, optional): Stride of the convolution. Default: 1
-        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will calculated as ``padding = (kernel_size - 1) // 2 * dilation``
+        padding (int, tuple or str, optional): Padding added to all four sides of the input. Default: None, in which case it will be calculated as ``padding = (kernel_size - 1) // 2 * dilation``
         groups (int, optional): Number of blocked connections from input channels to output channels. Default: 1
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``torch.nn.BatchNorm3d``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer won't be used. Default: ``torch.nn.BatchNorm3d``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
         dilation (int): Spacing between kernel elements. Default: 1
         inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
@@ -266,9 +266,10 @@ class MLP(torch.nn.Sequential):
     Args:
         in_channels (int): Number of channels of the input
         hidden_channels (List[int]): List of the hidden channel dimensions
-        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the convolution layer. If ``None`` this layer wont be used. Default: ``None``
-        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the conv layer. If ``None`` this layer wont be used. Default: ``torch.nn.ReLU``
-        inplace (bool): Parameter for the activation layer, which can optionally do the operation in-place. Default ``True``
+        norm_layer (Callable[..., torch.nn.Module], optional): Norm layer that will be stacked on top of the linear layer. If ``None`` this layer won't be used. Default: ``None``
+        activation_layer (Callable[..., torch.nn.Module], optional): Activation function which will be stacked on top of the normalization layer (if not None), otherwise on top of the linear layer. If ``None`` this layer won't be used. Default: ``torch.nn.ReLU``
+        inplace (bool, optional): Parameter for the activation layer, which can optionally do the operation in-place.
+            Default is ``None``, which uses the respective default values of the ``activation_layer`` and Dropout layer.
         bias (bool): Whether to use bias in the linear layer. Default ``True``
         dropout (float): The probability for the dropout layer. Default: 0.0
     """
@@ -279,7 +280,7 @@ class MLP(torch.nn.Sequential):
         hidden_channels: List[int],
         norm_layer: Optional[Callable[..., torch.nn.Module]] = None,
         activation_layer: Optional[Callable[..., torch.nn.Module]] = torch.nn.ReLU,
-        inplace: Optional[bool] = True,
+        inplace: Optional[bool] = None,
         bias: bool = True,
         dropout: float = 0.0,
     ):
diff --git a/torchvision/ops/poolers.py b/torchvision/ops/poolers.py
index 4bca4a50c7bffae69f4f48ce32b6ee706bb776d6..9cdd83a598bea0a044179269f8369b58401801d1 100644
--- a/torchvision/ops/poolers.py
+++ b/torchvision/ops/poolers.py
@@ -160,8 +160,8 @@ def _multiscale_roi_align(
             reference. The coordinate must satisfy ``0 <= x1 < x2`` and ``0 <= y1 < y2``.
         output_size (Union[List[Tuple[int, int]], List[int]]): size of the output
         sampling_ratio (int): sampling ratio for ROIAlign
-        scales (Optional[List[float]]): If None, scales will be automatically infered. Default value is None.
-        mapper (Optional[LevelMapper]): If none, mapper will be automatically infered. Default value is None.
+        scales (Optional[List[float]]): If None, scales will be automatically inferred. Default value is None.
+        mapper (Optional[LevelMapper]): If none, mapper will be automatically inferred. Default value is None.
     Returns:
         result (Tensor)
     """
diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py
index 42e93cca2114e9e22236741a6460b3868a36a9ac..0d505c140ee6b3af5bd2183ad32aef244c771497 100644
--- a/torchvision/ops/roi_align.py
+++ b/torchvision/ops/roi_align.py
@@ -1,16 +1,188 @@
 from typing import List, Union
 
 import torch
+import torch._dynamo
 import torch.fx
 from torch import nn, Tensor
 from torch.jit.annotations import BroadcastingList2
 from torch.nn.modules.utils import _pair
-from torchvision.extension import _assert_has_ops
+from torchvision.extension import _assert_has_ops, _has_ops
 
 from ..utils import _log_api_usage_once
 from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format
 
 
+# NB: all inputs are tensors
+def _bilinear_interpolate(
+    input,  # [N, C, H, W]
+    roi_batch_ind,  # [K]
+    y,  # [K, PH, IY]
+    x,  # [K, PW, IX]
+    ymask,  # [K, IY]
+    xmask,  # [K, IX]
+):
+    _, channels, height, width = input.size()
+
+    # deal with inverse element out of feature map boundary
+    y = y.clamp(min=0)
+    x = x.clamp(min=0)
+    y_low = y.int()
+    x_low = x.int()
+    y_high = torch.where(y_low >= height - 1, height - 1, y_low + 1)
+    y_low = torch.where(y_low >= height - 1, height - 1, y_low)
+    y = torch.where(y_low >= height - 1, y.to(input.dtype), y)
+
+    x_high = torch.where(x_low >= width - 1, width - 1, x_low + 1)
+    x_low = torch.where(x_low >= width - 1, width - 1, x_low)
+    x = torch.where(x_low >= width - 1, x.to(input.dtype), x)
+
+    ly = y - y_low
+    lx = x - x_low
+    hy = 1.0 - ly
+    hx = 1.0 - lx
+
+    # do bilinear interpolation, but respect the masking!
+    # TODO: It's possible the masking here is unnecessary if y and
+    # x were clamped appropriately; hard to tell
+    def masked_index(
+        y,  # [K, PH, IY]
+        x,  # [K, PW, IX]
+    ):
+        if ymask is not None:
+            assert xmask is not None
+            y = torch.where(ymask[:, None, :], y, 0)
+            x = torch.where(xmask[:, None, :], x, 0)
+        return input[
+            roi_batch_ind[:, None, None, None, None, None],
+            torch.arange(channels, device=input.device)[None, :, None, None, None, None],
+            y[:, None, :, None, :, None],  # prev [K, PH, IY]
+            x[:, None, None, :, None, :],  # prev [K, PW, IX]
+        ]  # [K, C, PH, PW, IY, IX]
+
+    v1 = masked_index(y_low, x_low)
+    v2 = masked_index(y_low, x_high)
+    v3 = masked_index(y_high, x_low)
+    v4 = masked_index(y_high, x_high)
+
+    # all ws preemptively [K, C, PH, PW, IY, IX]
+    def outer_prod(y, x):
+        return y[:, None, :, None, :, None] * x[:, None, None, :, None, :]
+
+    w1 = outer_prod(hy, hx)
+    w2 = outer_prod(hy, lx)
+    w3 = outer_prod(ly, hx)
+    w4 = outer_prod(ly, lx)
+
+    val = w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4
+    return val
+
+
+# TODO: this doesn't actually cache
+# TODO: main library should make this easier to do
+def maybe_cast(tensor):
+    if torch.is_autocast_enabled() and tensor.is_cuda and tensor.dtype != torch.double:
+        return tensor.float()
+    else:
+        return tensor
+
+
+# This is a slow but pure Python and differentiable implementation of
+# roi_align.  It potentially is a good basis for Inductor compilation
+# (but I have not benchmarked it) but today it is solely used for the
+# fact that its backwards can be implemented deterministically,
+# which is needed for the PT2 benchmark suite.
+#
+# It is transcribed directly off of the roi_align CUDA kernel, see
+# https://dev-discuss.pytorch.org/t/a-pure-python-implementation-of-roi-align-that-looks-just-like-its-cuda-kernel/1266
+@torch._dynamo.allow_in_graph
+def _roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned):
+    orig_dtype = input.dtype
+
+    input = maybe_cast(input)
+    rois = maybe_cast(rois)
+
+    _, _, height, width = input.size()
+
+    ph = torch.arange(pooled_height, device=input.device)  # [PH]
+    pw = torch.arange(pooled_width, device=input.device)  # [PW]
+
+    # input: [N, C, H, W]
+    # rois: [K, 5]
+
+    roi_batch_ind = rois[:, 0].int()  # [K]
+    offset = 0.5 if aligned else 0.0
+    roi_start_w = rois[:, 1] * spatial_scale - offset  # [K]
+    roi_start_h = rois[:, 2] * spatial_scale - offset  # [K]
+    roi_end_w = rois[:, 3] * spatial_scale - offset  # [K]
+    roi_end_h = rois[:, 4] * spatial_scale - offset  # [K]
+
+    roi_width = roi_end_w - roi_start_w  # [K]
+    roi_height = roi_end_h - roi_start_h  # [K]
+    if not aligned:
+        roi_width = torch.clamp(roi_width, min=1.0)  # [K]
+        roi_height = torch.clamp(roi_height, min=1.0)  # [K]
+
+    bin_size_h = roi_height / pooled_height  # [K]
+    bin_size_w = roi_width / pooled_width  # [K]
+
+    exact_sampling = sampling_ratio > 0
+
+    roi_bin_grid_h = sampling_ratio if exact_sampling else torch.ceil(roi_height / pooled_height)  # scalar or [K]
+    roi_bin_grid_w = sampling_ratio if exact_sampling else torch.ceil(roi_width / pooled_width)  # scalar or [K]
+
+    """
+    iy, ix = dims(2)
+    """
+
+    if exact_sampling:
+        count = max(roi_bin_grid_h * roi_bin_grid_w, 1)  # scalar
+        iy = torch.arange(roi_bin_grid_h, device=input.device)  # [IY]
+        ix = torch.arange(roi_bin_grid_w, device=input.device)  # [IX]
+        ymask = None
+        xmask = None
+    else:
+        count = torch.clamp(roi_bin_grid_h * roi_bin_grid_w, min=1)  # [K]
+        # When doing adaptive sampling, the number of samples we need to do
+        # is data-dependent based on how big the ROIs are.  This is a bit
+        # awkward because first-class dims can't actually handle this.
+        # So instead, we inefficiently suppose that we needed to sample ALL
+        # the points and mask out things that turned out to be unnecessary
+        iy = torch.arange(height, device=input.device)  # [IY]
+        ix = torch.arange(width, device=input.device)  # [IX]
+        ymask = iy[None, :] < roi_bin_grid_h[:, None]  # [K, IY]
+        xmask = ix[None, :] < roi_bin_grid_w[:, None]  # [K, IX]
+
+    def from_K(t):
+        return t[:, None, None]
+
+    y = (
+        from_K(roi_start_h)
+        + ph[None, :, None] * from_K(bin_size_h)
+        + (iy[None, None, :] + 0.5).to(input.dtype) * from_K(bin_size_h / roi_bin_grid_h)
+    )  # [K, PH, IY]
+    x = (
+        from_K(roi_start_w)
+        + pw[None, :, None] * from_K(bin_size_w)
+        + (ix[None, None, :] + 0.5).to(input.dtype) * from_K(bin_size_w / roi_bin_grid_w)
+    )  # [K, PW, IX]
+    val = _bilinear_interpolate(input, roi_batch_ind, y, x, ymask, xmask)  # [K, C, PH, PW, IY, IX]
+
+    # Mask out samples that weren't actually adaptively needed
+    if not exact_sampling:
+        val = torch.where(ymask[:, None, None, None, :, None], val, 0)
+        val = torch.where(xmask[:, None, None, None, None, :], val, 0)
+
+    output = val.sum((-1, -2))  # remove IY, IX ~> [K, C, PH, PW]
+    if isinstance(count, torch.Tensor):
+        output /= count[:, None, None, None]
+    else:
+        output /= count
+
+    output = output.to(orig_dtype)
+
+    return output
+
+
 @torch.fx.wrap
 def roi_align(
     input: Tensor,
@@ -54,12 +226,15 @@ def roi_align(
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(roi_align)
-    _assert_has_ops()
     check_roi_boxes_shape(boxes)
     rois = boxes
     output_size = _pair(output_size)
     if not isinstance(rois, torch.Tensor):
         rois = convert_boxes_to_roi_format(rois)
+    if not torch.jit.is_scripting():
+        if not _has_ops() or (torch.are_deterministic_algorithms_enabled() and (input.is_cuda or input.is_mps)):
+            return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned)
+    _assert_has_ops()
     return torch.ops.torchvision.roi_align(
         input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned
     )
diff --git a/torchvision/transforms/_functional_pil.py b/torchvision/transforms/_functional_pil.py
new file mode 100644
index 0000000000000000000000000000000000000000..277848224ac41b6511869c63913eba9bd63952ab
--- /dev/null
+++ b/torchvision/transforms/_functional_pil.py
@@ -0,0 +1,393 @@
+import numbers
+from typing import Any, Dict, List, Literal, Optional, Sequence, Tuple, Union
+
+import numpy as np
+import torch
+from PIL import Image, ImageEnhance, ImageOps
+
+try:
+    import accimage
+except ImportError:
+    accimage = None
+
+
+@torch.jit.unused
+def _is_pil_image(img: Any) -> bool:
+    if accimage is not None:
+        return isinstance(img, (Image.Image, accimage.Image))
+    else:
+        return isinstance(img, Image.Image)
+
+
+@torch.jit.unused
+def get_dimensions(img: Any) -> List[int]:
+    if _is_pil_image(img):
+        if hasattr(img, "getbands"):
+            channels = len(img.getbands())
+        else:
+            channels = img.channels
+        width, height = img.size
+        return [channels, height, width]
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+@torch.jit.unused
+def get_image_size(img: Any) -> List[int]:
+    if _is_pil_image(img):
+        return list(img.size)
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+@torch.jit.unused
+def get_image_num_channels(img: Any) -> int:
+    if _is_pil_image(img):
+        if hasattr(img, "getbands"):
+            return len(img.getbands())
+        else:
+            return img.channels
+    raise TypeError(f"Unexpected type {type(img)}")
+
+
+@torch.jit.unused
+def hflip(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    return img.transpose(Image.FLIP_LEFT_RIGHT)
+
+
+@torch.jit.unused
+def vflip(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    return img.transpose(Image.FLIP_TOP_BOTTOM)
+
+
+@torch.jit.unused
+def adjust_brightness(img: Image.Image, brightness_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Brightness(img)
+    img = enhancer.enhance(brightness_factor)
+    return img
+
+
+@torch.jit.unused
+def adjust_contrast(img: Image.Image, contrast_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(contrast_factor)
+    return img
+
+
+@torch.jit.unused
+def adjust_saturation(img: Image.Image, saturation_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Color(img)
+    img = enhancer.enhance(saturation_factor)
+    return img
+
+
+@torch.jit.unused
+def adjust_hue(img: Image.Image, hue_factor: float) -> Image.Image:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    input_mode = img.mode
+    if input_mode in {"L", "1", "I", "F"}:
+        return img
+
+    h, s, v = img.convert("HSV").split()
+
+    np_h = np.array(h, dtype=np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over="ignore"):
+        np_h += np.uint8(hue_factor * 255)
+    h = Image.fromarray(np_h, "L")
+
+    img = Image.merge("HSV", (h, s, v)).convert(input_mode)
+    return img
+
+
+@torch.jit.unused
+def adjust_gamma(
+    img: Image.Image,
+    gamma: float,
+    gain: float = 1.0,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    input_mode = img.mode
+    img = img.convert("RGB")
+    gamma_map = [int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma)) for ele in range(256)] * 3
+    img = img.point(gamma_map)  # use PIL's point-function to accelerate this part
+
+    img = img.convert(input_mode)
+    return img
+
+
+@torch.jit.unused
+def pad(
+    img: Image.Image,
+    padding: Union[int, List[int], Tuple[int, ...]],
+    fill: Optional[Union[float, List[float], Tuple[float, ...]]] = 0,
+    padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    if not isinstance(padding, (numbers.Number, tuple, list)):
+        raise TypeError("Got inappropriate padding arg")
+    if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
+        raise TypeError("Got inappropriate fill arg")
+    if not isinstance(padding_mode, str):
+        raise TypeError("Got inappropriate padding_mode arg")
+
+    if isinstance(padding, list):
+        padding = tuple(padding)
+
+    if isinstance(padding, tuple) and len(padding) not in [1, 2, 4]:
+        raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple")
+
+    if isinstance(padding, tuple) and len(padding) == 1:
+        # Compatibility with `functional_tensor.pad`
+        padding = padding[0]
+
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+    if padding_mode == "constant":
+        opts = _parse_fill(fill, img, name="fill")
+        if img.mode == "P":
+            palette = img.getpalette()
+            image = ImageOps.expand(img, border=padding, **opts)
+            image.putpalette(palette)
+            return image
+
+        return ImageOps.expand(img, border=padding, **opts)
+    else:
+        if isinstance(padding, int):
+            pad_left = pad_right = pad_top = pad_bottom = padding
+        if isinstance(padding, tuple) and len(padding) == 2:
+            pad_left = pad_right = padding[0]
+            pad_top = pad_bottom = padding[1]
+        if isinstance(padding, tuple) and len(padding) == 4:
+            pad_left = padding[0]
+            pad_top = padding[1]
+            pad_right = padding[2]
+            pad_bottom = padding[3]
+
+        p = [pad_left, pad_top, pad_right, pad_bottom]
+        cropping = -np.minimum(p, 0)
+
+        if cropping.any():
+            crop_left, crop_top, crop_right, crop_bottom = cropping
+            img = img.crop((crop_left, crop_top, img.width - crop_right, img.height - crop_bottom))
+
+        pad_left, pad_top, pad_right, pad_bottom = np.maximum(p, 0)
+
+        if img.mode == "P":
+            palette = img.getpalette()
+            img = np.asarray(img)
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), mode=padding_mode)
+            img = Image.fromarray(img)
+            img.putpalette(palette)
+            return img
+
+        img = np.asarray(img)
+        # RGB image
+        if len(img.shape) == 3:
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), padding_mode)
+        # Grayscale image
+        if len(img.shape) == 2:
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), padding_mode)
+
+        return Image.fromarray(img)
+
+
+@torch.jit.unused
+def crop(
+    img: Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    return img.crop((left, top, left + width, top + height))
+
+
+@torch.jit.unused
+def resize(
+    img: Image.Image,
+    size: Union[List[int], int],
+    interpolation: int = Image.BILINEAR,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    if not (isinstance(size, list) and len(size) == 2):
+        raise TypeError(f"Got inappropriate size arg: {size}")
+
+    return img.resize(tuple(size[::-1]), interpolation)
+
+
+@torch.jit.unused
+def _parse_fill(
+    fill: Optional[Union[float, List[float], Tuple[float, ...]]],
+    img: Image.Image,
+    name: str = "fillcolor",
+) -> Dict[str, Optional[Union[float, List[float], Tuple[float, ...]]]]:
+
+    # Process fill color for affine transforms
+    num_channels = get_image_num_channels(img)
+    if fill is None:
+        fill = 0
+    if isinstance(fill, (int, float)) and num_channels > 1:
+        fill = tuple([fill] * num_channels)
+    if isinstance(fill, (list, tuple)):
+        if len(fill) == 1:
+            fill = fill * num_channels
+        elif len(fill) != num_channels:
+            msg = "The number of elements in 'fill' does not match the number of channels of the image ({} != {})"
+            raise ValueError(msg.format(len(fill), num_channels))
+
+        fill = tuple(fill)  # type: ignore[arg-type]
+
+    if img.mode != "F":
+        if isinstance(fill, (list, tuple)):
+            fill = tuple(int(x) for x in fill)
+        else:
+            fill = int(fill)
+
+    return {name: fill}
+
+
+@torch.jit.unused
+def affine(
+    img: Image.Image,
+    matrix: List[float],
+    interpolation: int = Image.NEAREST,
+    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    output_size = img.size
+    opts = _parse_fill(fill, img)
+    return img.transform(output_size, Image.AFFINE, matrix, interpolation, **opts)
+
+
+@torch.jit.unused
+def rotate(
+    img: Image.Image,
+    angle: float,
+    interpolation: int = Image.NEAREST,
+    expand: bool = False,
+    center: Optional[Tuple[int, int]] = None,
+    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    opts = _parse_fill(fill, img)
+    return img.rotate(angle, interpolation, expand, center, **opts)
+
+
+@torch.jit.unused
+def perspective(
+    img: Image.Image,
+    perspective_coeffs: List[float],
+    interpolation: int = Image.BICUBIC,
+    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
+) -> Image.Image:
+
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    opts = _parse_fill(fill, img)
+
+    return img.transform(img.size, Image.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
+
+
+@torch.jit.unused
+def to_grayscale(img: Image.Image, num_output_channels: int) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    if num_output_channels == 1:
+        img = img.convert("L")
+    elif num_output_channels == 3:
+        img = img.convert("L")
+        np_img = np.array(img, dtype=np.uint8)
+        np_img = np.dstack([np_img, np_img, np_img])
+        img = Image.fromarray(np_img, "RGB")
+    else:
+        raise ValueError("num_output_channels should be either 1 or 3")
+
+    return img
+
+
+@torch.jit.unused
+def invert(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.invert(img)
+
+
+@torch.jit.unused
+def posterize(img: Image.Image, bits: int) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.posterize(img, bits)
+
+
+@torch.jit.unused
+def solarize(img: Image.Image, threshold: int) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.solarize(img, threshold)
+
+
+@torch.jit.unused
+def adjust_sharpness(img: Image.Image, sharpness_factor: float) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+
+    enhancer = ImageEnhance.Sharpness(img)
+    img = enhancer.enhance(sharpness_factor)
+    return img
+
+
+@torch.jit.unused
+def autocontrast(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.autocontrast(img)
+
+
+@torch.jit.unused
+def equalize(img: Image.Image) -> Image.Image:
+    if not _is_pil_image(img):
+        raise TypeError(f"img should be PIL Image. Got {type(img)}")
+    return ImageOps.equalize(img)
diff --git a/torchvision/transforms/_functional_tensor.py b/torchvision/transforms/_functional_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0e7c17882bc0a4d37cf730a71f9c13ec4802997
--- /dev/null
+++ b/torchvision/transforms/_functional_tensor.py
@@ -0,0 +1,962 @@
+import warnings
+from typing import List, Optional, Tuple, Union
+
+import torch
+from torch import Tensor
+from torch.nn.functional import conv2d, grid_sample, interpolate, pad as torch_pad
+
+
+def _is_tensor_a_torch_image(x: Tensor) -> bool:
+    return x.ndim >= 2
+
+
+def _assert_image_tensor(img: Tensor) -> None:
+    if not _is_tensor_a_torch_image(img):
+        raise TypeError("Tensor is not a torch image.")
+
+
+def get_dimensions(img: Tensor) -> List[int]:
+    _assert_image_tensor(img)
+    channels = 1 if img.ndim == 2 else img.shape[-3]
+    height, width = img.shape[-2:]
+    return [channels, height, width]
+
+
+def get_image_size(img: Tensor) -> List[int]:
+    # Returns (w, h) of tensor image
+    _assert_image_tensor(img)
+    return [img.shape[-1], img.shape[-2]]
+
+
+def get_image_num_channels(img: Tensor) -> int:
+    _assert_image_tensor(img)
+    if img.ndim == 2:
+        return 1
+    elif img.ndim > 2:
+        return img.shape[-3]
+
+    raise TypeError(f"Input ndim should be 2 or more. Got {img.ndim}")
+
+
+def _max_value(dtype: torch.dtype) -> int:
+    if dtype == torch.uint8:
+        return 255
+    elif dtype == torch.int8:
+        return 127
+    elif dtype == torch.int16:
+        return 32767
+    elif dtype == torch.int32:
+        return 2147483647
+    elif dtype == torch.int64:
+        return 9223372036854775807
+    else:
+        # This is only here for completeness. This value is implicitly assumed in a lot of places so changing it is not
+        # easy.
+        return 1
+
+
+def _assert_channels(img: Tensor, permitted: List[int]) -> None:
+    c = get_dimensions(img)[0]
+    if c not in permitted:
+        raise TypeError(f"Input image tensor permitted channel values are {permitted}, but found {c}")
+
+
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
+    if image.dtype == dtype:
+        return image
+
+    if image.is_floating_point():
+
+        # TODO: replace with dtype.is_floating_point when torchscript supports it
+        if torch.tensor(0, dtype=dtype).is_floating_point():
+            return image.to(dtype)
+
+        # float to int
+        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
+            image.dtype == torch.float64 and dtype == torch.int64
+        ):
+            msg = f"The cast from {image.dtype} to {dtype} cannot be performed safely."
+            raise RuntimeError(msg)
+
+        # https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
+        # For data in the range 0-1, (float * 255).to(uint) is only 255
+        # when float is exactly 1.0.
+        # `max + 1 - epsilon` provides more evenly distributed mapping of
+        # ranges of floats to ints.
+        eps = 1e-3
+        max_val = float(_max_value(dtype))
+        result = image.mul(max_val + 1.0 - eps)
+        return result.to(dtype)
+    else:
+        input_max = float(_max_value(image.dtype))
+
+        # int to float
+        # TODO: replace with dtype.is_floating_point when torchscript supports it
+        if torch.tensor(0, dtype=dtype).is_floating_point():
+            image = image.to(dtype)
+            return image / input_max
+
+        output_max = float(_max_value(dtype))
+
+        # int to int
+        if input_max > output_max:
+            # factor should be forced to int for torch jit script
+            # otherwise factor is a float and image // factor can produce different results
+            factor = int((input_max + 1) // (output_max + 1))
+            image = torch.div(image, factor, rounding_mode="floor")
+            return image.to(dtype)
+        else:
+            # factor should be forced to int for torch jit script
+            # otherwise factor is a float and image * factor can produce different results
+            factor = int((output_max + 1) // (input_max + 1))
+            image = image.to(dtype)
+            return image * factor
+
+
+def vflip(img: Tensor) -> Tensor:
+    _assert_image_tensor(img)
+
+    return img.flip(-2)
+
+
+def hflip(img: Tensor) -> Tensor:
+    _assert_image_tensor(img)
+
+    return img.flip(-1)
+
+
+def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
+    _assert_image_tensor(img)
+
+    _, h, w = get_dimensions(img)
+    right = left + width
+    bottom = top + height
+
+    if left < 0 or top < 0 or right > w or bottom > h:
+        padding_ltrb = [
+            max(-left + min(0, right), 0),
+            max(-top + min(0, bottom), 0),
+            max(right - max(w, left), 0),
+            max(bottom - max(h, top), 0),
+        ]
+        return pad(img[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb, fill=0)
+    return img[..., top:bottom, left:right]
+
+
+def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+    _assert_channels(img, [1, 3])
+
+    if num_output_channels not in (1, 3):
+        raise ValueError("num_output_channels should be either 1 or 3")
+
+    if img.shape[-3] == 3:
+        r, g, b = img.unbind(dim=-3)
+        # This implementation closely follows the TF one:
+        # https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L2105-L2138
+        l_img = (0.2989 * r + 0.587 * g + 0.114 * b).to(img.dtype)
+        l_img = l_img.unsqueeze(dim=-3)
+    else:
+        l_img = img.clone()
+
+    if num_output_channels == 3:
+        return l_img.expand(img.shape)
+
+    return l_img
+
+
+def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
+    if brightness_factor < 0:
+        raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+
+    return _blend(img, torch.zeros_like(img), brightness_factor)
+
+
+def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
+    if contrast_factor < 0:
+        raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [3, 1])
+    c = get_dimensions(img)[0]
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    if c == 3:
+        mean = torch.mean(rgb_to_grayscale(img).to(dtype), dim=(-3, -2, -1), keepdim=True)
+    else:
+        mean = torch.mean(img.to(dtype), dim=(-3, -2, -1), keepdim=True)
+
+    return _blend(img, mean, contrast_factor)
+
+
+def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError("Input img should be Tensor image")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
+        return img
+
+    orig_dtype = img.dtype
+    img = convert_image_dtype(img, torch.float32)
+
+    img = _rgb2hsv(img)
+    h, s, v = img.unbind(dim=-3)
+    h = (h + hue_factor) % 1.0
+    img = torch.stack((h, s, v), dim=-3)
+    img_hue_adj = _hsv2rgb(img)
+
+    return convert_image_dtype(img_hue_adj, orig_dtype)
+
+
+def adjust_saturation(img: Tensor, saturation_factor: float) -> Tensor:
+    if saturation_factor < 0:
+        raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+
+    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
+        return img
+
+    return _blend(img, rgb_to_grayscale(img), saturation_factor)
+
+
+def adjust_gamma(img: Tensor, gamma: float, gain: float = 1) -> Tensor:
+    if not isinstance(img, torch.Tensor):
+        raise TypeError("Input img should be a Tensor.")
+
+    _assert_channels(img, [1, 3])
+
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    result = img
+    dtype = img.dtype
+    if not torch.is_floating_point(img):
+        result = convert_image_dtype(result, torch.float32)
+
+    result = (gain * result**gamma).clamp(0, 1)
+
+    result = convert_image_dtype(result, dtype)
+    return result
+
+
+def _blend(img1: Tensor, img2: Tensor, ratio: float) -> Tensor:
+    ratio = float(ratio)
+    bound = _max_value(img1.dtype)
+    return (ratio * img1 + (1.0 - ratio) * img2).clamp(0, bound).to(img1.dtype)
+
+
+def _rgb2hsv(img: Tensor) -> Tensor:
+    r, g, b = img.unbind(dim=-3)
+
+    # Implementation is based on https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/
+    # src/libImaging/Convert.c#L330
+    maxc = torch.max(img, dim=-3).values
+    minc = torch.min(img, dim=-3).values
+
+    # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
+    # from happening in the results, because
+    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
+    #   + H channel has division by `(maxc - minc)`.
+    #
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring, so
+    # we don't need to deal with it in case we save the NaN in a buffer in
+    # backprop, if it is ever supported, but it doesn't hurt to do so.
+    eqc = maxc == minc
+
+    cr = maxc - minc
+    # Since `eqc => cr = 0`, replacing denominator with 1 when `eqc` is fine.
+    ones = torch.ones_like(maxc)
+    s = cr / torch.where(eqc, ones, maxc)
+    # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
+    # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
+    # would not matter what values `rc`, `gc`, and `bc` have here, and thus
+    # replacing denominator with 1 when `eqc` is fine.
+    cr_divisor = torch.where(eqc, ones, cr)
+    rc = (maxc - r) / cr_divisor
+    gc = (maxc - g) / cr_divisor
+    bc = (maxc - b) / cr_divisor
+
+    hr = (maxc == r) * (bc - gc)
+    hg = ((maxc == g) & (maxc != r)) * (2.0 + rc - bc)
+    hb = ((maxc != g) & (maxc != r)) * (4.0 + gc - rc)
+    h = hr + hg + hb
+    h = torch.fmod((h / 6.0 + 1.0), 1.0)
+    return torch.stack((h, s, maxc), dim=-3)
+
+
+def _hsv2rgb(img: Tensor) -> Tensor:
+    h, s, v = img.unbind(dim=-3)
+    i = torch.floor(h * 6.0)
+    f = (h * 6.0) - i
+    i = i.to(dtype=torch.int32)
+
+    p = torch.clamp((v * (1.0 - s)), 0.0, 1.0)
+    q = torch.clamp((v * (1.0 - s * f)), 0.0, 1.0)
+    t = torch.clamp((v * (1.0 - s * (1.0 - f))), 0.0, 1.0)
+    i = i % 6
+
+    mask = i.unsqueeze(dim=-3) == torch.arange(6, device=i.device).view(-1, 1, 1)
+
+    a1 = torch.stack((v, q, p, p, t, v), dim=-3)
+    a2 = torch.stack((t, v, v, q, p, p), dim=-3)
+    a3 = torch.stack((p, p, t, v, v, q), dim=-3)
+    a4 = torch.stack((a1, a2, a3), dim=-4)
+
+    return torch.einsum("...ijk, ...xijk -> ...xjk", mask.to(dtype=img.dtype), a4)
+
+
+def _pad_symmetric(img: Tensor, padding: List[int]) -> Tensor:
+    # padding is left, right, top, bottom
+
+    # crop if needed
+    if padding[0] < 0 or padding[1] < 0 or padding[2] < 0 or padding[3] < 0:
+        neg_min_padding = [-min(x, 0) for x in padding]
+        crop_left, crop_right, crop_top, crop_bottom = neg_min_padding
+        img = img[..., crop_top : img.shape[-2] - crop_bottom, crop_left : img.shape[-1] - crop_right]
+        padding = [max(x, 0) for x in padding]
+
+    in_sizes = img.size()
+
+    _x_indices = [i for i in range(in_sizes[-1])]  # [0, 1, 2, 3, ...]
+    left_indices = [i for i in range(padding[0] - 1, -1, -1)]  # e.g. [3, 2, 1, 0]
+    right_indices = [-(i + 1) for i in range(padding[1])]  # e.g. [-1, -2, -3]
+    x_indices = torch.tensor(left_indices + _x_indices + right_indices, device=img.device)
+
+    _y_indices = [i for i in range(in_sizes[-2])]
+    top_indices = [i for i in range(padding[2] - 1, -1, -1)]
+    bottom_indices = [-(i + 1) for i in range(padding[3])]
+    y_indices = torch.tensor(top_indices + _y_indices + bottom_indices, device=img.device)
+
+    ndim = img.ndim
+    if ndim == 3:
+        return img[:, y_indices[:, None], x_indices[None, :]]
+    elif ndim == 4:
+        return img[:, :, y_indices[:, None], x_indices[None, :]]
+    else:
+        raise RuntimeError("Symmetric padding of N-D tensors are not supported yet")
+
+
+def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
+    if isinstance(padding, int):
+        if torch.jit.is_scripting():
+            # This maybe unreachable
+            raise ValueError("padding can't be an int while torchscripting, set it as a list [value, ]")
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    elif len(padding) == 1:
+        pad_left = pad_right = pad_top = pad_bottom = padding[0]
+    elif len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    else:
+        pad_left = padding[0]
+        pad_top = padding[1]
+        pad_right = padding[2]
+        pad_bottom = padding[3]
+
+    return [pad_left, pad_right, pad_top, pad_bottom]
+
+
+def pad(
+    img: Tensor, padding: Union[int, List[int]], fill: Optional[Union[int, float]] = 0, padding_mode: str = "constant"
+) -> Tensor:
+    _assert_image_tensor(img)
+
+    if fill is None:
+        fill = 0
+
+    if not isinstance(padding, (int, tuple, list)):
+        raise TypeError("Got inappropriate padding arg")
+    if not isinstance(fill, (int, float)):
+        raise TypeError("Got inappropriate fill arg")
+    if not isinstance(padding_mode, str):
+        raise TypeError("Got inappropriate padding_mode arg")
+
+    if isinstance(padding, tuple):
+        padding = list(padding)
+
+    if isinstance(padding, list):
+        # TODO: Jit is failing on loading this op when scripted and saved
+        # https://github.com/pytorch/pytorch/issues/81100
+        if len(padding) not in [1, 2, 4]:
+            raise ValueError(
+                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
+            )
+
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+    p = _parse_pad_padding(padding)
+
+    if padding_mode == "edge":
+        # remap padding_mode str
+        padding_mode = "replicate"
+    elif padding_mode == "symmetric":
+        # route to another implementation
+        return _pad_symmetric(img, p)
+
+    need_squeeze = False
+    if img.ndim < 4:
+        img = img.unsqueeze(dim=0)
+        need_squeeze = True
+
+    out_dtype = img.dtype
+    need_cast = False
+    if (padding_mode != "constant") and img.dtype not in (torch.float32, torch.float64):
+        # Here we temporarily cast input tensor to float
+        # until pytorch issue is resolved :
+        # https://github.com/pytorch/pytorch/issues/40763
+        need_cast = True
+        img = img.to(torch.float32)
+
+    if padding_mode in ("reflect", "replicate"):
+        img = torch_pad(img, p, mode=padding_mode)
+    else:
+        img = torch_pad(img, p, mode=padding_mode, value=float(fill))
+
+    if need_squeeze:
+        img = img.squeeze(dim=0)
+
+    if need_cast:
+        img = img.to(out_dtype)
+
+    return img
+
+
+def resize(
+    img: Tensor,
+    size: List[int],
+    interpolation: str = "bilinear",
+    # TODO: in v0.17, change the default to True. This will a private function
+    # by then, so we don't care about warning here.
+    antialias: Optional[bool] = None,
+) -> Tensor:
+    _assert_image_tensor(img)
+
+    if isinstance(size, tuple):
+        size = list(size)
+
+    if antialias is None:
+        antialias = False
+
+    if antialias and interpolation not in ["bilinear", "bicubic"]:
+        # We manually set it to False to avoid an error downstream in interpolate()
+        # This behaviour is documented: the parameter is irrelevant for modes
+        # that are not bilinear or bicubic. We used to raise an error here, but
+        # now we don't as True is the default.
+        antialias = False
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [torch.float32, torch.float64])
+
+    # Define align_corners to avoid warnings
+    align_corners = False if interpolation in ["bilinear", "bicubic"] else None
+
+    img = interpolate(img, size=size, mode=interpolation, align_corners=align_corners, antialias=antialias)
+
+    if interpolation == "bicubic" and out_dtype == torch.uint8:
+        img = img.clamp(min=0, max=255)
+
+    img = _cast_squeeze_out(img, need_cast=need_cast, need_squeeze=need_squeeze, out_dtype=out_dtype)
+
+    return img
+
+
+def _assert_grid_transform_inputs(
+    img: Tensor,
+    matrix: Optional[List[float]],
+    interpolation: str,
+    fill: Optional[Union[int, float, List[float]]],
+    supported_interpolation_modes: List[str],
+    coeffs: Optional[List[float]] = None,
+) -> None:
+
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError("Input img should be Tensor")
+
+    _assert_image_tensor(img)
+
+    if matrix is not None and not isinstance(matrix, list):
+        raise TypeError("Argument matrix should be a list")
+
+    if matrix is not None and len(matrix) != 6:
+        raise ValueError("Argument matrix should have 6 float values")
+
+    if coeffs is not None and len(coeffs) != 8:
+        raise ValueError("Argument coeffs should have 8 float values")
+
+    if fill is not None and not isinstance(fill, (int, float, tuple, list)):
+        warnings.warn("Argument fill should be either int, float, tuple or list")
+
+    # Check fill
+    num_channels = get_dimensions(img)[0]
+    if fill is not None and isinstance(fill, (tuple, list)) and len(fill) > 1 and len(fill) != num_channels:
+        msg = (
+            "The number of elements in 'fill' cannot broadcast to match the number of "
+            "channels of the image ({} != {})"
+        )
+        raise ValueError(msg.format(len(fill), num_channels))
+
+    if interpolation not in supported_interpolation_modes:
+        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
+
+
+def _cast_squeeze_in(img: Tensor, req_dtypes: List[torch.dtype]) -> Tuple[Tensor, bool, bool, torch.dtype]:
+    need_squeeze = False
+    # make image NCHW
+    if img.ndim < 4:
+        img = img.unsqueeze(dim=0)
+        need_squeeze = True
+
+    out_dtype = img.dtype
+    need_cast = False
+    if out_dtype not in req_dtypes:
+        need_cast = True
+        req_dtype = req_dtypes[0]
+        img = img.to(req_dtype)
+    return img, need_cast, need_squeeze, out_dtype
+
+
+def _cast_squeeze_out(img: Tensor, need_cast: bool, need_squeeze: bool, out_dtype: torch.dtype) -> Tensor:
+    if need_squeeze:
+        img = img.squeeze(dim=0)
+
+    if need_cast:
+        if out_dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
+            # it is better to round before cast
+            img = torch.round(img)
+        img = img.to(out_dtype)
+
+    return img
+
+
+def _apply_grid_transform(
+    img: Tensor, grid: Tensor, mode: str, fill: Optional[Union[int, float, List[float]]]
+) -> Tensor:
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [grid.dtype])
+
+    if img.shape[0] > 1:
+        # Apply same grid to a batch of images
+        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3])
+
+    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
+    if fill is not None:
+        mask = torch.ones((img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype, device=img.device)
+        img = torch.cat((img, mask), dim=1)
+
+    img = grid_sample(img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        mask = img[:, -1:, :, :]  # N * 1 * H * W
+        img = img[:, :-1, :, :]  # N * C * H * W
+        mask = mask.expand_as(img)
+        fill_list, len_fill = (fill, len(fill)) if isinstance(fill, (tuple, list)) else ([float(fill)], 1)
+        fill_img = torch.tensor(fill_list, dtype=img.dtype, device=img.device).view(1, len_fill, 1, 1).expand_as(img)
+        if mode == "nearest":
+            mask = mask < 0.5
+            img[mask] = fill_img[mask]
+        else:  # 'bilinear'
+            img = img * mask + (1.0 - mask) * fill_img
+
+    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
+    return img
+
+
+def _gen_affine_grid(
+    theta: Tensor,
+    w: int,
+    h: int,
+    ow: int,
+    oh: int,
+) -> Tensor:
+    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
+    # AffineGridGenerator.cpp#L18
+    # Difference with AffineGridGenerator is that:
+    # 1) we normalize grid values after applying theta
+    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=theta.dtype, device=theta.device)
+    x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=theta.device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=theta.device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta = theta.transpose(1, 2) / torch.tensor([0.5 * w, 0.5 * h], dtype=theta.dtype, device=theta.device)
+    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
+    return output_grid.view(1, oh, ow, 2)
+
+
+def affine(
+    img: Tensor,
+    matrix: List[float],
+    interpolation: str = "nearest",
+    fill: Optional[Union[int, float, List[float]]] = None,
+) -> Tensor:
+    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
+
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
+    shape = img.shape
+    # grid will be generated on the same device as theta and img
+    grid = _gen_affine_grid(theta, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2])
+    return _apply_grid_transform(img, grid, interpolation, fill=fill)
+
+
+def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]:
+
+    # Inspired of PIL implementation:
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+    # Points are shifted due to affine matrix torch convention about
+    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
+    pts = torch.tensor(
+        [
+            [-0.5 * w, -0.5 * h, 1.0],
+            [-0.5 * w, 0.5 * h, 1.0],
+            [0.5 * w, 0.5 * h, 1.0],
+            [0.5 * w, -0.5 * h, 1.0],
+        ]
+    )
+    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
+    new_pts = torch.matmul(pts, theta.T)
+    min_vals, _ = new_pts.min(dim=0)
+    max_vals, _ = new_pts.max(dim=0)
+
+    # shift points to [0, w] and [0, h] interval to match PIL results
+    min_vals += torch.tensor((w * 0.5, h * 0.5))
+    max_vals += torch.tensor((w * 0.5, h * 0.5))
+
+    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
+    tol = 1e-4
+    cmax = torch.ceil((max_vals / tol).trunc_() * tol)
+    cmin = torch.floor((min_vals / tol).trunc_() * tol)
+    size = cmax - cmin
+    return int(size[0]), int(size[1])  # w, h
+
+
+def rotate(
+    img: Tensor,
+    matrix: List[float],
+    interpolation: str = "nearest",
+    expand: bool = False,
+    fill: Optional[Union[int, float, List[float]]] = None,
+) -> Tensor:
+    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
+    w, h = img.shape[-1], img.shape[-2]
+    ow, oh = _compute_affine_output_size(matrix, w, h) if expand else (w, h)
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
+    # grid will be generated on the same device as theta and img
+    grid = _gen_affine_grid(theta, w=w, h=h, ow=ow, oh=oh)
+
+    return _apply_grid_transform(img, grid, interpolation, fill=fill)
+
+
+def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> Tensor:
+    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
+    # src/libImaging/Geometry.c#L394
+
+    #
+    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #
+    theta1 = torch.tensor(
+        [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
+    )
+    theta2 = torch.tensor([[[coeffs[6], coeffs[7], 1.0], [coeffs[6], coeffs[7], 1.0]]], dtype=dtype, device=device)
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace(d, ow * 1.0 + d - 1.0, steps=ow, device=device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(d, oh * 1.0 + d - 1.0, steps=oh, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta1 = theta1.transpose(1, 2) / torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device)
+    output_grid1 = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta1)
+    output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2))
+
+    output_grid = output_grid1 / output_grid2 - 1.0
+    return output_grid.view(1, oh, ow, 2)
+
+
+def perspective(
+    img: Tensor,
+    perspective_coeffs: List[float],
+    interpolation: str = "bilinear",
+    fill: Optional[Union[int, float, List[float]]] = None,
+) -> Tensor:
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError("Input img should be Tensor.")
+
+    _assert_image_tensor(img)
+
+    _assert_grid_transform_inputs(
+        img,
+        matrix=None,
+        interpolation=interpolation,
+        fill=fill,
+        supported_interpolation_modes=["nearest", "bilinear"],
+        coeffs=perspective_coeffs,
+    )
+
+    ow, oh = img.shape[-1], img.shape[-2]
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=img.device)
+    return _apply_grid_transform(img, grid, interpolation, fill=fill)
+
+
+def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> Tensor:
+    ksize_half = (kernel_size - 1) * 0.5
+
+    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size)
+    pdf = torch.exp(-0.5 * (x / sigma).pow(2))
+    kernel1d = pdf / pdf.sum()
+
+    return kernel1d
+
+
+def _get_gaussian_kernel2d(
+    kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device
+) -> Tensor:
+    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0]).to(device, dtype=dtype)
+    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1]).to(device, dtype=dtype)
+    kernel2d = torch.mm(kernel1d_y[:, None], kernel1d_x[None, :])
+    return kernel2d
+
+
+def gaussian_blur(img: Tensor, kernel_size: List[int], sigma: List[float]) -> Tensor:
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError(f"img should be Tensor. Got {type(img)}")
+
+    _assert_image_tensor(img)
+
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=img.device)
+    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
+
+    # padding = (left, right, top, bottom)
+    padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
+    img = torch_pad(img, padding, mode="reflect")
+    img = conv2d(img, kernel, groups=img.shape[-3])
+
+    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
+    return img
+
+
+def invert(img: Tensor) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+
+    _assert_channels(img, [1, 3])
+
+    return _max_value(img.dtype) - img
+
+
+def posterize(img: Tensor, bits: int) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+    if img.dtype != torch.uint8:
+        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
+
+    _assert_channels(img, [1, 3])
+    mask = -int(2 ** (8 - bits))  # JIT-friendly for: ~(2 ** (8 - bits) - 1)
+    return img & mask
+
+
+def solarize(img: Tensor, threshold: float) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+
+    _assert_channels(img, [1, 3])
+
+    if threshold > _max_value(img.dtype):
+        raise TypeError("Threshold should be less than bound of img.")
+
+    inverted_img = invert(img)
+    return torch.where(img >= threshold, inverted_img, img)
+
+
+def _blurred_degenerate_image(img: Tensor) -> Tensor:
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+
+    kernel = torch.ones((3, 3), dtype=dtype, device=img.device)
+    kernel[1, 1] = 5.0
+    kernel /= kernel.sum()
+    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [kernel.dtype])
+    result_tmp = conv2d(result_tmp, kernel, groups=result_tmp.shape[-3])
+    result_tmp = _cast_squeeze_out(result_tmp, need_cast, need_squeeze, out_dtype)
+
+    result = img.clone()
+    result[..., 1:-1, 1:-1] = result_tmp
+
+    return result
+
+
+def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
+    if sharpness_factor < 0:
+        raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.")
+
+    _assert_image_tensor(img)
+
+    _assert_channels(img, [1, 3])
+
+    if img.size(-1) <= 2 or img.size(-2) <= 2:
+        return img
+
+    return _blend(img, _blurred_degenerate_image(img), sharpness_factor)
+
+
+def autocontrast(img: Tensor) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if img.ndim < 3:
+        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
+
+    _assert_channels(img, [1, 3])
+
+    bound = _max_value(img.dtype)
+    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
+
+    minimum = img.amin(dim=(-2, -1), keepdim=True).to(dtype)
+    maximum = img.amax(dim=(-2, -1), keepdim=True).to(dtype)
+    scale = bound / (maximum - minimum)
+    eq_idxs = torch.isfinite(scale).logical_not()
+    minimum[eq_idxs] = 0
+    scale[eq_idxs] = 1
+
+    return ((img - minimum) * scale).clamp(0, bound).to(img.dtype)
+
+
+def _scale_channel(img_chan: Tensor) -> Tensor:
+    # TODO: we should expect bincount to always be faster than histc, but this
+    # isn't always the case. Once
+    # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if
+    # block and only use bincount.
+    if img_chan.is_cuda:
+        hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255)
+    else:
+        hist = torch.bincount(img_chan.reshape(-1), minlength=256)
+
+    nonzero_hist = hist[hist != 0]
+    step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor")
+    if step == 0:
+        return img_chan
+
+    lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode="floor"), step, rounding_mode="floor")
+    lut = torch.nn.functional.pad(lut, [1, 0])[:-1].clamp(0, 255)
+
+    return lut[img_chan.to(torch.int64)].to(torch.uint8)
+
+
+def _equalize_single_image(img: Tensor) -> Tensor:
+    return torch.stack([_scale_channel(img[c]) for c in range(img.size(0))])
+
+
+def equalize(img: Tensor) -> Tensor:
+
+    _assert_image_tensor(img)
+
+    if not (3 <= img.ndim <= 4):
+        raise TypeError(f"Input image tensor should have 3 or 4 dimensions, but found {img.ndim}")
+    if img.dtype != torch.uint8:
+        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
+
+    _assert_channels(img, [1, 3])
+
+    if img.ndim == 3:
+        return _equalize_single_image(img)
+
+    return torch.stack([_equalize_single_image(x) for x in img])
+
+
+def normalize(tensor: Tensor, mean: List[float], std: List[float], inplace: bool = False) -> Tensor:
+    _assert_image_tensor(tensor)
+
+    if not tensor.is_floating_point():
+        raise TypeError(f"Input tensor should be a float tensor. Got {tensor.dtype}.")
+
+    if tensor.ndim < 3:
+        raise ValueError(
+            f"Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = {tensor.size()}"
+        )
+
+    if not inplace:
+        tensor = tensor.clone()
+
+    dtype = tensor.dtype
+    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
+    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
+    if (std == 0).any():
+        raise ValueError(f"std evaluated to zero after conversion to {dtype}, leading to division by zero.")
+    if mean.ndim == 1:
+        mean = mean.view(-1, 1, 1)
+    if std.ndim == 1:
+        std = std.view(-1, 1, 1)
+    return tensor.sub_(mean).div_(std)
+
+
+def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool = False) -> Tensor:
+    _assert_image_tensor(img)
+
+    if not inplace:
+        img = img.clone()
+
+    img[..., i : i + h, j : j + w] = v
+    return img
+
+
+def _create_identity_grid(size: List[int]) -> Tensor:
+    hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s) for s in size]
+    grid_y, grid_x = torch.meshgrid(hw_space, indexing="ij")
+    return torch.stack([grid_x, grid_y], -1).unsqueeze(0)  # 1 x H x W x 2
+
+
+def elastic_transform(
+    img: Tensor,
+    displacement: Tensor,
+    interpolation: str = "bilinear",
+    fill: Optional[Union[int, float, List[float]]] = None,
+) -> Tensor:
+
+    if not (isinstance(img, torch.Tensor)):
+        raise TypeError(f"img should be Tensor. Got {type(img)}")
+
+    size = list(img.shape[-2:])
+    displacement = displacement.to(img.device)
+
+    identity_grid = _create_identity_grid(size)
+    grid = identity_grid.to(img.device) + displacement
+    return _apply_grid_transform(img, grid, interpolation, fill)
diff --git a/torchvision/transforms/_pil_constants.py b/torchvision/transforms/_pil_constants.py
deleted file mode 100644
index 46f6ce5d24dac20c83716a60a478a0b0720487a9..0000000000000000000000000000000000000000
--- a/torchvision/transforms/_pil_constants.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from PIL import Image
-
-# See https://pillow.readthedocs.io/en/stable/releasenotes/9.1.0.html#deprecations
-# TODO: Remove this file once PIL minimal version is >= 9.1
-
-if hasattr(Image, "Resampling"):
-    BICUBIC = Image.Resampling.BICUBIC
-    BILINEAR = Image.Resampling.BILINEAR
-    LINEAR = Image.Resampling.BILINEAR
-    NEAREST = Image.Resampling.NEAREST
-
-    AFFINE = Image.Transform.AFFINE
-    FLIP_LEFT_RIGHT = Image.Transpose.FLIP_LEFT_RIGHT
-    FLIP_TOP_BOTTOM = Image.Transpose.FLIP_TOP_BOTTOM
-    PERSPECTIVE = Image.Transform.PERSPECTIVE
-else:
-    BICUBIC = Image.BICUBIC
-    BILINEAR = Image.BILINEAR
-    NEAREST = Image.NEAREST
-    LINEAR = Image.LINEAR
-
-    AFFINE = Image.AFFINE
-    FLIP_LEFT_RIGHT = Image.FLIP_LEFT_RIGHT
-    FLIP_TOP_BOTTOM = Image.FLIP_TOP_BOTTOM
-    PERSPECTIVE = Image.PERSPECTIVE
diff --git a/torchvision/transforms/_presets.py b/torchvision/transforms/_presets.py
index 33b94d01c9d3e1023d817eea77066c3f6508477c..ccbe425f2ac5a426b644df75432f516e20d11bb2 100644
--- a/torchvision/transforms/_presets.py
+++ b/torchvision/transforms/_presets.py
@@ -2,7 +2,7 @@
 This file is part of the private API. Please do not use directly these classes as they will be modified on
 future versions without warning. The classes should be accessed only via the transforms argument of Weights.
 """
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import torch
 from torch import nn, Tensor
@@ -44,6 +44,7 @@ class ImageClassification(nn.Module):
         mean: Tuple[float, ...] = (0.485, 0.456, 0.406),
         std: Tuple[float, ...] = (0.229, 0.224, 0.225),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
         super().__init__()
         self.crop_size = [crop_size]
@@ -51,9 +52,10 @@ class ImageClassification(nn.Module):
         self.mean = list(mean)
         self.std = list(std)
         self.interpolation = interpolation
+        self.antialias = antialias
 
     def forward(self, img: Tensor) -> Tensor:
-        img = F.resize(img, self.resize_size, interpolation=self.interpolation)
+        img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=self.antialias)
         img = F.center_crop(img, self.crop_size)
         if not isinstance(img, Tensor):
             img = F.pil_to_tensor(img)
@@ -105,7 +107,11 @@ class VideoClassification(nn.Module):
 
         N, T, C, H, W = vid.shape
         vid = vid.view(-1, C, H, W)
-        vid = F.resize(vid, self.resize_size, interpolation=self.interpolation)
+        # We hard-code antialias=False to preserve results after we changed
+        # its default from None to True (see
+        # https://github.com/pytorch/vision/pull/7160)
+        # TODO: we could re-train the video models with antialias=True?
+        vid = F.resize(vid, self.resize_size, interpolation=self.interpolation, antialias=False)
         vid = F.center_crop(vid, self.crop_size)
         vid = F.convert_image_dtype(vid, torch.float)
         vid = F.normalize(vid, mean=self.mean, std=self.std)
@@ -145,16 +151,18 @@ class SemanticSegmentation(nn.Module):
         mean: Tuple[float, ...] = (0.485, 0.456, 0.406),
         std: Tuple[float, ...] = (0.229, 0.224, 0.225),
         interpolation: InterpolationMode = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
     ) -> None:
         super().__init__()
         self.resize_size = [resize_size] if resize_size is not None else None
         self.mean = list(mean)
         self.std = list(std)
         self.interpolation = interpolation
+        self.antialias = antialias
 
     def forward(self, img: Tensor) -> Tensor:
         if isinstance(self.resize_size, list):
-            img = F.resize(img, self.resize_size, interpolation=self.interpolation)
+            img = F.resize(img, self.resize_size, interpolation=self.interpolation, antialias=self.antialias)
         if not isinstance(img, Tensor):
             img = F.pil_to_tensor(img)
         img = F.convert_image_dtype(img, torch.float)
diff --git a/torchvision/transforms/_transforms_video.py b/torchvision/transforms/_transforms_video.py
index a67eca5ff4c948ddb2ca03d04644650610bb5f68..a04da4f74849805641e4c470f6b6b8d5f7000e3a 100644
--- a/torchvision/transforms/_transforms_video.py
+++ b/torchvision/transforms/_transforms_video.py
@@ -151,7 +151,7 @@ class ToTensorVideo:
 
 class RandomHorizontalFlipVideo:
     """
-    Flip the video clip along the horizonal direction with a given probability
+    Flip the video clip along the horizontal direction with a given probability
     Args:
         p (float): probability of the clip being flipped. Default value is 0.5
     """
diff --git a/torchvision/transforms/functional.py b/torchvision/transforms/functional.py
index e82c5eca86e8730db368e923df18533e6675c03b..3e81005c6d6d7ecfd2a5f1d749ca631a091adf9f 100644
--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -15,15 +15,17 @@ except ImportError:
     accimage = None
 
 from ..utils import _log_api_usage_once
-from . import functional_pil as F_pil, functional_tensor as F_t
+from . import _functional_pil as F_pil, _functional_tensor as F_t
 
 
 class InterpolationMode(Enum):
     """Interpolation modes
-    Available interpolation methods are ``nearest``, ``bilinear``, ``bicubic``, ``box``, ``hamming``, and ``lanczos``.
+    Available interpolation methods are ``nearest``, ``nearest-exact``, ``bilinear``, ``bicubic``, ``box``, ``hamming``,
+    and ``lanczos``.
     """
 
     NEAREST = "nearest"
+    NEAREST_EXACT = "nearest-exact"
     BILINEAR = "bilinear"
     BICUBIC = "bicubic"
     # For PIL compatibility
@@ -50,6 +52,7 @@ pil_modes_mapping = {
     InterpolationMode.NEAREST: 0,
     InterpolationMode.BILINEAR: 2,
     InterpolationMode.BICUBIC: 3,
+    InterpolationMode.NEAREST_EXACT: 0,
     InterpolationMode.BOX: 4,
     InterpolationMode.HAMMING: 5,
     InterpolationMode.LANCZOS: 1,
@@ -164,7 +167,7 @@ def to_tensor(pic) -> Tensor:
 
     if pic.mode == "1":
         img = 255 * img
-    img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
+    img = img.view(pic.size[1], pic.size[0], F_pil.get_image_num_channels(pic))
     # put it from HWC to CHW format
     img = img.permute((2, 0, 1)).contiguous()
     if isinstance(img, torch.ByteTensor):
@@ -202,7 +205,7 @@ def pil_to_tensor(pic: Any) -> Tensor:
 
     # handle PIL Image
     img = torch.as_tensor(np.array(pic, copy=True))
-    img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
+    img = img.view(pic.size[1], pic.size[0], F_pil.get_image_num_channels(pic))
     # put it from HWC to CHW format
     img = img.permute((2, 0, 1))
     return img
@@ -390,7 +393,7 @@ def resize(
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
     max_size: Optional[int] = None,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> Tensor:
     r"""Resize the input image to the given size.
     If the image is torch Tensor, it is expected
@@ -416,37 +419,48 @@ def resize(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`.
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
+            supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
-            the resized image: if the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, then
-            the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e the
-            smaller edge may be shorter than ``size``. This is only supported
-            if ``size`` is an int (or a sequence of length 1 in torchscript
-            mode).
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``,
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
 
     Returns:
         PIL Image or Tensor: Resized image.
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(resize)
-    # Backward compatibility with integer value
+
     if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
         interpolation = _interpolation_modes_from_int(interpolation)
-
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
 
     if isinstance(size, (list, tuple)):
         if len(size) not in [1, 2]:
@@ -464,11 +478,13 @@ def resize(
         size = [size]
     output_size = _compute_resized_output_size((image_height, image_width), size, max_size)
 
-    if (image_height, image_width) == output_size:
+    if [image_height, image_width] == output_size:
         return img
 
+    antialias = _check_antialias(img, antialias, interpolation)
+
     if not isinstance(img, torch.Tensor):
-        if antialias is not None and not antialias:
+        if antialias is False:
             warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
         pil_interpolation = pil_modes_mapping[interpolation]
         return F_pil.resize(img, size=output_size, interpolation=pil_interpolation)
@@ -599,7 +615,7 @@ def resized_crop(
     width: int,
     size: List[int],
     interpolation: InterpolationMode = InterpolationMode.BILINEAR,
-    antialias: Optional[bool] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
 ) -> Tensor:
     """Crop the given image and resize it to desired size.
     If the image is torch Tensor, it is expected
@@ -617,13 +633,27 @@ def resized_crop(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`.
             Default is ``InterpolationMode.BILINEAR``. If input is Tensor, only ``InterpolationMode.NEAREST``,
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+            ``InterpolationMode.NEAREST_EXACT``, ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are
+            supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
     Returns:
         PIL Image or Tensor: Cropped image.
     """
@@ -702,8 +732,7 @@ def perspective(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
 
@@ -719,16 +748,12 @@ def perspective(
 
     coeffs = _get_perspective_coeffs(startpoints, endpoints)
 
-    # Backward compatibility with integer value
     if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
         interpolation = _interpolation_modes_from_int(interpolation)
-
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
 
     if not isinstance(img, torch.Tensor):
         pil_interpolation = pil_modes_mapping[interpolation]
@@ -802,7 +827,9 @@ def five_crop(img: Tensor, size: List[int]) -> Tuple[Tensor, Tensor, Tensor, Ten
     return tl, tr, bl, br, center
 
 
-def ten_crop(img: Tensor, size: List[int], vertical_flip: bool = False) -> List[Tensor]:
+def ten_crop(
+    img: Tensor, size: List[int], vertical_flip: bool = False
+) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, Tensor]:
     """Generate ten cropped images from the given image.
     Crop the given image into four corners and the central crop plus the
     flipped version of these (horizontal flipping is used by default).
@@ -854,7 +881,7 @@ def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
             If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
             where ... means it can have an arbitrary number of leading dimensions.
         brightness_factor (float):  How much to adjust the brightness. Can be
-            any non negative number. 0 gives a black image, 1 gives the
+            any non-negative number. 0 gives a black image, 1 gives the
             original image while 2 increases the brightness by a factor of 2.
 
     Returns:
@@ -876,7 +903,7 @@ def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
             If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
             where ... means it can have an arbitrary number of leading dimensions.
         contrast_factor (float): How much to adjust the contrast. Can be any
-            non negative number. 0 gives a solid gray image, 1 gives the
+            non-negative number. 0 gives a solid gray image, 1 gives the
             original image while 2 increases the contrast by a factor of 2.
 
     Returns:
@@ -999,7 +1026,7 @@ def _get_inverse_affine_matrix(
     #       RotateScaleShear(a, s, (sx, sy)) =
     #       = R(a) * S(s) * SHy(sy) * SHx(sx)
     #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
-    #         [ s*sin(a + sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
     #         [ 0                    , 0                                      , 1 ]
     # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
     # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
@@ -1062,8 +1089,7 @@ def rotate(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         expand (bool, optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1085,13 +1111,12 @@ def rotate(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(rotate)
 
-    # Backward compatibility with integer value
     if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
         interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
 
     if not isinstance(angle, (int, float)):
         raise TypeError("Argument angle should be int or float")
@@ -1099,9 +1124,6 @@ def rotate(
     if center is not None and not isinstance(center, (list, tuple)):
         raise TypeError("Argument center should be a sequence")
 
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
-
     if not isinstance(img, torch.Tensor):
         pil_interpolation = pil_modes_mapping[interpolation]
         return F_pil.rotate(img, angle=angle, interpolation=pil_interpolation, expand=expand, center=center, fill=fill)
@@ -1138,13 +1160,12 @@ def affine(
         translate (sequence of integers): horizontal and vertical translations (post-rotation translation)
         scale (float): overall scale
         shear (float or sequence): shear angle value in degrees between -180 to 180, clockwise direction.
-            If a sequence is specified, the first value corresponds to a shear parallel to the x axis, while
-            the second value corresponds to a shear parallel to the y axis.
+            If a sequence is specified, the first value corresponds to a shear parallel to the x-axis, while
+            the second value corresponds to a shear parallel to the y-axis.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
 
@@ -1160,13 +1181,12 @@ def affine(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(affine)
 
-    # Backward compatibility with integer value
     if isinstance(interpolation, int):
-        warnings.warn(
-            "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-            "Please use InterpolationMode enum."
-        )
         interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise TypeError(
+            "Argument interpolation should be a InterpolationMode or a corresponding Pillow integer constant"
+        )
 
     if not isinstance(angle, (int, float)):
         raise TypeError("Argument angle should be int or float")
@@ -1183,9 +1203,6 @@ def affine(
     if not isinstance(shear, (numbers.Number, (list, tuple))):
         raise TypeError("Shear should be either a single value or a sequence of two values")
 
-    if not isinstance(interpolation, InterpolationMode):
-        raise TypeError("Argument interpolation should be a InterpolationMode")
-
     if isinstance(angle, int):
         angle = float(angle)
 
@@ -1229,6 +1246,9 @@ def affine(
     return F_t.affine(img, matrix=matrix, interpolation=interpolation.value, fill=fill)
 
 
+# Looks like to_grayscale() is a stand-alone functional that is never called
+# from the transform classes. Perhaps it's still here for BC? I can't be
+# bothered to dig.
 @torch.jit.unused
 def to_grayscale(img, num_output_channels=1):
     """Convert PIL image of any mode (RGB, HSV, LAB, etc) to grayscale version of image.
@@ -1290,7 +1310,7 @@ def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool
         h (int): Height of the erased region.
         w (int): Width of the erased region.
         v: Erasing value.
-        inplace(bool, optional): For in-place operations. By default is set False.
+        inplace(bool, optional): For in-place operations. By default, is set False.
 
     Returns:
         Tensor Image: Erased image.
@@ -1395,7 +1415,7 @@ def posterize(img: Tensor, bits: int) -> Tensor:
 
     Args:
         img (PIL Image or Tensor): Image to have its colors posterized.
-            If img is torch Tensor, it should be of type torch.uint8 and
+            If img is torch Tensor, it should be of type torch.uint8, and
             it is expected to be in [..., 1 or 3, H, W] format, where ... means
             it can have an arbitrary number of leading dimensions.
             If img is PIL Image, it is expected to be in mode "L" or "RGB".
@@ -1442,7 +1462,7 @@ def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
             If img is torch Tensor, it is expected to be in [..., 1 or 3, H, W] format,
             where ... means it can have an arbitrary number of leading dimensions.
         sharpness_factor (float):  How much to adjust the sharpness. Can be
-            any non negative number. 0 gives a blurred image, 1 gives the
+            any non-negative number. 0 gives a blurred image, 1 gives the
             original image while 2 increases the sharpness by a factor of 2.
 
     Returns:
@@ -1527,12 +1547,10 @@ def elastic_transform(
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`.
             Default is ``InterpolationMode.BILINEAR``.
-            For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (number or str or tuple): Pixel fill value for constant fill. Default is 0.
             If a tuple of length 3, it is used to fill R, G, B channels respectively.
             This value is only used when the padding_mode is constant.
-            Only number is supported for torch Tensor.
-            Only int or str or tuple value is supported for PIL Image.
     """
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(elastic_transform)
@@ -1572,3 +1590,28 @@ def elastic_transform(
     if not isinstance(img, torch.Tensor):
         output = to_pil_image(output, mode=img.mode)
     return output
+
+
+# TODO in v0.17: remove this helper and change default of antialias to True everywhere
+def _check_antialias(
+    img: Tensor, antialias: Optional[Union[str, bool]], interpolation: InterpolationMode
+) -> Optional[bool]:
+    if isinstance(antialias, str):  # it should be "warn", but we don't bother checking against that
+        if isinstance(img, Tensor) and (
+            interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC
+        ):
+            warnings.warn(
+                "The default value of the antialias parameter of all the resizing transforms "
+                "(Resize(), RandomResizedCrop(), etc.) "
+                "will change from None to True in v0.17, "
+                "in order to be consistent across the PIL and Tensor backends. "
+                "To suppress this warning, directly pass "
+                "antialias=True (recommended, future default), antialias=None (current default, "
+                "which means False for Tensors and True for PIL), "
+                "or antialias=False (only works on Tensors - PIL will still use antialiasing). "
+                "This also applies if you are using the inference transforms from the models weights: "
+                "update the call to weights.transforms(antialias=True)."
+            )
+        antialias = None
+
+    return antialias
diff --git a/torchvision/transforms/functional_pil.py b/torchvision/transforms/functional_pil.py
index 45c32782540006a640b4d4ad59ac03ef97d747a4..bfcbf1a54424dad2b053febcc825e93aba9ce812 100644
--- a/torchvision/transforms/functional_pil.py
+++ b/torchvision/transforms/functional_pil.py
@@ -1,393 +1,11 @@
-import numbers
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+import warnings
 
-import numpy as np
-import torch
-from PIL import Image, ImageEnhance, ImageOps
-from typing_extensions import Literal
+from torchvision.transforms._functional_pil import *  # noqa
 
-try:
-    import accimage
-except ImportError:
-    accimage = None
-from . import _pil_constants
-
-
-@torch.jit.unused
-def _is_pil_image(img: Any) -> bool:
-    if accimage is not None:
-        return isinstance(img, (Image.Image, accimage.Image))
-    else:
-        return isinstance(img, Image.Image)
-
-
-@torch.jit.unused
-def get_dimensions(img: Any) -> List[int]:
-    if _is_pil_image(img):
-        if hasattr(img, "getbands"):
-            channels = len(img.getbands())
-        else:
-            channels = img.channels
-        width, height = img.size
-        return [channels, height, width]
-    raise TypeError(f"Unexpected type {type(img)}")
-
-
-@torch.jit.unused
-def get_image_size(img: Any) -> List[int]:
-    if _is_pil_image(img):
-        return list(img.size)
-    raise TypeError(f"Unexpected type {type(img)}")
-
-
-@torch.jit.unused
-def get_image_num_channels(img: Any) -> int:
-    if _is_pil_image(img):
-        if hasattr(img, "getbands"):
-            return len(img.getbands())
-        else:
-            return img.channels
-    raise TypeError(f"Unexpected type {type(img)}")
-
-
-@torch.jit.unused
-def hflip(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    return img.transpose(_pil_constants.FLIP_LEFT_RIGHT)
-
-
-@torch.jit.unused
-def vflip(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    return img.transpose(_pil_constants.FLIP_TOP_BOTTOM)
-
-
-@torch.jit.unused
-def adjust_brightness(img: Image.Image, brightness_factor: float) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    enhancer = ImageEnhance.Brightness(img)
-    img = enhancer.enhance(brightness_factor)
-    return img
-
-
-@torch.jit.unused
-def adjust_contrast(img: Image.Image, contrast_factor: float) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    enhancer = ImageEnhance.Contrast(img)
-    img = enhancer.enhance(contrast_factor)
-    return img
-
-
-@torch.jit.unused
-def adjust_saturation(img: Image.Image, saturation_factor: float) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    enhancer = ImageEnhance.Color(img)
-    img = enhancer.enhance(saturation_factor)
-    return img
-
-
-@torch.jit.unused
-def adjust_hue(img: Image.Image, hue_factor: float) -> Image.Image:
-    if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    input_mode = img.mode
-    if input_mode in {"L", "1", "I", "F"}:
-        return img
-
-    h, s, v = img.convert("HSV").split()
-
-    np_h = np.array(h, dtype=np.uint8)
-    # uint8 addition take cares of rotation across boundaries
-    with np.errstate(over="ignore"):
-        np_h += np.uint8(hue_factor * 255)
-    h = Image.fromarray(np_h, "L")
-
-    img = Image.merge("HSV", (h, s, v)).convert(input_mode)
-    return img
-
-
-@torch.jit.unused
-def adjust_gamma(
-    img: Image.Image,
-    gamma: float,
-    gain: float = 1.0,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    if gamma < 0:
-        raise ValueError("Gamma should be a non-negative real number")
-
-    input_mode = img.mode
-    img = img.convert("RGB")
-    gamma_map = [int((255 + 1 - 1e-3) * gain * pow(ele / 255.0, gamma)) for ele in range(256)] * 3
-    img = img.point(gamma_map)  # use PIL's point-function to accelerate this part
-
-    img = img.convert(input_mode)
-    return img
-
-
-@torch.jit.unused
-def pad(
-    img: Image.Image,
-    padding: Union[int, List[int], Tuple[int, ...]],
-    fill: Optional[Union[float, List[float], Tuple[float, ...]]] = 0,
-    padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    if not isinstance(padding, (numbers.Number, tuple, list)):
-        raise TypeError("Got inappropriate padding arg")
-    if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
-        raise TypeError("Got inappropriate fill arg")
-    if not isinstance(padding_mode, str):
-        raise TypeError("Got inappropriate padding_mode arg")
-
-    if isinstance(padding, list):
-        padding = tuple(padding)
-
-    if isinstance(padding, tuple) and len(padding) not in [1, 2, 4]:
-        raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple")
-
-    if isinstance(padding, tuple) and len(padding) == 1:
-        # Compatibility with `functional_tensor.pad`
-        padding = padding[0]
-
-    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
-        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
-
-    if padding_mode == "constant":
-        opts = _parse_fill(fill, img, name="fill")
-        if img.mode == "P":
-            palette = img.getpalette()
-            image = ImageOps.expand(img, border=padding, **opts)
-            image.putpalette(palette)
-            return image
-
-        return ImageOps.expand(img, border=padding, **opts)
-    else:
-        if isinstance(padding, int):
-            pad_left = pad_right = pad_top = pad_bottom = padding
-        if isinstance(padding, tuple) and len(padding) == 2:
-            pad_left = pad_right = padding[0]
-            pad_top = pad_bottom = padding[1]
-        if isinstance(padding, tuple) and len(padding) == 4:
-            pad_left = padding[0]
-            pad_top = padding[1]
-            pad_right = padding[2]
-            pad_bottom = padding[3]
-
-        p = [pad_left, pad_top, pad_right, pad_bottom]
-        cropping = -np.minimum(p, 0)
-
-        if cropping.any():
-            crop_left, crop_top, crop_right, crop_bottom = cropping
-            img = img.crop((crop_left, crop_top, img.width - crop_right, img.height - crop_bottom))
-
-        pad_left, pad_top, pad_right, pad_bottom = np.maximum(p, 0)
-
-        if img.mode == "P":
-            palette = img.getpalette()
-            img = np.asarray(img)
-            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), mode=padding_mode)
-            img = Image.fromarray(img)
-            img.putpalette(palette)
-            return img
-
-        img = np.asarray(img)
-        # RGB image
-        if len(img.shape) == 3:
-            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right), (0, 0)), padding_mode)
-        # Grayscale image
-        if len(img.shape) == 2:
-            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)), padding_mode)
-
-        return Image.fromarray(img)
-
-
-@torch.jit.unused
-def crop(
-    img: Image.Image,
-    top: int,
-    left: int,
-    height: int,
-    width: int,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    return img.crop((left, top, left + width, top + height))
-
-
-@torch.jit.unused
-def resize(
-    img: Image.Image,
-    size: Union[List[int], int],
-    interpolation: int = _pil_constants.BILINEAR,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    if not (isinstance(size, list) and len(size) == 2):
-        raise TypeError(f"Got inappropriate size arg: {size}")
-
-    return img.resize(tuple(size[::-1]), interpolation)
-
-
-@torch.jit.unused
-def _parse_fill(
-    fill: Optional[Union[float, List[float], Tuple[float, ...]]],
-    img: Image.Image,
-    name: str = "fillcolor",
-) -> Dict[str, Optional[Union[float, List[float], Tuple[float, ...]]]]:
-
-    # Process fill color for affine transforms
-    num_channels = get_image_num_channels(img)
-    if fill is None:
-        fill = 0
-    if isinstance(fill, (int, float)) and num_channels > 1:
-        fill = tuple([fill] * num_channels)
-    if isinstance(fill, (list, tuple)):
-        if len(fill) != num_channels:
-            msg = "The number of elements in 'fill' does not match the number of channels of the image ({} != {})"
-            raise ValueError(msg.format(len(fill), num_channels))
-
-        fill = tuple(fill)
-
-    if img.mode != "F":
-        if isinstance(fill, (list, tuple)):
-            fill = tuple(int(x) for x in fill)
-        else:
-            fill = int(fill)
-
-    return {name: fill}
-
-
-@torch.jit.unused
-def affine(
-    img: Image.Image,
-    matrix: List[float],
-    interpolation: int = _pil_constants.NEAREST,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    output_size = img.size
-    opts = _parse_fill(fill, img)
-    return img.transform(output_size, _pil_constants.AFFINE, matrix, interpolation, **opts)
-
-
-@torch.jit.unused
-def rotate(
-    img: Image.Image,
-    angle: float,
-    interpolation: int = _pil_constants.NEAREST,
-    expand: bool = False,
-    center: Optional[Tuple[int, int]] = None,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    opts = _parse_fill(fill, img)
-    return img.rotate(angle, interpolation, expand, center, **opts)
-
-
-@torch.jit.unused
-def perspective(
-    img: Image.Image,
-    perspective_coeffs: List[float],
-    interpolation: int = _pil_constants.BICUBIC,
-    fill: Optional[Union[int, float, Sequence[int], Sequence[float]]] = None,
-) -> Image.Image:
-
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    opts = _parse_fill(fill, img)
-
-    return img.transform(img.size, _pil_constants.PERSPECTIVE, perspective_coeffs, interpolation, **opts)
-
-
-@torch.jit.unused
-def to_grayscale(img: Image.Image, num_output_channels: int) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    if num_output_channels == 1:
-        img = img.convert("L")
-    elif num_output_channels == 3:
-        img = img.convert("L")
-        np_img = np.array(img, dtype=np.uint8)
-        np_img = np.dstack([np_img, np_img, np_img])
-        img = Image.fromarray(np_img, "RGB")
-    else:
-        raise ValueError("num_output_channels should be either 1 or 3")
-
-    return img
-
-
-@torch.jit.unused
-def invert(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.invert(img)
-
-
-@torch.jit.unused
-def posterize(img: Image.Image, bits: int) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.posterize(img, bits)
-
-
-@torch.jit.unused
-def solarize(img: Image.Image, threshold: int) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.solarize(img, threshold)
-
-
-@torch.jit.unused
-def adjust_sharpness(img: Image.Image, sharpness_factor: float) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-
-    enhancer = ImageEnhance.Sharpness(img)
-    img = enhancer.enhance(sharpness_factor)
-    return img
-
-
-@torch.jit.unused
-def autocontrast(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.autocontrast(img)
-
-
-@torch.jit.unused
-def equalize(img: Image.Image) -> Image.Image:
-    if not _is_pil_image(img):
-        raise TypeError(f"img should be PIL Image. Got {type(img)}")
-    return ImageOps.equalize(img)
+warnings.warn(
+    "The torchvision.transforms.functional_pil module is deprecated "
+    "in 0.15 and will be **removed in 0.17**. Please don't rely on it. "
+    "You probably just need to use APIs in "
+    "torchvision.transforms.functional or in "
+    "torchvision.transforms.v2.functional."
+)
diff --git a/torchvision/transforms/functional_tensor.py b/torchvision/transforms/functional_tensor.py
index 2be2964b9e074a5f4aedbd610a98532c6f5e5978..a9f7f509074c601a8f2c5469798ae7b8b281bafe 100644
--- a/torchvision/transforms/functional_tensor.py
+++ b/torchvision/transforms/functional_tensor.py
@@ -1,974 +1,11 @@
 import warnings
-from typing import List, Optional, Tuple, Union
 
-import torch
-from torch import Tensor
-from torch.nn.functional import conv2d, grid_sample, interpolate, pad as torch_pad
+from torchvision.transforms._functional_tensor import *  # noqa
 
-
-def _is_tensor_a_torch_image(x: Tensor) -> bool:
-    return x.ndim >= 2
-
-
-def _assert_image_tensor(img: Tensor) -> None:
-    if not _is_tensor_a_torch_image(img):
-        raise TypeError("Tensor is not a torch image.")
-
-
-def _assert_threshold(img: Tensor, threshold: float) -> None:
-    bound = 1 if img.is_floating_point() else 255
-    if threshold > bound:
-        raise TypeError("Threshold should be less than bound of img.")
-
-
-def get_dimensions(img: Tensor) -> List[int]:
-    _assert_image_tensor(img)
-    channels = 1 if img.ndim == 2 else img.shape[-3]
-    height, width = img.shape[-2:]
-    return [channels, height, width]
-
-
-def get_image_size(img: Tensor) -> List[int]:
-    # Returns (w, h) of tensor image
-    _assert_image_tensor(img)
-    return [img.shape[-1], img.shape[-2]]
-
-
-def get_image_num_channels(img: Tensor) -> int:
-    _assert_image_tensor(img)
-    if img.ndim == 2:
-        return 1
-    elif img.ndim > 2:
-        return img.shape[-3]
-
-    raise TypeError(f"Input ndim should be 2 or more. Got {img.ndim}")
-
-
-def _max_value(dtype: torch.dtype) -> int:
-    if dtype == torch.uint8:
-        return 255
-    elif dtype == torch.int8:
-        return 127
-    elif dtype == torch.int16:
-        return 32767
-    elif dtype == torch.int32:
-        return 2147483647
-    elif dtype == torch.int64:
-        return 9223372036854775807
-    else:
-        return 1
-
-
-def _assert_channels(img: Tensor, permitted: List[int]) -> None:
-    c = get_dimensions(img)[0]
-    if c not in permitted:
-        raise TypeError(f"Input image tensor permitted channel values are {permitted}, but found {c}")
-
-
-def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float) -> torch.Tensor:
-    if image.dtype == dtype:
-        return image
-
-    if image.is_floating_point():
-
-        # TODO: replace with dtype.is_floating_point when torchscript supports it
-        if torch.tensor(0, dtype=dtype).is_floating_point():
-            return image.to(dtype)
-
-        # float to int
-        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
-            image.dtype == torch.float64 and dtype == torch.int64
-        ):
-            msg = f"The cast from {image.dtype} to {dtype} cannot be performed safely."
-            raise RuntimeError(msg)
-
-        # https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
-        # For data in the range 0-1, (float * 255).to(uint) is only 255
-        # when float is exactly 1.0.
-        # `max + 1 - epsilon` provides more evenly distributed mapping of
-        # ranges of floats to ints.
-        eps = 1e-3
-        max_val = float(_max_value(dtype))
-        result = image.mul(max_val + 1.0 - eps)
-        return result.to(dtype)
-    else:
-        input_max = float(_max_value(image.dtype))
-
-        # int to float
-        # TODO: replace with dtype.is_floating_point when torchscript supports it
-        if torch.tensor(0, dtype=dtype).is_floating_point():
-            image = image.to(dtype)
-            return image / input_max
-
-        output_max = float(_max_value(dtype))
-
-        # int to int
-        if input_max > output_max:
-            # factor should be forced to int for torch jit script
-            # otherwise factor is a float and image // factor can produce different results
-            factor = int((input_max + 1) // (output_max + 1))
-            image = torch.div(image, factor, rounding_mode="floor")
-            return image.to(dtype)
-        else:
-            # factor should be forced to int for torch jit script
-            # otherwise factor is a float and image * factor can produce different results
-            factor = int((output_max + 1) // (input_max + 1))
-            image = image.to(dtype)
-            return image * factor
-
-
-def vflip(img: Tensor) -> Tensor:
-    _assert_image_tensor(img)
-
-    return img.flip(-2)
-
-
-def hflip(img: Tensor) -> Tensor:
-    _assert_image_tensor(img)
-
-    return img.flip(-1)
-
-
-def crop(img: Tensor, top: int, left: int, height: int, width: int) -> Tensor:
-    _assert_image_tensor(img)
-
-    _, h, w = get_dimensions(img)
-    right = left + width
-    bottom = top + height
-
-    if left < 0 or top < 0 or right > w or bottom > h:
-        padding_ltrb = [
-            max(-left + min(0, right), 0),
-            max(-top + min(0, bottom), 0),
-            max(right - max(w, left), 0),
-            max(bottom - max(h, top), 0),
-        ]
-        return pad(img[..., max(top, 0) : bottom, max(left, 0) : right], padding_ltrb, fill=0)
-    return img[..., top:bottom, left:right]
-
-
-def rgb_to_grayscale(img: Tensor, num_output_channels: int = 1) -> Tensor:
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-    _assert_channels(img, [1, 3])
-
-    if num_output_channels not in (1, 3):
-        raise ValueError("num_output_channels should be either 1 or 3")
-
-    if img.shape[-3] == 3:
-        r, g, b = img.unbind(dim=-3)
-        # This implementation closely follows the TF one:
-        # https://github.com/tensorflow/tensorflow/blob/v2.3.0/tensorflow/python/ops/image_ops_impl.py#L2105-L2138
-        l_img = (0.2989 * r + 0.587 * g + 0.114 * b).to(img.dtype)
-        l_img = l_img.unsqueeze(dim=-3)
-    else:
-        l_img = img.clone()
-
-    if num_output_channels == 3:
-        return l_img.expand(img.shape)
-
-    return l_img
-
-
-def adjust_brightness(img: Tensor, brightness_factor: float) -> Tensor:
-    if brightness_factor < 0:
-        raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [1, 3])
-
-    return _blend(img, torch.zeros_like(img), brightness_factor)
-
-
-def adjust_contrast(img: Tensor, contrast_factor: float) -> Tensor:
-    if contrast_factor < 0:
-        raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [3, 1])
-    c = get_dimensions(img)[0]
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    if c == 3:
-        mean = torch.mean(rgb_to_grayscale(img).to(dtype), dim=(-3, -2, -1), keepdim=True)
-    else:
-        mean = torch.mean(img.to(dtype), dim=(-3, -2, -1), keepdim=True)
-
-    return _blend(img, mean, contrast_factor)
-
-
-def adjust_hue(img: Tensor, hue_factor: float) -> Tensor:
-    if not (-0.5 <= hue_factor <= 0.5):
-        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
-
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError("Input img should be Tensor image")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [1, 3])
-    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
-        return img
-
-    orig_dtype = img.dtype
-    if img.dtype == torch.uint8:
-        img = img.to(dtype=torch.float32) / 255.0
-
-    img = _rgb2hsv(img)
-    h, s, v = img.unbind(dim=-3)
-    h = (h + hue_factor) % 1.0
-    img = torch.stack((h, s, v), dim=-3)
-    img_hue_adj = _hsv2rgb(img)
-
-    if orig_dtype == torch.uint8:
-        img_hue_adj = (img_hue_adj * 255.0).to(dtype=orig_dtype)
-
-    return img_hue_adj
-
-
-def adjust_saturation(img: Tensor, saturation_factor: float) -> Tensor:
-    if saturation_factor < 0:
-        raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [1, 3])
-
-    if get_dimensions(img)[0] == 1:  # Match PIL behaviour
-        return img
-
-    return _blend(img, rgb_to_grayscale(img), saturation_factor)
-
-
-def adjust_gamma(img: Tensor, gamma: float, gain: float = 1) -> Tensor:
-    if not isinstance(img, torch.Tensor):
-        raise TypeError("Input img should be a Tensor.")
-
-    _assert_channels(img, [1, 3])
-
-    if gamma < 0:
-        raise ValueError("Gamma should be a non-negative real number")
-
-    result = img
-    dtype = img.dtype
-    if not torch.is_floating_point(img):
-        result = convert_image_dtype(result, torch.float32)
-
-    result = (gain * result**gamma).clamp(0, 1)
-
-    result = convert_image_dtype(result, dtype)
-    return result
-
-
-def _blend(img1: Tensor, img2: Tensor, ratio: float) -> Tensor:
-    ratio = float(ratio)
-    bound = 1.0 if img1.is_floating_point() else 255.0
-    return (ratio * img1 + (1.0 - ratio) * img2).clamp(0, bound).to(img1.dtype)
-
-
-def _rgb2hsv(img: Tensor) -> Tensor:
-    r, g, b = img.unbind(dim=-3)
-
-    # Implementation is based on https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/
-    # src/libImaging/Convert.c#L330
-    maxc = torch.max(img, dim=-3).values
-    minc = torch.min(img, dim=-3).values
-
-    # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
-    # from happening in the results, because
-    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
-    #   + H channel has division by `(maxc - minc)`.
-    #
-    # Instead of overwriting NaN afterwards, we just prevent it from occuring so
-    # we don't need to deal with it in case we save the NaN in a buffer in
-    # backprop, if it is ever supported, but it doesn't hurt to do so.
-    eqc = maxc == minc
-
-    cr = maxc - minc
-    # Since `eqc => cr = 0`, replacing denominator with 1 when `eqc` is fine.
-    ones = torch.ones_like(maxc)
-    s = cr / torch.where(eqc, ones, maxc)
-    # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
-    # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
-    # would not matter what values `rc`, `gc`, and `bc` have here, and thus
-    # replacing denominator with 1 when `eqc` is fine.
-    cr_divisor = torch.where(eqc, ones, cr)
-    rc = (maxc - r) / cr_divisor
-    gc = (maxc - g) / cr_divisor
-    bc = (maxc - b) / cr_divisor
-
-    hr = (maxc == r) * (bc - gc)
-    hg = ((maxc == g) & (maxc != r)) * (2.0 + rc - bc)
-    hb = ((maxc != g) & (maxc != r)) * (4.0 + gc - rc)
-    h = hr + hg + hb
-    h = torch.fmod((h / 6.0 + 1.0), 1.0)
-    return torch.stack((h, s, maxc), dim=-3)
-
-
-def _hsv2rgb(img: Tensor) -> Tensor:
-    h, s, v = img.unbind(dim=-3)
-    i = torch.floor(h * 6.0)
-    f = (h * 6.0) - i
-    i = i.to(dtype=torch.int32)
-
-    p = torch.clamp((v * (1.0 - s)), 0.0, 1.0)
-    q = torch.clamp((v * (1.0 - s * f)), 0.0, 1.0)
-    t = torch.clamp((v * (1.0 - s * (1.0 - f))), 0.0, 1.0)
-    i = i % 6
-
-    mask = i.unsqueeze(dim=-3) == torch.arange(6, device=i.device).view(-1, 1, 1)
-
-    a1 = torch.stack((v, q, p, p, t, v), dim=-3)
-    a2 = torch.stack((t, v, v, q, p, p), dim=-3)
-    a3 = torch.stack((p, p, t, v, v, q), dim=-3)
-    a4 = torch.stack((a1, a2, a3), dim=-4)
-
-    return torch.einsum("...ijk, ...xijk -> ...xjk", mask.to(dtype=img.dtype), a4)
-
-
-def _pad_symmetric(img: Tensor, padding: List[int]) -> Tensor:
-    # padding is left, right, top, bottom
-
-    # crop if needed
-    if padding[0] < 0 or padding[1] < 0 or padding[2] < 0 or padding[3] < 0:
-        neg_min_padding = [-min(x, 0) for x in padding]
-        crop_left, crop_right, crop_top, crop_bottom = neg_min_padding
-        img = img[..., crop_top : img.shape[-2] - crop_bottom, crop_left : img.shape[-1] - crop_right]
-        padding = [max(x, 0) for x in padding]
-
-    in_sizes = img.size()
-
-    _x_indices = [i for i in range(in_sizes[-1])]  # [0, 1, 2, 3, ...]
-    left_indices = [i for i in range(padding[0] - 1, -1, -1)]  # e.g. [3, 2, 1, 0]
-    right_indices = [-(i + 1) for i in range(padding[1])]  # e.g. [-1, -2, -3]
-    x_indices = torch.tensor(left_indices + _x_indices + right_indices, device=img.device)
-
-    _y_indices = [i for i in range(in_sizes[-2])]
-    top_indices = [i for i in range(padding[2] - 1, -1, -1)]
-    bottom_indices = [-(i + 1) for i in range(padding[3])]
-    y_indices = torch.tensor(top_indices + _y_indices + bottom_indices, device=img.device)
-
-    ndim = img.ndim
-    if ndim == 3:
-        return img[:, y_indices[:, None], x_indices[None, :]]
-    elif ndim == 4:
-        return img[:, :, y_indices[:, None], x_indices[None, :]]
-    else:
-        raise RuntimeError("Symmetric padding of N-D tensors are not supported yet")
-
-
-def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
-    if isinstance(padding, int):
-        if torch.jit.is_scripting():
-            # This maybe unreachable
-            raise ValueError("padding can't be an int while torchscripting, set it as a list [value, ]")
-        pad_left = pad_right = pad_top = pad_bottom = padding
-    elif len(padding) == 1:
-        pad_left = pad_right = pad_top = pad_bottom = padding[0]
-    elif len(padding) == 2:
-        pad_left = pad_right = padding[0]
-        pad_top = pad_bottom = padding[1]
-    else:
-        pad_left = padding[0]
-        pad_top = padding[1]
-        pad_right = padding[2]
-        pad_bottom = padding[3]
-
-    return [pad_left, pad_right, pad_top, pad_bottom]
-
-
-def pad(
-    img: Tensor, padding: Union[int, List[int]], fill: Optional[Union[int, float]] = 0, padding_mode: str = "constant"
-) -> Tensor:
-    _assert_image_tensor(img)
-
-    if fill is None:
-        fill = 0
-
-    if not isinstance(padding, (int, tuple, list)):
-        raise TypeError("Got inappropriate padding arg")
-    if not isinstance(fill, (int, float)):
-        raise TypeError("Got inappropriate fill arg")
-    if not isinstance(padding_mode, str):
-        raise TypeError("Got inappropriate padding_mode arg")
-
-    if isinstance(padding, tuple):
-        padding = list(padding)
-
-    if isinstance(padding, list):
-        # TODO: Jit is failing on loading this op when scripted and saved
-        # https://github.com/pytorch/pytorch/issues/81100
-        if len(padding) not in [1, 2, 4]:
-            raise ValueError(
-                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
-            )
-
-    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
-        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
-
-    p = _parse_pad_padding(padding)
-
-    if padding_mode == "edge":
-        # remap padding_mode str
-        padding_mode = "replicate"
-    elif padding_mode == "symmetric":
-        # route to another implementation
-        return _pad_symmetric(img, p)
-
-    need_squeeze = False
-    if img.ndim < 4:
-        img = img.unsqueeze(dim=0)
-        need_squeeze = True
-
-    out_dtype = img.dtype
-    need_cast = False
-    if (padding_mode != "constant") and img.dtype not in (torch.float32, torch.float64):
-        # Here we temporary cast input tensor to float
-        # until pytorch issue is resolved :
-        # https://github.com/pytorch/pytorch/issues/40763
-        need_cast = True
-        img = img.to(torch.float32)
-
-    if padding_mode in ("reflect", "replicate"):
-        img = torch_pad(img, p, mode=padding_mode)
-    else:
-        img = torch_pad(img, p, mode=padding_mode, value=float(fill))
-
-    if need_squeeze:
-        img = img.squeeze(dim=0)
-
-    if need_cast:
-        img = img.to(out_dtype)
-
-    return img
-
-
-def resize(
-    img: Tensor,
-    size: List[int],
-    interpolation: str = "bilinear",
-    antialias: Optional[bool] = None,
-) -> Tensor:
-    _assert_image_tensor(img)
-
-    if isinstance(size, tuple):
-        size = list(size)
-
-    if antialias is None:
-        antialias = False
-
-    if antialias and interpolation not in ["bilinear", "bicubic"]:
-        raise ValueError("Antialias option is supported for bilinear and bicubic interpolation modes only")
-
-    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [torch.float32, torch.float64])
-
-    # Define align_corners to avoid warnings
-    align_corners = False if interpolation in ["bilinear", "bicubic"] else None
-
-    img = interpolate(img, size=size, mode=interpolation, align_corners=align_corners, antialias=antialias)
-
-    if interpolation == "bicubic" and out_dtype == torch.uint8:
-        img = img.clamp(min=0, max=255)
-
-    img = _cast_squeeze_out(img, need_cast=need_cast, need_squeeze=need_squeeze, out_dtype=out_dtype)
-
-    return img
-
-
-def _assert_grid_transform_inputs(
-    img: Tensor,
-    matrix: Optional[List[float]],
-    interpolation: str,
-    fill: Optional[Union[int, float, List[float]]],
-    supported_interpolation_modes: List[str],
-    coeffs: Optional[List[float]] = None,
-) -> None:
-
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError("Input img should be Tensor")
-
-    _assert_image_tensor(img)
-
-    if matrix is not None and not isinstance(matrix, list):
-        raise TypeError("Argument matrix should be a list")
-
-    if matrix is not None and len(matrix) != 6:
-        raise ValueError("Argument matrix should have 6 float values")
-
-    if coeffs is not None and len(coeffs) != 8:
-        raise ValueError("Argument coeffs should have 8 float values")
-
-    if fill is not None and not isinstance(fill, (int, float, tuple, list)):
-        warnings.warn("Argument fill should be either int, float, tuple or list")
-
-    # Check fill
-    num_channels = get_dimensions(img)[0]
-    if fill is not None and isinstance(fill, (tuple, list)) and len(fill) > 1 and len(fill) != num_channels:
-        msg = (
-            "The number of elements in 'fill' cannot broadcast to match the number of "
-            "channels of the image ({} != {})"
-        )
-        raise ValueError(msg.format(len(fill), num_channels))
-
-    if interpolation not in supported_interpolation_modes:
-        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
-
-
-def _cast_squeeze_in(img: Tensor, req_dtypes: List[torch.dtype]) -> Tuple[Tensor, bool, bool, torch.dtype]:
-    need_squeeze = False
-    # make image NCHW
-    if img.ndim < 4:
-        img = img.unsqueeze(dim=0)
-        need_squeeze = True
-
-    out_dtype = img.dtype
-    need_cast = False
-    if out_dtype not in req_dtypes:
-        need_cast = True
-        req_dtype = req_dtypes[0]
-        img = img.to(req_dtype)
-    return img, need_cast, need_squeeze, out_dtype
-
-
-def _cast_squeeze_out(img: Tensor, need_cast: bool, need_squeeze: bool, out_dtype: torch.dtype) -> Tensor:
-    if need_squeeze:
-        img = img.squeeze(dim=0)
-
-    if need_cast:
-        if out_dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
-            # it is better to round before cast
-            img = torch.round(img)
-        img = img.to(out_dtype)
-
-    return img
-
-
-def _apply_grid_transform(
-    img: Tensor, grid: Tensor, mode: str, fill: Optional[Union[int, float, List[float]]]
-) -> Tensor:
-
-    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(img, [grid.dtype])
-
-    if img.shape[0] > 1:
-        # Apply same grid to a batch of images
-        grid = grid.expand(img.shape[0], grid.shape[1], grid.shape[2], grid.shape[3])
-
-    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
-    if fill is not None:
-        mask = torch.ones((img.shape[0], 1, img.shape[2], img.shape[3]), dtype=img.dtype, device=img.device)
-        img = torch.cat((img, mask), dim=1)
-
-    img = grid_sample(img, grid, mode=mode, padding_mode="zeros", align_corners=False)
-
-    # Fill with required color
-    if fill is not None:
-        mask = img[:, -1:, :, :]  # N * 1 * H * W
-        img = img[:, :-1, :, :]  # N * C * H * W
-        mask = mask.expand_as(img)
-        fill_list, len_fill = (fill, len(fill)) if isinstance(fill, (tuple, list)) else ([float(fill)], 1)
-        fill_img = torch.tensor(fill_list, dtype=img.dtype, device=img.device).view(1, len_fill, 1, 1).expand_as(img)
-        if mode == "nearest":
-            mask = mask < 0.5
-            img[mask] = fill_img[mask]
-        else:  # 'bilinear'
-            img = img * mask + (1.0 - mask) * fill_img
-
-    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
-    return img
-
-
-def _gen_affine_grid(
-    theta: Tensor,
-    w: int,
-    h: int,
-    ow: int,
-    oh: int,
-) -> Tensor:
-    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
-    # AffineGridGenerator.cpp#L18
-    # Difference with AffineGridGenerator is that:
-    # 1) we normalize grid values after applying theta
-    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
-
-    d = 0.5
-    base_grid = torch.empty(1, oh, ow, 3, dtype=theta.dtype, device=theta.device)
-    x_grid = torch.linspace(-ow * 0.5 + d, ow * 0.5 + d - 1, steps=ow, device=theta.device)
-    base_grid[..., 0].copy_(x_grid)
-    y_grid = torch.linspace(-oh * 0.5 + d, oh * 0.5 + d - 1, steps=oh, device=theta.device).unsqueeze_(-1)
-    base_grid[..., 1].copy_(y_grid)
-    base_grid[..., 2].fill_(1)
-
-    rescaled_theta = theta.transpose(1, 2) / torch.tensor([0.5 * w, 0.5 * h], dtype=theta.dtype, device=theta.device)
-    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
-    return output_grid.view(1, oh, ow, 2)
-
-
-def affine(
-    img: Tensor,
-    matrix: List[float],
-    interpolation: str = "nearest",
-    fill: Optional[Union[int, float, List[float]]] = None,
-) -> Tensor:
-    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
-
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
-    shape = img.shape
-    # grid will be generated on the same device as theta and img
-    grid = _gen_affine_grid(theta, w=shape[-1], h=shape[-2], ow=shape[-1], oh=shape[-2])
-    return _apply_grid_transform(img, grid, interpolation, fill=fill)
-
-
-def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]:
-
-    # Inspired of PIL implementation:
-    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
-
-    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
-    # Points are shifted due to affine matrix torch convention about
-    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
-    pts = torch.tensor(
-        [
-            [-0.5 * w, -0.5 * h, 1.0],
-            [-0.5 * w, 0.5 * h, 1.0],
-            [0.5 * w, 0.5 * h, 1.0],
-            [0.5 * w, -0.5 * h, 1.0],
-        ]
-    )
-    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
-    new_pts = torch.matmul(pts, theta.T)
-    min_vals, _ = new_pts.min(dim=0)
-    max_vals, _ = new_pts.max(dim=0)
-
-    # shift points to [0, w] and [0, h] interval to match PIL results
-    min_vals += torch.tensor((w * 0.5, h * 0.5))
-    max_vals += torch.tensor((w * 0.5, h * 0.5))
-
-    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
-    tol = 1e-4
-    cmax = torch.ceil((max_vals / tol).trunc_() * tol)
-    cmin = torch.floor((min_vals / tol).trunc_() * tol)
-    size = cmax - cmin
-    return int(size[0]), int(size[1])  # w, h
-
-
-def rotate(
-    img: Tensor,
-    matrix: List[float],
-    interpolation: str = "nearest",
-    expand: bool = False,
-    fill: Optional[Union[int, float, List[float]]] = None,
-) -> Tensor:
-    _assert_grid_transform_inputs(img, matrix, interpolation, fill, ["nearest", "bilinear"])
-    w, h = img.shape[-1], img.shape[-2]
-    ow, oh = _compute_affine_output_size(matrix, w, h) if expand else (w, h)
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    theta = torch.tensor(matrix, dtype=dtype, device=img.device).reshape(1, 2, 3)
-    # grid will be generated on the same device as theta and img
-    grid = _gen_affine_grid(theta, w=w, h=h, ow=ow, oh=oh)
-
-    return _apply_grid_transform(img, grid, interpolation, fill=fill)
-
-
-def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> Tensor:
-    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
-    # src/libImaging/Geometry.c#L394
-
-    #
-    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
-    #
-    theta1 = torch.tensor(
-        [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
-    )
-    theta2 = torch.tensor([[[coeffs[6], coeffs[7], 1.0], [coeffs[6], coeffs[7], 1.0]]], dtype=dtype, device=device)
-
-    d = 0.5
-    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
-    x_grid = torch.linspace(d, ow * 1.0 + d - 1.0, steps=ow, device=device)
-    base_grid[..., 0].copy_(x_grid)
-    y_grid = torch.linspace(d, oh * 1.0 + d - 1.0, steps=oh, device=device).unsqueeze_(-1)
-    base_grid[..., 1].copy_(y_grid)
-    base_grid[..., 2].fill_(1)
-
-    rescaled_theta1 = theta1.transpose(1, 2) / torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device)
-    output_grid1 = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta1)
-    output_grid2 = base_grid.view(1, oh * ow, 3).bmm(theta2.transpose(1, 2))
-
-    output_grid = output_grid1 / output_grid2 - 1.0
-    return output_grid.view(1, oh, ow, 2)
-
-
-def perspective(
-    img: Tensor,
-    perspective_coeffs: List[float],
-    interpolation: str = "bilinear",
-    fill: Optional[Union[int, float, List[float]]] = None,
-) -> Tensor:
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError("Input img should be Tensor.")
-
-    _assert_image_tensor(img)
-
-    _assert_grid_transform_inputs(
-        img,
-        matrix=None,
-        interpolation=interpolation,
-        fill=fill,
-        supported_interpolation_modes=["nearest", "bilinear"],
-        coeffs=perspective_coeffs,
-    )
-
-    ow, oh = img.shape[-1], img.shape[-2]
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=img.device)
-    return _apply_grid_transform(img, grid, interpolation, fill=fill)
-
-
-def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> Tensor:
-    ksize_half = (kernel_size - 1) * 0.5
-
-    x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size)
-    pdf = torch.exp(-0.5 * (x / sigma).pow(2))
-    kernel1d = pdf / pdf.sum()
-
-    return kernel1d
-
-
-def _get_gaussian_kernel2d(
-    kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device
-) -> Tensor:
-    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0]).to(device, dtype=dtype)
-    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1]).to(device, dtype=dtype)
-    kernel2d = torch.mm(kernel1d_y[:, None], kernel1d_x[None, :])
-    return kernel2d
-
-
-def gaussian_blur(img: Tensor, kernel_size: List[int], sigma: List[float]) -> Tensor:
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError(f"img should be Tensor. Got {type(img)}")
-
-    _assert_image_tensor(img)
-
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype, device=img.device)
-    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
-
-    img, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(
-        img,
-        [
-            kernel.dtype,
-        ],
-    )
-
-    # padding = (left, right, top, bottom)
-    padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
-    img = torch_pad(img, padding, mode="reflect")
-    img = conv2d(img, kernel, groups=img.shape[-3])
-
-    img = _cast_squeeze_out(img, need_cast, need_squeeze, out_dtype)
-    return img
-
-
-def invert(img: Tensor) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-
-    _assert_channels(img, [1, 3])
-
-    bound = torch.tensor(1 if img.is_floating_point() else 255, dtype=img.dtype, device=img.device)
-    return bound - img
-
-
-def posterize(img: Tensor, bits: int) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-    if img.dtype != torch.uint8:
-        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
-
-    _assert_channels(img, [1, 3])
-    mask = -int(2 ** (8 - bits))  # JIT-friendly for: ~(2 ** (8 - bits) - 1)
-    return img & mask
-
-
-def solarize(img: Tensor, threshold: float) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-
-    _assert_channels(img, [1, 3])
-
-    _assert_threshold(img, threshold)
-
-    inverted_img = invert(img)
-    return torch.where(img >= threshold, inverted_img, img)
-
-
-def _blurred_degenerate_image(img: Tensor) -> Tensor:
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-
-    kernel = torch.ones((3, 3), dtype=dtype, device=img.device)
-    kernel[1, 1] = 5.0
-    kernel /= kernel.sum()
-    kernel = kernel.expand(img.shape[-3], 1, kernel.shape[0], kernel.shape[1])
-
-    result_tmp, need_cast, need_squeeze, out_dtype = _cast_squeeze_in(
-        img,
-        [
-            kernel.dtype,
-        ],
-    )
-    result_tmp = conv2d(result_tmp, kernel, groups=result_tmp.shape[-3])
-    result_tmp = _cast_squeeze_out(result_tmp, need_cast, need_squeeze, out_dtype)
-
-    result = img.clone()
-    result[..., 1:-1, 1:-1] = result_tmp
-
-    return result
-
-
-def adjust_sharpness(img: Tensor, sharpness_factor: float) -> Tensor:
-    if sharpness_factor < 0:
-        raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.")
-
-    _assert_image_tensor(img)
-
-    _assert_channels(img, [1, 3])
-
-    if img.size(-1) <= 2 or img.size(-2) <= 2:
-        return img
-
-    return _blend(img, _blurred_degenerate_image(img), sharpness_factor)
-
-
-def autocontrast(img: Tensor) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if img.ndim < 3:
-        raise TypeError(f"Input image tensor should have at least 3 dimensions, but found {img.ndim}")
-
-    _assert_channels(img, [1, 3])
-
-    bound = 1.0 if img.is_floating_point() else 255.0
-    dtype = img.dtype if torch.is_floating_point(img) else torch.float32
-
-    minimum = img.amin(dim=(-2, -1), keepdim=True).to(dtype)
-    maximum = img.amax(dim=(-2, -1), keepdim=True).to(dtype)
-    scale = bound / (maximum - minimum)
-    eq_idxs = torch.isfinite(scale).logical_not()
-    minimum[eq_idxs] = 0
-    scale[eq_idxs] = 1
-
-    return ((img - minimum) * scale).clamp(0, bound).to(img.dtype)
-
-
-def _scale_channel(img_chan: Tensor) -> Tensor:
-    # TODO: we should expect bincount to always be faster than histc, but this
-    # isn't always the case. Once
-    # https://github.com/pytorch/pytorch/issues/53194 is fixed, remove the if
-    # block and only use bincount.
-    if img_chan.is_cuda:
-        hist = torch.histc(img_chan.to(torch.float32), bins=256, min=0, max=255)
-    else:
-        hist = torch.bincount(img_chan.view(-1), minlength=256)
-
-    nonzero_hist = hist[hist != 0]
-    step = torch.div(nonzero_hist[:-1].sum(), 255, rounding_mode="floor")
-    if step == 0:
-        return img_chan
-
-    lut = torch.div(torch.cumsum(hist, 0) + torch.div(step, 2, rounding_mode="floor"), step, rounding_mode="floor")
-    lut = torch.nn.functional.pad(lut, [1, 0])[:-1].clamp(0, 255)
-
-    return lut[img_chan.to(torch.int64)].to(torch.uint8)
-
-
-def _equalize_single_image(img: Tensor) -> Tensor:
-    return torch.stack([_scale_channel(img[c]) for c in range(img.size(0))])
-
-
-def equalize(img: Tensor) -> Tensor:
-
-    _assert_image_tensor(img)
-
-    if not (3 <= img.ndim <= 4):
-        raise TypeError(f"Input image tensor should have 3 or 4 dimensions, but found {img.ndim}")
-    if img.dtype != torch.uint8:
-        raise TypeError(f"Only torch.uint8 image tensors are supported, but found {img.dtype}")
-
-    _assert_channels(img, [1, 3])
-
-    if img.ndim == 3:
-        return _equalize_single_image(img)
-
-    return torch.stack([_equalize_single_image(x) for x in img])
-
-
-def normalize(tensor: Tensor, mean: List[float], std: List[float], inplace: bool = False) -> Tensor:
-    _assert_image_tensor(tensor)
-
-    if not tensor.is_floating_point():
-        raise TypeError(f"Input tensor should be a float tensor. Got {tensor.dtype}.")
-
-    if tensor.ndim < 3:
-        raise ValueError(
-            f"Expected tensor to be a tensor image of size (..., C, H, W). Got tensor.size() = {tensor.size()}"
-        )
-
-    if not inplace:
-        tensor = tensor.clone()
-
-    dtype = tensor.dtype
-    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
-    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
-    if (std == 0).any():
-        raise ValueError(f"std evaluated to zero after conversion to {dtype}, leading to division by zero.")
-    if mean.ndim == 1:
-        mean = mean.view(-1, 1, 1)
-    if std.ndim == 1:
-        std = std.view(-1, 1, 1)
-    return tensor.sub_(mean).div_(std)
-
-
-def erase(img: Tensor, i: int, j: int, h: int, w: int, v: Tensor, inplace: bool = False) -> Tensor:
-    _assert_image_tensor(img)
-
-    if not inplace:
-        img = img.clone()
-
-    img[..., i : i + h, j : j + w] = v
-    return img
-
-
-def _create_identity_grid(size: List[int]) -> Tensor:
-    hw_space = [torch.linspace((-s + 1) / s, (s - 1) / s, s) for s in size]
-    grid_y, grid_x = torch.meshgrid(hw_space, indexing="ij")
-    return torch.stack([grid_x, grid_y], -1).unsqueeze(0)  # 1 x H x W x 2
-
-
-def elastic_transform(
-    img: Tensor,
-    displacement: Tensor,
-    interpolation: str = "bilinear",
-    fill: Optional[Union[int, float, List[float]]] = None,
-) -> Tensor:
-
-    if not (isinstance(img, torch.Tensor)):
-        raise TypeError(f"img should be Tensor. Got {type(img)}")
-
-    size = list(img.shape[-2:])
-    displacement = displacement.to(img.device)
-
-    identity_grid = _create_identity_grid(size)
-    grid = identity_grid.to(img.device) + displacement
-    return _apply_grid_transform(img, grid, interpolation, fill)
+warnings.warn(
+    "The torchvision.transforms.functional_tensor module is deprecated "
+    "in 0.15 and will be **removed in 0.17**. Please don't rely on it. "
+    "You probably just need to use APIs in "
+    "torchvision.transforms.functional or in "
+    "torchvision.transforms.v2.functional."
+)
diff --git a/torchvision/transforms/transforms.py b/torchvision/transforms/transforms.py
index 18e2ffc968055218c958b6e71cd5726323df6f35..38fc417204cab5a229114c63bb52839b1c73f1bc 100644
--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -3,7 +3,7 @@ import numbers
 import random
 import warnings
 from collections.abc import Sequence
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Union
 
 import torch
 from torch import Tensor
@@ -105,7 +105,9 @@ class Compose:
 
 
 class ToTensor:
-    """Convert a ``PIL Image`` or ``numpy.ndarray`` to tensor. This transform does not support torchscript.
+    """Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image or numpy.ndarray (H x W x C) in the range
     [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
@@ -139,7 +141,9 @@ class ToTensor:
 
 
 class PILToTensor:
-    """Convert a ``PIL Image`` to a tensor of the same type. This transform does not support torchscript.
+    """Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    This transform does not support torchscript.
 
     Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
     """
@@ -166,7 +170,8 @@ class PILToTensor:
 
 
 class ConvertImageDtype(torch.nn.Module):
-    """Convert a tensor image to the given ``dtype`` and scale the values accordingly
+    """Convert a tensor image to the given ``dtype`` and scale the values accordingly.
+
     This function does not support PIL Image.
 
     Args:
@@ -194,19 +199,21 @@ class ConvertImageDtype(torch.nn.Module):
 
 
 class ToPILImage:
-    """Convert a tensor or an ndarray to PIL Image. This transform does not support torchscript.
+    """Convert a tensor or an ndarray to PIL Image
+
+    This transform does not support torchscript.
 
     Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
-    H x W x C to a PIL Image while preserving the value range.
+    H x W x C to a PIL Image while adjusting the value range depending on the ``mode``.
 
     Args:
         mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
             If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+
             - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
             - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
             - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
-            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
-            ``short``).
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``, ``short``).
 
     .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
     """
@@ -276,7 +283,7 @@ class Normalize(torch.nn.Module):
 class Resize(torch.nn.Module):
     """Resize the input image to the given size.
     If the image is torch Tensor, it is expected
-    to have [..., H, W] shape, where ... means an arbitrary number of leading dimensions
+    to have [..., H, W] shape, where ... means a maximum of two leading dimensions
 
     .. warning::
         The output image might be different depending on its type: when downsampling, the interpolation of PIL images
@@ -296,25 +303,38 @@ class Resize(torch.nn.Module):
                 In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
-            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and
-            ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         max_size (int, optional): The maximum allowed for the longer edge of
-            the resized image: if the longer edge of the image is greater
-            than ``max_size`` after being resized according to ``size``, then
-            the image is resized again so that the longer edge is equal to
-            ``max_size``. As a result, ``size`` might be overruled, i.e the
-            smaller edge may be shorter than ``size``. This is only supported
-            if ``size`` is an int (or a sequence of length 1 in torchscript
-            mode).
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``,
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
     """
 
-    def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None, antialias=None):
+    def __init__(self, size, interpolation=InterpolationMode.BILINEAR, max_size=None, antialias="warn"):
         super().__init__()
         _log_api_usage_once(self)
         if not isinstance(size, (int, Sequence)):
@@ -324,12 +344,7 @@ class Resize(torch.nn.Module):
         self.size = size
         self.max_size = max_size
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
         self.interpolation = interpolation
@@ -752,8 +767,7 @@ class RandomPerspective(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
     """
@@ -763,12 +777,7 @@ class RandomPerspective(torch.nn.Module):
         _log_api_usage_once(self)
         self.p = p
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
         self.interpolation = interpolation
@@ -865,14 +874,27 @@ class RandomResizedCrop(torch.nn.Module):
             resizing.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
-            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` and
-            ``InterpolationMode.BICUBIC`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
-        antialias (bool, optional): antialias flag. If ``img`` is PIL Image, the flag is ignored and anti-alias
-            is always used. If ``img`` is Tensor, the flag is False by default and can be set to True for
-            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` modes.
-            This can help making the output for PIL images and tensors closer.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
     """
 
     def __init__(
@@ -881,7 +903,7 @@ class RandomResizedCrop(torch.nn.Module):
         scale=(0.08, 1.0),
         ratio=(3.0 / 4.0, 4.0 / 3.0),
         interpolation=InterpolationMode.BILINEAR,
-        antialias: Optional[bool] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
     ):
         super().__init__()
         _log_api_usage_once(self)
@@ -894,12 +916,7 @@ class RandomResizedCrop(torch.nn.Module):
         if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
             warnings.warn("Scale and ratio should be of kind (min, max)")
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
         self.interpolation = interpolation
@@ -967,7 +984,7 @@ class RandomResizedCrop(torch.nn.Module):
         format_string = self.__class__.__name__ + f"(size={self.size}"
         format_string += f", scale={tuple(round(s, 4) for s in self.scale)}"
         format_string += f", ratio={tuple(round(r, 4) for r in self.ratio)}"
-        format_string += f", interpolation={interpolate_str})"
+        format_string += f", interpolation={interpolate_str}"
         format_string += f", antialias={self.antialias})"
         return format_string
 
@@ -1039,7 +1056,7 @@ class TenCrop(torch.nn.Module):
 
     Example:
          >>> transform = Compose([
-         >>>    TenCrop(size), # this is a list of PIL Images
+         >>>    TenCrop(size), # this is a tuple of PIL Images
          >>>    Lambda(lambda crops: torch.stack([PILToTensor()(crop) for crop in crops])) # returns a 4D tensor
          >>> ])
          >>> #In your test loop you can do the following:
@@ -1108,6 +1125,11 @@ class LinearTransformation(torch.nn.Module):
                 f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
             )
 
+        if transformation_matrix.dtype != mean_vector.dtype:
+            raise ValueError(
+                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
+            )
+
         self.transformation_matrix = transformation_matrix
         self.mean_vector = mean_vector
 
@@ -1135,7 +1157,8 @@ class LinearTransformation(torch.nn.Module):
             )
 
         flat_tensor = tensor.view(-1, n) - self.mean_vector
-        transformed_tensor = torch.mm(flat_tensor, self.transformation_matrix)
+        transformation_matrix = self.transformation_matrix.to(flat_tensor.dtype)
+        transformed_tensor = torch.mm(flat_tensor, transformation_matrix)
         tensor = transformed_tensor.view(shape)
         return tensor
 
@@ -1160,7 +1183,7 @@ class ColorJitter(torch.nn.Module):
             or the given [min, max]. Should be non negative numbers.
         contrast (float or tuple of float (min, max)): How much to jitter contrast.
             contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
-            or the given [min, max]. Should be non negative numbers.
+            or the given [min, max]. Should be non-negative numbers.
         saturation (float or tuple of float (min, max)): How much to jitter saturation.
             saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
             or the given [min, max]. Should be non negative numbers.
@@ -1172,7 +1195,13 @@ class ColorJitter(torch.nn.Module):
             or use an interpolation that generates negative values before using this function.
     """
 
-    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+    def __init__(
+        self,
+        brightness: Union[float, Tuple[float, float]] = 0,
+        contrast: Union[float, Tuple[float, float]] = 0,
+        saturation: Union[float, Tuple[float, float]] = 0,
+        hue: Union[float, Tuple[float, float]] = 0,
+    ) -> None:
         super().__init__()
         _log_api_usage_once(self)
         self.brightness = self._check_input(brightness, "brightness")
@@ -1189,16 +1218,19 @@ class ColorJitter(torch.nn.Module):
             if clip_first_on_zero:
                 value[0] = max(value[0], 0.0)
         elif isinstance(value, (tuple, list)) and len(value) == 2:
-            if not bound[0] <= value[0] <= value[1] <= bound[1]:
-                raise ValueError(f"{name} values should be between {bound}")
+            value = [float(value[0]), float(value[1])]
         else:
             raise TypeError(f"{name} should be a single number or a list/tuple with length 2.")
 
+        if not bound[0] <= value[0] <= value[1] <= bound[1]:
+            raise ValueError(f"{name} values should be between {bound}, but got {value}.")
+
         # if value is 0 or (1., 1.) for brightness/contrast/saturation
         # or (0., 0.) for hue, do nothing
         if value[0] == value[1] == center:
-            value = None
-        return value
+            return None
+        else:
+            return tuple(value)
 
     @staticmethod
     def get_params(
@@ -1279,8 +1311,7 @@ class RandomRotation(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         expand (bool, optional): Optional expansion flag.
             If true, expands the output to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1298,12 +1329,7 @@ class RandomRotation(torch.nn.Module):
         super().__init__()
         _log_api_usage_once(self)
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
@@ -1381,16 +1407,15 @@ class RandomAffine(torch.nn.Module):
         scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
             randomly sampled from the range a <= scale <= b. Will keep original scale by default.
         shear (sequence or number, optional): Range of degrees to select from.
-            If shear is a number, a shear parallel to the x axis in the range (-shear, +shear)
-            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x axis in the
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
             range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
-            a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
             Will not apply shear by default.
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image[.Resampling].NEAREST``) are still accepted,
-            but deprecated since 0.13 and will be removed in 0.15. Please use InterpolationMode enum.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
         center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
@@ -1413,12 +1438,7 @@ class RandomAffine(torch.nn.Module):
         super().__init__()
         _log_api_usage_once(self)
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument 'interpolation' of type int is deprecated since 0.13 and will be removed in 0.15. "
-                "Please use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
 
         self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
@@ -1602,7 +1622,7 @@ class RandomGrayscale(torch.nn.Module):
 
 
 class RandomErasing(torch.nn.Module):
-    """Randomly selects a rectangle region in an torch Tensor image and erases its pixels.
+    """Randomly selects a rectangle region in a torch.Tensor image and erases its pixels.
     This transform does not support PIL Image.
     'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
 
@@ -1707,11 +1727,11 @@ class RandomErasing(torch.nn.Module):
 
             # cast self.value to script acceptable type
             if isinstance(self.value, (int, float)):
-                value = [self.value]
+                value = [float(self.value)]
             elif isinstance(self.value, str):
                 value = None
-            elif isinstance(self.value, tuple):
-                value = list(self.value)
+            elif isinstance(self.value, (list, tuple)):
+                value = [float(v) for v in self.value]
             else:
                 value = self.value
 
@@ -1938,7 +1958,7 @@ class RandomAdjustSharpness(torch.nn.Module):
 
     Args:
         sharpness_factor (float):  How much to adjust the sharpness. Can be
-            any non negative number. 0 gives a blurred image, 1 gives the
+            any non-negative number. 0 gives a blurred image, 1 gives the
             original image while 2 increases the sharpness by a factor of 2.
         p (float): probability of the image being sharpened. Default value is 0.5
     """
@@ -2045,7 +2065,7 @@ class ElasticTransform(torch.nn.Module):
         interpolation (InterpolationMode): Desired interpolation enum defined by
             :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
             If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
-            For backward compatibility integer values (e.g. ``PIL.Image.NEAREST``) are still acceptable.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
         fill (sequence or number): Pixel fill value for the area outside the transformed
             image. Default is ``0``. If given a number, the value is used for all bands respectively.
 
@@ -2086,17 +2106,16 @@ class ElasticTransform(torch.nn.Module):
 
         self.sigma = sigma
 
-        # Backward compatibility with integer value
         if isinstance(interpolation, int):
-            warnings.warn(
-                "Argument interpolation should be of type InterpolationMode instead of int. "
-                "Please, use InterpolationMode enum."
-            )
             interpolation = _interpolation_modes_from_int(interpolation)
         self.interpolation = interpolation
 
-        if not isinstance(fill, (int, float)):
-            raise TypeError(f"fill should be int or float. Got {type(fill)}")
+        if isinstance(fill, (int, float)):
+            fill = [float(fill)]
+        elif isinstance(fill, (list, tuple)):
+            fill = [float(f) for f in fill]
+        else:
+            raise TypeError(f"fill should be int or float or a list or tuple of them. Got {type(fill)}")
         self.fill = fill
 
     @staticmethod
@@ -2123,7 +2142,7 @@ class ElasticTransform(torch.nn.Module):
     def forward(self, tensor: Tensor) -> Tensor:
         """
         Args:
-            img (PIL Image or Tensor): Image to be transformed.
+            tensor (PIL Image or Tensor): Image to be transformed.
 
         Returns:
             PIL Image or Tensor: Transformed image.
@@ -2133,9 +2152,9 @@ class ElasticTransform(torch.nn.Module):
         return F.elastic_transform(tensor, displacement, self.interpolation, self.fill)
 
     def __repr__(self):
-        format_string = self.__class__.__name__ + "(alpha="
-        format_string += str(self.alpha) + ")"
-        format_string += ", (sigma=" + str(self.sigma) + ")"
-        format_string += ", interpolation={self.interpolation}"
-        format_string += ", fill={self.fill})"
+        format_string = self.__class__.__name__
+        format_string += f"(alpha={self.alpha}"
+        format_string += f", sigma={self.sigma}"
+        format_string += f", interpolation={self.interpolation}"
+        format_string += f", fill={self.fill})"
         return format_string
diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..dbc0474d307a9fc87ab35067c08771adffe9d3b6
--- /dev/null
+++ b/torchvision/transforms/v2/__init__.py
@@ -0,0 +1,57 @@
+from torchvision.transforms import AutoAugmentPolicy, InterpolationMode  # usort: skip
+
+from . import functional  # usort: skip
+
+from ._transform import Transform  # usort: skip
+
+from ._augment import CutMix, MixUp, RandomErasing
+from ._auto_augment import AugMix, AutoAugment, RandAugment, TrivialAugmentWide
+from ._color import (
+    ColorJitter,
+    Grayscale,
+    RandomAdjustSharpness,
+    RandomAutocontrast,
+    RandomChannelPermutation,
+    RandomEqualize,
+    RandomGrayscale,
+    RandomInvert,
+    RandomPhotometricDistort,
+    RandomPosterize,
+    RandomSolarize,
+)
+from ._container import Compose, RandomApply, RandomChoice, RandomOrder
+from ._geometry import (
+    CenterCrop,
+    ElasticTransform,
+    FiveCrop,
+    Pad,
+    RandomAffine,
+    RandomCrop,
+    RandomHorizontalFlip,
+    RandomIoUCrop,
+    RandomPerspective,
+    RandomResize,
+    RandomResizedCrop,
+    RandomRotation,
+    RandomShortestSize,
+    RandomVerticalFlip,
+    RandomZoomOut,
+    Resize,
+    ScaleJitter,
+    TenCrop,
+)
+from ._meta import ClampBoundingBoxes, ConvertBoundingBoxFormat
+from ._misc import (
+    ConvertImageDtype,
+    GaussianBlur,
+    Identity,
+    Lambda,
+    LinearTransformation,
+    Normalize,
+    SanitizeBoundingBoxes,
+    ToDtype,
+)
+from ._temporal import UniformTemporalSubsample
+from ._type_conversion import PILToTensor, ToImage, ToPILImage, ToPureTensor
+
+from ._deprecated import ToTensor  # usort: skip
diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..ad7fb861be268ee7c6f205a608be90ee21b91639
--- /dev/null
+++ b/torchvision/transforms/v2/_augment.py
@@ -0,0 +1,325 @@
+import math
+import numbers
+import warnings
+from typing import Any, Callable, Dict, List, Tuple
+
+import PIL.Image
+import torch
+from torch.nn.functional import one_hot
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+from ._transform import _RandomApplyTransform, Transform
+from ._utils import _parse_labels_getter, has_any, is_pure_tensor, query_chw, query_size
+
+
+class RandomErasing(_RandomApplyTransform):
+    """[BETA] Randomly select a rectangle region in the input image or video and erase its pixels.
+
+    .. v2betastatus:: RandomErasing transform
+
+    This transform does not support PIL Image.
+    'Random Erasing Data Augmentation' by Zhong et al. See https://arxiv.org/abs/1708.04896
+
+    Args:
+        p (float, optional): probability that the random erasing operation will be performed.
+        scale (tuple of float, optional): range of proportion of erased area against input image.
+        ratio (tuple of float, optional): range of aspect ratio of erased area.
+        value (number or tuple of numbers): erasing value. Default is 0. If a single int, it is used to
+            erase all pixels. If a tuple of length 3, it is used to erase
+            R, G, B channels respectively.
+            If a str of 'random', erasing each pixel with random values.
+        inplace (bool, optional): boolean to make this transform inplace. Default set to False.
+
+    Returns:
+        Erased input.
+
+    Example:
+        >>> from torchvision.transforms import v2 as transforms
+        >>>
+        >>> transform = transforms.Compose([
+        >>>   transforms.RandomHorizontalFlip(),
+        >>>   transforms.PILToTensor(),
+        >>>   transforms.ConvertImageDtype(torch.float),
+        >>>   transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>>   transforms.RandomErasing(),
+        >>> ])
+    """
+
+    _v1_transform_cls = _transforms.RandomErasing
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        return dict(
+            super()._extract_params_for_v1_transform(),
+            value="random" if self.value is None else self.value,
+        )
+
+    def __init__(
+        self,
+        p: float = 0.5,
+        scale: Tuple[float, float] = (0.02, 0.33),
+        ratio: Tuple[float, float] = (0.3, 3.3),
+        value: float = 0.0,
+        inplace: bool = False,
+    ):
+        super().__init__(p=p)
+        if not isinstance(value, (numbers.Number, str, tuple, list)):
+            raise TypeError("Argument value should be either a number or str or a sequence")
+        if isinstance(value, str) and value != "random":
+            raise ValueError("If value is str, it should be 'random'")
+        if not isinstance(scale, (tuple, list)):
+            raise TypeError("Scale should be a sequence")
+        if not isinstance(ratio, (tuple, list)):
+            raise TypeError("Ratio should be a sequence")
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+        if scale[0] < 0 or scale[1] > 1:
+            raise ValueError("Scale should be between 0 and 1")
+        self.scale = scale
+        self.ratio = ratio
+        if isinstance(value, (int, float)):
+            self.value = [float(value)]
+        elif isinstance(value, str):
+            self.value = None
+        elif isinstance(value, (list, tuple)):
+            self.value = [float(v) for v in value]
+        else:
+            self.value = value
+        self.inplace = inplace
+
+        self._log_ratio = torch.log(torch.tensor(self.ratio))
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        img_c, img_h, img_w = query_chw(flat_inputs)
+
+        if self.value is not None and not (len(self.value) in (1, img_c)):
+            raise ValueError(
+                f"If value is a sequence, it should have either a single value or {img_c} (number of inpt channels)"
+            )
+
+        area = img_h * img_w
+
+        log_ratio = self._log_ratio
+        for _ in range(10):
+            erase_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+            aspect_ratio = torch.exp(
+                torch.empty(1).uniform_(
+                    log_ratio[0],  # type: ignore[arg-type]
+                    log_ratio[1],  # type: ignore[arg-type]
+                )
+            ).item()
+
+            h = int(round(math.sqrt(erase_area * aspect_ratio)))
+            w = int(round(math.sqrt(erase_area / aspect_ratio)))
+            if not (h < img_h and w < img_w):
+                continue
+
+            if self.value is None:
+                v = torch.empty([img_c, h, w], dtype=torch.float32).normal_()
+            else:
+                v = torch.tensor(self.value)[:, None, None]
+
+            i = torch.randint(0, img_h - h + 1, size=(1,)).item()
+            j = torch.randint(0, img_w - w + 1, size=(1,)).item()
+            break
+        else:
+            i, j, h, w, v = 0, 0, img_h, img_w, None
+
+        return dict(i=i, j=j, h=h, w=w, v=v)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        if params["v"] is not None:
+            inpt = self._call_kernel(F.erase, inpt, **params, inplace=self.inplace)
+
+        return inpt
+
+
+class _BaseMixUpCutMix(Transform):
+    def __init__(self, *, alpha: float = 1.0, num_classes: int, labels_getter="default") -> None:
+        super().__init__()
+        self.alpha = float(alpha)
+        self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha]))
+
+        self.num_classes = num_classes
+
+        self._labels_getter = _parse_labels_getter(labels_getter)
+
+    def forward(self, *inputs):
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        flat_inputs, spec = tree_flatten(inputs)
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+
+        if has_any(flat_inputs, PIL.Image.Image, tv_tensors.BoundingBoxes, tv_tensors.Mask):
+            raise ValueError(f"{type(self).__name__}() does not support PIL images, bounding boxes and masks.")
+
+        labels = self._labels_getter(inputs)
+        if not isinstance(labels, torch.Tensor):
+            raise ValueError(f"The labels must be a tensor, but got {type(labels)} instead.")
+        elif labels.ndim != 1:
+            raise ValueError(
+                f"labels tensor should be of shape (batch_size,) " f"but got shape {labels.shape} instead."
+            )
+
+        params = {
+            "labels": labels,
+            "batch_size": labels.shape[0],
+            **self._get_params(
+                [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+            ),
+        }
+
+        # By default, the labels will be False inside needs_transform_list, since they are a torch.Tensor coming
+        # after an image or video. However, we need to handle them in _transform, so we make sure to set them to True
+        needs_transform_list[next(idx for idx, inpt in enumerate(flat_inputs) if inpt is labels)] = True
+        flat_outputs = [
+            self._transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _check_image_or_video(self, inpt: torch.Tensor, *, batch_size: int):
+        expected_num_dims = 5 if isinstance(inpt, tv_tensors.Video) else 4
+        if inpt.ndim != expected_num_dims:
+            raise ValueError(
+                f"Expected a batched input with {expected_num_dims} dims, but got {inpt.ndim} dimensions instead."
+            )
+        if inpt.shape[0] != batch_size:
+            raise ValueError(
+                f"The batch size of the image or video does not match the batch size of the labels: "
+                f"{inpt.shape[0]} != {batch_size}."
+            )
+
+    def _mixup_label(self, label: torch.Tensor, *, lam: float) -> torch.Tensor:
+        label = one_hot(label, num_classes=self.num_classes)
+        if not label.dtype.is_floating_point:
+            label = label.float()
+        return label.roll(1, 0).mul_(1.0 - lam).add_(label.mul(lam))
+
+
+class MixUp(_BaseMixUpCutMix):
+    """[BETA] Apply MixUp to the provided batch of images and labels.
+
+    .. v2betastatus:: MixUp transform
+
+    Paper: `mixup: Beyond Empirical Risk Minimization <https://arxiv.org/abs/1710.09412>`_.
+
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
+
+    In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
+    into a tensor of shape ``(batch_size, num_classes)``.
+
+    Args:
+        alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
+        num_classes (int): number of classes in the batch. Used for one-hot-encoding.
+        labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
+            By default, this will pick the second parameter as the labels if it's a tensor. This covers the most
+            common scenario where this transform is called as ``MixUp()(imgs_batch, labels_batch)``.
+            It can also be a callable that takes the same input as the transform, and returns the labels.
+    """
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        return dict(lam=float(self._dist.sample(())))  # type: ignore[arg-type]
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        lam = params["lam"]
+
+        if inpt is params["labels"]:
+            return self._mixup_label(inpt, lam=lam)
+        elif isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) or is_pure_tensor(inpt):
+            self._check_image_or_video(inpt, batch_size=params["batch_size"])
+
+            output = inpt.roll(1, 0).mul_(1.0 - lam).add_(inpt.mul(lam))
+
+            if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                output = tv_tensors.wrap(output, like=inpt)
+
+            return output
+        else:
+            return inpt
+
+
+class CutMix(_BaseMixUpCutMix):
+    """[BETA] Apply CutMix to the provided batch of images and labels.
+
+    .. v2betastatus:: CutMix transform
+
+    Paper: `CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features
+    <https://arxiv.org/abs/1905.04899>`_.
+
+    .. note::
+        This transform is meant to be used on **batches** of samples, not
+        individual images. See
+        :ref:`sphx_glr_auto_examples_transforms_plot_cutmix_mixup.py` for detailed usage
+        examples.
+        The sample pairing is deterministic and done by matching consecutive
+        samples in the batch, so the batch needs to be shuffled (this is an
+        implementation detail, not a guaranteed convention.)
+
+    In the input, the labels are expected to be a tensor of shape ``(batch_size,)``. They will be transformed
+    into a tensor of shape ``(batch_size, num_classes)``.
+
+    Args:
+        alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1.
+        num_classes (int): number of classes in the batch. Used for one-hot-encoding.
+        labels_getter (callable or "default", optional): indicates how to identify the labels in the input.
+            By default, this will pick the second parameter as the labels if it's a tensor. This covers the most
+            common scenario where this transform is called as ``CutMix()(imgs_batch, labels_batch)``.
+            It can also be a callable that takes the same input as the transform, and returns the labels.
+    """
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        lam = float(self._dist.sample(()))  # type: ignore[arg-type]
+
+        H, W = query_size(flat_inputs)
+
+        r_x = torch.randint(W, size=(1,))
+        r_y = torch.randint(H, size=(1,))
+
+        r = 0.5 * math.sqrt(1.0 - lam)
+        r_w_half = int(r * W)
+        r_h_half = int(r * H)
+
+        x1 = int(torch.clamp(r_x - r_w_half, min=0))
+        y1 = int(torch.clamp(r_y - r_h_half, min=0))
+        x2 = int(torch.clamp(r_x + r_w_half, max=W))
+        y2 = int(torch.clamp(r_y + r_h_half, max=H))
+        box = (x1, y1, x2, y2)
+
+        lam_adjusted = float(1.0 - (x2 - x1) * (y2 - y1) / (W * H))
+
+        return dict(box=box, lam_adjusted=lam_adjusted)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        if inpt is params["labels"]:
+            return self._mixup_label(inpt, lam=params["lam_adjusted"])
+        elif isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)) or is_pure_tensor(inpt):
+            self._check_image_or_video(inpt, batch_size=params["batch_size"])
+
+            x1, y1, x2, y2 = params["box"]
+            rolled = inpt.roll(1, 0)
+            output = inpt.clone()
+            output[..., y1:y2, x1:x2] = rolled[..., y1:y2, x1:x2]
+
+            if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                output = tv_tensors.wrap(output, like=inpt)
+
+            return output
+        else:
+            return inpt
diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ddd5aacdc396a357723380e0fd2493cb9ce5cb7
--- /dev/null
+++ b/torchvision/transforms/v2/_auto_augment.py
@@ -0,0 +1,635 @@
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+
+import PIL.Image
+import torch
+
+from torch.utils._pytree import tree_flatten, tree_unflatten, TreeSpec
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.transforms import _functional_tensor as _FT
+from torchvision.transforms.v2 import AutoAugmentPolicy, functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
+from torchvision.transforms.v2.functional._meta import get_size
+from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
+
+from ._utils import _get_fill, _setup_fill_arg, check_type, is_pure_tensor
+
+
+ImageOrVideo = Union[torch.Tensor, PIL.Image.Image, tv_tensors.Image, tv_tensors.Video]
+
+
+class _AutoAugmentBase(Transform):
+    def __init__(
+        self,
+        *,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
+    ) -> None:
+        super().__init__()
+        self.interpolation = _check_interpolation(interpolation)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if isinstance(params["fill"], dict):
+            raise ValueError(f"{type(self).__name__}() can not be scripted for when `fill` is a dictionary.")
+
+        return params
+
+    def _get_random_item(self, dct: Dict[str, Tuple[Callable, bool]]) -> Tuple[str, Tuple[Callable, bool]]:
+        keys = tuple(dct.keys())
+        key = keys[int(torch.randint(len(keys), ()))]
+        return key, dct[key]
+
+    def _flatten_and_extract_image_or_video(
+        self,
+        inputs: Any,
+        unsupported_types: Tuple[Type, ...] = (tv_tensors.BoundingBoxes, tv_tensors.Mask),
+    ) -> Tuple[Tuple[List[Any], TreeSpec, int], ImageOrVideo]:
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+
+        image_or_videos = []
+        for idx, (inpt, needs_transform) in enumerate(zip(flat_inputs, needs_transform_list)):
+            if needs_transform and check_type(
+                inpt,
+                (
+                    tv_tensors.Image,
+                    PIL.Image.Image,
+                    is_pure_tensor,
+                    tv_tensors.Video,
+                ),
+            ):
+                image_or_videos.append((idx, inpt))
+            elif isinstance(inpt, unsupported_types):
+                raise TypeError(f"Inputs of type {type(inpt).__name__} are not supported by {type(self).__name__}()")
+
+        if not image_or_videos:
+            raise TypeError("Found no image in the sample.")
+        if len(image_or_videos) > 1:
+            raise TypeError(
+                f"Auto augment transformations are only properly defined for a single image or video, "
+                f"but found {len(image_or_videos)}."
+            )
+
+        idx, image_or_video = image_or_videos[0]
+        return (flat_inputs, spec, idx), image_or_video
+
+    def _unflatten_and_insert_image_or_video(
+        self,
+        flat_inputs_with_spec: Tuple[List[Any], TreeSpec, int],
+        image_or_video: ImageOrVideo,
+    ) -> Any:
+        flat_inputs, spec, idx = flat_inputs_with_spec
+        flat_inputs[idx] = image_or_video
+        return tree_unflatten(flat_inputs, spec)
+
+    def _apply_image_or_video_transform(
+        self,
+        image: ImageOrVideo,
+        transform_id: str,
+        magnitude: float,
+        interpolation: Union[InterpolationMode, int],
+        fill: Dict[Union[Type, str], _FillTypeJIT],
+    ) -> ImageOrVideo:
+        fill_ = _get_fill(fill, type(image))
+
+        if transform_id == "Identity":
+            return image
+        elif transform_id == "ShearX":
+            # magnitude should be arctan(magnitude)
+            # official autoaug: (1, level, 0, 0, 1, 0)
+            # https://github.com/tensorflow/models/blob/dd02069717128186b88afa8d857ce57d17957f03/research/autoaugment/augmentation_transforms.py#L290
+            # compared to
+            # torchvision:      (1, tan(level), 0, 0, 1, 0)
+            # https://github.com/pytorch/vision/blob/0c2373d0bba3499e95776e7936e207d8a1676e65/torchvision/transforms/functional.py#L976
+            return F.affine(
+                image,
+                angle=0.0,
+                translate=[0, 0],
+                scale=1.0,
+                shear=[math.degrees(math.atan(magnitude)), 0.0],
+                interpolation=interpolation,
+                fill=fill_,
+                center=[0, 0],
+            )
+        elif transform_id == "ShearY":
+            # magnitude should be arctan(magnitude)
+            # See above
+            return F.affine(
+                image,
+                angle=0.0,
+                translate=[0, 0],
+                scale=1.0,
+                shear=[0.0, math.degrees(math.atan(magnitude))],
+                interpolation=interpolation,
+                fill=fill_,
+                center=[0, 0],
+            )
+        elif transform_id == "TranslateX":
+            return F.affine(
+                image,
+                angle=0.0,
+                translate=[int(magnitude), 0],
+                scale=1.0,
+                interpolation=interpolation,
+                shear=[0.0, 0.0],
+                fill=fill_,
+            )
+        elif transform_id == "TranslateY":
+            return F.affine(
+                image,
+                angle=0.0,
+                translate=[0, int(magnitude)],
+                scale=1.0,
+                interpolation=interpolation,
+                shear=[0.0, 0.0],
+                fill=fill_,
+            )
+        elif transform_id == "Rotate":
+            return F.rotate(image, angle=magnitude, interpolation=interpolation, fill=fill_)
+        elif transform_id == "Brightness":
+            return F.adjust_brightness(image, brightness_factor=1.0 + magnitude)
+        elif transform_id == "Color":
+            return F.adjust_saturation(image, saturation_factor=1.0 + magnitude)
+        elif transform_id == "Contrast":
+            return F.adjust_contrast(image, contrast_factor=1.0 + magnitude)
+        elif transform_id == "Sharpness":
+            return F.adjust_sharpness(image, sharpness_factor=1.0 + magnitude)
+        elif transform_id == "Posterize":
+            return F.posterize(image, bits=int(magnitude))
+        elif transform_id == "Solarize":
+            bound = _FT._max_value(image.dtype) if isinstance(image, torch.Tensor) else 255.0
+            return F.solarize(image, threshold=bound * magnitude)
+        elif transform_id == "AutoContrast":
+            return F.autocontrast(image)
+        elif transform_id == "Equalize":
+            return F.equalize(image)
+        elif transform_id == "Invert":
+            return F.invert(image)
+        else:
+            raise ValueError(f"No transform available for {transform_id}")
+
+
+class AutoAugment(_AutoAugmentBase):
+    r"""[BETA] AutoAugment data augmentation method based on
+    `"AutoAugment: Learning Augmentation Strategies from Data" <https://arxiv.org/pdf/1805.09501.pdf>`_.
+
+    .. v2betastatus:: AutoAugment transform
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        policy (AutoAugmentPolicy, optional): Desired policy enum defined by
+            :class:`torchvision.transforms.autoaugment.AutoAugmentPolicy`. Default is ``AutoAugmentPolicy.IMAGENET``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+    _v1_transform_cls = _transforms.AutoAugment
+
+    _AUGMENTATION_SPACE = {
+        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "TranslateX": (
+            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * width, num_bins),
+            True,
+        ),
+        "TranslateY": (
+            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * height, num_bins),
+            True,
+        ),
+        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
+        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Posterize": (
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
+            False,
+        ),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
+        "AutoContrast": (lambda num_bins, height, width: None, False),
+        "Equalize": (lambda num_bins, height, width: None, False),
+        "Invert": (lambda num_bins, height, width: None, False),
+    }
+
+    def __init__(
+        self,
+        policy: AutoAugmentPolicy = AutoAugmentPolicy.IMAGENET,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
+    ) -> None:
+        super().__init__(interpolation=interpolation, fill=fill)
+        self.policy = policy
+        self._policies = self._get_policies(policy)
+
+    def _get_policies(
+        self, policy: AutoAugmentPolicy
+    ) -> List[Tuple[Tuple[str, float, Optional[int]], Tuple[str, float, Optional[int]]]]:
+        if policy == AutoAugmentPolicy.IMAGENET:
+            return [
+                (("Posterize", 0.4, 8), ("Rotate", 0.6, 9)),
+                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
+                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
+                (("Posterize", 0.6, 7), ("Posterize", 0.6, 6)),
+                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
+                (("Equalize", 0.4, None), ("Rotate", 0.8, 8)),
+                (("Solarize", 0.6, 3), ("Equalize", 0.6, None)),
+                (("Posterize", 0.8, 5), ("Equalize", 1.0, None)),
+                (("Rotate", 0.2, 3), ("Solarize", 0.6, 8)),
+                (("Equalize", 0.6, None), ("Posterize", 0.4, 6)),
+                (("Rotate", 0.8, 8), ("Color", 0.4, 0)),
+                (("Rotate", 0.4, 9), ("Equalize", 0.6, None)),
+                (("Equalize", 0.0, None), ("Equalize", 0.8, None)),
+                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
+                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
+                (("Rotate", 0.8, 8), ("Color", 1.0, 2)),
+                (("Color", 0.8, 8), ("Solarize", 0.8, 7)),
+                (("Sharpness", 0.4, 7), ("Invert", 0.6, None)),
+                (("ShearX", 0.6, 5), ("Equalize", 1.0, None)),
+                (("Color", 0.4, 0), ("Equalize", 0.6, None)),
+                (("Equalize", 0.4, None), ("Solarize", 0.2, 4)),
+                (("Solarize", 0.6, 5), ("AutoContrast", 0.6, None)),
+                (("Invert", 0.6, None), ("Equalize", 1.0, None)),
+                (("Color", 0.6, 4), ("Contrast", 1.0, 8)),
+                (("Equalize", 0.8, None), ("Equalize", 0.6, None)),
+            ]
+        elif policy == AutoAugmentPolicy.CIFAR10:
+            return [
+                (("Invert", 0.1, None), ("Contrast", 0.2, 6)),
+                (("Rotate", 0.7, 2), ("TranslateX", 0.3, 9)),
+                (("Sharpness", 0.8, 1), ("Sharpness", 0.9, 3)),
+                (("ShearY", 0.5, 8), ("TranslateY", 0.7, 9)),
+                (("AutoContrast", 0.5, None), ("Equalize", 0.9, None)),
+                (("ShearY", 0.2, 7), ("Posterize", 0.3, 7)),
+                (("Color", 0.4, 3), ("Brightness", 0.6, 7)),
+                (("Sharpness", 0.3, 9), ("Brightness", 0.7, 9)),
+                (("Equalize", 0.6, None), ("Equalize", 0.5, None)),
+                (("Contrast", 0.6, 7), ("Sharpness", 0.6, 5)),
+                (("Color", 0.7, 7), ("TranslateX", 0.5, 8)),
+                (("Equalize", 0.3, None), ("AutoContrast", 0.4, None)),
+                (("TranslateY", 0.4, 3), ("Sharpness", 0.2, 6)),
+                (("Brightness", 0.9, 6), ("Color", 0.2, 8)),
+                (("Solarize", 0.5, 2), ("Invert", 0.0, None)),
+                (("Equalize", 0.2, None), ("AutoContrast", 0.6, None)),
+                (("Equalize", 0.2, None), ("Equalize", 0.6, None)),
+                (("Color", 0.9, 9), ("Equalize", 0.6, None)),
+                (("AutoContrast", 0.8, None), ("Solarize", 0.2, 8)),
+                (("Brightness", 0.1, 3), ("Color", 0.7, 0)),
+                (("Solarize", 0.4, 5), ("AutoContrast", 0.9, None)),
+                (("TranslateY", 0.9, 9), ("TranslateY", 0.7, 9)),
+                (("AutoContrast", 0.9, None), ("Solarize", 0.8, 3)),
+                (("Equalize", 0.8, None), ("Invert", 0.1, None)),
+                (("TranslateY", 0.7, 9), ("AutoContrast", 0.9, None)),
+            ]
+        elif policy == AutoAugmentPolicy.SVHN:
+            return [
+                (("ShearX", 0.9, 4), ("Invert", 0.2, None)),
+                (("ShearY", 0.9, 8), ("Invert", 0.7, None)),
+                (("Equalize", 0.6, None), ("Solarize", 0.6, 6)),
+                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
+                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
+                (("ShearX", 0.9, 4), ("AutoContrast", 0.8, None)),
+                (("ShearY", 0.9, 8), ("Invert", 0.4, None)),
+                (("ShearY", 0.9, 5), ("Solarize", 0.2, 6)),
+                (("Invert", 0.9, None), ("AutoContrast", 0.8, None)),
+                (("Equalize", 0.6, None), ("Rotate", 0.9, 3)),
+                (("ShearX", 0.9, 4), ("Solarize", 0.3, 3)),
+                (("ShearY", 0.8, 8), ("Invert", 0.7, None)),
+                (("Equalize", 0.9, None), ("TranslateY", 0.6, 6)),
+                (("Invert", 0.9, None), ("Equalize", 0.6, None)),
+                (("Contrast", 0.3, 3), ("Rotate", 0.8, 4)),
+                (("Invert", 0.8, None), ("TranslateY", 0.0, 2)),
+                (("ShearY", 0.7, 6), ("Solarize", 0.4, 8)),
+                (("Invert", 0.6, None), ("Rotate", 0.8, 4)),
+                (("ShearY", 0.3, 7), ("TranslateX", 0.9, 3)),
+                (("ShearX", 0.1, 6), ("Invert", 0.6, None)),
+                (("Solarize", 0.7, 2), ("TranslateY", 0.6, 7)),
+                (("ShearY", 0.8, 4), ("Invert", 0.8, None)),
+                (("ShearX", 0.7, 9), ("TranslateY", 0.8, 3)),
+                (("ShearY", 0.8, 5), ("AutoContrast", 0.7, None)),
+                (("ShearX", 0.7, 2), ("Invert", 0.1, None)),
+            ]
+        else:
+            raise ValueError(f"The provided policy {policy} is not recognized.")
+
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(image_or_video)
+
+        policy = self._policies[int(torch.randint(len(self._policies), ()))]
+
+        for transform_id, probability, magnitude_idx in policy:
+            if not torch.rand(()) <= probability:
+                continue
+
+            magnitudes_fn, signed = self._AUGMENTATION_SPACE[transform_id]
+
+            magnitudes = magnitudes_fn(10, height, width)
+            if magnitudes is not None:
+                magnitude = float(magnitudes[magnitude_idx])
+                if signed and torch.rand(()) <= 0.5:
+                    magnitude *= -1
+            else:
+                magnitude = 0.0
+
+            image_or_video = self._apply_image_or_video_transform(
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
+            )
+
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
+
+
+class RandAugment(_AutoAugmentBase):
+    r"""[BETA] RandAugment data augmentation method based on
+    `"RandAugment: Practical automated data augmentation with a reduced search space"
+    <https://arxiv.org/abs/1909.13719>`_.
+
+    .. v2betastatus:: RandAugment transform
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_ops (int, optional): Number of augmentation transformations to apply sequentially.
+        magnitude (int, optional): Magnitude for all the transformations.
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.RandAugment
+    _AUGMENTATION_SPACE = {
+        "Identity": (lambda num_bins, height, width: None, False),
+        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "TranslateX": (
+            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * width, num_bins),
+            True,
+        ),
+        "TranslateY": (
+            lambda num_bins, height, width: torch.linspace(0.0, 150.0 / 331.0 * height, num_bins),
+            True,
+        ),
+        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
+        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Posterize": (
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
+            False,
+        ),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
+        "AutoContrast": (lambda num_bins, height, width: None, False),
+        "Equalize": (lambda num_bins, height, width: None, False),
+    }
+
+    def __init__(
+        self,
+        num_ops: int = 2,
+        magnitude: int = 9,
+        num_magnitude_bins: int = 31,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
+    ) -> None:
+        super().__init__(interpolation=interpolation, fill=fill)
+        self.num_ops = num_ops
+        self.magnitude = magnitude
+        self.num_magnitude_bins = num_magnitude_bins
+
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(image_or_video)
+
+        for _ in range(self.num_ops):
+            transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
+            magnitudes = magnitudes_fn(self.num_magnitude_bins, height, width)
+            if magnitudes is not None:
+                magnitude = float(magnitudes[self.magnitude])
+                if signed and torch.rand(()) <= 0.5:
+                    magnitude *= -1
+            else:
+                magnitude = 0.0
+            image_or_video = self._apply_image_or_video_transform(
+                image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
+            )
+
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
+
+
+class TrivialAugmentWide(_AutoAugmentBase):
+    r"""[BETA] Dataset-independent data-augmentation with TrivialAugment Wide, as described in
+    `"TrivialAugment: Tuning-free Yet State-of-the-Art Data Augmentation" <https://arxiv.org/abs/2103.10158>`_.
+
+    .. v2betastatus:: TrivialAugmentWide transform
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        num_magnitude_bins (int, optional): The number of different magnitude values.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.TrivialAugmentWide
+    _AUGMENTATION_SPACE = {
+        "Identity": (lambda num_bins, height, width: None, False),
+        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "TranslateX": (lambda num_bins, height, width: torch.linspace(0.0, 32.0, num_bins), True),
+        "TranslateY": (lambda num_bins, height, width: torch.linspace(0.0, 32.0, num_bins), True),
+        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 135.0, num_bins), True),
+        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.99, num_bins), True),
+        "Posterize": (
+            lambda num_bins, height, width: (8 - (torch.arange(num_bins) / ((num_bins - 1) / 6))).round().int(),
+            False,
+        ),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
+        "AutoContrast": (lambda num_bins, height, width: None, False),
+        "Equalize": (lambda num_bins, height, width: None, False),
+    }
+
+    def __init__(
+        self,
+        num_magnitude_bins: int = 31,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
+    ):
+        super().__init__(interpolation=interpolation, fill=fill)
+        self.num_magnitude_bins = num_magnitude_bins
+
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(image_or_video)
+
+        transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE)
+
+        magnitudes = magnitudes_fn(self.num_magnitude_bins, height, width)
+        if magnitudes is not None:
+            magnitude = float(magnitudes[int(torch.randint(self.num_magnitude_bins, ()))])
+            if signed and torch.rand(()) <= 0.5:
+                magnitude *= -1
+        else:
+            magnitude = 0.0
+
+        image_or_video = self._apply_image_or_video_transform(
+            image_or_video, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
+        )
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, image_or_video)
+
+
+class AugMix(_AutoAugmentBase):
+    r"""[BETA] AugMix data augmentation method based on
+    `"AugMix: A Simple Data Processing Method to Improve Robustness and Uncertainty" <https://arxiv.org/abs/1912.02781>`_.
+
+    .. v2betastatus:: AugMix transform
+
+    This transformation works on images and videos only.
+
+    If the input is :class:`torch.Tensor`, it should be of type ``torch.uint8``, and it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        severity (int, optional): The severity of base augmentation operators. Default is ``3``.
+        mixture_width (int, optional): The number of augmentation chains. Default is ``3``.
+        chain_depth (int, optional): The depth of augmentation chains. A negative value denotes stochastic depth sampled from the interval [1, 3].
+            Default is ``-1``.
+        alpha (float, optional): The hyperparameter for the probability distributions. Default is ``1.0``.
+        all_ops (bool, optional): Use all operations (including brightness, contrast, color and sharpness). Default is ``True``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+        fill (sequence or number, optional): Pixel fill value for the area outside the transformed
+            image. If given a number, the value is used for all bands respectively.
+    """
+
+    _v1_transform_cls = _transforms.AugMix
+
+    _PARTIAL_AUGMENTATION_SPACE = {
+        "ShearX": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "ShearY": (lambda num_bins, height, width: torch.linspace(0.0, 0.3, num_bins), True),
+        "TranslateX": (lambda num_bins, height, width: torch.linspace(0.0, width / 3.0, num_bins), True),
+        "TranslateY": (lambda num_bins, height, width: torch.linspace(0.0, height / 3.0, num_bins), True),
+        "Rotate": (lambda num_bins, height, width: torch.linspace(0.0, 30.0, num_bins), True),
+        "Posterize": (
+            lambda num_bins, height, width: (4 - (torch.arange(num_bins) / ((num_bins - 1) / 4))).round().int(),
+            False,
+        ),
+        "Solarize": (lambda num_bins, height, width: torch.linspace(1.0, 0.0, num_bins), False),
+        "AutoContrast": (lambda num_bins, height, width: None, False),
+        "Equalize": (lambda num_bins, height, width: None, False),
+    }
+    _AUGMENTATION_SPACE: Dict[str, Tuple[Callable[[int, int, int], Optional[torch.Tensor]], bool]] = {
+        **_PARTIAL_AUGMENTATION_SPACE,
+        "Brightness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Color": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Contrast": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+        "Sharpness": (lambda num_bins, height, width: torch.linspace(0.0, 0.9, num_bins), True),
+    }
+
+    def __init__(
+        self,
+        severity: int = 3,
+        mixture_width: int = 3,
+        chain_depth: int = -1,
+        alpha: float = 1.0,
+        all_ops: bool = True,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = None,
+    ) -> None:
+        super().__init__(interpolation=interpolation, fill=fill)
+        self._PARAMETER_MAX = 10
+        if not (1 <= severity <= self._PARAMETER_MAX):
+            raise ValueError(f"The severity must be between [1, {self._PARAMETER_MAX}]. Got {severity} instead.")
+        self.severity = severity
+        self.mixture_width = mixture_width
+        self.chain_depth = chain_depth
+        self.alpha = alpha
+        self.all_ops = all_ops
+
+    def _sample_dirichlet(self, params: torch.Tensor) -> torch.Tensor:
+        # Must be on a separate method so that we can overwrite it in tests.
+        return torch._sample_dirichlet(params)
+
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs_with_spec, orig_image_or_video = self._flatten_and_extract_image_or_video(inputs)
+        height, width = get_size(orig_image_or_video)
+
+        if isinstance(orig_image_or_video, torch.Tensor):
+            image_or_video = orig_image_or_video
+        else:  # isinstance(inpt, PIL.Image.Image):
+            image_or_video = F.pil_to_tensor(orig_image_or_video)
+
+        augmentation_space = self._AUGMENTATION_SPACE if self.all_ops else self._PARTIAL_AUGMENTATION_SPACE
+
+        orig_dims = list(image_or_video.shape)
+        expected_ndim = 5 if isinstance(orig_image_or_video, tv_tensors.Video) else 4
+        batch = image_or_video.reshape([1] * max(expected_ndim - image_or_video.ndim, 0) + orig_dims)
+        batch_dims = [batch.size(0)] + [1] * (batch.ndim - 1)
+
+        # Sample the beta weights for combining the original and augmented image or video. To get Beta, we use a
+        # Dirichlet with 2 parameters. The 1st column stores the weights of the original and the 2nd the ones of
+        # augmented image or video.
+        m = self._sample_dirichlet(
+            torch.tensor([self.alpha, self.alpha], device=batch.device).expand(batch_dims[0], -1)
+        )
+
+        # Sample the mixing weights and combine them with the ones sampled from Beta for the augmented images or videos.
+        combined_weights = self._sample_dirichlet(
+            torch.tensor([self.alpha] * self.mixture_width, device=batch.device).expand(batch_dims[0], -1)
+        ) * m[:, 1].reshape([batch_dims[0], -1])
+
+        mix = m[:, 0].reshape(batch_dims) * batch
+        for i in range(self.mixture_width):
+            aug = batch
+            depth = self.chain_depth if self.chain_depth > 0 else int(torch.randint(low=1, high=4, size=(1,)).item())
+            for _ in range(depth):
+                transform_id, (magnitudes_fn, signed) = self._get_random_item(augmentation_space)
+
+                magnitudes = magnitudes_fn(self._PARAMETER_MAX, height, width)
+                if magnitudes is not None:
+                    magnitude = float(magnitudes[int(torch.randint(self.severity, ()))])
+                    if signed and torch.rand(()) <= 0.5:
+                        magnitude *= -1
+                else:
+                    magnitude = 0.0
+
+                aug = self._apply_image_or_video_transform(
+                    aug, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill
+                )
+            mix.add_(combined_weights[:, i].reshape(batch_dims) * aug)
+        mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype)
+
+        if isinstance(orig_image_or_video, (tv_tensors.Image, tv_tensors.Video)):
+            mix = tv_tensors.wrap(mix, like=orig_image_or_video)
+        elif isinstance(orig_image_or_video, PIL.Image.Image):
+            mix = F.to_pil_image(mix)
+
+        return self._unflatten_and_insert_image_or_video(flat_inputs_with_spec, mix)
diff --git a/torchvision/transforms/v2/_color.py b/torchvision/transforms/v2/_color.py
new file mode 100644
index 0000000000000000000000000000000000000000..efe731b5ec9bd34e5cde0f27c4ccbc8ec8678438
--- /dev/null
+++ b/torchvision/transforms/v2/_color.py
@@ -0,0 +1,380 @@
+import collections.abc
+from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from torchvision import transforms as _transforms
+from torchvision.transforms.v2 import functional as F, Transform
+
+from ._transform import _RandomApplyTransform
+from ._utils import query_chw
+
+
+class Grayscale(Transform):
+    """[BETA] Convert images or videos to grayscale.
+
+    .. v2betastatus:: Grayscale transform
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 3 or 1, H, W] shape, where ... means an arbitrary number of leading dimensions
+
+    Args:
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+    """
+
+    _v1_transform_cls = _transforms.Grayscale
+
+    def __init__(self, num_output_channels: int = 1):
+        super().__init__()
+        self.num_output_channels = num_output_channels
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.rgb_to_grayscale, inpt, num_output_channels=self.num_output_channels)
+
+
+class RandomGrayscale(_RandomApplyTransform):
+    """[BETA] Randomly convert image or videos to grayscale with a probability of p (default 0.1).
+
+    .. v2betastatus:: RandomGrayscale transform
+
+    If the input is a :class:`torch.Tensor`, it is expected to have [..., 3 or 1, H, W] shape,
+    where ... means an arbitrary number of leading dimensions
+
+    The output has the same number of channels as the input.
+
+    Args:
+        p (float): probability that image should be converted to grayscale.
+    """
+
+    _v1_transform_cls = _transforms.RandomGrayscale
+
+    def __init__(self, p: float = 0.1) -> None:
+        super().__init__(p=p)
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        num_input_channels, *_ = query_chw(flat_inputs)
+        return dict(num_input_channels=num_input_channels)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.rgb_to_grayscale, inpt, num_output_channels=params["num_input_channels"])
+
+
+class ColorJitter(Transform):
+    """[BETA] Randomly change the brightness, contrast, saturation and hue of an image or video.
+
+    .. v2betastatus:: ColorJitter transform
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, mode "1", "I", "F" and modes with transparency (alpha channel) are not supported.
+
+    Args:
+        brightness (float or tuple of float (min, max)): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [max(0, 1 - brightness), 1 + brightness]
+            or the given [min, max]. Should be non negative numbers.
+        contrast (float or tuple of float (min, max)): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [max(0, 1 - contrast), 1 + contrast]
+            or the given [min, max]. Should be non-negative numbers.
+        saturation (float or tuple of float (min, max)): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [max(0, 1 - saturation), 1 + saturation]
+            or the given [min, max]. Should be non negative numbers.
+        hue (float or tuple of float (min, max)): How much to jitter hue.
+            hue_factor is chosen uniformly from [-hue, hue] or the given [min, max].
+            Should have 0<= hue <= 0.5 or -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+    """
+
+    _v1_transform_cls = _transforms.ColorJitter
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        return {attr: value or 0 for attr, value in super()._extract_params_for_v1_transform().items()}
+
+    def __init__(
+        self,
+        brightness: Optional[Union[float, Sequence[float]]] = None,
+        contrast: Optional[Union[float, Sequence[float]]] = None,
+        saturation: Optional[Union[float, Sequence[float]]] = None,
+        hue: Optional[Union[float, Sequence[float]]] = None,
+    ) -> None:
+        super().__init__()
+        self.brightness = self._check_input(brightness, "brightness")
+        self.contrast = self._check_input(contrast, "contrast")
+        self.saturation = self._check_input(saturation, "saturation")
+        self.hue = self._check_input(hue, "hue", center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
+
+    def _check_input(
+        self,
+        value: Optional[Union[float, Sequence[float]]],
+        name: str,
+        center: float = 1.0,
+        bound: Tuple[float, float] = (0, float("inf")),
+        clip_first_on_zero: bool = True,
+    ) -> Optional[Tuple[float, float]]:
+        if value is None:
+            return None
+
+        if isinstance(value, (int, float)):
+            if value < 0:
+                raise ValueError(f"If {name} is a single number, it must be non negative.")
+            value = [center - value, center + value]
+            if clip_first_on_zero:
+                value[0] = max(value[0], 0.0)
+        elif isinstance(value, collections.abc.Sequence) and len(value) == 2:
+            value = [float(v) for v in value]
+        else:
+            raise TypeError(f"{name}={value} should be a single number or a sequence with length 2.")
+
+        if not bound[0] <= value[0] <= value[1] <= bound[1]:
+            raise ValueError(f"{name} values should be between {bound}, but got {value}.")
+
+        return None if value[0] == value[1] == center else (float(value[0]), float(value[1]))
+
+    @staticmethod
+    def _generate_value(left: float, right: float) -> float:
+        return torch.empty(1).uniform_(left, right).item()
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        fn_idx = torch.randperm(4)
+
+        b = None if self.brightness is None else self._generate_value(self.brightness[0], self.brightness[1])
+        c = None if self.contrast is None else self._generate_value(self.contrast[0], self.contrast[1])
+        s = None if self.saturation is None else self._generate_value(self.saturation[0], self.saturation[1])
+        h = None if self.hue is None else self._generate_value(self.hue[0], self.hue[1])
+
+        return dict(fn_idx=fn_idx, brightness_factor=b, contrast_factor=c, saturation_factor=s, hue_factor=h)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        output = inpt
+        brightness_factor = params["brightness_factor"]
+        contrast_factor = params["contrast_factor"]
+        saturation_factor = params["saturation_factor"]
+        hue_factor = params["hue_factor"]
+        for fn_id in params["fn_idx"]:
+            if fn_id == 0 and brightness_factor is not None:
+                output = self._call_kernel(F.adjust_brightness, output, brightness_factor=brightness_factor)
+            elif fn_id == 1 and contrast_factor is not None:
+                output = self._call_kernel(F.adjust_contrast, output, contrast_factor=contrast_factor)
+            elif fn_id == 2 and saturation_factor is not None:
+                output = self._call_kernel(F.adjust_saturation, output, saturation_factor=saturation_factor)
+            elif fn_id == 3 and hue_factor is not None:
+                output = self._call_kernel(F.adjust_hue, output, hue_factor=hue_factor)
+        return output
+
+
+class RandomChannelPermutation(Transform):
+    """[BETA] Randomly permute the channels of an image or video
+
+    .. v2betastatus:: RandomChannelPermutation transform
+    """
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        num_channels, *_ = query_chw(flat_inputs)
+        return dict(permutation=torch.randperm(num_channels))
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.permute_channels, inpt, params["permutation"])
+
+
+class RandomPhotometricDistort(Transform):
+    """[BETA] Randomly distorts the image or video as used in `SSD: Single Shot
+    MultiBox Detector <https://arxiv.org/abs/1512.02325>`_.
+
+    .. v2betastatus:: RandomPhotometricDistort transform
+
+    This transform relies on :class:`~torchvision.transforms.v2.ColorJitter`
+    under the hood to adjust the contrast, saturation, hue, brightness, and also
+    randomly permutes channels.
+
+    Args:
+        brightness (tuple of float (min, max), optional): How much to jitter brightness.
+            brightness_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        contrast tuple of float (min, max), optional): How much to jitter contrast.
+            contrast_factor is chosen uniformly from [min, max]. Should be non-negative numbers.
+        saturation (tuple of float (min, max), optional): How much to jitter saturation.
+            saturation_factor is chosen uniformly from [min, max]. Should be non negative numbers.
+        hue (tuple of float (min, max), optional): How much to jitter hue.
+            hue_factor is chosen uniformly from [min, max].  Should have -0.5 <= min <= max <= 0.5.
+            To jitter hue, the pixel values of the input image has to be non-negative for conversion to HSV space;
+            thus it does not work if you normalize your image to an interval with negative values,
+            or use an interpolation that generates negative values before using this function.
+        p (float, optional) probability each distortion operation (contrast, saturation, ...) to be applied.
+            Default is 0.5.
+    """
+
+    def __init__(
+        self,
+        brightness: Tuple[float, float] = (0.875, 1.125),
+        contrast: Tuple[float, float] = (0.5, 1.5),
+        saturation: Tuple[float, float] = (0.5, 1.5),
+        hue: Tuple[float, float] = (-0.05, 0.05),
+        p: float = 0.5,
+    ):
+        super().__init__()
+        self.brightness = brightness
+        self.contrast = contrast
+        self.hue = hue
+        self.saturation = saturation
+        self.p = p
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        num_channels, *_ = query_chw(flat_inputs)
+        params: Dict[str, Any] = {
+            key: ColorJitter._generate_value(range[0], range[1]) if torch.rand(1) < self.p else None
+            for key, range in [
+                ("brightness_factor", self.brightness),
+                ("contrast_factor", self.contrast),
+                ("saturation_factor", self.saturation),
+                ("hue_factor", self.hue),
+            ]
+        }
+        params["contrast_before"] = bool(torch.rand(()) < 0.5)
+        params["channel_permutation"] = torch.randperm(num_channels) if torch.rand(1) < self.p else None
+        return params
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        if params["brightness_factor"] is not None:
+            inpt = self._call_kernel(F.adjust_brightness, inpt, brightness_factor=params["brightness_factor"])
+        if params["contrast_factor"] is not None and params["contrast_before"]:
+            inpt = self._call_kernel(F.adjust_contrast, inpt, contrast_factor=params["contrast_factor"])
+        if params["saturation_factor"] is not None:
+            inpt = self._call_kernel(F.adjust_saturation, inpt, saturation_factor=params["saturation_factor"])
+        if params["hue_factor"] is not None:
+            inpt = self._call_kernel(F.adjust_hue, inpt, hue_factor=params["hue_factor"])
+        if params["contrast_factor"] is not None and not params["contrast_before"]:
+            inpt = self._call_kernel(F.adjust_contrast, inpt, contrast_factor=params["contrast_factor"])
+        if params["channel_permutation"] is not None:
+            inpt = self._call_kernel(F.permute_channels, inpt, permutation=params["channel_permutation"])
+        return inpt
+
+
+class RandomEqualize(_RandomApplyTransform):
+    """[BETA] Equalize the histogram of the given image or video with a given probability.
+
+    .. v2betastatus:: RandomEqualize transform
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "P", "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being equalized. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomEqualize
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.equalize, inpt)
+
+
+class RandomInvert(_RandomApplyTransform):
+    """[BETA] Inverts the colors of the given image or video with a given probability.
+
+    .. v2betastatus:: RandomInvert transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being color inverted. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomInvert
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.invert, inpt)
+
+
+class RandomPosterize(_RandomApplyTransform):
+    """[BETA] Posterize the image or video with a given probability by reducing the
+    number of bits for each color channel.
+
+    .. v2betastatus:: RandomPosterize transform
+
+    If the input is a :class:`torch.Tensor`, it should be of type torch.uint8,
+    and it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        bits (int): number of bits to keep for each channel (0-8)
+        p (float): probability of the image being posterized. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomPosterize
+
+    def __init__(self, bits: int, p: float = 0.5) -> None:
+        super().__init__(p=p)
+        self.bits = bits
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.posterize, inpt, bits=self.bits)
+
+
+class RandomSolarize(_RandomApplyTransform):
+    """[BETA] Solarize the image or video with a given probability by inverting all pixel
+    values above a threshold.
+
+    .. v2betastatus:: RandomSolarize transform
+
+    If img is a Tensor, it is expected to be in [..., 1 or 3, H, W] format,
+    where ... means it can have an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        threshold (float): all pixels equal or above this value are inverted.
+        p (float): probability of the image being solarized. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomSolarize
+
+    def __init__(self, threshold: float, p: float = 0.5) -> None:
+        super().__init__(p=p)
+        self.threshold = threshold
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.solarize, inpt, threshold=self.threshold)
+
+
+class RandomAutocontrast(_RandomApplyTransform):
+    """[BETA] Autocontrast the pixels of the given image or video with a given probability.
+
+    .. v2betastatus:: RandomAutocontrast transform
+
+    If the input is a :class:`torch.Tensor`, it is expected
+    to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+    If img is PIL Image, it is expected to be in mode "L" or "RGB".
+
+    Args:
+        p (float): probability of the image being autocontrasted. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomAutocontrast
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.autocontrast, inpt)
+
+
+class RandomAdjustSharpness(_RandomApplyTransform):
+    """[BETA] Adjust the sharpness of the image or video with a given probability.
+
+    .. v2betastatus:: RandomAdjustSharpness transform
+
+    If the input is a :class:`torch.Tensor`,
+    it is expected to have [..., 1 or 3, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        sharpness_factor (float):  How much to adjust the sharpness. Can be
+            any non-negative number. 0 gives a blurred image, 1 gives the
+            original image while 2 increases the sharpness by a factor of 2.
+        p (float): probability of the image being sharpened. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomAdjustSharpness
+
+    def __init__(self, sharpness_factor: float, p: float = 0.5) -> None:
+        super().__init__(p=p)
+        self.sharpness_factor = sharpness_factor
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.adjust_sharpness, inpt, sharpness_factor=self.sharpness_factor)
diff --git a/torchvision/transforms/v2/_container.py b/torchvision/transforms/v2/_container.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f591c49707b00c0bb6256ae3d74ad1f662c0bbf
--- /dev/null
+++ b/torchvision/transforms/v2/_container.py
@@ -0,0 +1,180 @@
+from typing import Any, Callable, Dict, List, Optional, Sequence, Union
+
+import torch
+
+from torch import nn
+from torchvision import transforms as _transforms
+from torchvision.transforms.v2 import Transform
+
+
+class Compose(Transform):
+    """[BETA] Composes several transforms together.
+
+    .. v2betastatus:: Compose transform
+
+    This transform does not support torchscript.
+    Please, see the note below.
+
+    Args:
+        transforms (list of ``Transform`` objects): list of transforms to compose.
+
+    Example:
+        >>> transforms.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.PILToTensor(),
+        >>>     transforms.ConvertImageDtype(torch.float),
+        >>> ])
+
+    .. note::
+        In order to script the transformations, please use ``torch.nn.Sequential`` as below.
+
+        >>> transforms = torch.nn.Sequential(
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
+        >>> )
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    """
+
+    def __init__(self, transforms: Sequence[Callable]) -> None:
+        super().__init__()
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+        elif not transforms:
+            raise ValueError("Pass at least one transform")
+        self.transforms = transforms
+
+    def forward(self, *inputs: Any) -> Any:
+        needs_unpacking = len(inputs) > 1
+        for transform in self.transforms:
+            outputs = transform(*inputs)
+            inputs = outputs if needs_unpacking else (outputs,)
+        return outputs
+
+    def extra_repr(self) -> str:
+        format_string = []
+        for t in self.transforms:
+            format_string.append(f"    {t}")
+        return "\n".join(format_string)
+
+
+class RandomApply(Transform):
+    """[BETA] Apply randomly a list of transformations with a given probability.
+
+    .. v2betastatus:: RandomApply transform
+
+    .. note::
+        In order to script the transformation, please use ``torch.nn.ModuleList`` as input instead of list/tuple of
+        transforms as shown below:
+
+        >>> transforms = transforms.RandomApply(torch.nn.ModuleList([
+        >>>     transforms.ColorJitter(),
+        >>> ]), p=0.3)
+        >>> scripted_transforms = torch.jit.script(transforms)
+
+        Make sure to use only scriptable transformations, i.e. that work with ``torch.Tensor``, does not require
+        `lambda` functions or ``PIL.Image``.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (float): probability of applying the list of transforms
+    """
+
+    _v1_transform_cls = _transforms.RandomApply
+
+    def __init__(self, transforms: Union[Sequence[Callable], nn.ModuleList], p: float = 0.5) -> None:
+        super().__init__()
+
+        if not isinstance(transforms, (Sequence, nn.ModuleList)):
+            raise TypeError("Argument transforms should be a sequence of callables or a `nn.ModuleList`")
+        self.transforms = transforms
+
+        if not (0.0 <= p <= 1.0):
+            raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
+        self.p = p
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        return {"transforms": self.transforms, "p": self.p}
+
+    def forward(self, *inputs: Any) -> Any:
+        sample = inputs if len(inputs) > 1 else inputs[0]
+
+        if torch.rand(1) >= self.p:
+            return sample
+
+        for transform in self.transforms:
+            sample = transform(sample)
+        return sample
+
+    def extra_repr(self) -> str:
+        format_string = []
+        for t in self.transforms:
+            format_string.append(f"    {t}")
+        return "\n".join(format_string)
+
+
+class RandomChoice(Transform):
+    """[BETA] Apply single transformation randomly picked from a list.
+
+    .. v2betastatus:: RandomChoice transform
+
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+        p (list of floats or None, optional): probability of each transform being picked.
+            If ``p`` doesn't sum to 1, it is automatically normalized. If ``None``
+            (default), all transforms have the same probability.
+    """
+
+    def __init__(
+        self,
+        transforms: Sequence[Callable],
+        p: Optional[List[float]] = None,
+    ) -> None:
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+
+        if p is None:
+            p = [1] * len(transforms)
+        elif len(p) != len(transforms):
+            raise ValueError(f"Length of p doesn't match the number of transforms: {len(p)} != {len(transforms)}")
+
+        super().__init__()
+
+        self.transforms = transforms
+        total = sum(p)
+        self.p = [prob / total for prob in p]
+
+    def forward(self, *inputs: Any) -> Any:
+        idx = int(torch.multinomial(torch.tensor(self.p), 1))
+        transform = self.transforms[idx]
+        return transform(*inputs)
+
+
+class RandomOrder(Transform):
+    """[BETA] Apply a list of transformations in a random order.
+
+    .. v2betastatus:: RandomOrder transform
+
+    This transform does not support torchscript.
+
+    Args:
+        transforms (sequence or torch.nn.Module): list of transformations
+    """
+
+    def __init__(self, transforms: Sequence[Callable]) -> None:
+        if not isinstance(transforms, Sequence):
+            raise TypeError("Argument transforms should be a sequence of callables")
+        super().__init__()
+        self.transforms = transforms
+
+    def forward(self, *inputs: Any) -> Any:
+        sample = inputs if len(inputs) > 1 else inputs[0]
+        for idx in torch.randperm(len(self.transforms)):
+            transform = self.transforms[idx]
+            sample = transform(sample)
+        return sample
diff --git a/torchvision/transforms/v2/_deprecated.py b/torchvision/transforms/v2/_deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ffa7194361d465a4e9b328e1223790720a8f765
--- /dev/null
+++ b/torchvision/transforms/v2/_deprecated.py
@@ -0,0 +1,50 @@
+import warnings
+from typing import Any, Dict, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from torchvision.transforms import functional as _F
+
+from torchvision.transforms.v2 import Transform
+
+
+class ToTensor(Transform):
+    """[BETA] [DEPRECATED] Use ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`` instead.
+
+    Convert a PIL Image or ndarray to tensor and scale the values accordingly.
+
+    .. v2betastatus:: ToTensor transform
+
+    .. warning::
+        :class:`v2.ToTensor` is deprecated and will be removed in a future release.
+        Please use instead ``v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])``.
+
+    This transform does not support torchscript.
+
+
+    Converts a PIL Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+
+    .. note::
+        Because the input image is scaled to [0.0, 1.0], this transformation should not be used when
+        transforming target image masks. See the `references`_ for implementing the transforms for image masks.
+
+    .. _references: https://github.com/pytorch/vision/tree/main/references/segmentation
+    """
+
+    _transformed_types = (PIL.Image.Image, np.ndarray)
+
+    def __init__(self) -> None:
+        warnings.warn(
+            "The transform `ToTensor()` is deprecated and will be removed in a future release. "
+            "Instead, please use `v2.Compose([v2.ToImage(), v2.ToDtype(torch.float32, scale=True)])`."
+        )
+        super().__init__()
+
+    def _transform(self, inpt: Union[PIL.Image.Image, np.ndarray], params: Dict[str, Any]) -> torch.Tensor:
+        return _F.to_tensor(inpt)
diff --git a/torchvision/transforms/v2/_geometry.py b/torchvision/transforms/v2/_geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d79fb6fd24b2c0a3d96c2b877d0f0299466e28b
--- /dev/null
+++ b/torchvision/transforms/v2/_geometry.py
@@ -0,0 +1,1447 @@
+import math
+import numbers
+import warnings
+from typing import Any, Callable, cast, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
+
+import PIL.Image
+import torch
+
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.ops.boxes import box_iou
+from torchvision.transforms.functional import _get_perspective_coeffs
+from torchvision.transforms.v2 import functional as F, InterpolationMode, Transform
+from torchvision.transforms.v2.functional._geometry import _check_interpolation
+from torchvision.transforms.v2.functional._utils import _FillType
+
+from ._transform import _RandomApplyTransform
+from ._utils import (
+    _check_padding_arg,
+    _check_padding_mode_arg,
+    _check_sequence_input,
+    _get_fill,
+    _setup_angle,
+    _setup_fill_arg,
+    _setup_number_or_seq,
+    _setup_size,
+    get_bounding_boxes,
+    has_all,
+    has_any,
+    is_pure_tensor,
+    query_size,
+)
+
+
+class RandomHorizontalFlip(_RandomApplyTransform):
+    """[BETA] Horizontally flip the input with a given probability.
+
+    .. v2betastatus:: RandomHorizontalFlip transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomHorizontalFlip
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.horizontal_flip, inpt)
+
+
+class RandomVerticalFlip(_RandomApplyTransform):
+    """[BETA] Vertically flip the input with a given probability.
+
+    .. v2betastatus:: RandomVerticalFlip transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        p (float, optional): probability of the input being flipped. Default value is 0.5
+    """
+
+    _v1_transform_cls = _transforms.RandomVerticalFlip
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.vertical_flip, inpt)
+
+
+class Resize(Transform):
+    """[BETA] Resize the input to the given size.
+
+    .. v2betastatus:: Resize transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    .. warning::
+        The output image might be different depending on its type: when downsampling, the interpolation of PIL images
+        and tensors is slightly different, because PIL applies antialiasing. This may lead to significant differences
+        in the performance of a network. Therefore, it is preferable to train and serve a model with the same input
+        types. See also below the ``antialias`` parameter, which can help making the output of PIL images and tensors
+        closer.
+
+    Args:
+        size (sequence or int): Desired output size. If size is a sequence like
+            (h, w), output size will be matched to this. If size is an int,
+            smaller edge of the image will be matched to this number.
+            i.e, if height > width, then image will be rescaled to
+            (size * height / width, size).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        max_size (int, optional): The maximum allowed for the longer edge of
+            the resized image. If the longer edge of the image is greater
+            than ``max_size`` after being resized according to ``size``,
+            ``size`` will be overruled so that the longer edge is equal to
+            ``max_size``.
+            As a result, the smaller edge may be shorter than ``size``. This
+            is only supported if ``size`` is an int (or a sequence of length
+            1 in torchscript mode).
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
+    _v1_transform_cls = _transforms.Resize
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        max_size: Optional[int] = None,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ) -> None:
+        super().__init__()
+
+        if isinstance(size, int):
+            size = [size]
+        elif isinstance(size, (list, tuple)) and len(size) in {1, 2}:
+            size = list(size)
+        else:
+            raise ValueError(
+                f"size can either be an integer or a list or tuple of one or two integers, " f"but got {size} instead."
+            )
+        self.size = size
+
+        self.interpolation = _check_interpolation(interpolation)
+        self.max_size = max_size
+        self.antialias = antialias
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize,
+            inpt,
+            self.size,
+            interpolation=self.interpolation,
+            max_size=self.max_size,
+            antialias=self.antialias,
+        )
+
+
+class CenterCrop(Transform):
+    """[BETA] Crop the input at the center.
+
+    .. v2betastatus:: CenterCrop transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    If image size is smaller than output size along any edge, image is padded with 0 and then center cropped.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+    """
+
+    _v1_transform_cls = _transforms.CenterCrop
+
+    def __init__(self, size: Union[int, Sequence[int]]):
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.center_crop, inpt, output_size=self.size)
+
+
+class RandomResizedCrop(Transform):
+    """[BETA] Crop a random portion of the input and resize it to a given size.
+
+    .. v2betastatus:: RandomResizedCrop transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    A crop of the original input is made: the crop has a random area (H * W)
+    and a random aspect ratio. This crop is finally resized to the given
+    size. This is popularly used to train the Inception networks.
+
+    Args:
+        size (int or sequence): expected output size of the crop, for each edge. If size is an
+            int instead of sequence like (h, w), a square output size ``(size, size)`` is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+            .. note::
+                In torchscript mode size as single int is not supported, use a sequence of length 1: ``[size, ]``.
+        scale (tuple of float, optional): Specifies the lower and upper bounds for the random area of the crop,
+            before resizing. The scale is defined with respect to the area of the original image.
+        ratio (tuple of float, optional): lower and upper bounds for the random aspect ratio of the crop, before
+            resizing.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
+    _v1_transform_cls = _transforms.RandomResizedCrop
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        scale: Tuple[float, float] = (0.08, 1.0),
+        ratio: Tuple[float, float] = (3.0 / 4.0, 4.0 / 3.0),
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+        if not isinstance(scale, Sequence):
+            raise TypeError("Scale should be a sequence")
+        scale = cast(Tuple[float, float], scale)
+        if not isinstance(ratio, Sequence):
+            raise TypeError("Ratio should be a sequence")
+        ratio = cast(Tuple[float, float], ratio)
+        if (scale[0] > scale[1]) or (ratio[0] > ratio[1]):
+            warnings.warn("Scale and ratio should be of kind (min, max)")
+
+        self.scale = scale
+        self.ratio = ratio
+        self.interpolation = _check_interpolation(interpolation)
+        self.antialias = antialias
+
+        self._log_ratio = torch.log(torch.tensor(self.ratio))
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_size(flat_inputs)
+        area = height * width
+
+        log_ratio = self._log_ratio
+        for _ in range(10):
+            target_area = area * torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+            aspect_ratio = torch.exp(
+                torch.empty(1).uniform_(
+                    log_ratio[0],  # type: ignore[arg-type]
+                    log_ratio[1],  # type: ignore[arg-type]
+                )
+            ).item()
+
+            w = int(round(math.sqrt(target_area * aspect_ratio)))
+            h = int(round(math.sqrt(target_area / aspect_ratio)))
+
+            if 0 < w <= width and 0 < h <= height:
+                i = torch.randint(0, height - h + 1, size=(1,)).item()
+                j = torch.randint(0, width - w + 1, size=(1,)).item()
+                break
+        else:
+            # Fallback to central crop
+            in_ratio = float(width) / float(height)
+            if in_ratio < min(self.ratio):
+                w = width
+                h = int(round(w / min(self.ratio)))
+            elif in_ratio > max(self.ratio):
+                h = height
+                w = int(round(h * max(self.ratio)))
+            else:  # whole image
+                w = width
+                h = height
+            i = (height - h) // 2
+            j = (width - w) // 2
+
+        return dict(top=i, left=j, height=h, width=w)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resized_crop, inpt, **params, size=self.size, interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+class FiveCrop(Transform):
+    """[BETA] Crop the image or video into four corners and the central crop.
+
+    .. v2betastatus:: FiveCrop transform
+
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
+    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+         size (sequence or int): Desired output size of the crop. If size is an ``int``
+            instead of sequence like (h, w), a square crop of size (size, size) is made.
+            If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+
+    Example:
+        >>> class BatchMultiCrop(transforms.Transform):
+        ...     def forward(self, sample: Tuple[Tuple[Union[tv_tensors.Image, tv_tensors.Video], ...], int]):
+        ...         images_or_videos, labels = sample
+        ...         batch_size = len(images_or_videos)
+        ...         image_or_video = images_or_videos[0]
+        ...         images_or_videos = tv_tensors.wrap(torch.stack(images_or_videos), like=image_or_video)
+        ...         labels = torch.full((batch_size,), label, device=images_or_videos.device)
+        ...         return images_or_videos, labels
+        ...
+        >>> image = tv_tensors.Image(torch.rand(3, 256, 256))
+        >>> label = 3
+        >>> transform = transforms.Compose([transforms.FiveCrop(224), BatchMultiCrop()])
+        >>> images, labels = transform(image, label)
+        >>> images.shape
+        torch.Size([5, 3, 224, 224])
+        >>> labels
+        tensor([3, 3, 3, 3, 3])
+    """
+
+    _v1_transform_cls = _transforms.FiveCrop
+
+    def __init__(self, size: Union[int, Sequence[int]]) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.five_crop, inpt, self.size)
+
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
+            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
+
+
+class TenCrop(Transform):
+    """[BETA] Crop the image or video into four corners and the central crop plus the flipped version of
+    these (horizontal flipping is used by default).
+
+    .. v2betastatus:: TenCrop transform
+
+    If the input is a :class:`torch.Tensor` or a :class:`~torchvision.tv_tensors.Image` or a
+    :class:`~torchvision.tv_tensors.Video` it can have arbitrary number of leading batch dimensions.
+    For example, the image can have ``[..., C, H, W]`` shape.
+
+    See :class:`~torchvision.transforms.v2.FiveCrop` for an example.
+
+    .. Note::
+         This transform returns a tuple of images and there may be a mismatch in the number of
+         inputs and targets your Dataset returns. See below for an example of how to deal with
+         this.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        vertical_flip (bool, optional): Use vertical flipping instead of horizontal
+    """
+
+    _v1_transform_cls = _transforms.TenCrop
+
+    def __init__(self, size: Union[int, Sequence[int]], vertical_flip: bool = False) -> None:
+        super().__init__()
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+        self.vertical_flip = vertical_flip
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        if isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask)):
+            warnings.warn(
+                f"{type(self).__name__}() is currently passing through inputs of type "
+                f"tv_tensors.{type(inpt).__name__}. This will likely change in the future."
+            )
+        return super()._call_kernel(functional, inpt, *args, **kwargs)
+
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        if has_any(flat_inputs, tv_tensors.BoundingBoxes, tv_tensors.Mask):
+            raise TypeError(f"BoundingBoxes'es and Mask's are not supported by {type(self).__name__}()")
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.ten_crop, inpt, self.size, vertical_flip=self.vertical_flip)
+
+
+class Pad(Transform):
+    """[BETA] Pad the input on all sides with the given "pad" value.
+
+    .. v2betastatus:: Pad transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        padding (int or sequence): Padding on each border. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is "constant".
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    _v1_transform_cls = _transforms.Pad
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
+
+        return params
+
+    def __init__(
+        self,
+        padding: Union[int, Sequence[int]],
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
+        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+    ) -> None:
+        super().__init__()
+
+        _check_padding_arg(padding)
+        _check_padding_mode_arg(padding_mode)
+
+        # This cast does Sequence[int] -> List[int] and is required to make mypy happy
+        if not isinstance(padding, int):
+            padding = list(padding)
+        self.padding = padding
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+        self.padding_mode = padding_mode
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(F.pad, inpt, padding=self.padding, fill=fill, padding_mode=self.padding_mode)  # type: ignore[arg-type]
+
+
+class RandomZoomOut(_RandomApplyTransform):
+    """[BETA] "Zoom out" transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    .. v2betastatus:: RandomZoomOut transform
+
+    This transformation randomly pads images, videos, bounding boxes and masks creating a zoom out effect.
+    Output spatial size is randomly sampled from original size up to a maximum size configured
+    with ``side_range`` parameter:
+
+    .. code-block:: python
+
+        r = uniform_sample(side_range[0], side_range[1])
+        output_width = input_width * r
+        output_height = input_height * r
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        side_range (sequence of floats, optional): tuple of two floats defines minimum and maximum factors to
+            scale the input size.
+        p (float, optional): probability that the zoom operation will be performed.
+    """
+
+    def __init__(
+        self,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
+        side_range: Sequence[float] = (1.0, 4.0),
+        p: float = 0.5,
+    ) -> None:
+        super().__init__(p=p)
+
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+        _check_sequence_input(side_range, "side_range", req_sizes=(2,))
+
+        self.side_range = side_range
+        if side_range[0] < 1.0 or side_range[0] > side_range[1]:
+            raise ValueError(f"Invalid canvas side range provided {side_range}.")
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_h, orig_w = query_size(flat_inputs)
+
+        r = self.side_range[0] + torch.rand(1) * (self.side_range[1] - self.side_range[0])
+        canvas_width = int(orig_w * r)
+        canvas_height = int(orig_h * r)
+
+        r = torch.rand(2)
+        left = int((canvas_width - orig_w) * r[0])
+        top = int((canvas_height - orig_h) * r[1])
+        right = canvas_width - (left + orig_w)
+        bottom = canvas_height - (top + orig_h)
+        padding = [left, top, right, bottom]
+
+        return dict(padding=padding)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(F.pad, inpt, **params, fill=fill)
+
+
+class RandomRotation(Transform):
+    """[BETA] Rotate the input by angle.
+
+    .. v2betastatus:: RandomRotation transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees).
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center (see note below) and no translation.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+            .. note::
+
+                In theory, setting ``center`` has no effect if ``expand=True``, since the image center will become the
+                center of rotation. In practice however, due to numerical precision, this can lead to off-by-one
+                differences of the resulting image size compared to using the image center in the first place. Thus, when
+                setting ``expand=True``, it's best to leave ``center=None`` (default).
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
+    _v1_transform_cls = _transforms.RandomRotation
+
+    def __init__(
+        self,
+        degrees: Union[numbers.Number, Sequence],
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        expand: bool = False,
+        center: Optional[List[float]] = None,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
+    ) -> None:
+        super().__init__()
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+        self.interpolation = _check_interpolation(interpolation)
+        self.expand = expand
+
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
+        return dict(angle=angle)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.rotate,
+            inpt,
+            **params,
+            interpolation=self.interpolation,
+            expand=self.expand,
+            center=self.center,
+            fill=fill,
+        )
+
+
+class RandomAffine(Transform):
+    """[BETA] Random affine transformation the input keeping center invariant.
+
+    .. v2betastatus:: RandomAffine transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        degrees (sequence or number): Range of degrees to select from.
+            If degrees is a number instead of sequence like (min, max), the range of degrees
+            will be (-degrees, +degrees). Set to 0 to deactivate rotations.
+        translate (tuple, optional): tuple of maximum absolute fraction for horizontal
+            and vertical translations. For example translate=(a, b), then horizontal shift
+            is randomly sampled in the range -img_width * a < dx < img_width * a and vertical shift is
+            randomly sampled in the range -img_height * b < dy < img_height * b. Will not translate by default.
+        scale (tuple, optional): scaling factor interval, e.g (a, b), then scale is
+            randomly sampled from the range a <= scale <= b. Will keep original scale by default.
+        shear (sequence or number, optional): Range of degrees to select from.
+            If shear is a number, a shear parallel to the x-axis in the range (-shear, +shear)
+            will be applied. Else if shear is a sequence of 2 values a shear parallel to the x-axis in the
+            range (shear[0], shear[1]) will be applied. Else if shear is a sequence of 4 values,
+            an x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
+            Will not apply shear by default.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.NEAREST``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        center (sequence, optional): Optional center of rotation, (x, y). Origin is the upper left corner.
+            Default is the center of the image.
+
+    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
+
+    """
+
+    _v1_transform_cls = _transforms.RandomAffine
+
+    def __init__(
+        self,
+        degrees: Union[numbers.Number, Sequence],
+        translate: Optional[Sequence[float]] = None,
+        scale: Optional[Sequence[float]] = None,
+        shear: Optional[Union[int, float, Sequence[float]]] = None,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
+        center: Optional[List[float]] = None,
+    ) -> None:
+        super().__init__()
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
+        if translate is not None:
+            _check_sequence_input(translate, "translate", req_sizes=(2,))
+            for t in translate:
+                if not (0.0 <= t <= 1.0):
+                    raise ValueError("translation values should be between 0 and 1")
+        self.translate = translate
+        if scale is not None:
+            _check_sequence_input(scale, "scale", req_sizes=(2,))
+            for s in scale:
+                if s <= 0:
+                    raise ValueError("scale values should be positive")
+        self.scale = scale
+
+        if shear is not None:
+            self.shear = _setup_angle(shear, name="shear", req_sizes=(2, 4))
+        else:
+            self.shear = shear
+
+        self.interpolation = _check_interpolation(interpolation)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+        if center is not None:
+            _check_sequence_input(center, "center", req_sizes=(2,))
+
+        self.center = center
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_size(flat_inputs)
+
+        angle = torch.empty(1).uniform_(self.degrees[0], self.degrees[1]).item()
+        if self.translate is not None:
+            max_dx = float(self.translate[0] * width)
+            max_dy = float(self.translate[1] * height)
+            tx = int(round(torch.empty(1).uniform_(-max_dx, max_dx).item()))
+            ty = int(round(torch.empty(1).uniform_(-max_dy, max_dy).item()))
+            translate = (tx, ty)
+        else:
+            translate = (0, 0)
+
+        if self.scale is not None:
+            scale = torch.empty(1).uniform_(self.scale[0], self.scale[1]).item()
+        else:
+            scale = 1.0
+
+        shear_x = shear_y = 0.0
+        if self.shear is not None:
+            shear_x = torch.empty(1).uniform_(self.shear[0], self.shear[1]).item()
+            if len(self.shear) == 4:
+                shear_y = torch.empty(1).uniform_(self.shear[2], self.shear[3]).item()
+
+        shear = (shear_x, shear_y)
+        return dict(angle=angle, translate=translate, scale=scale, shear=shear)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.affine,
+            inpt,
+            **params,
+            interpolation=self.interpolation,
+            fill=fill,
+            center=self.center,
+        )
+
+
+class RandomCrop(Transform):
+    """[BETA] Crop the input at a random location.
+
+    .. v2betastatus:: RandomCrop transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        size (sequence or int): Desired output size of the crop. If size is an
+            int instead of sequence like (h, w), a square crop (size, size) is
+            made. If provided a sequence of length 1, it will be interpreted as (size[0], size[0]).
+        padding (int or sequence, optional): Optional padding on each border
+            of the image. Default is None. If a single int is provided this
+            is used to pad all borders. If sequence of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a sequence of length 4 is provided
+            this is the padding for the left, top, right and bottom borders respectively.
+
+            .. note::
+                In torchscript mode padding as single int is not supported, use a sequence of
+                length 1: ``[padding, ]``.
+        pad_if_needed (boolean, optional): It will pad the image if smaller than the
+            desired size to avoid raising an exception. Since cropping is done
+            after padding, the padding seems to be done at a random offset.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+        padding_mode (str, optional): Type of padding. Should be: constant, edge, reflect or symmetric.
+            Default is constant.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value at the edge of the image.
+
+            - reflect: pads with reflection of image without repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+              will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image repeating the last value on the edge.
+              For example, padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+              will result in [2, 1, 1, 2, 3, 4, 4, 3]
+    """
+
+    _v1_transform_cls = _transforms.RandomCrop
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        params = super()._extract_params_for_v1_transform()
+
+        if not (params["fill"] is None or isinstance(params["fill"], (int, float))):
+            raise ValueError(f"{type(self).__name__}() can only be scripted for a scalar `fill`, but got {self.fill}.")
+
+        padding = self.padding
+        if padding is not None:
+            pad_left, pad_right, pad_top, pad_bottom = padding
+            padding = [pad_left, pad_top, pad_right, pad_bottom]
+        params["padding"] = padding
+
+        return params
+
+    def __init__(
+        self,
+        size: Union[int, Sequence[int]],
+        padding: Optional[Union[int, Sequence[int]]] = None,
+        pad_if_needed: bool = False,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
+        padding_mode: Literal["constant", "edge", "reflect", "symmetric"] = "constant",
+    ) -> None:
+        super().__init__()
+
+        self.size = _setup_size(size, error_msg="Please provide only two dimensions (h, w) for size.")
+
+        if pad_if_needed or padding is not None:
+            if padding is not None:
+                _check_padding_arg(padding)
+            _check_padding_mode_arg(padding_mode)
+
+        self.padding = F._geometry._parse_pad_padding(padding) if padding else None  # type: ignore[arg-type]
+        self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+        self.padding_mode = padding_mode
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        padded_height, padded_width = query_size(flat_inputs)
+
+        if self.padding is not None:
+            pad_left, pad_right, pad_top, pad_bottom = self.padding
+            padded_height += pad_top + pad_bottom
+            padded_width += pad_left + pad_right
+        else:
+            pad_left = pad_right = pad_top = pad_bottom = 0
+
+        cropped_height, cropped_width = self.size
+
+        if self.pad_if_needed:
+            if padded_height < cropped_height:
+                diff = cropped_height - padded_height
+
+                pad_top += diff
+                pad_bottom += diff
+                padded_height += 2 * diff
+
+            if padded_width < cropped_width:
+                diff = cropped_width - padded_width
+
+                pad_left += diff
+                pad_right += diff
+                padded_width += 2 * diff
+
+        if padded_height < cropped_height or padded_width < cropped_width:
+            raise ValueError(
+                f"Required crop size {(cropped_height, cropped_width)} is larger than "
+                f"{'padded ' if self.padding is not None else ''}input image size {(padded_height, padded_width)}."
+            )
+
+        # We need a different order here than we have in self.padding since this padding will be parsed again in `F.pad`
+        padding = [pad_left, pad_top, pad_right, pad_bottom]
+        needs_pad = any(padding)
+
+        needs_vert_crop, top = (
+            (True, int(torch.randint(0, padded_height - cropped_height + 1, size=())))
+            if padded_height > cropped_height
+            else (False, 0)
+        )
+        needs_horz_crop, left = (
+            (True, int(torch.randint(0, padded_width - cropped_width + 1, size=())))
+            if padded_width > cropped_width
+            else (False, 0)
+        )
+
+        return dict(
+            needs_crop=needs_vert_crop or needs_horz_crop,
+            top=top,
+            left=left,
+            height=cropped_height,
+            width=cropped_width,
+            needs_pad=needs_pad,
+            padding=padding,
+        )
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        if params["needs_pad"]:
+            fill = _get_fill(self._fill, type(inpt))
+            inpt = self._call_kernel(F.pad, inpt, padding=params["padding"], fill=fill, padding_mode=self.padding_mode)
+
+        if params["needs_crop"]:
+            inpt = self._call_kernel(
+                F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
+            )
+
+        return inpt
+
+
+class RandomPerspective(_RandomApplyTransform):
+    """[BETA] Perform a random perspective transformation of the input with a given probability.
+
+    .. v2betastatus:: RandomPerspective transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        distortion_scale (float, optional): argument to control the degree of distortion and ranges from 0 to 1.
+            Default is 0.5.
+        p (float, optional): probability of the input being transformed. Default is 0.5.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
+    _v1_transform_cls = _transforms.RandomPerspective
+
+    def __init__(
+        self,
+        distortion_scale: float = 0.5,
+        p: float = 0.5,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
+    ) -> None:
+        super().__init__(p=p)
+
+        if not (0 <= distortion_scale <= 1):
+            raise ValueError("Argument distortion_scale value should be between 0 and 1")
+
+        self.distortion_scale = distortion_scale
+        self.interpolation = _check_interpolation(interpolation)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        height, width = query_size(flat_inputs)
+
+        distortion_scale = self.distortion_scale
+
+        half_height = height // 2
+        half_width = width // 2
+        bound_height = int(distortion_scale * half_height) + 1
+        bound_width = int(distortion_scale * half_width) + 1
+        topleft = [
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
+        ]
+        topright = [
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(0, bound_height, size=(1,))),
+        ]
+        botright = [
+            int(torch.randint(width - bound_width, width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
+        ]
+        botleft = [
+            int(torch.randint(0, bound_width, size=(1,))),
+            int(torch.randint(height - bound_height, height, size=(1,))),
+        ]
+        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]]
+        endpoints = [topleft, topright, botright, botleft]
+        perspective_coeffs = _get_perspective_coeffs(startpoints, endpoints)
+        return dict(coefficients=perspective_coeffs)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.perspective,
+            inpt,
+            None,
+            None,
+            fill=fill,
+            interpolation=self.interpolation,
+            **params,
+        )
+
+
+class ElasticTransform(Transform):
+    """[BETA] Transform the input with elastic transformations.
+
+    .. v2betastatus:: RandomPerspective transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Given alpha and sigma, it will generate displacement
+    vectors for all pixels based on random offsets. Alpha controls the strength
+    and sigma controls the smoothness of the displacements.
+    The displacements are added to an identity grid and the resulting grid is
+    used to transform the input.
+
+    .. note::
+        Implementation to transform bounding boxes is approximative (not exact).
+        We construct an approximation of the inverse grid as ``inverse_grid = identity - displacement``.
+        This is not an exact inverse of the grid used to transform images, i.e. ``grid = identity + displacement``.
+        Our assumption is that ``displacement * displacement`` is small and can be ignored.
+        Large displacements would lead to large errors in the approximation.
+
+    Applications:
+        Randomly transforms the morphology of objects in images and produces a
+        see-through-water-like effect.
+
+    Args:
+        alpha (float or sequence of floats, optional): Magnitude of displacements. Default is 50.0.
+        sigma (float or sequence of floats, optional): Smoothness of displacements. Default is 5.0.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.BILINEAR`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        fill (number or tuple or dict, optional): Pixel fill value used when the  ``padding_mode`` is constant.
+            Default is 0. If a tuple of length 3, it is used to fill R, G, B channels respectively.
+            Fill value can be also a dictionary mapping data type to the fill value, e.g.
+            ``fill={tv_tensors.Image: 127, tv_tensors.Mask: 0}`` where ``Image`` will be filled with 127 and
+            ``Mask`` will be filled with 0.
+    """
+
+    _v1_transform_cls = _transforms.ElasticTransform
+
+    def __init__(
+        self,
+        alpha: Union[float, Sequence[float]] = 50.0,
+        sigma: Union[float, Sequence[float]] = 5.0,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        fill: Union[_FillType, Dict[Union[Type, str], _FillType]] = 0,
+    ) -> None:
+        super().__init__()
+        self.alpha = _setup_number_or_seq(alpha, "alpha")
+        self.sigma = _setup_number_or_seq(sigma, "sigma")
+
+        self.interpolation = _check_interpolation(interpolation)
+        self.fill = fill
+        self._fill = _setup_fill_arg(fill)
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        size = list(query_size(flat_inputs))
+
+        dx = torch.rand([1, 1] + size) * 2 - 1
+        if self.sigma[0] > 0.0:
+            kx = int(8 * self.sigma[0] + 1)
+            # if kernel size is even we have to make it odd
+            if kx % 2 == 0:
+                kx += 1
+            dx = self._call_kernel(F.gaussian_blur, dx, [kx, kx], list(self.sigma))
+        dx = dx * self.alpha[0] / size[0]
+
+        dy = torch.rand([1, 1] + size) * 2 - 1
+        if self.sigma[1] > 0.0:
+            ky = int(8 * self.sigma[1] + 1)
+            # if kernel size is even we have to make it odd
+            if ky % 2 == 0:
+                ky += 1
+            dy = self._call_kernel(F.gaussian_blur, dy, [ky, ky], list(self.sigma))
+        dy = dy * self.alpha[1] / size[1]
+        displacement = torch.concat([dx, dy], 1).permute([0, 2, 3, 1])  # 1 x H x W x 2
+        return dict(displacement=displacement)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        fill = _get_fill(self._fill, type(inpt))
+        return self._call_kernel(
+            F.elastic,
+            inpt,
+            **params,
+            fill=fill,
+            interpolation=self.interpolation,
+        )
+
+
+class RandomIoUCrop(Transform):
+    """[BETA] Random IoU crop transformation from
+    `"SSD: Single Shot MultiBox Detector" <https://arxiv.org/abs/1512.02325>`_.
+
+    .. v2betastatus:: RandomIoUCrop transform
+
+    This transformation requires an image or video data and ``tv_tensors.BoundingBoxes`` in the input.
+
+    .. warning::
+        In order to properly remove the bounding boxes below the IoU threshold, `RandomIoUCrop`
+        must be followed by :class:`~torchvision.transforms.v2.SanitizeBoundingBoxes`, either immediately
+        after or later in the transforms pipeline.
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_scale (float, optional): Minimum factors to scale the input size.
+        max_scale (float, optional): Maximum factors to scale the input size.
+        min_aspect_ratio (float, optional): Minimum aspect ratio for the cropped image or video.
+        max_aspect_ratio (float, optional): Maximum aspect ratio for the cropped image or video.
+        sampler_options (list of float, optional): List of minimal IoU (Jaccard) overlap between all the boxes and
+            a cropped image or video. Default, ``None`` which corresponds to ``[0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]``
+        trials (int, optional): Number of trials to find a crop for a given value of minimal IoU (Jaccard) overlap.
+            Default, 40.
+    """
+
+    def __init__(
+        self,
+        min_scale: float = 0.3,
+        max_scale: float = 1.0,
+        min_aspect_ratio: float = 0.5,
+        max_aspect_ratio: float = 2.0,
+        sampler_options: Optional[List[float]] = None,
+        trials: int = 40,
+    ):
+        super().__init__()
+        # Configuration similar to https://github.com/weiliu89/caffe/blob/ssd/examples/ssd/ssd_coco.py#L89-L174
+        self.min_scale = min_scale
+        self.max_scale = max_scale
+        self.min_aspect_ratio = min_aspect_ratio
+        self.max_aspect_ratio = max_aspect_ratio
+        if sampler_options is None:
+            sampler_options = [0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
+        self.options = sampler_options
+        self.trials = trials
+
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        if not (
+            has_all(flat_inputs, tv_tensors.BoundingBoxes)
+            and has_any(flat_inputs, PIL.Image.Image, tv_tensors.Image, is_pure_tensor)
+        ):
+            raise TypeError(
+                f"{type(self).__name__}() requires input sample to contain tensor or PIL images "
+                "and bounding boxes. Sample can also contain masks."
+            )
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_h, orig_w = query_size(flat_inputs)
+        bboxes = get_bounding_boxes(flat_inputs)
+
+        while True:
+            # sample an option
+            idx = int(torch.randint(low=0, high=len(self.options), size=(1,)))
+            min_jaccard_overlap = self.options[idx]
+            if min_jaccard_overlap >= 1.0:  # a value larger than 1 encodes the leave as-is option
+                return dict()
+
+            for _ in range(self.trials):
+                # check the aspect ratio limitations
+                r = self.min_scale + (self.max_scale - self.min_scale) * torch.rand(2)
+                new_w = int(orig_w * r[0])
+                new_h = int(orig_h * r[1])
+                aspect_ratio = new_w / new_h
+                if not (self.min_aspect_ratio <= aspect_ratio <= self.max_aspect_ratio):
+                    continue
+
+                # check for 0 area crops
+                r = torch.rand(2)
+                left = int((orig_w - new_w) * r[0])
+                top = int((orig_h - new_h) * r[1])
+                right = left + new_w
+                bottom = top + new_h
+                if left == right or top == bottom:
+                    continue
+
+                # check for any valid boxes with centers within the crop area
+                xyxy_bboxes = F.convert_bounding_box_format(
+                    bboxes.as_subclass(torch.Tensor),
+                    bboxes.format,
+                    tv_tensors.BoundingBoxFormat.XYXY,
+                )
+                cx = 0.5 * (xyxy_bboxes[..., 0] + xyxy_bboxes[..., 2])
+                cy = 0.5 * (xyxy_bboxes[..., 1] + xyxy_bboxes[..., 3])
+                is_within_crop_area = (left < cx) & (cx < right) & (top < cy) & (cy < bottom)
+                if not is_within_crop_area.any():
+                    continue
+
+                # check at least 1 box with jaccard limitations
+                xyxy_bboxes = xyxy_bboxes[is_within_crop_area]
+                ious = box_iou(
+                    xyxy_bboxes,
+                    torch.tensor([[left, top, right, bottom]], dtype=xyxy_bboxes.dtype, device=xyxy_bboxes.device),
+                )
+                if ious.max() < min_jaccard_overlap:
+                    continue
+
+                return dict(top=top, left=left, height=new_h, width=new_w, is_within_crop_area=is_within_crop_area)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+
+        if len(params) < 1:
+            return inpt
+
+        output = self._call_kernel(
+            F.crop, inpt, top=params["top"], left=params["left"], height=params["height"], width=params["width"]
+        )
+
+        if isinstance(output, tv_tensors.BoundingBoxes):
+            # We "mark" the invalid boxes as degenreate, and they can be
+            # removed by a later call to SanitizeBoundingBoxes()
+            output[~params["is_within_crop_area"]] = 0
+
+        return output
+
+
+class ScaleJitter(Transform):
+    """[BETA] Perform Large Scale Jitter on the input according to
+    `"Simple Copy-Paste is a Strong Data Augmentation Method for Instance Segmentation" <https://arxiv.org/abs/2012.07177>`_.
+
+    .. v2betastatus:: ScaleJitter transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        target_size (tuple of int): Target size. This parameter defines base scale for jittering,
+            e.g. ``min(target_size[0] / width, target_size[1] / height)``.
+        scale_range (tuple of float, optional): Minimum and maximum of the scale range. Default, ``(0.1, 2.0)``.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        target_size: Tuple[int, int],
+        scale_range: Tuple[float, float] = (0.1, 2.0),
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ):
+        super().__init__()
+        self.target_size = target_size
+        self.scale_range = scale_range
+        self.interpolation = _check_interpolation(interpolation)
+        self.antialias = antialias
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_height, orig_width = query_size(flat_inputs)
+
+        scale = self.scale_range[0] + torch.rand(1) * (self.scale_range[1] - self.scale_range[0])
+        r = min(self.target_size[1] / orig_height, self.target_size[0] / orig_width) * scale
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        return dict(size=(new_height, new_width))
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+class RandomShortestSize(Transform):
+    """[BETA] Randomly resize the input.
+
+    .. v2betastatus:: RandomShortestSize transform
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int or sequence of int): Minimum spatial size. Single integer value or a sequence of integer values.
+        max_size (int, optional): Maximum spatial size. Default, None.
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        min_size: Union[List[int], Tuple[int], int],
+        max_size: Optional[int] = None,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ):
+        super().__init__()
+        self.min_size = [min_size] if isinstance(min_size, int) else list(min_size)
+        self.max_size = max_size
+        self.interpolation = _check_interpolation(interpolation)
+        self.antialias = antialias
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        orig_height, orig_width = query_size(flat_inputs)
+
+        min_size = self.min_size[int(torch.randint(len(self.min_size), ()))]
+        r = min_size / min(orig_height, orig_width)
+        if self.max_size is not None:
+            r = min(r, self.max_size / max(orig_height, orig_width))
+
+        new_width = int(orig_width * r)
+        new_height = int(orig_height * r)
+
+        return dict(size=(new_height, new_width))
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize, inpt, size=params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
+
+
+class RandomResize(Transform):
+    """[BETA] Randomly resize the input.
+
+    .. v2betastatus:: RandomResize transform
+
+    This transformation can be used together with ``RandomCrop`` as data augmentations to train
+    models on image segmentation task.
+
+    Output spatial size is randomly sampled from the interval ``[min_size, max_size]``:
+
+    .. code-block:: python
+
+        size = uniform_sample(min_size, max_size)
+        output_width = size
+        output_height = size
+
+    If the input is a :class:`torch.Tensor` or a ``TVTensor`` (e.g. :class:`~torchvision.tv_tensors.Image`,
+    :class:`~torchvision.tv_tensors.Video`, :class:`~torchvision.tv_tensors.BoundingBoxes` etc.)
+    it can have arbitrary number of leading batch dimensions. For example,
+    the image can have ``[..., C, H, W]`` shape. A bounding box can have ``[..., 4]`` shape.
+
+    Args:
+        min_size (int): Minimum output size for random sampling
+        max_size (int): Maximum output size for random sampling
+        interpolation (InterpolationMode, optional): Desired interpolation enum defined by
+            :class:`torchvision.transforms.InterpolationMode`. Default is ``InterpolationMode.BILINEAR``.
+            If input is Tensor, only ``InterpolationMode.NEAREST``, ``InterpolationMode.NEAREST_EXACT``,
+            ``InterpolationMode.BILINEAR`` and ``InterpolationMode.BICUBIC`` are supported.
+            The corresponding Pillow integer constants, e.g. ``PIL.Image.BILINEAR`` are accepted as well.
+        antialias (bool, optional): Whether to apply antialiasing.
+            It only affects **tensors** with bilinear or bicubic modes and it is
+            ignored otherwise: on PIL images, antialiasing is always applied on
+            bilinear or bicubic modes; on other modes (for PIL images and
+            tensors), antialiasing makes no sense and this parameter is ignored.
+            Possible values are:
+
+            - ``True``: will apply antialiasing for bilinear or bicubic modes.
+              Other mode aren't affected. This is probably what you want to use.
+            - ``False``: will not apply antialiasing for tensors on any mode. PIL
+              images are still antialiased on bilinear or bicubic modes, because
+              PIL doesn't support no antialias.
+            - ``None``: equivalent to ``False`` for tensors and ``True`` for
+              PIL images. This value exists for legacy reasons and you probably
+              don't want to use it unless you really know what you are doing.
+
+            The current default is ``None`` **but will change to** ``True`` **in
+            v0.17** for the PIL and Tensor backends to be consistent.
+    """
+
+    def __init__(
+        self,
+        min_size: int,
+        max_size: int,
+        interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+        antialias: Optional[Union[str, bool]] = "warn",
+    ) -> None:
+        super().__init__()
+        self.min_size = min_size
+        self.max_size = max_size
+        self.interpolation = _check_interpolation(interpolation)
+        self.antialias = antialias
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        size = int(torch.randint(self.min_size, self.max_size, ()))
+        return dict(size=[size])
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(
+            F.resize, inpt, params["size"], interpolation=self.interpolation, antialias=self.antialias
+        )
diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..9fa31ebef94f61eead6064ad065e141e3db23a6e
--- /dev/null
+++ b/torchvision/transforms/v2/_meta.py
@@ -0,0 +1,42 @@
+from typing import Any, Dict, Union
+
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F, Transform
+
+
+class ConvertBoundingBoxFormat(Transform):
+    """[BETA] Convert bounding box coordinates to the given ``format``, eg from "CXCYWH" to "XYXY".
+
+    .. v2betastatus:: ConvertBoundingBoxFormat transform
+
+    Args:
+        format (str or tv_tensors.BoundingBoxFormat): output bounding box format.
+            Possible values are defined by :class:`~torchvision.tv_tensors.BoundingBoxFormat` and
+            string values match the enums, e.g. "XYXY" or "XYWH" etc.
+    """
+
+    _transformed_types = (tv_tensors.BoundingBoxes,)
+
+    def __init__(self, format: Union[str, tv_tensors.BoundingBoxFormat]) -> None:
+        super().__init__()
+        if isinstance(format, str):
+            format = tv_tensors.BoundingBoxFormat[format]
+        self.format = format
+
+    def _transform(self, inpt: tv_tensors.BoundingBoxes, params: Dict[str, Any]) -> tv_tensors.BoundingBoxes:
+        return F.convert_bounding_box_format(inpt, new_format=self.format)  # type: ignore[return-value]
+
+
+class ClampBoundingBoxes(Transform):
+    """[BETA] Clamp bounding boxes to their corresponding image dimensions.
+
+    The clamping is done according to the bounding boxes' ``canvas_size`` meta-data.
+
+    .. v2betastatus:: ClampBoundingBoxes transform
+
+    """
+
+    _transformed_types = (tv_tensors.BoundingBoxes,)
+
+    def _transform(self, inpt: tv_tensors.BoundingBoxes, params: Dict[str, Any]) -> tv_tensors.BoundingBoxes:
+        return F.clamp_bounding_boxes(inpt)  # type: ignore[return-value]
diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..67aaf4f3753e23bb7aa0288260bc7ed267327567
--- /dev/null
+++ b/torchvision/transforms/v2/_misc.py
@@ -0,0 +1,421 @@
+import warnings
+from typing import Any, Callable, cast, Dict, List, Optional, Sequence, Type, Union
+
+import PIL.Image
+
+import torch
+from torch.utils._pytree import tree_flatten, tree_unflatten
+
+from torchvision import transforms as _transforms, tv_tensors
+from torchvision.transforms.v2 import functional as F, Transform
+
+from ._utils import _parse_labels_getter, _setup_number_or_seq, _setup_size, get_bounding_boxes, has_any, is_pure_tensor
+
+
+# TODO: do we want/need to expose this?
+class Identity(Transform):
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return inpt
+
+
+class Lambda(Transform):
+    """[BETA] Apply a user-defined function as a transform.
+
+    .. v2betastatus:: Lambda transform
+
+    This transform does not support torchscript.
+
+    Args:
+        lambd (function): Lambda/function to be used for transform.
+    """
+
+    _transformed_types = (object,)
+
+    def __init__(self, lambd: Callable[[Any], Any], *types: Type):
+        super().__init__()
+        self.lambd = lambd
+        self.types = types or self._transformed_types
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        if isinstance(inpt, self.types):
+            return self.lambd(inpt)
+        else:
+            return inpt
+
+    def extra_repr(self) -> str:
+        extras = []
+        name = getattr(self.lambd, "__name__", None)
+        if name:
+            extras.append(name)
+        extras.append(f"types={[type.__name__ for type in self.types]}")
+        return ", ".join(extras)
+
+
+class LinearTransformation(Transform):
+    """[BETA] Transform a tensor image or video with a square transformation matrix and a mean_vector computed offline.
+
+    .. v2betastatus:: LinearTransformation transform
+
+    This transform does not support PIL Image.
+    Given transformation_matrix and mean_vector, will flatten the torch.*Tensor and
+    subtract mean_vector from it which is then followed by computing the dot
+    product with the transformation matrix and then reshaping the tensor to its
+    original shape.
+
+    Applications:
+        whitening transformation: Suppose X is a column vector zero-centered data.
+        Then compute the data covariance matrix [D x D] with torch.mm(X.t(), X),
+        perform SVD on this matrix and pass it as transformation_matrix.
+
+    Args:
+        transformation_matrix (Tensor): tensor [D x D], D = C x H x W
+        mean_vector (Tensor): tensor [D], D = C x H x W
+    """
+
+    _v1_transform_cls = _transforms.LinearTransformation
+
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, tv_tensors.Video)
+
+    def __init__(self, transformation_matrix: torch.Tensor, mean_vector: torch.Tensor):
+        super().__init__()
+        if transformation_matrix.size(0) != transformation_matrix.size(1):
+            raise ValueError(
+                "transformation_matrix should be square. Got "
+                f"{tuple(transformation_matrix.size())} rectangular matrix."
+            )
+
+        if mean_vector.size(0) != transformation_matrix.size(0):
+            raise ValueError(
+                f"mean_vector should have the same length {mean_vector.size(0)}"
+                f" as any one of the dimensions of the transformation_matrix [{tuple(transformation_matrix.size())}]"
+            )
+
+        if transformation_matrix.device != mean_vector.device:
+            raise ValueError(
+                f"Input tensors should be on the same device. Got {transformation_matrix.device} and {mean_vector.device}"
+            )
+
+        if transformation_matrix.dtype != mean_vector.dtype:
+            raise ValueError(
+                f"Input tensors should have the same dtype. Got {transformation_matrix.dtype} and {mean_vector.dtype}"
+            )
+
+        self.transformation_matrix = transformation_matrix
+        self.mean_vector = mean_vector
+
+    def _check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
+            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        shape = inpt.shape
+        n = shape[-3] * shape[-2] * shape[-1]
+        if n != self.transformation_matrix.shape[0]:
+            raise ValueError(
+                "Input tensor and transformation matrix have incompatible shape."
+                + f"[{shape[-3]} x {shape[-2]} x {shape[-1]}] != "
+                + f"{self.transformation_matrix.shape[0]}"
+            )
+
+        if inpt.device.type != self.mean_vector.device.type:
+            raise ValueError(
+                "Input tensor should be on the same device as transformation matrix and mean vector. "
+                f"Got {inpt.device} vs {self.mean_vector.device}"
+            )
+
+        flat_inpt = inpt.reshape(-1, n) - self.mean_vector
+
+        transformation_matrix = self.transformation_matrix.to(flat_inpt.dtype)
+        output = torch.mm(flat_inpt, transformation_matrix)
+        output = output.reshape(shape)
+
+        if isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+            output = tv_tensors.wrap(output, like=inpt)
+        return output
+
+
+class Normalize(Transform):
+    """[BETA] Normalize a tensor image or video with mean and standard deviation.
+
+    .. v2betastatus:: Normalize transform
+
+    This transform does not support PIL Image.
+    Given mean: ``(mean[1],...,mean[n])`` and std: ``(std[1],..,std[n])`` for ``n``
+    channels, this transform will normalize each channel of the input
+    ``torch.*Tensor`` i.e.,
+    ``output[channel] = (input[channel] - mean[channel]) / std[channel]``
+
+    .. note::
+        This transform acts out of place, i.e., it does not mutate the input tensor.
+
+    Args:
+        mean (sequence): Sequence of means for each channel.
+        std (sequence): Sequence of standard deviations for each channel.
+        inplace(bool,optional): Bool to make this operation in-place.
+
+    """
+
+    _v1_transform_cls = _transforms.Normalize
+
+    def __init__(self, mean: Sequence[float], std: Sequence[float], inplace: bool = False):
+        super().__init__()
+        self.mean = list(mean)
+        self.std = list(std)
+        self.inplace = inplace
+
+    def _check_inputs(self, sample: Any) -> Any:
+        if has_any(sample, PIL.Image.Image):
+            raise TypeError(f"{type(self).__name__}() does not support PIL images.")
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.normalize, inpt, mean=self.mean, std=self.std, inplace=self.inplace)
+
+
+class GaussianBlur(Transform):
+    """[BETA] Blurs image with randomly chosen Gaussian blur.
+
+    .. v2betastatus:: GausssianBlur transform
+
+    If the input is a Tensor, it is expected
+    to have [..., C, H, W] shape, where ... means an arbitrary number of leading dimensions.
+
+    Args:
+        kernel_size (int or sequence): Size of the Gaussian kernel.
+        sigma (float or tuple of float (min, max)): Standard deviation to be used for
+            creating kernel to perform blurring. If float, sigma is fixed. If it is tuple
+            of float (min, max), sigma is chosen uniformly at random to lie in the
+            given range.
+    """
+
+    _v1_transform_cls = _transforms.GaussianBlur
+
+    def __init__(
+        self, kernel_size: Union[int, Sequence[int]], sigma: Union[int, float, Sequence[float]] = (0.1, 2.0)
+    ) -> None:
+        super().__init__()
+        self.kernel_size = _setup_size(kernel_size, "Kernel size should be a tuple/list of two integers")
+        for ks in self.kernel_size:
+            if ks <= 0 or ks % 2 == 0:
+                raise ValueError("Kernel size value should be an odd and positive number.")
+
+        self.sigma = _setup_number_or_seq(sigma, "sigma")
+
+        if not 0.0 < self.sigma[0] <= self.sigma[1]:
+            raise ValueError(f"sigma values should be positive and of the form (min, max). Got {self.sigma}")
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        sigma = torch.empty(1).uniform_(self.sigma[0], self.sigma[1]).item()
+        return dict(sigma=[sigma, sigma])
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.gaussian_blur, inpt, self.kernel_size, **params)
+
+
+class ToDtype(Transform):
+    """[BETA] Converts the input to a specific dtype, optionally scaling the values for images or videos.
+
+    .. v2betastatus:: ToDtype transform
+
+    .. note::
+        ``ToDtype(dtype, scale=True)`` is the recommended replacement for ``ConvertImageDtype(dtype)``.
+
+    Args:
+        dtype (``torch.dtype`` or dict of ``TVTensor`` -> ``torch.dtype``): The dtype to convert to.
+            If a ``torch.dtype`` is passed, e.g. ``torch.float32``, only images and videos will be converted
+            to that dtype: this is for compatibility with :class:`~torchvision.transforms.v2.ConvertImageDtype`.
+            A dict can be passed to specify per-tv_tensor conversions, e.g.
+            ``dtype={tv_tensors.Image: torch.float32, tv_tensors.Mask: torch.int64, "others":None}``. The "others"
+            key can be used as a catch-all for any other tv_tensor type, and ``None`` means no conversion.
+        scale (bool, optional): Whether to scale the values for images or videos. See :ref:`range_and_dtype`.
+            Default: ``False``.
+    """
+
+    _transformed_types = (torch.Tensor,)
+
+    def __init__(
+        self, dtype: Union[torch.dtype, Dict[Union[Type, str], Optional[torch.dtype]]], scale: bool = False
+    ) -> None:
+        super().__init__()
+
+        if not isinstance(dtype, (dict, torch.dtype)):
+            raise ValueError(f"dtype must be a dict or a torch.dtype, got {type(dtype)} instead")
+
+        if (
+            isinstance(dtype, dict)
+            and torch.Tensor in dtype
+            and any(cls in dtype for cls in [tv_tensors.Image, tv_tensors.Video])
+        ):
+            warnings.warn(
+                "Got `dtype` values for `torch.Tensor` and either `tv_tensors.Image` or `tv_tensors.Video`. "
+                "Note that a plain `torch.Tensor` will *not* be transformed by this (or any other transformation) "
+                "in case a `tv_tensors.Image` or `tv_tensors.Video` is present in the input."
+            )
+        self.dtype = dtype
+        self.scale = scale
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        if isinstance(self.dtype, torch.dtype):
+            # For consistency / BC with ConvertImageDtype, we only care about images or videos when dtype
+            # is a simple torch.dtype
+            if not is_pure_tensor(inpt) and not isinstance(inpt, (tv_tensors.Image, tv_tensors.Video)):
+                return inpt
+
+            dtype: Optional[torch.dtype] = self.dtype
+        elif type(inpt) in self.dtype:
+            dtype = self.dtype[type(inpt)]
+        elif "others" in self.dtype:
+            dtype = self.dtype["others"]
+        else:
+            raise ValueError(
+                f"No dtype was specified for type {type(inpt)}. "
+                "If you only need to convert the dtype of images or videos, you can just pass e.g. dtype=torch.float32. "
+                "If you're passing a dict as dtype, "
+                'you can use "others" as a catch-all key '
+                'e.g. dtype={tv_tensors.Mask: torch.int64, "others": None} to pass-through the rest of the inputs.'
+            )
+
+        supports_scaling = is_pure_tensor(inpt) or isinstance(inpt, (tv_tensors.Image, tv_tensors.Video))
+        if dtype is None:
+            if self.scale and supports_scaling:
+                warnings.warn(
+                    "scale was set to True but no dtype was specified for images or videos: no scaling will be done."
+                )
+            return inpt
+
+        return self._call_kernel(F.to_dtype, inpt, dtype=dtype, scale=self.scale)
+
+
+class ConvertImageDtype(Transform):
+    """[BETA] [DEPRECATED] Use ``v2.ToDtype(dtype, scale=True)`` instead.
+
+    Convert input image to the given ``dtype`` and scale the values accordingly.
+
+    .. v2betastatus:: ConvertImageDtype transform
+
+    .. warning::
+        Consider using ``ToDtype(dtype, scale=True)`` instead. See :class:`~torchvision.transforms.v2.ToDtype`.
+
+    This function does not support PIL Image.
+
+    Args:
+        dtype (torch.dtype): Desired data type of the output
+
+    .. note::
+
+        When converting from a smaller to a larger integer ``dtype`` the maximum values are **not** mapped exactly.
+        If converted back and forth, this mismatch has no effect.
+
+    Raises:
+        RuntimeError: When trying to cast :class:`torch.float32` to :class:`torch.int32` or :class:`torch.int64` as
+            well as for trying to cast :class:`torch.float64` to :class:`torch.int64`. These conversions might lead to
+            overflow errors since the floating point ``dtype`` cannot store consecutive integers over the whole range
+            of the integer ``dtype``.
+    """
+
+    _v1_transform_cls = _transforms.ConvertImageDtype
+
+    def __init__(self, dtype: torch.dtype = torch.float32) -> None:
+        super().__init__()
+        self.dtype = dtype
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.to_dtype, inpt, dtype=self.dtype, scale=True)
+
+
+class SanitizeBoundingBoxes(Transform):
+    """[BETA] Remove degenerate/invalid bounding boxes and their corresponding labels and masks.
+
+    .. v2betastatus:: SanitizeBoundingBoxes transform
+
+    This transform removes bounding boxes and their associated labels/masks that:
+
+    - are below a given ``min_size``: by default this also removes degenerate boxes that have e.g. X2 <= X1.
+    - have any coordinate outside of their corresponding image. You may want to
+      call :class:`~torchvision.transforms.v2.ClampBoundingBoxes` first to avoid undesired removals.
+
+    It is recommended to call it at the end of a pipeline, before passing the
+    input to the models. It is critical to call this transform if
+    :class:`~torchvision.transforms.v2.RandomIoUCrop` was called.
+    If you want to be extra careful, you may call it after all transforms that
+    may modify bounding boxes but once at the end should be enough in most
+    cases.
+
+    Args:
+        min_size (float, optional) The size below which bounding boxes are removed. Default is 1.
+        labels_getter (callable or str or None, optional): indicates how to identify the labels in the input.
+            By default, this will try to find a "labels" key in the input (case-insensitive), if
+            the input is a dict or it is a tuple whose second element is a dict.
+            This heuristic should work well with a lot of datasets, including the built-in torchvision datasets.
+            It can also be a callable that takes the same input
+            as the transform, and returns the labels.
+    """
+
+    def __init__(
+        self,
+        min_size: float = 1.0,
+        labels_getter: Union[Callable[[Any], Optional[torch.Tensor]], str, None] = "default",
+    ) -> None:
+        super().__init__()
+
+        if min_size < 1:
+            raise ValueError(f"min_size must be >= 1, got {min_size}.")
+        self.min_size = min_size
+
+        self.labels_getter = labels_getter
+        self._labels_getter = _parse_labels_getter(labels_getter)
+
+    def forward(self, *inputs: Any) -> Any:
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+
+        labels = self._labels_getter(inputs)
+        if labels is not None and not isinstance(labels, torch.Tensor):
+            raise ValueError(
+                f"The labels in the input to forward() must be a tensor or None, got {type(labels)} instead."
+            )
+
+        flat_inputs, spec = tree_flatten(inputs)
+        boxes = get_bounding_boxes(flat_inputs)
+
+        if labels is not None and boxes.shape[0] != labels.shape[0]:
+            raise ValueError(
+                f"Number of boxes (shape={boxes.shape}) and number of labels (shape={labels.shape}) do not match."
+            )
+
+        boxes = cast(
+            tv_tensors.BoundingBoxes,
+            F.convert_bounding_box_format(
+                boxes,
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            ),
+        )
+        ws, hs = boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]
+        valid = (ws >= self.min_size) & (hs >= self.min_size) & (boxes >= 0).all(dim=-1)
+        # TODO: Do we really need to check for out of bounds here? All
+        # transforms should be clamping anyway, so this should never happen?
+        image_h, image_w = boxes.canvas_size
+        valid &= (boxes[:, 0] <= image_w) & (boxes[:, 2] <= image_w)
+        valid &= (boxes[:, 1] <= image_h) & (boxes[:, 3] <= image_h)
+
+        params = dict(valid=valid.as_subclass(torch.Tensor), labels=labels)
+        flat_outputs = [
+            # Even-though it may look like we're transforming all inputs, we don't:
+            # _transform() will only care about BoundingBoxeses and the labels
+            self._transform(inpt, params)
+            for inpt in flat_inputs
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        is_label = inpt is not None and inpt is params["labels"]
+        is_bounding_boxes_or_mask = isinstance(inpt, (tv_tensors.BoundingBoxes, tv_tensors.Mask))
+
+        if not (is_label or is_bounding_boxes_or_mask):
+            return inpt
+
+        output = inpt[params["valid"]]
+
+        if is_label:
+            return output
+
+        return tv_tensors.wrap(output, like=inpt)
diff --git a/torchvision/transforms/v2/_temporal.py b/torchvision/transforms/v2/_temporal.py
new file mode 100644
index 0000000000000000000000000000000000000000..df39cde0ecd1163eda012a1c7579d4eef85ea964
--- /dev/null
+++ b/torchvision/transforms/v2/_temporal.py
@@ -0,0 +1,28 @@
+from typing import Any, Dict
+
+import torch
+from torchvision.transforms.v2 import functional as F, Transform
+
+
+class UniformTemporalSubsample(Transform):
+    """[BETA] Uniformly subsample ``num_samples`` indices from the temporal dimension of the video.
+
+    .. v2betastatus:: UniformTemporalSubsample transform
+
+    Videos are expected to be of shape ``[..., T, C, H, W]`` where ``T`` denotes the temporal dimension.
+
+    When ``num_samples`` is larger than the size of temporal dimension of the video, it
+    will sample frames based on nearest neighbor interpolation.
+
+    Args:
+        num_samples (int): The number of equispaced samples to be selected
+    """
+
+    _transformed_types = (torch.Tensor,)
+
+    def __init__(self, num_samples: int):
+        super().__init__()
+        self.num_samples = num_samples
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        return self._call_kernel(F.uniform_temporal_subsample, inpt, self.num_samples)
diff --git a/torchvision/transforms/v2/_transform.py b/torchvision/transforms/v2/_transform.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7eced5a2874d5d5a5d24f6580ee4f824b174285
--- /dev/null
+++ b/torchvision/transforms/v2/_transform.py
@@ -0,0 +1,176 @@
+from __future__ import annotations
+
+import enum
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
+
+import PIL.Image
+import torch
+from torch import nn
+from torch.utils._pytree import tree_flatten, tree_unflatten
+from torchvision import tv_tensors
+from torchvision.transforms.v2._utils import check_type, has_any, is_pure_tensor
+from torchvision.utils import _log_api_usage_once
+
+from .functional._utils import _get_kernel
+
+
+class Transform(nn.Module):
+
+    # Class attribute defining transformed types. Other types are passed-through without any transformation
+    # We support both Types and callables that are able to do further checks on the type of the input.
+    _transformed_types: Tuple[Union[Type, Callable[[Any], bool]], ...] = (torch.Tensor, PIL.Image.Image)
+
+    def __init__(self) -> None:
+        super().__init__()
+        _log_api_usage_once(self)
+
+    def _check_inputs(self, flat_inputs: List[Any]) -> None:
+        pass
+
+    def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]:
+        return dict()
+
+    def _call_kernel(self, functional: Callable, inpt: Any, *args: Any, **kwargs: Any) -> Any:
+        kernel = _get_kernel(functional, type(inpt), allow_passthrough=True)
+        return kernel(inpt, *args, **kwargs)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any:
+        raise NotImplementedError
+
+    def forward(self, *inputs: Any) -> Any:
+        flat_inputs, spec = tree_flatten(inputs if len(inputs) > 1 else inputs[0])
+
+        self._check_inputs(flat_inputs)
+
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+        params = self._get_params(
+            [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+        )
+
+        flat_outputs = [
+            self._transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
+
+    def _needs_transform_list(self, flat_inputs: List[Any]) -> List[bool]:
+        # Below is a heuristic on how to deal with pure tensor inputs:
+        # 1. Pure tensors, i.e. tensors that are not a tv_tensor, are passed through if there is an explicit image
+        #    (`tv_tensors.Image` or `PIL.Image.Image`) or video (`tv_tensors.Video`) in the sample.
+        # 2. If there is no explicit image or video in the sample, only the first encountered pure tensor is
+        #    transformed as image, while the rest is passed through. The order is defined by the returned `flat_inputs`
+        #    of `tree_flatten`, which recurses depth-first through the input.
+        #
+        # This heuristic stems from two requirements:
+        # 1. We need to keep BC for single input pure tensors and treat them as images.
+        # 2. We don't want to treat all pure tensors as images, because some datasets like `CelebA` or `Widerface`
+        #    return supplemental numerical data as tensors that cannot be transformed as images.
+        #
+        # The heuristic should work well for most people in practice. The only case where it doesn't is if someone
+        # tries to transform multiple pure tensors at the same time, expecting them all to be treated as images.
+        # However, this case wasn't supported by transforms v1 either, so there is no BC concern.
+
+        needs_transform_list = []
+        transform_pure_tensor = not has_any(flat_inputs, tv_tensors.Image, tv_tensors.Video, PIL.Image.Image)
+        for inpt in flat_inputs:
+            needs_transform = True
+
+            if not check_type(inpt, self._transformed_types):
+                needs_transform = False
+            elif is_pure_tensor(inpt):
+                if transform_pure_tensor:
+                    transform_pure_tensor = False
+                else:
+                    needs_transform = False
+            needs_transform_list.append(needs_transform)
+        return needs_transform_list
+
+    def extra_repr(self) -> str:
+        extra = []
+        for name, value in self.__dict__.items():
+            if name.startswith("_") or name == "training":
+                continue
+
+            if not isinstance(value, (bool, int, float, str, tuple, list, enum.Enum)):
+                continue
+
+            extra.append(f"{name}={value}")
+
+        return ", ".join(extra)
+
+    # This attribute should be set on all transforms that have a v1 equivalent. Doing so enables two things:
+    # 1. In case the v1 transform has a static `get_params` method, it will also be available under the same name on
+    #    the v2 transform. See `__init_subclass__` for details.
+    # 2. The v2 transform will be JIT scriptable. See `_extract_params_for_v1_transform` and `__prepare_scriptable__`
+    #    for details.
+    _v1_transform_cls: Optional[Type[nn.Module]] = None
+
+    def __init_subclass__(cls) -> None:
+        # Since `get_params` is a `@staticmethod`, we have to bind it to the class itself rather than to an instance.
+        # This method is called after subclassing has happened, i.e. `cls` is the subclass, e.g. `Resize`.
+        if cls._v1_transform_cls is not None and hasattr(cls._v1_transform_cls, "get_params"):
+            cls.get_params = staticmethod(cls._v1_transform_cls.get_params)  # type: ignore[attr-defined]
+
+    def _extract_params_for_v1_transform(self) -> Dict[str, Any]:
+        # This method is called by `__prepare_scriptable__` to instantiate the equivalent v1 transform from the current
+        # v2 transform instance. It extracts all available public attributes that are specific to that transform and
+        # not `nn.Module` in general.
+        # Overwrite this method on the v2 transform class if the above is not sufficient. For example, this might happen
+        # if the v2 transform introduced new parameters that are not support by the v1 transform.
+        common_attrs = nn.Module().__dict__.keys()
+        return {
+            attr: value
+            for attr, value in self.__dict__.items()
+            if not attr.startswith("_") and attr not in common_attrs
+        }
+
+    def __prepare_scriptable__(self) -> nn.Module:
+        # This method is called early on when `torch.jit.script`'ing an `nn.Module` instance. If it succeeds, the return
+        # value is used for scripting over the original object that should have been scripted. Since the v1 transforms
+        # are JIT scriptable, and we made sure that for single image inputs v1 and v2 are equivalent, we just return the
+        # equivalent v1 transform here. This of course only makes transforms v2 JIT scriptable as long as transforms v1
+        # is around.
+        if self._v1_transform_cls is None:
+            raise RuntimeError(
+                f"Transform {type(self).__name__} cannot be JIT scripted. "
+                "torchscript is only supported for backward compatibility with transforms "
+                "which are already in torchvision.transforms. "
+                "For torchscript support (on tensors only), you can use the functional API instead."
+            )
+
+        return self._v1_transform_cls(**self._extract_params_for_v1_transform())
+
+
+class _RandomApplyTransform(Transform):
+    def __init__(self, p: float = 0.5) -> None:
+        if not (0.0 <= p <= 1.0):
+            raise ValueError("`p` should be a floating point value in the interval [0.0, 1.0].")
+
+        super().__init__()
+        self.p = p
+
+    def forward(self, *inputs: Any) -> Any:
+        # We need to almost duplicate `Transform.forward()` here since we always want to check the inputs, but return
+        # early afterwards in case the random check triggers. The same result could be achieved by calling
+        # `super().forward()` after the random check, but that would call `self._check_inputs` twice.
+
+        inputs = inputs if len(inputs) > 1 else inputs[0]
+        flat_inputs, spec = tree_flatten(inputs)
+
+        self._check_inputs(flat_inputs)
+
+        if torch.rand(1) >= self.p:
+            return inputs
+
+        needs_transform_list = self._needs_transform_list(flat_inputs)
+        params = self._get_params(
+            [inpt for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list) if needs_transform]
+        )
+
+        flat_outputs = [
+            self._transform(inpt, params) if needs_transform else inpt
+            for (inpt, needs_transform) in zip(flat_inputs, needs_transform_list)
+        ]
+
+        return tree_unflatten(flat_outputs, spec)
diff --git a/torchvision/transforms/v2/_type_conversion.py b/torchvision/transforms/v2/_type_conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..c909a17beccc9a4c5ae26903b78c438016cef271
--- /dev/null
+++ b/torchvision/transforms/v2/_type_conversion.py
@@ -0,0 +1,92 @@
+from typing import Any, Dict, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+
+from torchvision import tv_tensors
+from torchvision.transforms.v2 import functional as F, Transform
+
+from torchvision.transforms.v2._utils import is_pure_tensor
+
+
+class PILToTensor(Transform):
+    """[BETA] Convert a PIL Image to a tensor of the same type - this does not scale values.
+
+    .. v2betastatus:: PILToTensor transform
+
+    This transform does not support torchscript.
+
+    Converts a PIL Image (H x W x C) to a Tensor of shape (C x H x W).
+    """
+
+    _transformed_types = (PIL.Image.Image,)
+
+    def _transform(self, inpt: PIL.Image.Image, params: Dict[str, Any]) -> torch.Tensor:
+        return F.pil_to_tensor(inpt)
+
+
+class ToImage(Transform):
+    """[BETA] Convert a tensor, ndarray, or PIL Image to :class:`~torchvision.tv_tensors.Image`
+    ; this does not scale values.
+
+    .. v2betastatus:: ToImage transform
+
+    This transform does not support torchscript.
+    """
+
+    _transformed_types = (is_pure_tensor, PIL.Image.Image, np.ndarray)
+
+    def _transform(
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
+    ) -> tv_tensors.Image:
+        return F.to_image(inpt)
+
+
+class ToPILImage(Transform):
+    """[BETA] Convert a tensor or an ndarray to PIL Image
+
+    .. v2betastatus:: ToPILImage transform
+
+    This transform does not support torchscript.
+
+    Converts a torch.*Tensor of shape C x H x W or a numpy ndarray of shape
+    H x W x C to a PIL Image while adjusting the value range depending on the ``mode``.
+
+    Args:
+        mode (`PIL.Image mode`_): color space and pixel depth of input data (optional).
+            If ``mode`` is ``None`` (default) there are some assumptions made about the input data:
+
+            - If the input has 4 channels, the ``mode`` is assumed to be ``RGBA``.
+            - If the input has 3 channels, the ``mode`` is assumed to be ``RGB``.
+            - If the input has 2 channels, the ``mode`` is assumed to be ``LA``.
+            - If the input has 1 channel, the ``mode`` is determined by the data type (i.e ``int``, ``float``,
+              ``short``).
+
+    .. _PIL.Image mode: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#concept-modes
+    """
+
+    _transformed_types = (is_pure_tensor, tv_tensors.Image, np.ndarray)
+
+    def __init__(self, mode: Optional[str] = None) -> None:
+        super().__init__()
+        self.mode = mode
+
+    def _transform(
+        self, inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray], params: Dict[str, Any]
+    ) -> PIL.Image.Image:
+        return F.to_pil_image(inpt, mode=self.mode)
+
+
+class ToPureTensor(Transform):
+    """[BETA] Convert all tv_tensors to pure tensors, removing associated metadata (if any).
+
+    .. v2betastatus:: ToPureTensor transform
+
+    This doesn't scale or change the values, only the type.
+    """
+
+    _transformed_types = (tv_tensors.TVTensor,)
+
+    def _transform(self, inpt: Any, params: Dict[str, Any]) -> torch.Tensor:
+        return inpt.as_subclass(torch.Tensor)
diff --git a/torchvision/transforms/v2/_utils.py b/torchvision/transforms/v2/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..6147180a986f0b2272b1f4f24654b6cbe46edff1
--- /dev/null
+++ b/torchvision/transforms/v2/_utils.py
@@ -0,0 +1,222 @@
+from __future__ import annotations
+
+import collections.abc
+import numbers
+from contextlib import suppress
+
+from typing import Any, Callable, Dict, List, Literal, Optional, Sequence, Tuple, Type, Union
+
+import PIL.Image
+import torch
+
+from torchvision import tv_tensors
+
+from torchvision._utils import sequence_to_str
+
+from torchvision.transforms.transforms import _check_sequence_input, _setup_angle, _setup_size  # noqa: F401
+from torchvision.transforms.v2.functional import get_dimensions, get_size, is_pure_tensor
+from torchvision.transforms.v2.functional._utils import _FillType, _FillTypeJIT
+
+
+def _setup_number_or_seq(arg: Union[int, float, Sequence[Union[int, float]]], name: str) -> Sequence[float]:
+    if not isinstance(arg, (int, float, Sequence)):
+        raise TypeError(f"{name} should be a number or a sequence of numbers. Got {type(arg)}")
+    if isinstance(arg, Sequence) and len(arg) not in (1, 2):
+        raise ValueError(f"If {name} is a sequence its length should be 1 or 2. Got {len(arg)}")
+    if isinstance(arg, Sequence):
+        for element in arg:
+            if not isinstance(element, (int, float)):
+                raise ValueError(f"{name} should be a sequence of numbers. Got {type(element)}")
+
+    if isinstance(arg, (int, float)):
+        arg = [float(arg), float(arg)]
+    elif isinstance(arg, Sequence):
+        if len(arg) == 1:
+            arg = [float(arg[0]), float(arg[0])]
+        else:
+            arg = [float(arg[0]), float(arg[1])]
+    return arg
+
+
+def _check_fill_arg(fill: Union[_FillType, Dict[Union[Type, str], _FillType]]) -> None:
+    if isinstance(fill, dict):
+        for value in fill.values():
+            _check_fill_arg(value)
+    else:
+        if fill is not None and not isinstance(fill, (numbers.Number, tuple, list)):
+            raise TypeError("Got inappropriate fill arg, only Numbers, tuples, lists and dicts are allowed.")
+
+
+def _convert_fill_arg(fill: _FillType) -> _FillTypeJIT:
+    # Fill = 0 is not equivalent to None, https://github.com/pytorch/vision/issues/6517
+    # So, we can't reassign fill to 0
+    # if fill is None:
+    #     fill = 0
+    if fill is None:
+        return fill
+
+    if not isinstance(fill, (int, float)):
+        fill = [float(v) for v in list(fill)]
+    return fill  # type: ignore[return-value]
+
+
+def _setup_fill_arg(fill: Union[_FillType, Dict[Union[Type, str], _FillType]]) -> Dict[Union[Type, str], _FillTypeJIT]:
+    _check_fill_arg(fill)
+
+    if isinstance(fill, dict):
+        for k, v in fill.items():
+            fill[k] = _convert_fill_arg(v)
+        return fill  # type: ignore[return-value]
+    else:
+        return {"others": _convert_fill_arg(fill)}
+
+
+def _get_fill(fill_dict, inpt_type):
+    if inpt_type in fill_dict:
+        return fill_dict[inpt_type]
+    elif "others" in fill_dict:
+        return fill_dict["others"]
+    else:
+        RuntimeError("This should never happen, please open an issue on the torchvision repo if you hit this.")
+
+
+def _check_padding_arg(padding: Union[int, Sequence[int]]) -> None:
+    if not isinstance(padding, (numbers.Number, tuple, list)):
+        raise TypeError("Got inappropriate padding arg")
+
+    if isinstance(padding, (tuple, list)) and len(padding) not in [1, 2, 4]:
+        raise ValueError(f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple")
+
+
+# TODO: let's use torchvision._utils.StrEnum to have the best of both worlds (strings and enums)
+# https://github.com/pytorch/vision/issues/6250
+def _check_padding_mode_arg(padding_mode: Literal["constant", "edge", "reflect", "symmetric"]) -> None:
+    if padding_mode not in ["constant", "edge", "reflect", "symmetric"]:
+        raise ValueError("Padding mode should be either constant, edge, reflect or symmetric")
+
+
+def _find_labels_default_heuristic(inputs: Any) -> torch.Tensor:
+    """
+    This heuristic covers three cases:
+
+    1. The input is tuple or list whose second item is a labels tensor. This happens for already batched
+       classification inputs for MixUp and CutMix (typically after the Dataloder).
+    2. The input is a tuple or list whose second item is a dictionary that contains the labels tensor
+       under a label-like (see below) key. This happens for the inputs of detection models.
+    3. The input is a dictionary that is structured as the one from 2.
+
+    What is "label-like" key? We first search for an case-insensitive match of 'labels' inside the keys of the
+    dictionary. This is the name our detection models expect. If we can't find that, we look for a case-insensitive
+    match of the term 'label' anywhere inside the key, i.e. 'FooLaBeLBar'. If we can't find that either, the dictionary
+    contains no "label-like" key.
+    """
+
+    if isinstance(inputs, (tuple, list)):
+        inputs = inputs[1]
+
+    # MixUp, CutMix
+    if is_pure_tensor(inputs):
+        return inputs
+
+    if not isinstance(inputs, collections.abc.Mapping):
+        raise ValueError(
+            f"When using the default labels_getter, the input passed to forward must be a dictionary or a two-tuple "
+            f"whose second item is a dictionary or a tensor, but got {inputs} instead."
+        )
+
+    candidate_key = None
+    with suppress(StopIteration):
+        candidate_key = next(key for key in inputs.keys() if key.lower() == "labels")
+    if candidate_key is None:
+        with suppress(StopIteration):
+            candidate_key = next(key for key in inputs.keys() if "label" in key.lower())
+    if candidate_key is None:
+        raise ValueError(
+            "Could not infer where the labels are in the sample. Try passing a callable as the labels_getter parameter?"
+            "If there are no labels in the sample by design, pass labels_getter=None."
+        )
+
+    return inputs[candidate_key]
+
+
+def _parse_labels_getter(
+    labels_getter: Union[str, Callable[[Any], Optional[torch.Tensor]], None]
+) -> Callable[[Any], Optional[torch.Tensor]]:
+    if labels_getter == "default":
+        return _find_labels_default_heuristic
+    elif callable(labels_getter):
+        return labels_getter
+    elif labels_getter is None:
+        return lambda _: None
+    else:
+        raise ValueError(f"labels_getter should either be 'default', a callable, or None, but got {labels_getter}.")
+
+
+def get_bounding_boxes(flat_inputs: List[Any]) -> tv_tensors.BoundingBoxes:
+    # This assumes there is only one bbox per sample as per the general convention
+    try:
+        return next(inpt for inpt in flat_inputs if isinstance(inpt, tv_tensors.BoundingBoxes))
+    except StopIteration:
+        raise ValueError("No bounding boxes were found in the sample")
+
+
+def query_chw(flat_inputs: List[Any]) -> Tuple[int, int, int]:
+    chws = {
+        tuple(get_dimensions(inpt))
+        for inpt in flat_inputs
+        if check_type(inpt, (is_pure_tensor, tv_tensors.Image, PIL.Image.Image, tv_tensors.Video))
+    }
+    if not chws:
+        raise TypeError("No image or video was found in the sample")
+    elif len(chws) > 1:
+        raise ValueError(f"Found multiple CxHxW dimensions in the sample: {sequence_to_str(sorted(chws))}")
+    c, h, w = chws.pop()
+    return c, h, w
+
+
+def query_size(flat_inputs: List[Any]) -> Tuple[int, int]:
+    sizes = {
+        tuple(get_size(inpt))
+        for inpt in flat_inputs
+        if check_type(
+            inpt,
+            (
+                is_pure_tensor,
+                tv_tensors.Image,
+                PIL.Image.Image,
+                tv_tensors.Video,
+                tv_tensors.Mask,
+                tv_tensors.BoundingBoxes,
+            ),
+        )
+    }
+    if not sizes:
+        raise TypeError("No image, video, mask or bounding box was found in the sample")
+    elif len(sizes) > 1:
+        raise ValueError(f"Found multiple HxW dimensions in the sample: {sequence_to_str(sorted(sizes))}")
+    h, w = sizes.pop()
+    return h, w
+
+
+def check_type(obj: Any, types_or_checks: Tuple[Union[Type, Callable[[Any], bool]], ...]) -> bool:
+    for type_or_check in types_or_checks:
+        if isinstance(obj, type_or_check) if isinstance(type_or_check, type) else type_or_check(obj):
+            return True
+    return False
+
+
+def has_any(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
+    for inpt in flat_inputs:
+        if check_type(inpt, types_or_checks):
+            return True
+    return False
+
+
+def has_all(flat_inputs: List[Any], *types_or_checks: Union[Type, Callable[[Any], bool]]) -> bool:
+    for type_or_check in types_or_checks:
+        for inpt in flat_inputs:
+            if isinstance(inpt, type_or_check) if isinstance(type_or_check, type) else type_or_check(inpt):
+                break
+        else:
+            return False
+    return True
diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..81d5c1b9baf4bfda6b27239e35dc5df01bbb9c8e
--- /dev/null
+++ b/torchvision/transforms/v2/functional/__init__.py
@@ -0,0 +1,177 @@
+from torchvision.transforms import InterpolationMode  # usort: skip
+
+from ._utils import is_pure_tensor, register_kernel  # usort: skip
+
+from ._meta import (
+    clamp_bounding_boxes,
+    convert_bounding_box_format,
+    get_dimensions_image,
+    _get_dimensions_image_pil,
+    get_dimensions_video,
+    get_dimensions,
+    get_num_frames_video,
+    get_num_frames,
+    get_image_num_channels,
+    get_num_channels_image,
+    _get_num_channels_image_pil,
+    get_num_channels_video,
+    get_num_channels,
+    get_size_bounding_boxes,
+    get_size_image,
+    _get_size_image_pil,
+    get_size_mask,
+    get_size_video,
+    get_size,
+)  # usort: skip
+
+from ._augment import _erase_image_pil, erase, erase_image, erase_video
+from ._color import (
+    _adjust_brightness_image_pil,
+    _adjust_contrast_image_pil,
+    _adjust_gamma_image_pil,
+    _adjust_hue_image_pil,
+    _adjust_saturation_image_pil,
+    _adjust_sharpness_image_pil,
+    _autocontrast_image_pil,
+    _equalize_image_pil,
+    _invert_image_pil,
+    _permute_channels_image_pil,
+    _posterize_image_pil,
+    _rgb_to_grayscale_image_pil,
+    _solarize_image_pil,
+    adjust_brightness,
+    adjust_brightness_image,
+    adjust_brightness_video,
+    adjust_contrast,
+    adjust_contrast_image,
+    adjust_contrast_video,
+    adjust_gamma,
+    adjust_gamma_image,
+    adjust_gamma_video,
+    adjust_hue,
+    adjust_hue_image,
+    adjust_hue_video,
+    adjust_saturation,
+    adjust_saturation_image,
+    adjust_saturation_video,
+    adjust_sharpness,
+    adjust_sharpness_image,
+    adjust_sharpness_video,
+    autocontrast,
+    autocontrast_image,
+    autocontrast_video,
+    equalize,
+    equalize_image,
+    equalize_video,
+    invert,
+    invert_image,
+    invert_video,
+    permute_channels,
+    permute_channels_image,
+    permute_channels_video,
+    posterize,
+    posterize_image,
+    posterize_video,
+    rgb_to_grayscale,
+    rgb_to_grayscale_image,
+    solarize,
+    solarize_image,
+    solarize_video,
+    to_grayscale,
+)
+from ._geometry import (
+    _affine_image_pil,
+    _center_crop_image_pil,
+    _crop_image_pil,
+    _elastic_image_pil,
+    _five_crop_image_pil,
+    _horizontal_flip_image_pil,
+    _pad_image_pil,
+    _perspective_image_pil,
+    _resize_image_pil,
+    _resized_crop_image_pil,
+    _rotate_image_pil,
+    _ten_crop_image_pil,
+    _vertical_flip_image_pil,
+    affine,
+    affine_bounding_boxes,
+    affine_image,
+    affine_mask,
+    affine_video,
+    center_crop,
+    center_crop_bounding_boxes,
+    center_crop_image,
+    center_crop_mask,
+    center_crop_video,
+    crop,
+    crop_bounding_boxes,
+    crop_image,
+    crop_mask,
+    crop_video,
+    elastic,
+    elastic_bounding_boxes,
+    elastic_image,
+    elastic_mask,
+    elastic_transform,
+    elastic_video,
+    five_crop,
+    five_crop_image,
+    five_crop_video,
+    hflip,  # TODO: Consider moving all pure alias definitions at the bottom of the file
+    horizontal_flip,
+    horizontal_flip_bounding_boxes,
+    horizontal_flip_image,
+    horizontal_flip_mask,
+    horizontal_flip_video,
+    pad,
+    pad_bounding_boxes,
+    pad_image,
+    pad_mask,
+    pad_video,
+    perspective,
+    perspective_bounding_boxes,
+    perspective_image,
+    perspective_mask,
+    perspective_video,
+    resize,
+    resize_bounding_boxes,
+    resize_image,
+    resize_mask,
+    resize_video,
+    resized_crop,
+    resized_crop_bounding_boxes,
+    resized_crop_image,
+    resized_crop_mask,
+    resized_crop_video,
+    rotate,
+    rotate_bounding_boxes,
+    rotate_image,
+    rotate_mask,
+    rotate_video,
+    ten_crop,
+    ten_crop_image,
+    ten_crop_video,
+    vertical_flip,
+    vertical_flip_bounding_boxes,
+    vertical_flip_image,
+    vertical_flip_mask,
+    vertical_flip_video,
+    vflip,
+)
+from ._misc import (
+    _gaussian_blur_image_pil,
+    convert_image_dtype,
+    gaussian_blur,
+    gaussian_blur_image,
+    gaussian_blur_video,
+    normalize,
+    normalize_image,
+    normalize_video,
+    to_dtype,
+    to_dtype_image,
+    to_dtype_video,
+)
+from ._temporal import uniform_temporal_subsample, uniform_temporal_subsample_video
+from ._type_conversion import pil_to_tensor, to_image, to_pil_image
+
+from ._deprecated import get_image_size, to_tensor  # usort: skip
diff --git a/torchvision/transforms/v2/functional/_augment.py b/torchvision/transforms/v2/functional/_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..c9b1f3951fedc152124e7a37ee08c905d0757ad1
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_augment.py
@@ -0,0 +1,55 @@
+import PIL.Image
+
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms.functional import pil_to_tensor, to_pil_image
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def erase(
+    inpt: torch.Tensor,
+    i: int,
+    j: int,
+    h: int,
+    w: int,
+    v: torch.Tensor,
+    inplace: bool = False,
+) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomErase` for details."""
+    if torch.jit.is_scripting():
+        return erase_image(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+
+    _log_api_usage_once(erase)
+
+    kernel = _get_kernel(erase, type(inpt))
+    return kernel(inpt, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+
+
+@_register_kernel_internal(erase, torch.Tensor)
+@_register_kernel_internal(erase, tv_tensors.Image)
+def erase_image(
+    image: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> torch.Tensor:
+    if not inplace:
+        image = image.clone()
+
+    image[..., i : i + h, j : j + w] = v
+    return image
+
+
+@_register_kernel_internal(erase, PIL.Image.Image)
+def _erase_image_pil(
+    image: PIL.Image.Image, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> PIL.Image.Image:
+    t_img = pil_to_tensor(image)
+    output = erase_image(t_img, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
+    return to_pil_image(output, mode=image.mode)
+
+
+@_register_kernel_internal(erase, tv_tensors.Video)
+def erase_video(
+    video: torch.Tensor, i: int, j: int, h: int, w: int, v: torch.Tensor, inplace: bool = False
+) -> torch.Tensor:
+    return erase_image(video, i=i, j=j, h=h, w=w, v=v, inplace=inplace)
diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py
new file mode 100644
index 0000000000000000000000000000000000000000..33ac7bc3b01e6dc4438691e25b002abda3893866
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_color.py
@@ -0,0 +1,709 @@
+from typing import List
+
+import PIL.Image
+import torch
+from torch.nn.functional import conv2d
+from torchvision import tv_tensors
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.transforms._functional_tensor import _max_value
+
+from torchvision.utils import _log_api_usage_once
+
+from ._misc import _num_value_bits, to_dtype_image
+from ._type_conversion import pil_to_tensor, to_pil_image
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def rgb_to_grayscale(inpt: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.Grayscale` for details."""
+    if torch.jit.is_scripting():
+        return rgb_to_grayscale_image(inpt, num_output_channels=num_output_channels)
+
+    _log_api_usage_once(rgb_to_grayscale)
+
+    kernel = _get_kernel(rgb_to_grayscale, type(inpt))
+    return kernel(inpt, num_output_channels=num_output_channels)
+
+
+# `to_grayscale` actually predates `rgb_to_grayscale` in v1, but only handles PIL images. Since `rgb_to_grayscale` is a
+# superset in terms of functionality and has the same signature, we alias here to avoid disruption.
+to_grayscale = rgb_to_grayscale
+
+
+def _rgb_to_grayscale_image(
+    image: torch.Tensor, num_output_channels: int = 1, preserve_dtype: bool = True
+) -> torch.Tensor:
+    if image.shape[-3] == 1:
+        return image.clone()
+
+    r, g, b = image.unbind(dim=-3)
+    l_img = r.mul(0.2989).add_(g, alpha=0.587).add_(b, alpha=0.114)
+    l_img = l_img.unsqueeze(dim=-3)
+    if preserve_dtype:
+        l_img = l_img.to(image.dtype)
+    if num_output_channels == 3:
+        l_img = l_img.expand(image.shape)
+    return l_img
+
+
+@_register_kernel_internal(rgb_to_grayscale, torch.Tensor)
+@_register_kernel_internal(rgb_to_grayscale, tv_tensors.Image)
+def rgb_to_grayscale_image(image: torch.Tensor, num_output_channels: int = 1) -> torch.Tensor:
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+    return _rgb_to_grayscale_image(image, num_output_channels=num_output_channels, preserve_dtype=True)
+
+
+@_register_kernel_internal(rgb_to_grayscale, PIL.Image.Image)
+def _rgb_to_grayscale_image_pil(image: PIL.Image.Image, num_output_channels: int = 1) -> PIL.Image.Image:
+    if num_output_channels not in (1, 3):
+        raise ValueError(f"num_output_channels must be 1 or 3, got {num_output_channels}.")
+    return _FP.to_grayscale(image, num_output_channels=num_output_channels)
+
+
+def _blend(image1: torch.Tensor, image2: torch.Tensor, ratio: float) -> torch.Tensor:
+    ratio = float(ratio)
+    fp = image1.is_floating_point()
+    bound = _max_value(image1.dtype)
+    output = image1.mul(ratio).add_(image2, alpha=(1.0 - ratio)).clamp_(0, bound)
+    return output if fp else output.to(image1.dtype)
+
+
+def adjust_brightness(inpt: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    """Adjust brightness."""
+
+    if torch.jit.is_scripting():
+        return adjust_brightness_image(inpt, brightness_factor=brightness_factor)
+
+    _log_api_usage_once(adjust_brightness)
+
+    kernel = _get_kernel(adjust_brightness, type(inpt))
+    return kernel(inpt, brightness_factor=brightness_factor)
+
+
+@_register_kernel_internal(adjust_brightness, torch.Tensor)
+@_register_kernel_internal(adjust_brightness, tv_tensors.Image)
+def adjust_brightness_image(image: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    if brightness_factor < 0:
+        raise ValueError(f"brightness_factor ({brightness_factor}) is not non-negative.")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    fp = image.is_floating_point()
+    bound = _max_value(image.dtype)
+    output = image.mul(brightness_factor).clamp_(0, bound)
+    return output if fp else output.to(image.dtype)
+
+
+@_register_kernel_internal(adjust_brightness, PIL.Image.Image)
+def _adjust_brightness_image_pil(image: PIL.Image.Image, brightness_factor: float) -> PIL.Image.Image:
+    return _FP.adjust_brightness(image, brightness_factor=brightness_factor)
+
+
+@_register_kernel_internal(adjust_brightness, tv_tensors.Video)
+def adjust_brightness_video(video: torch.Tensor, brightness_factor: float) -> torch.Tensor:
+    return adjust_brightness_image(video, brightness_factor=brightness_factor)
+
+
+def adjust_saturation(inpt: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    """Adjust saturation."""
+    if torch.jit.is_scripting():
+        return adjust_saturation_image(inpt, saturation_factor=saturation_factor)
+
+    _log_api_usage_once(adjust_saturation)
+
+    kernel = _get_kernel(adjust_saturation, type(inpt))
+    return kernel(inpt, saturation_factor=saturation_factor)
+
+
+@_register_kernel_internal(adjust_saturation, torch.Tensor)
+@_register_kernel_internal(adjust_saturation, tv_tensors.Image)
+def adjust_saturation_image(image: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    if saturation_factor < 0:
+        raise ValueError(f"saturation_factor ({saturation_factor}) is not non-negative.")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    if c == 1:  # Match PIL behaviour
+        return image
+
+    grayscale_image = _rgb_to_grayscale_image(image, num_output_channels=1, preserve_dtype=False)
+    if not image.is_floating_point():
+        grayscale_image = grayscale_image.floor_()
+
+    return _blend(image, grayscale_image, saturation_factor)
+
+
+_adjust_saturation_image_pil = _register_kernel_internal(adjust_saturation, PIL.Image.Image)(_FP.adjust_saturation)
+
+
+@_register_kernel_internal(adjust_saturation, tv_tensors.Video)
+def adjust_saturation_video(video: torch.Tensor, saturation_factor: float) -> torch.Tensor:
+    return adjust_saturation_image(video, saturation_factor=saturation_factor)
+
+
+def adjust_contrast(inpt: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.RandomAutocontrast`"""
+    if torch.jit.is_scripting():
+        return adjust_contrast_image(inpt, contrast_factor=contrast_factor)
+
+    _log_api_usage_once(adjust_contrast)
+
+    kernel = _get_kernel(adjust_contrast, type(inpt))
+    return kernel(inpt, contrast_factor=contrast_factor)
+
+
+@_register_kernel_internal(adjust_contrast, torch.Tensor)
+@_register_kernel_internal(adjust_contrast, tv_tensors.Image)
+def adjust_contrast_image(image: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    if contrast_factor < 0:
+        raise ValueError(f"contrast_factor ({contrast_factor}) is not non-negative.")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+    fp = image.is_floating_point()
+    if c == 3:
+        grayscale_image = _rgb_to_grayscale_image(image, num_output_channels=1, preserve_dtype=False)
+        if not fp:
+            grayscale_image = grayscale_image.floor_()
+    else:
+        grayscale_image = image if fp else image.to(torch.float32)
+    mean = torch.mean(grayscale_image, dim=(-3, -2, -1), keepdim=True)
+    return _blend(image, mean, contrast_factor)
+
+
+_adjust_contrast_image_pil = _register_kernel_internal(adjust_contrast, PIL.Image.Image)(_FP.adjust_contrast)
+
+
+@_register_kernel_internal(adjust_contrast, tv_tensors.Video)
+def adjust_contrast_video(video: torch.Tensor, contrast_factor: float) -> torch.Tensor:
+    return adjust_contrast_image(video, contrast_factor=contrast_factor)
+
+
+def adjust_sharpness(inpt: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.RandomAdjustSharpness`"""
+    if torch.jit.is_scripting():
+        return adjust_sharpness_image(inpt, sharpness_factor=sharpness_factor)
+
+    _log_api_usage_once(adjust_sharpness)
+
+    kernel = _get_kernel(adjust_sharpness, type(inpt))
+    return kernel(inpt, sharpness_factor=sharpness_factor)
+
+
+@_register_kernel_internal(adjust_sharpness, torch.Tensor)
+@_register_kernel_internal(adjust_sharpness, tv_tensors.Image)
+def adjust_sharpness_image(image: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    num_channels, height, width = image.shape[-3:]
+    if num_channels not in (1, 3):
+        raise TypeError(f"Input image tensor can have 1 or 3 channels, but found {num_channels}")
+
+    if sharpness_factor < 0:
+        raise ValueError(f"sharpness_factor ({sharpness_factor}) is not non-negative.")
+
+    if image.numel() == 0 or height <= 2 or width <= 2:
+        return image
+
+    bound = _max_value(image.dtype)
+    fp = image.is_floating_point()
+    shape = image.shape
+
+    if image.ndim > 4:
+        image = image.reshape(-1, num_channels, height, width)
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    # The following is a normalized 3x3 kernel with 1s in the edges and a 5 in the middle.
+    kernel_dtype = image.dtype if fp else torch.float32
+    a, b = 1.0 / 13.0, 5.0 / 13.0
+    kernel = torch.tensor([[a, a, a], [a, b, a], [a, a, a]], dtype=kernel_dtype, device=image.device)
+    kernel = kernel.expand(num_channels, 1, 3, 3)
+
+    # We copy and cast at the same time to avoid modifications on the original data
+    output = image.to(dtype=kernel_dtype, copy=True)
+    blurred_degenerate = conv2d(output, kernel, groups=num_channels)
+    if not fp:
+        # it is better to round before cast
+        blurred_degenerate = blurred_degenerate.round_()
+
+    # Create a view on the underlying output while pointing at the same data. We do this to avoid indexing twice.
+    view = output[..., 1:-1, 1:-1]
+
+    # We speed up blending by minimizing flops and doing in-place. The 2 blend options are mathematically equivalent:
+    # x+(1-r)*(y-x) = x + (1-r)*y - (1-r)*x = x*r + y*(1-r)
+    view.add_(blurred_degenerate.sub_(view), alpha=(1.0 - sharpness_factor))
+
+    # The actual data of output have been modified by the above. We only need to clamp and cast now.
+    output = output.clamp_(0, bound)
+    if not fp:
+        output = output.to(image.dtype)
+
+    if needs_unsquash:
+        output = output.reshape(shape)
+
+    return output
+
+
+_adjust_sharpness_image_pil = _register_kernel_internal(adjust_sharpness, PIL.Image.Image)(_FP.adjust_sharpness)
+
+
+@_register_kernel_internal(adjust_sharpness, tv_tensors.Video)
+def adjust_sharpness_video(video: torch.Tensor, sharpness_factor: float) -> torch.Tensor:
+    return adjust_sharpness_image(video, sharpness_factor=sharpness_factor)
+
+
+def adjust_hue(inpt: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    """Adjust hue"""
+    if torch.jit.is_scripting():
+        return adjust_hue_image(inpt, hue_factor=hue_factor)
+
+    _log_api_usage_once(adjust_hue)
+
+    kernel = _get_kernel(adjust_hue, type(inpt))
+    return kernel(inpt, hue_factor=hue_factor)
+
+
+def _rgb_to_hsv(image: torch.Tensor) -> torch.Tensor:
+    r, g, _ = image.unbind(dim=-3)
+
+    # Implementation is based on
+    # https://github.com/python-pillow/Pillow/blob/4174d4267616897df3746d315d5a2d0f82c656ee/src/libImaging/Convert.c#L330
+    minc, maxc = torch.aminmax(image, dim=-3)
+
+    # The algorithm erases S and H channel where `maxc = minc`. This avoids NaN
+    # from happening in the results, because
+    #   + S channel has division by `maxc`, which is zero only if `maxc = minc`
+    #   + H channel has division by `(maxc - minc)`.
+    #
+    # Instead of overwriting NaN afterwards, we just prevent it from occurring so
+    # we don't need to deal with it in case we save the NaN in a buffer in
+    # backprop, if it is ever supported, but it doesn't hurt to do so.
+    eqc = maxc == minc
+
+    channels_range = maxc - minc
+    # Since `eqc => channels_range = 0`, replacing denominator with 1 when `eqc` is fine.
+    ones = torch.ones_like(maxc)
+    s = channels_range / torch.where(eqc, ones, maxc)
+    # Note that `eqc => maxc = minc = r = g = b`. So the following calculation
+    # of `h` would reduce to `bc - gc + 2 + rc - bc + 4 + rc - bc = 6` so it
+    # would not matter what values `rc`, `gc`, and `bc` have here, and thus
+    # replacing denominator with 1 when `eqc` is fine.
+    channels_range_divisor = torch.where(eqc, ones, channels_range).unsqueeze_(dim=-3)
+    rc, gc, bc = ((maxc.unsqueeze(dim=-3) - image) / channels_range_divisor).unbind(dim=-3)
+
+    mask_maxc_neq_r = maxc != r
+    mask_maxc_eq_g = maxc == g
+
+    hg = rc.add(2.0).sub_(bc).mul_(mask_maxc_eq_g & mask_maxc_neq_r)
+    hr = bc.sub_(gc).mul_(~mask_maxc_neq_r)
+    hb = gc.add_(4.0).sub_(rc).mul_(mask_maxc_neq_r.logical_and_(mask_maxc_eq_g.logical_not_()))
+
+    h = hr.add_(hg).add_(hb)
+    h = h.mul_(1.0 / 6.0).add_(1.0).fmod_(1.0)
+    return torch.stack((h, s, maxc), dim=-3)
+
+
+def _hsv_to_rgb(img: torch.Tensor) -> torch.Tensor:
+    h, s, v = img.unbind(dim=-3)
+    h6 = h.mul(6)
+    i = torch.floor(h6)
+    f = h6.sub_(i)
+    i = i.to(dtype=torch.int32)
+
+    sxf = s * f
+    one_minus_s = 1.0 - s
+    q = (1.0 - sxf).mul_(v).clamp_(0.0, 1.0)
+    t = sxf.add_(one_minus_s).mul_(v).clamp_(0.0, 1.0)
+    p = one_minus_s.mul_(v).clamp_(0.0, 1.0)
+    i.remainder_(6)
+
+    vpqt = torch.stack((v, p, q, t), dim=-3)
+
+    # vpqt -> rgb mapping based on i
+    select = torch.tensor([[0, 2, 1, 1, 3, 0], [3, 0, 0, 2, 1, 1], [1, 1, 3, 0, 0, 2]], dtype=torch.long)
+    select = select.to(device=img.device, non_blocking=True)
+
+    select = select[:, i]
+    if select.ndim > 3:
+        # if input.shape is (B, ..., C, H, W) then
+        # select.shape is (C, B, ...,  H, W)
+        # thus we move C axis to get (B, ..., C, H, W)
+        select = select.moveaxis(0, -3)
+
+    return vpqt.gather(-3, select)
+
+
+@_register_kernel_internal(adjust_hue, torch.Tensor)
+@_register_kernel_internal(adjust_hue, tv_tensors.Image)
+def adjust_hue_image(image: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError(f"hue_factor ({hue_factor}) is not in [-0.5, 0.5].")
+
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    if c == 1:  # Match PIL behaviour
+        return image
+
+    if image.numel() == 0:
+        # exit earlier on empty images
+        return image
+
+    orig_dtype = image.dtype
+    image = to_dtype_image(image, torch.float32, scale=True)
+
+    image = _rgb_to_hsv(image)
+    h, s, v = image.unbind(dim=-3)
+    h.add_(hue_factor).remainder_(1.0)
+    image = torch.stack((h, s, v), dim=-3)
+    image_hue_adj = _hsv_to_rgb(image)
+
+    return to_dtype_image(image_hue_adj, orig_dtype, scale=True)
+
+
+_adjust_hue_image_pil = _register_kernel_internal(adjust_hue, PIL.Image.Image)(_FP.adjust_hue)
+
+
+@_register_kernel_internal(adjust_hue, tv_tensors.Video)
+def adjust_hue_video(video: torch.Tensor, hue_factor: float) -> torch.Tensor:
+    return adjust_hue_image(video, hue_factor=hue_factor)
+
+
+def adjust_gamma(inpt: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
+    """Adjust gamma."""
+    if torch.jit.is_scripting():
+        return adjust_gamma_image(inpt, gamma=gamma, gain=gain)
+
+    _log_api_usage_once(adjust_gamma)
+
+    kernel = _get_kernel(adjust_gamma, type(inpt))
+    return kernel(inpt, gamma=gamma, gain=gain)
+
+
+@_register_kernel_internal(adjust_gamma, torch.Tensor)
+@_register_kernel_internal(adjust_gamma, tv_tensors.Image)
+def adjust_gamma_image(image: torch.Tensor, gamma: float, gain: float = 1.0) -> torch.Tensor:
+    if gamma < 0:
+        raise ValueError("Gamma should be a non-negative real number")
+
+    # The input image is either assumed to be at [0, 1] scale (if float) or is converted to that scale (if integer).
+    # Since the gamma is non-negative, the output remains at [0, 1] scale.
+    if not torch.is_floating_point(image):
+        output = to_dtype_image(image, torch.float32, scale=True).pow_(gamma)
+    else:
+        output = image.pow(gamma)
+
+    if gain != 1.0:
+        # The clamp operation is needed only if multiplication is performed. It's only when gain != 1, that the scale
+        # of the output can go beyond [0, 1].
+        output = output.mul_(gain).clamp_(0.0, 1.0)
+
+    return to_dtype_image(output, image.dtype, scale=True)
+
+
+_adjust_gamma_image_pil = _register_kernel_internal(adjust_gamma, PIL.Image.Image)(_FP.adjust_gamma)
+
+
+@_register_kernel_internal(adjust_gamma, tv_tensors.Video)
+def adjust_gamma_video(video: torch.Tensor, gamma: float, gain: float = 1) -> torch.Tensor:
+    return adjust_gamma_image(video, gamma=gamma, gain=gain)
+
+
+def posterize(inpt: torch.Tensor, bits: int) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomPosterize` for details."""
+    if torch.jit.is_scripting():
+        return posterize_image(inpt, bits=bits)
+
+    _log_api_usage_once(posterize)
+
+    kernel = _get_kernel(posterize, type(inpt))
+    return kernel(inpt, bits=bits)
+
+
+@_register_kernel_internal(posterize, torch.Tensor)
+@_register_kernel_internal(posterize, tv_tensors.Image)
+def posterize_image(image: torch.Tensor, bits: int) -> torch.Tensor:
+    if image.is_floating_point():
+        levels = 1 << bits
+        return image.mul(levels).floor_().clamp_(0, levels - 1).mul_(1.0 / levels)
+    else:
+        num_value_bits = _num_value_bits(image.dtype)
+        if bits >= num_value_bits:
+            return image
+
+        mask = ((1 << bits) - 1) << (num_value_bits - bits)
+        return image & mask
+
+
+_posterize_image_pil = _register_kernel_internal(posterize, PIL.Image.Image)(_FP.posterize)
+
+
+@_register_kernel_internal(posterize, tv_tensors.Video)
+def posterize_video(video: torch.Tensor, bits: int) -> torch.Tensor:
+    return posterize_image(video, bits=bits)
+
+
+def solarize(inpt: torch.Tensor, threshold: float) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomSolarize` for details."""
+    if torch.jit.is_scripting():
+        return solarize_image(inpt, threshold=threshold)
+
+    _log_api_usage_once(solarize)
+
+    kernel = _get_kernel(solarize, type(inpt))
+    return kernel(inpt, threshold=threshold)
+
+
+@_register_kernel_internal(solarize, torch.Tensor)
+@_register_kernel_internal(solarize, tv_tensors.Image)
+def solarize_image(image: torch.Tensor, threshold: float) -> torch.Tensor:
+    if threshold > _max_value(image.dtype):
+        raise TypeError(f"Threshold should be less or equal the maximum value of the dtype, but got {threshold}")
+
+    return torch.where(image >= threshold, invert_image(image), image)
+
+
+_solarize_image_pil = _register_kernel_internal(solarize, PIL.Image.Image)(_FP.solarize)
+
+
+@_register_kernel_internal(solarize, tv_tensors.Video)
+def solarize_video(video: torch.Tensor, threshold: float) -> torch.Tensor:
+    return solarize_image(video, threshold=threshold)
+
+
+def autocontrast(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomAutocontrast` for details."""
+    if torch.jit.is_scripting():
+        return autocontrast_image(inpt)
+
+    _log_api_usage_once(autocontrast)
+
+    kernel = _get_kernel(autocontrast, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(autocontrast, torch.Tensor)
+@_register_kernel_internal(autocontrast, tv_tensors.Image)
+def autocontrast_image(image: torch.Tensor) -> torch.Tensor:
+    c = image.shape[-3]
+    if c not in [1, 3]:
+        raise TypeError(f"Input image tensor permitted channel values are 1 or 3, but found {c}")
+
+    if image.numel() == 0:
+        # exit earlier on empty images
+        return image
+
+    bound = _max_value(image.dtype)
+    fp = image.is_floating_point()
+    float_image = image if fp else image.to(torch.float32)
+
+    minimum = float_image.amin(dim=(-2, -1), keepdim=True)
+    maximum = float_image.amax(dim=(-2, -1), keepdim=True)
+
+    eq_idxs = maximum == minimum
+    inv_scale = maximum.sub_(minimum).mul_(1.0 / bound)
+    minimum[eq_idxs] = 0.0
+    inv_scale[eq_idxs] = 1.0
+
+    if fp:
+        diff = float_image.sub(minimum)
+    else:
+        diff = float_image.sub_(minimum)
+
+    return diff.div_(inv_scale).clamp_(0, bound).to(image.dtype)
+
+
+_autocontrast_image_pil = _register_kernel_internal(autocontrast, PIL.Image.Image)(_FP.autocontrast)
+
+
+@_register_kernel_internal(autocontrast, tv_tensors.Video)
+def autocontrast_video(video: torch.Tensor) -> torch.Tensor:
+    return autocontrast_image(video)
+
+
+def equalize(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomEqualize` for details."""
+    if torch.jit.is_scripting():
+        return equalize_image(inpt)
+
+    _log_api_usage_once(equalize)
+
+    kernel = _get_kernel(equalize, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(equalize, torch.Tensor)
+@_register_kernel_internal(equalize, tv_tensors.Image)
+def equalize_image(image: torch.Tensor) -> torch.Tensor:
+    if image.numel() == 0:
+        return image
+
+    # 1. The algorithm below can easily be extended to support arbitrary integer dtypes. However, the histogram that
+    #    would be needed to computed will have at least `torch.iinfo(dtype).max + 1` values. That is perfectly fine for
+    #    `torch.int8`, `torch.uint8`, and `torch.int16`, at least questionable for `torch.int32` and completely
+    #    unfeasible for `torch.int64`.
+    # 2. Floating point inputs need to be binned for this algorithm. Apart from converting them to an integer dtype, we
+    #    could also use PyTorch's builtin histogram functionality. However, that has its own set of issues: in addition
+    #    to being slow in general, PyTorch's implementation also doesn't support batches. In total, that makes it slower
+    #    and more complicated to implement than a simple conversion and a fast histogram implementation for integers.
+    # Since we need to convert in most cases anyway and out of the acceptable dtypes mentioned in 1. `torch.uint8` is
+    # by far the most common, we choose it as base.
+    output_dtype = image.dtype
+    image = to_dtype_image(image, torch.uint8, scale=True)
+
+    # The histogram is computed by using the flattened image as index. For example, a pixel value of 127 in the image
+    # corresponds to adding 1 to index 127 in the histogram.
+    batch_shape = image.shape[:-2]
+    flat_image = image.flatten(start_dim=-2).to(torch.long)
+    hist = flat_image.new_zeros(batch_shape + (256,), dtype=torch.int32)
+    hist.scatter_add_(dim=-1, index=flat_image, src=hist.new_ones(1).expand_as(flat_image))
+    cum_hist = hist.cumsum(dim=-1)
+
+    # The simplest form of lookup-table (LUT) that also achieves histogram equalization is
+    # `lut = cum_hist / flat_image.shape[-1] * 255`
+    # However, PIL uses a more elaborate scheme:
+    # https://github.com/python-pillow/Pillow/blob/eb59cb61d5239ee69cbbf12709a0c6fd7314e6d7/src/PIL/ImageOps.py#L368-L385
+    # `lut = ((cum_hist + num_non_max_pixels // (2 * 255)) // num_non_max_pixels) * 255`
+
+    # The last non-zero element in the histogram is the first element in the cumulative histogram with the maximum
+    # value. Thus, the "max" in `num_non_max_pixels` does not refer to 255 as the maximum value of uint8 images, but
+    # rather the maximum value in the image, which might be or not be 255.
+    index = cum_hist.argmax(dim=-1)
+    num_non_max_pixels = flat_image.shape[-1] - hist.gather(dim=-1, index=index.unsqueeze_(-1))
+
+    # This is performance optimization that saves us one multiplication later. With this, the LUT computation simplifies
+    # to `lut = (cum_hist + step // 2) // step` and thus saving the final multiplication by 255 while keeping the
+    # division count the same. PIL uses the variable name `step` for this, so we keep that for easier comparison.
+    step = num_non_max_pixels.div_(255, rounding_mode="floor")
+
+    # Although it looks like we could return early if we find `step == 0` like PIL does, that is unfortunately not as
+    # easy due to our support for batched images. We can only return early if `(step == 0).all()` holds. If it doesn't,
+    # we have to go through the computation below anyway. Since `step == 0` is an edge case anyway, it makes no sense to
+    # pay the runtime cost for checking it every time.
+    valid_equalization = step.ne(0).unsqueeze_(-1)
+
+    # `lut[k]` is computed with `cum_hist[k-1]` with `lut[0] == (step // 2) // step == 0`. Thus, we perform the
+    # computation only for `lut[1:]` with `cum_hist[:-1]` and add `lut[0] == 0` afterwards.
+    cum_hist = cum_hist[..., :-1]
+    (
+        cum_hist.add_(step // 2)
+        # We need the `clamp_`(min=1) call here to avoid zero division since they fail for integer dtypes. This has no
+        # effect on the returned result of this kernel since images inside the batch with `step == 0` are returned as is
+        # instead of equalized version.
+        .div_(step.clamp_(min=1), rounding_mode="floor")
+        # We need the `clamp_` call here since PILs LUT computation scheme can produce values outside the valid value
+        # range of uint8 images
+        .clamp_(0, 255)
+    )
+    lut = cum_hist.to(torch.uint8)
+    lut = torch.cat([lut.new_zeros(1).expand(batch_shape + (1,)), lut], dim=-1)
+    equalized_image = lut.gather(dim=-1, index=flat_image).view_as(image)
+
+    output = torch.where(valid_equalization, equalized_image, image)
+    return to_dtype_image(output, output_dtype, scale=True)
+
+
+_equalize_image_pil = _register_kernel_internal(equalize, PIL.Image.Image)(_FP.equalize)
+
+
+@_register_kernel_internal(equalize, tv_tensors.Video)
+def equalize_video(video: torch.Tensor) -> torch.Tensor:
+    return equalize_image(video)
+
+
+def invert(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :func:`~torchvision.transforms.v2.RandomInvert`."""
+    if torch.jit.is_scripting():
+        return invert_image(inpt)
+
+    _log_api_usage_once(invert)
+
+    kernel = _get_kernel(invert, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(invert, torch.Tensor)
+@_register_kernel_internal(invert, tv_tensors.Image)
+def invert_image(image: torch.Tensor) -> torch.Tensor:
+    if image.is_floating_point():
+        return 1.0 - image
+    elif image.dtype == torch.uint8:
+        return image.bitwise_not()
+    else:  # signed integer dtypes
+        # We can't use `Tensor.bitwise_not` here, since we want to retain the leading zero bit that encodes the sign
+        return image.bitwise_xor((1 << _num_value_bits(image.dtype)) - 1)
+
+
+_invert_image_pil = _register_kernel_internal(invert, PIL.Image.Image)(_FP.invert)
+
+
+@_register_kernel_internal(invert, tv_tensors.Video)
+def invert_video(video: torch.Tensor) -> torch.Tensor:
+    return invert_image(video)
+
+
+def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor:
+    """Permute the channels of the input according to the given permutation.
+
+    This function supports plain :class:`~torch.Tensor`'s, :class:`PIL.Image.Image`'s, and
+    :class:`torchvision.tv_tensors.Image` and :class:`torchvision.tv_tensors.Video`.
+
+    Example:
+        >>> rgb_image = torch.rand(3, 256, 256)
+        >>> bgr_image = F.permutate_channels(rgb_image, permutation=[2, 1, 0])
+
+    Args:
+        permutation (List[int]): Valid permutation of the input channel indices. The index of the element determines the
+            channel index in the input and the value determines the channel index in the output. For example,
+            ``permutation=[2, 0 , 1]``
+
+            - takes ``ìnpt[..., 0, :, :]`` and puts it at ``output[..., 2, :, :]``,
+            - takes ``ìnpt[..., 1, :, :]`` and puts it at ``output[..., 0, :, :]``, and
+            - takes ``ìnpt[..., 2, :, :]`` and puts it at ``output[..., 1, :, :]``.
+
+    Raises:
+        ValueError: If ``len(permutation)`` doesn't match the number of channels in the input.
+    """
+    if torch.jit.is_scripting():
+        return permute_channels_image(inpt, permutation=permutation)
+
+    _log_api_usage_once(permute_channels)
+
+    kernel = _get_kernel(permute_channels, type(inpt))
+    return kernel(inpt, permutation=permutation)
+
+
+@_register_kernel_internal(permute_channels, torch.Tensor)
+@_register_kernel_internal(permute_channels, tv_tensors.Image)
+def permute_channels_image(image: torch.Tensor, permutation: List[int]) -> torch.Tensor:
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
+
+    if len(permutation) != num_channels:
+        raise ValueError(
+            f"Length of permutation does not match number of channels: " f"{len(permutation)} != {num_channels}"
+        )
+
+    if image.numel() == 0:
+        return image
+
+    image = image.reshape(-1, num_channels, height, width)
+    image = image[:, permutation, :, :]
+    return image.reshape(shape)
+
+
+@_register_kernel_internal(permute_channels, PIL.Image.Image)
+def _permute_channels_image_pil(image: PIL.Image.Image, permutation: List[int]) -> PIL.Image:
+    return to_pil_image(permute_channels_image(pil_to_tensor(image), permutation=permutation))
+
+
+@_register_kernel_internal(permute_channels, tv_tensors.Video)
+def permute_channels_video(video: torch.Tensor, permutation: List[int]) -> torch.Tensor:
+    return permute_channels_image(video, permutation=permutation)
diff --git a/torchvision/transforms/v2/functional/_deprecated.py b/torchvision/transforms/v2/functional/_deprecated.py
new file mode 100644
index 0000000000000000000000000000000000000000..37b027c72bc1db794f29f195d86c8678b49aaa5a
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_deprecated.py
@@ -0,0 +1,24 @@
+import warnings
+from typing import Any, List
+
+import torch
+
+from torchvision.transforms import functional as _F
+
+
+@torch.jit.unused
+def to_tensor(inpt: Any) -> torch.Tensor:
+    """[BETA] [DEPREACTED] Use to_image() and to_dtype() instead."""
+    warnings.warn(
+        "The function `to_tensor(...)` is deprecated and will be removed in a future release. "
+        "Instead, please use `to_image(...)` followed by `to_dtype(..., dtype=torch.float32, scale=True)`."
+    )
+    return _F.to_tensor(inpt)
+
+
+def get_image_size(inpt: torch.Tensor) -> List[int]:
+    warnings.warn(
+        "The function `get_image_size(...)` is deprecated and will be removed in a future release. "
+        "Instead, please use `get_size(...)` which returns `[h, w]` instead of `[w, h]`."
+    )
+    return _F.get_image_size(inpt)
diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4798a78e060be957a1db8b859c99cad9cf0020f
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_geometry.py
@@ -0,0 +1,2380 @@
+import math
+import numbers
+import warnings
+from typing import Any, List, Optional, Sequence, Tuple, Union
+
+import PIL.Image
+import torch
+from torch.nn.functional import grid_sample, interpolate, pad as torch_pad
+
+from torchvision import tv_tensors
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.transforms._functional_tensor import _pad_symmetric
+from torchvision.transforms.functional import (
+    _check_antialias,
+    _compute_resized_output_size as __compute_resized_output_size,
+    _get_perspective_coeffs,
+    _interpolation_modes_from_int,
+    InterpolationMode,
+    pil_modes_mapping,
+    pil_to_tensor,
+    to_pil_image,
+)
+
+from torchvision.utils import _log_api_usage_once
+
+from ._meta import _get_size_image_pil, clamp_bounding_boxes, convert_bounding_box_format
+
+from ._utils import _FillTypeJIT, _get_kernel, _register_five_ten_crop_kernel_internal, _register_kernel_internal
+
+
+def _check_interpolation(interpolation: Union[InterpolationMode, int]) -> InterpolationMode:
+    if isinstance(interpolation, int):
+        interpolation = _interpolation_modes_from_int(interpolation)
+    elif not isinstance(interpolation, InterpolationMode):
+        raise ValueError(
+            f"Argument interpolation should be an `InterpolationMode` or a corresponding Pillow integer constant, "
+            f"but got {interpolation}."
+        )
+    return interpolation
+
+
+def horizontal_flip(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomHorizontalFlip` for details."""
+    if torch.jit.is_scripting():
+        return horizontal_flip_image(inpt)
+
+    _log_api_usage_once(horizontal_flip)
+
+    kernel = _get_kernel(horizontal_flip, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(horizontal_flip, torch.Tensor)
+@_register_kernel_internal(horizontal_flip, tv_tensors.Image)
+def horizontal_flip_image(image: torch.Tensor) -> torch.Tensor:
+    return image.flip(-1)
+
+
+@_register_kernel_internal(horizontal_flip, PIL.Image.Image)
+def _horizontal_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image:
+    return _FP.hflip(image)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.Mask)
+def horizontal_flip_mask(mask: torch.Tensor) -> torch.Tensor:
+    return horizontal_flip_image(mask)
+
+
+def horizontal_flip_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: tv_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int]
+) -> torch.Tensor:
+    shape = bounding_boxes.shape
+
+    bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
+        bounding_boxes[:, [2, 0]] = bounding_boxes[:, [0, 2]].sub_(canvas_size[1]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYWH:
+        bounding_boxes[:, 0].add_(bounding_boxes[:, 2]).sub_(canvas_size[1]).neg_()
+    else:  # format == tv_tensors.BoundingBoxFormat.CXCYWH:
+        bounding_boxes[:, 0].sub_(canvas_size[1]).neg_()
+
+    return bounding_boxes.reshape(shape)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _horizontal_flip_bounding_boxes_dispatch(inpt: tv_tensors.BoundingBoxes) -> tv_tensors.BoundingBoxes:
+    output = horizontal_flip_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(horizontal_flip, tv_tensors.Video)
+def horizontal_flip_video(video: torch.Tensor) -> torch.Tensor:
+    return horizontal_flip_image(video)
+
+
+def vertical_flip(inpt: torch.Tensor) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomVerticalFlip` for details."""
+    if torch.jit.is_scripting():
+        return vertical_flip_image(inpt)
+
+    _log_api_usage_once(vertical_flip)
+
+    kernel = _get_kernel(vertical_flip, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(vertical_flip, torch.Tensor)
+@_register_kernel_internal(vertical_flip, tv_tensors.Image)
+def vertical_flip_image(image: torch.Tensor) -> torch.Tensor:
+    return image.flip(-2)
+
+
+@_register_kernel_internal(vertical_flip, PIL.Image.Image)
+def _vertical_flip_image_pil(image: PIL.Image) -> PIL.Image:
+    return _FP.vflip(image)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.Mask)
+def vertical_flip_mask(mask: torch.Tensor) -> torch.Tensor:
+    return vertical_flip_image(mask)
+
+
+def vertical_flip_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: tv_tensors.BoundingBoxFormat, canvas_size: Tuple[int, int]
+) -> torch.Tensor:
+    shape = bounding_boxes.shape
+
+    bounding_boxes = bounding_boxes.clone().reshape(-1, 4)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
+        bounding_boxes[:, [1, 3]] = bounding_boxes[:, [3, 1]].sub_(canvas_size[0]).neg_()
+    elif format == tv_tensors.BoundingBoxFormat.XYWH:
+        bounding_boxes[:, 1].add_(bounding_boxes[:, 3]).sub_(canvas_size[0]).neg_()
+    else:  # format == tv_tensors.BoundingBoxFormat.CXCYWH:
+        bounding_boxes[:, 1].sub_(canvas_size[0]).neg_()
+
+    return bounding_boxes.reshape(shape)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _vertical_flip_bounding_boxes_dispatch(inpt: tv_tensors.BoundingBoxes) -> tv_tensors.BoundingBoxes:
+    output = vertical_flip_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(vertical_flip, tv_tensors.Video)
+def vertical_flip_video(video: torch.Tensor) -> torch.Tensor:
+    return vertical_flip_image(video)
+
+
+# We changed the names to align them with the transforms, i.e. `RandomHorizontalFlip`. Still, `hflip` and `vflip` are
+# prevalent and well understood. Thus, we just alias them without deprecating the old names.
+hflip = horizontal_flip
+vflip = vertical_flip
+
+
+def _compute_resized_output_size(
+    canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+) -> List[int]:
+    if isinstance(size, int):
+        size = [size]
+    elif max_size is not None and len(size) != 1:
+        raise ValueError(
+            "max_size should only be passed if size specifies the length of the smaller edge, "
+            "i.e. size should be an int or a sequence of length 1 in torchscript mode."
+        )
+    return __compute_resized_output_size(canvas_size, size=size, max_size=max_size)
+
+
+def resize(
+    inpt: torch.Tensor,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.Resize` for details."""
+    if torch.jit.is_scripting():
+        return resize_image(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+    _log_api_usage_once(resize)
+
+    kernel = _get_kernel(resize, type(inpt))
+    return kernel(inpt, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+
+@_register_kernel_internal(resize, torch.Tensor)
+@_register_kernel_internal(resize, tv_tensors.Image)
+def resize_image(
+    image: torch.Tensor,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+    antialias = _check_antialias(img=image, antialias=antialias, interpolation=interpolation)
+    assert not isinstance(antialias, str)
+    antialias = False if antialias is None else antialias
+    align_corners: Optional[bool] = None
+    if interpolation == InterpolationMode.BILINEAR or interpolation == InterpolationMode.BICUBIC:
+        align_corners = False
+    else:
+        # The default of antialias should be True from 0.17, so we don't warn or
+        # error if other interpolation modes are used. This is documented.
+        antialias = False
+
+    shape = image.shape
+    numel = image.numel()
+    num_channels, old_height, old_width = shape[-3:]
+    new_height, new_width = _compute_resized_output_size((old_height, old_width), size=size, max_size=max_size)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return image
+    elif numel > 0:
+        image = image.reshape(-1, num_channels, old_height, old_width)
+
+        dtype = image.dtype
+        acceptable_dtypes = [torch.float32, torch.float64]
+        if interpolation == InterpolationMode.NEAREST or interpolation == InterpolationMode.NEAREST_EXACT:
+            # uint8 dtype can be included for cpu and cuda input if nearest mode
+            acceptable_dtypes.append(torch.uint8)
+        elif image.device.type == "cpu":
+            # uint8 dtype support for bilinear and bicubic is limited to cpu and
+            # according to our benchmarks, non-AVX CPUs should still prefer u8->f32->interpolate->u8 path for bilinear
+            if (interpolation == InterpolationMode.BILINEAR and "AVX2" in torch.backends.cpu.get_cpu_capability()) or (
+                interpolation == InterpolationMode.BICUBIC
+            ):
+                acceptable_dtypes.append(torch.uint8)
+
+        strides = image.stride()
+        if image.is_contiguous(memory_format=torch.channels_last) and image.shape[0] == 1 and numel != strides[0]:
+            # There is a weird behaviour in torch core where the output tensor of `interpolate()` can be allocated as
+            # contiguous even though the input is un-ambiguously channels_last (https://github.com/pytorch/pytorch/issues/68430).
+            # In particular this happens for the typical torchvision use-case of single CHW images where we fake the batch dim
+            # to become 1CHW. Below, we restride those tensors to trick torch core into properly allocating the output as
+            # channels_last, thus preserving the memory format of the input. This is not just for format consistency:
+            # for uint8 bilinear images, this also avoids an extra copy (re-packing) of the output and saves time.
+            # TODO: when https://github.com/pytorch/pytorch/issues/68430 is fixed (possibly by https://github.com/pytorch/pytorch/pull/100373),
+            # we should be able to remove this hack.
+            new_strides = list(strides)
+            new_strides[0] = numel
+            image = image.as_strided((1, num_channels, old_height, old_width), new_strides)
+
+        need_cast = dtype not in acceptable_dtypes
+        if need_cast:
+            image = image.to(dtype=torch.float32)
+
+        image = interpolate(
+            image,
+            size=[new_height, new_width],
+            mode=interpolation.value,
+            align_corners=align_corners,
+            antialias=antialias,
+        )
+
+        if need_cast:
+            if interpolation == InterpolationMode.BICUBIC and dtype == torch.uint8:
+                # This path is hit on non-AVX archs, or on GPU.
+                image = image.clamp_(min=0, max=255)
+            if dtype in (torch.uint8, torch.int8, torch.int16, torch.int32, torch.int64):
+                image = image.round_()
+            image = image.to(dtype=dtype)
+
+    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
+
+
+def _resize_image_pil(
+    image: PIL.Image.Image,
+    size: Union[Sequence[int], int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+) -> PIL.Image.Image:
+    old_height, old_width = image.height, image.width
+    new_height, new_width = _compute_resized_output_size(
+        (old_height, old_width),
+        size=size,  # type: ignore[arg-type]
+        max_size=max_size,
+    )
+
+    interpolation = _check_interpolation(interpolation)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return image
+
+    return image.resize((new_width, new_height), resample=pil_modes_mapping[interpolation])
+
+
+@_register_kernel_internal(resize, PIL.Image.Image)
+def __resize_image_pil_dispatch(
+    image: PIL.Image.Image,
+    size: Union[Sequence[int], int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> PIL.Image.Image:
+    if antialias is False:
+        warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+    return _resize_image_pil(image, size=size, interpolation=interpolation, max_size=max_size)
+
+
+def resize_mask(mask: torch.Tensor, size: List[int], max_size: Optional[int] = None) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = resize_image(mask, size=size, interpolation=InterpolationMode.NEAREST, max_size=max_size)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(resize, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _resize_mask_dispatch(
+    inpt: tv_tensors.Mask, size: List[int], max_size: Optional[int] = None, **kwargs: Any
+) -> tv_tensors.Mask:
+    output = resize_mask(inpt.as_subclass(torch.Tensor), size, max_size=max_size)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def resize_bounding_boxes(
+    bounding_boxes: torch.Tensor, canvas_size: Tuple[int, int], size: List[int], max_size: Optional[int] = None
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    old_height, old_width = canvas_size
+    new_height, new_width = _compute_resized_output_size(canvas_size, size=size, max_size=max_size)
+
+    if (new_height, new_width) == (old_height, old_width):
+        return bounding_boxes, canvas_size
+
+    w_ratio = new_width / old_width
+    h_ratio = new_height / old_height
+    ratios = torch.tensor([w_ratio, h_ratio, w_ratio, h_ratio], device=bounding_boxes.device)
+    return (
+        bounding_boxes.mul(ratios).to(bounding_boxes.dtype),
+        (new_height, new_width),
+    )
+
+
+@_register_kernel_internal(resize, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _resize_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, size: List[int], max_size: Optional[int] = None, **kwargs: Any
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = resize_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), inpt.canvas_size, size, max_size=max_size
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(resize, tv_tensors.Video)
+def resize_video(
+    video: torch.Tensor,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    max_size: Optional[int] = None,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> torch.Tensor:
+    return resize_image(video, size=size, interpolation=interpolation, max_size=max_size, antialias=antialias)
+
+
+def affine(
+    inpt: torch.Tensor,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[List[float]] = None,
+) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomAffine` for details."""
+    if torch.jit.is_scripting():
+        return affine_image(
+            inpt,
+            angle=angle,
+            translate=translate,
+            scale=scale,
+            shear=shear,
+            interpolation=interpolation,
+            fill=fill,
+            center=center,
+        )
+
+    _log_api_usage_once(affine)
+
+    kernel = _get_kernel(affine, type(inpt))
+    return kernel(
+        inpt,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=interpolation,
+        fill=fill,
+        center=center,
+    )
+
+
+def _affine_parse_args(
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    interpolation: InterpolationMode = InterpolationMode.NEAREST,
+    center: Optional[List[float]] = None,
+) -> Tuple[float, List[float], List[float], Optional[List[float]]]:
+    if not isinstance(angle, (int, float)):
+        raise TypeError("Argument angle should be int or float")
+
+    if not isinstance(translate, (list, tuple)):
+        raise TypeError("Argument translate should be a sequence")
+
+    if len(translate) != 2:
+        raise ValueError("Argument translate should be a sequence of length 2")
+
+    if scale <= 0.0:
+        raise ValueError("Argument scale should be positive")
+
+    if not isinstance(shear, (numbers.Number, (list, tuple))):
+        raise TypeError("Shear should be either a single value or a sequence of two values")
+
+    if not isinstance(interpolation, InterpolationMode):
+        raise TypeError("Argument interpolation should be a InterpolationMode")
+
+    if isinstance(angle, int):
+        angle = float(angle)
+
+    if isinstance(translate, tuple):
+        translate = list(translate)
+
+    if isinstance(shear, numbers.Number):
+        shear = [shear, 0.0]
+
+    if isinstance(shear, tuple):
+        shear = list(shear)
+
+    if len(shear) == 1:
+        shear = [shear[0], shear[0]]
+
+    if len(shear) != 2:
+        raise ValueError(f"Shear should be a sequence containing two values. Got {shear}")
+
+    if center is not None:
+        if not isinstance(center, (list, tuple)):
+            raise TypeError("Argument center should be a sequence")
+        else:
+            center = [float(c) for c in center]
+
+    return angle, translate, shear, center
+
+
+def _get_inverse_affine_matrix(
+    center: List[float], angle: float, translate: List[float], scale: float, shear: List[float], inverted: bool = True
+) -> List[float]:
+    # Helper method to compute inverse matrix for affine transformation
+
+    # Pillow requires inverse affine transformation matrix:
+    # Affine matrix is : M = T * C * RotateScaleShear * C^-1
+    #
+    # where T is translation matrix: [1, 0, tx | 0, 1, ty | 0, 0, 1]
+    #       C is translation matrix to keep center: [1, 0, cx | 0, 1, cy | 0, 0, 1]
+    #       RotateScaleShear is rotation with scale and shear matrix
+    #
+    #       RotateScaleShear(a, s, (sx, sy)) =
+    #       = R(a) * S(s) * SHy(sy) * SHx(sx)
+    #       = [ s*cos(a - sy)/cos(sy), s*(-cos(a - sy)*tan(sx)/cos(sy) - sin(a)), 0 ]
+    #         [ s*sin(a - sy)/cos(sy), s*(-sin(a - sy)*tan(sx)/cos(sy) + cos(a)), 0 ]
+    #         [ 0                    , 0                                      , 1 ]
+    # where R is a rotation matrix, S is a scaling matrix, and SHx and SHy are the shears:
+    # SHx(s) = [1, -tan(s)] and SHy(s) = [1      , 0]
+    #          [0, 1      ]              [-tan(s), 1]
+    #
+    # Thus, the inverse is M^-1 = C * RotateScaleShear^-1 * C^-1 * T^-1
+
+    rot = math.radians(angle)
+    sx = math.radians(shear[0])
+    sy = math.radians(shear[1])
+
+    cx, cy = center
+    tx, ty = translate
+
+    # Cached results
+    cos_sy = math.cos(sy)
+    tan_sx = math.tan(sx)
+    rot_minus_sy = rot - sy
+    cx_plus_tx = cx + tx
+    cy_plus_ty = cy + ty
+
+    # Rotate Scale Shear (RSS) without scaling
+    a = math.cos(rot_minus_sy) / cos_sy
+    b = -(a * tan_sx + math.sin(rot))
+    c = math.sin(rot_minus_sy) / cos_sy
+    d = math.cos(rot) - c * tan_sx
+
+    if inverted:
+        # Inverted rotation matrix with scale and shear
+        # det([[a, b], [c, d]]) == 1, since det(rotation) = 1 and det(shear) = 1
+        matrix = [d / scale, -b / scale, 0.0, -c / scale, a / scale, 0.0]
+        # Apply inverse of translation and of center translation: RSS^-1 * C^-1 * T^-1
+        # and then apply center translation: C * RSS^-1 * C^-1 * T^-1
+        matrix[2] += cx - matrix[0] * cx_plus_tx - matrix[1] * cy_plus_ty
+        matrix[5] += cy - matrix[3] * cx_plus_tx - matrix[4] * cy_plus_ty
+    else:
+        matrix = [a * scale, b * scale, 0.0, c * scale, d * scale, 0.0]
+        # Apply inverse of center translation: RSS * C^-1
+        # and then apply translation and center : T * C * RSS * C^-1
+        matrix[2] += cx_plus_tx - matrix[0] * cx - matrix[1] * cy
+        matrix[5] += cy_plus_ty - matrix[3] * cx - matrix[4] * cy
+
+    return matrix
+
+
+def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]:
+    # Inspired of PIL implementation:
+    # https://github.com/python-pillow/Pillow/blob/11de3318867e4398057373ee9f12dcb33db7335c/src/PIL/Image.py#L2054
+
+    # pts are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+    # Points are shifted due to affine matrix torch convention about
+    # the center point. Center is (0, 0) for image center pivot point (w * 0.5, h * 0.5)
+    half_w = 0.5 * w
+    half_h = 0.5 * h
+    pts = torch.tensor(
+        [
+            [-half_w, -half_h, 1.0],
+            [-half_w, half_h, 1.0],
+            [half_w, half_h, 1.0],
+            [half_w, -half_h, 1.0],
+        ]
+    )
+    theta = torch.tensor(matrix, dtype=torch.float).view(2, 3)
+    new_pts = torch.matmul(pts, theta.T)
+    min_vals, max_vals = new_pts.aminmax(dim=0)
+
+    # shift points to [0, w] and [0, h] interval to match PIL results
+    halfs = torch.tensor((half_w, half_h))
+    min_vals.add_(halfs)
+    max_vals.add_(halfs)
+
+    # Truncate precision to 1e-4 to avoid ceil of Xe-15 to 1.0
+    tol = 1e-4
+    inv_tol = 1.0 / tol
+    cmax = max_vals.mul_(inv_tol).trunc_().mul_(tol).ceil_()
+    cmin = min_vals.mul_(inv_tol).trunc_().mul_(tol).floor_()
+    size = cmax.sub_(cmin)
+    return int(size[0]), int(size[1])  # w, h
+
+
+def _apply_grid_transform(img: torch.Tensor, grid: torch.Tensor, mode: str, fill: _FillTypeJIT) -> torch.Tensor:
+
+    # We are using context knowledge that grid should have float dtype
+    fp = img.dtype == grid.dtype
+    float_img = img if fp else img.to(grid.dtype)
+
+    shape = float_img.shape
+    if shape[0] > 1:
+        # Apply same grid to a batch of images
+        grid = grid.expand(shape[0], -1, -1, -1)
+
+    # Append a dummy mask for customized fill colors, should be faster than grid_sample() twice
+    if fill is not None:
+        mask = torch.ones((shape[0], 1, shape[2], shape[3]), dtype=float_img.dtype, device=float_img.device)
+        float_img = torch.cat((float_img, mask), dim=1)
+
+    float_img = grid_sample(float_img, grid, mode=mode, padding_mode="zeros", align_corners=False)
+
+    # Fill with required color
+    if fill is not None:
+        float_img, mask = torch.tensor_split(float_img, indices=(-1,), dim=-3)
+        mask = mask.expand_as(float_img)
+        fill_list = fill if isinstance(fill, (tuple, list)) else [float(fill)]  # type: ignore[arg-type]
+        fill_img = torch.tensor(fill_list, dtype=float_img.dtype, device=float_img.device).view(1, -1, 1, 1)
+        if mode == "nearest":
+            bool_mask = mask < 0.5
+            float_img[bool_mask] = fill_img.expand_as(float_img)[bool_mask]
+        else:  # 'bilinear'
+            # The following is mathematically equivalent to:
+            # img * mask + (1.0 - mask) * fill = img * mask - fill * mask + fill = mask * (img - fill) + fill
+            float_img = float_img.sub_(fill_img).mul_(mask).add_(fill_img)
+
+    img = float_img.round_().to(img.dtype) if not fp else float_img
+
+    return img
+
+
+def _assert_grid_transform_inputs(
+    image: torch.Tensor,
+    matrix: Optional[List[float]],
+    interpolation: str,
+    fill: _FillTypeJIT,
+    supported_interpolation_modes: List[str],
+    coeffs: Optional[List[float]] = None,
+) -> None:
+    if matrix is not None:
+        if not isinstance(matrix, list):
+            raise TypeError("Argument matrix should be a list")
+        elif len(matrix) != 6:
+            raise ValueError("Argument matrix should have 6 float values")
+
+    if coeffs is not None and len(coeffs) != 8:
+        raise ValueError("Argument coeffs should have 8 float values")
+
+    if fill is not None:
+        if isinstance(fill, (tuple, list)):
+            length = len(fill)
+            num_channels = image.shape[-3]
+            if length > 1 and length != num_channels:
+                raise ValueError(
+                    "The number of elements in 'fill' cannot broadcast to match the number of "
+                    f"channels of the image ({length} != {num_channels})"
+                )
+        elif not isinstance(fill, (int, float)):
+            raise ValueError("Argument fill should be either int, float, tuple or list")
+
+    if interpolation not in supported_interpolation_modes:
+        raise ValueError(f"Interpolation mode '{interpolation}' is unsupported with Tensor input")
+
+
+def _affine_grid(
+    theta: torch.Tensor,
+    w: int,
+    h: int,
+    ow: int,
+    oh: int,
+) -> torch.Tensor:
+    # https://github.com/pytorch/pytorch/blob/74b65c32be68b15dc7c9e8bb62459efbfbde33d8/aten/src/ATen/native/
+    # AffineGridGenerator.cpp#L18
+    # Difference with AffineGridGenerator is that:
+    # 1) we normalize grid values after applying theta
+    # 2) we can normalize by other image size, such that it covers "extend" option like in PIL.Image.rotate
+    dtype = theta.dtype
+    device = theta.device
+
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace((1.0 - ow) * 0.5, (ow - 1.0) * 0.5, steps=ow, device=device)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace((1.0 - oh) * 0.5, (oh - 1.0) * 0.5, steps=oh, device=device).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta = theta.transpose(1, 2).div_(torch.tensor([0.5 * w, 0.5 * h], dtype=dtype, device=device))
+    output_grid = base_grid.view(1, oh * ow, 3).bmm(rescaled_theta)
+    return output_grid.view(1, oh, ow, 2)
+
+
+@_register_kernel_internal(affine, torch.Tensor)
+@_register_kernel_internal(affine, tv_tensors.Image)
+def affine_image(
+    image: torch.Tensor,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[List[float]] = None,
+) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+
+    if image.numel() == 0:
+        return image
+
+    shape = image.shape
+    ndim = image.ndim
+
+    if ndim > 4:
+        image = image.reshape((-1,) + shape[-3:])
+        needs_unsquash = True
+    elif ndim == 3:
+        image = image.unsqueeze(0)
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    height, width = shape[-2:]
+    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
+
+    center_f = [0.0, 0.0]
+    if center is not None:
+        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+        center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])]
+
+    translate_f = [float(t) for t in translate]
+    matrix = _get_inverse_affine_matrix(center_f, angle, translate_f, scale, shear)
+
+    _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+    grid = _affine_grid(theta, w=width, h=height, ow=width, oh=height)
+    output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+    if needs_unsquash:
+        output = output.reshape(shape)
+
+    return output
+
+
+@_register_kernel_internal(affine, PIL.Image.Image)
+def _affine_image_pil(
+    image: PIL.Image.Image,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[List[float]] = None,
+) -> PIL.Image.Image:
+    interpolation = _check_interpolation(interpolation)
+    angle, translate, shear, center = _affine_parse_args(angle, translate, scale, shear, interpolation, center)
+
+    # center = (img_size[0] * 0.5 + 0.5, img_size[1] * 0.5 + 0.5)
+    # it is visually better to estimate the center without 0.5 offset
+    # otherwise image rotated by 90 degrees is shifted vs output image of torch.rot90 or F_t.affine
+    if center is None:
+        height, width = _get_size_image_pil(image)
+        center = [width * 0.5, height * 0.5]
+    matrix = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+
+    return _FP.affine(image, matrix, interpolation=pil_modes_mapping[interpolation], fill=fill)
+
+
+def _affine_bounding_boxes_with_expand(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: Tuple[int, int],
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    center: Optional[List[float]] = None,
+    expand: bool = False,
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes, canvas_size
+
+    original_shape = bounding_boxes.shape
+    original_dtype = bounding_boxes.dtype
+    bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
+    dtype = bounding_boxes.dtype
+    device = bounding_boxes.device
+    bounding_boxes = (
+        convert_bounding_box_format(
+            bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY, inplace=True
+        )
+    ).reshape(-1, 4)
+
+    angle, translate, shear, center = _affine_parse_args(
+        angle, translate, scale, shear, InterpolationMode.NEAREST, center
+    )
+
+    if center is None:
+        height, width = canvas_size
+        center = [width * 0.5, height * 0.5]
+
+    affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear, inverted=False)
+    transposed_affine_matrix = (
+        torch.tensor(
+            affine_vector,
+            dtype=dtype,
+            device=device,
+        )
+        .reshape(2, 3)
+        .T
+    )
+    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
+    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
+    # Single point structure is similar to
+    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
+    points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=device, dtype=dtype)], dim=-1)
+    # 2) Now let's transform the points using affine matrix
+    transformed_points = torch.matmul(points, transposed_affine_matrix)
+    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
+    # and compute bounding box from 4 transformed points:
+    transformed_points = transformed_points.reshape(-1, 4, 2)
+    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+    out_bboxes = torch.cat([out_bbox_mins, out_bbox_maxs], dim=1)
+
+    if expand:
+        # Compute minimum point for transformed image frame:
+        # Points are Top-Left, Top-Right, Bottom-Left, Bottom-Right points.
+        height, width = canvas_size
+        points = torch.tensor(
+            [
+                [0.0, 0.0, 1.0],
+                [0.0, float(height), 1.0],
+                [float(width), float(height), 1.0],
+                [float(width), 0.0, 1.0],
+            ],
+            dtype=dtype,
+            device=device,
+        )
+        new_points = torch.matmul(points, transposed_affine_matrix)
+        tr = torch.amin(new_points, dim=0, keepdim=True)
+        # Translate bounding boxes
+        out_bboxes.sub_(tr.repeat((1, 2)))
+        # Estimate meta-data for image with inverted=True
+        affine_vector = _get_inverse_affine_matrix(center, angle, translate, scale, shear)
+        new_width, new_height = _compute_affine_output_size(affine_vector, width, height)
+        canvas_size = (new_height, new_width)
+
+    out_bboxes = clamp_bounding_boxes(out_bboxes, format=tv_tensors.BoundingBoxFormat.XYXY, canvas_size=canvas_size)
+    out_bboxes = convert_bounding_box_format(
+        out_bboxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+    out_bboxes = out_bboxes.to(original_dtype)
+    return out_bboxes, canvas_size
+
+
+def affine_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: Tuple[int, int],
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    center: Optional[List[float]] = None,
+) -> torch.Tensor:
+    out_box, _ = _affine_bounding_boxes_with_expand(
+        bounding_boxes,
+        format=format,
+        canvas_size=canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+        expand=False,
+    )
+    return out_box
+
+
+@_register_kernel_internal(affine, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _affine_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    center: Optional[List[float]] = None,
+    **kwargs,
+) -> tv_tensors.BoundingBoxes:
+    output = affine_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        center=center,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def affine_mask(
+    mask: torch.Tensor,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    fill: _FillTypeJIT = None,
+    center: Optional[List[float]] = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = affine_image(
+        mask,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=InterpolationMode.NEAREST,
+        fill=fill,
+        center=center,
+    )
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(affine, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _affine_mask_dispatch(
+    inpt: tv_tensors.Mask,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    fill: _FillTypeJIT = None,
+    center: Optional[List[float]] = None,
+    **kwargs,
+) -> tv_tensors.Mask:
+    output = affine_mask(
+        inpt.as_subclass(torch.Tensor),
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        fill=fill,
+        center=center,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(affine, tv_tensors.Video)
+def affine_video(
+    video: torch.Tensor,
+    angle: Union[int, float],
+    translate: List[float],
+    scale: float,
+    shear: List[float],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    fill: _FillTypeJIT = None,
+    center: Optional[List[float]] = None,
+) -> torch.Tensor:
+    return affine_image(
+        video,
+        angle=angle,
+        translate=translate,
+        scale=scale,
+        shear=shear,
+        interpolation=interpolation,
+        fill=fill,
+        center=center,
+    )
+
+
+def rotate(
+    inpt: torch.Tensor,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[List[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomRotation` for details."""
+    if torch.jit.is_scripting():
+        return rotate_image(inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+    _log_api_usage_once(rotate)
+
+    kernel = _get_kernel(rotate, type(inpt))
+    return kernel(inpt, angle=angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+
+@_register_kernel_internal(rotate, torch.Tensor)
+@_register_kernel_internal(rotate, tv_tensors.Image)
+def rotate_image(
+    image: torch.Tensor,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[List[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    interpolation = _check_interpolation(interpolation)
+
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
+
+    center_f = [0.0, 0.0]
+    if center is not None:
+        # Center values should be in pixel coordinates but translated such that (0, 0) corresponds to image center.
+        center_f = [(c - s * 0.5) for c, s in zip(center, [width, height])]
+
+    # due to current incoherence of rotation angle direction between affine and rotate implementations
+    # we need to set -angle.
+    matrix = _get_inverse_affine_matrix(center_f, -angle, [0.0, 0.0], 1.0, [0.0, 0.0])
+
+    if image.numel() > 0:
+        image = image.reshape(-1, num_channels, height, width)
+
+        _assert_grid_transform_inputs(image, matrix, interpolation.value, fill, ["nearest", "bilinear"])
+
+        ow, oh = _compute_affine_output_size(matrix, width, height) if expand else (width, height)
+        dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+        theta = torch.tensor(matrix, dtype=dtype, device=image.device).reshape(1, 2, 3)
+        grid = _affine_grid(theta, w=width, h=height, ow=ow, oh=oh)
+        output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+        new_height, new_width = output.shape[-2:]
+    else:
+        output = image
+        new_width, new_height = _compute_affine_output_size(matrix, width, height) if expand else (width, height)
+
+    return output.reshape(shape[:-3] + (num_channels, new_height, new_width))
+
+
+@_register_kernel_internal(rotate, PIL.Image.Image)
+def _rotate_image_pil(
+    image: PIL.Image.Image,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[List[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> PIL.Image.Image:
+    interpolation = _check_interpolation(interpolation)
+
+    return _FP.rotate(
+        image, angle, interpolation=pil_modes_mapping[interpolation], expand=expand, fill=fill, center=center
+    )
+
+
+def rotate_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: Tuple[int, int],
+    angle: float,
+    expand: bool = False,
+    center: Optional[List[float]] = None,
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    return _affine_bounding_boxes_with_expand(
+        bounding_boxes,
+        format=format,
+        canvas_size=canvas_size,
+        angle=-angle,
+        translate=[0.0, 0.0],
+        scale=1.0,
+        shear=[0.0, 0.0],
+        center=center,
+        expand=expand,
+    )
+
+
+@_register_kernel_internal(rotate, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _rotate_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, angle: float, expand: bool = False, center: Optional[List[float]] = None, **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = rotate_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        angle=angle,
+        expand=expand,
+        center=center,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def rotate_mask(
+    mask: torch.Tensor,
+    angle: float,
+    expand: bool = False,
+    center: Optional[List[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = rotate_image(
+        mask,
+        angle=angle,
+        expand=expand,
+        interpolation=InterpolationMode.NEAREST,
+        fill=fill,
+        center=center,
+    )
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(rotate, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _rotate_mask_dispatch(
+    inpt: tv_tensors.Mask,
+    angle: float,
+    expand: bool = False,
+    center: Optional[List[float]] = None,
+    fill: _FillTypeJIT = None,
+    **kwargs,
+) -> tv_tensors.Mask:
+    output = rotate_mask(inpt.as_subclass(torch.Tensor), angle=angle, expand=expand, fill=fill, center=center)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(rotate, tv_tensors.Video)
+def rotate_video(
+    video: torch.Tensor,
+    angle: float,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.NEAREST,
+    expand: bool = False,
+    center: Optional[List[float]] = None,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    return rotate_image(video, angle, interpolation=interpolation, expand=expand, fill=fill, center=center)
+
+
+def pad(
+    inpt: torch.Tensor,
+    padding: List[int],
+    fill: Optional[Union[int, float, List[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.Pad` for details."""
+    if torch.jit.is_scripting():
+        return pad_image(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
+
+    _log_api_usage_once(pad)
+
+    kernel = _get_kernel(pad, type(inpt))
+    return kernel(inpt, padding=padding, fill=fill, padding_mode=padding_mode)
+
+
+def _parse_pad_padding(padding: Union[int, List[int]]) -> List[int]:
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    elif isinstance(padding, (tuple, list)):
+        if len(padding) == 1:
+            pad_left = pad_right = pad_top = pad_bottom = padding[0]
+        elif len(padding) == 2:
+            pad_left = pad_right = padding[0]
+            pad_top = pad_bottom = padding[1]
+        elif len(padding) == 4:
+            pad_left = padding[0]
+            pad_top = padding[1]
+            pad_right = padding[2]
+            pad_bottom = padding[3]
+        else:
+            raise ValueError(
+                f"Padding must be an int or a 1, 2, or 4 element tuple, not a {len(padding)} element tuple"
+            )
+    else:
+        raise TypeError(f"`padding` should be an integer or tuple or list of integers, but got {padding}")
+
+    return [pad_left, pad_right, pad_top, pad_bottom]
+
+
+@_register_kernel_internal(pad, torch.Tensor)
+@_register_kernel_internal(pad, tv_tensors.Image)
+def pad_image(
+    image: torch.Tensor,
+    padding: List[int],
+    fill: Optional[Union[int, float, List[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    # Be aware that while `padding` has order `[left, top, right, bottom]`, `torch_padding` uses
+    # `[left, right, top, bottom]`. This stems from the fact that we align our API with PIL, but need to use `torch_pad`
+    # internally.
+    torch_padding = _parse_pad_padding(padding)
+
+    if padding_mode not in ("constant", "edge", "reflect", "symmetric"):
+        raise ValueError(
+            f"`padding_mode` should be either `'constant'`, `'edge'`, `'reflect'` or `'symmetric'`, "
+            f"but got `'{padding_mode}'`."
+        )
+
+    if fill is None:
+        fill = 0
+
+    if isinstance(fill, (int, float)):
+        return _pad_with_scalar_fill(image, torch_padding, fill=fill, padding_mode=padding_mode)
+    elif len(fill) == 1:
+        return _pad_with_scalar_fill(image, torch_padding, fill=fill[0], padding_mode=padding_mode)
+    else:
+        return _pad_with_vector_fill(image, torch_padding, fill=fill, padding_mode=padding_mode)
+
+
+def _pad_with_scalar_fill(
+    image: torch.Tensor,
+    torch_padding: List[int],
+    fill: Union[int, float],
+    padding_mode: str,
+) -> torch.Tensor:
+    shape = image.shape
+    num_channels, height, width = shape[-3:]
+
+    batch_size = 1
+    for s in shape[:-3]:
+        batch_size *= s
+
+    image = image.reshape(batch_size, num_channels, height, width)
+
+    if padding_mode == "edge":
+        # Similar to the padding order, `torch_pad`'s PIL's padding modes don't have the same names. Thus, we map
+        # the PIL name for the padding mode, which we are also using for our API, to the corresponding `torch_pad`
+        # name.
+        padding_mode = "replicate"
+
+    if padding_mode == "constant":
+        image = torch_pad(image, torch_padding, mode=padding_mode, value=float(fill))
+    elif padding_mode in ("reflect", "replicate"):
+        # `torch_pad` only supports `"reflect"` or `"replicate"` padding for floating point inputs.
+        # TODO: See https://github.com/pytorch/pytorch/issues/40763
+        dtype = image.dtype
+        if not image.is_floating_point():
+            needs_cast = True
+            image = image.to(torch.float32)
+        else:
+            needs_cast = False
+
+        image = torch_pad(image, torch_padding, mode=padding_mode)
+
+        if needs_cast:
+            image = image.to(dtype)
+    else:  # padding_mode == "symmetric"
+        image = _pad_symmetric(image, torch_padding)
+
+    new_height, new_width = image.shape[-2:]
+
+    return image.reshape(shape[:-3] + (num_channels, new_height, new_width))
+
+
+# TODO: This should be removed once torch_pad supports non-scalar padding values
+def _pad_with_vector_fill(
+    image: torch.Tensor,
+    torch_padding: List[int],
+    fill: List[float],
+    padding_mode: str,
+) -> torch.Tensor:
+    if padding_mode != "constant":
+        raise ValueError(f"Padding mode '{padding_mode}' is not supported if fill is not scalar")
+
+    output = _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
+    left, right, top, bottom = torch_padding
+
+    # We are creating the tensor in the autodetected dtype first and convert to the right one after to avoid an implicit
+    # float -> int conversion. That happens for example for the valid input of a uint8 image with floating point fill
+    # value.
+    fill = torch.tensor(fill, device=image.device).to(dtype=image.dtype).reshape(-1, 1, 1)
+
+    if top > 0:
+        output[..., :top, :] = fill
+    if left > 0:
+        output[..., :, :left] = fill
+    if bottom > 0:
+        output[..., -bottom:, :] = fill
+    if right > 0:
+        output[..., :, -right:] = fill
+    return output
+
+
+_pad_image_pil = _register_kernel_internal(pad, PIL.Image.Image)(_FP.pad)
+
+
+@_register_kernel_internal(pad, tv_tensors.Mask)
+def pad_mask(
+    mask: torch.Tensor,
+    padding: List[int],
+    fill: Optional[Union[int, float, List[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    if fill is None:
+        fill = 0
+
+    if isinstance(fill, (tuple, list)):
+        raise ValueError("Non-scalar fill value is not supported")
+
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = pad_image(mask, padding=padding, fill=fill, padding_mode=padding_mode)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+def pad_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: Tuple[int, int],
+    padding: List[int],
+    padding_mode: str = "constant",
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    if padding_mode not in ["constant"]:
+        # TODO: add support of other padding modes
+        raise ValueError(f"Padding mode '{padding_mode}' is not supported with bounding boxes")
+
+    left, right, top, bottom = _parse_pad_padding(padding)
+
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
+        pad = [left, top, left, top]
+    else:
+        pad = [left, top, 0, 0]
+    bounding_boxes = bounding_boxes + torch.tensor(pad, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
+
+    height, width = canvas_size
+    height += top + bottom
+    width += left + right
+    canvas_size = (height, width)
+
+    return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size
+
+
+@_register_kernel_internal(pad, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _pad_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, padding: List[int], padding_mode: str = "constant", **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = pad_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        padding=padding,
+        padding_mode=padding_mode,
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(pad, tv_tensors.Video)
+def pad_video(
+    video: torch.Tensor,
+    padding: List[int],
+    fill: Optional[Union[int, float, List[float]]] = None,
+    padding_mode: str = "constant",
+) -> torch.Tensor:
+    return pad_image(video, padding, fill=fill, padding_mode=padding_mode)
+
+
+def crop(inpt: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomCrop` for details."""
+    if torch.jit.is_scripting():
+        return crop_image(inpt, top=top, left=left, height=height, width=width)
+
+    _log_api_usage_once(crop)
+
+    kernel = _get_kernel(crop, type(inpt))
+    return kernel(inpt, top=top, left=left, height=height, width=width)
+
+
+@_register_kernel_internal(crop, torch.Tensor)
+@_register_kernel_internal(crop, tv_tensors.Image)
+def crop_image(image: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    h, w = image.shape[-2:]
+
+    right = left + width
+    bottom = top + height
+
+    if left < 0 or top < 0 or right > w or bottom > h:
+        image = image[..., max(top, 0) : bottom, max(left, 0) : right]
+        torch_padding = [
+            max(min(right, 0) - left, 0),
+            max(right - max(w, left), 0),
+            max(min(bottom, 0) - top, 0),
+            max(bottom - max(h, top), 0),
+        ]
+        return _pad_with_scalar_fill(image, torch_padding, fill=0, padding_mode="constant")
+    return image[..., top:bottom, left:right]
+
+
+_crop_image_pil = _FP.crop
+_register_kernel_internal(crop, PIL.Image.Image)(_crop_image_pil)
+
+
+def crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+
+    # Crop or implicit pad if left and/or top have negative values:
+    if format == tv_tensors.BoundingBoxFormat.XYXY:
+        sub = [left, top, left, top]
+    else:
+        sub = [left, top, 0, 0]
+
+    bounding_boxes = bounding_boxes - torch.tensor(sub, dtype=bounding_boxes.dtype, device=bounding_boxes.device)
+    canvas_size = (height, width)
+
+    return clamp_bounding_boxes(bounding_boxes, format=format, canvas_size=canvas_size), canvas_size
+
+
+@_register_kernel_internal(crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _crop_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(crop, tv_tensors.Mask)
+def crop_mask(mask: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = crop_image(mask, top, left, height, width)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(crop, tv_tensors.Video)
+def crop_video(video: torch.Tensor, top: int, left: int, height: int, width: int) -> torch.Tensor:
+    return crop_image(video, top, left, height, width)
+
+
+def perspective(
+    inpt: torch.Tensor,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
+) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomPerspective` for details."""
+    if torch.jit.is_scripting():
+        return perspective_image(
+            inpt,
+            startpoints=startpoints,
+            endpoints=endpoints,
+            interpolation=interpolation,
+            fill=fill,
+            coefficients=coefficients,
+        )
+
+    _log_api_usage_once(perspective)
+
+    kernel = _get_kernel(perspective, type(inpt))
+    return kernel(
+        inpt,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        interpolation=interpolation,
+        fill=fill,
+        coefficients=coefficients,
+    )
+
+
+def _perspective_grid(coeffs: List[float], ow: int, oh: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    # https://github.com/python-pillow/Pillow/blob/4634eafe3c695a014267eefdce830b4a825beed7/
+    # src/libImaging/Geometry.c#L394
+
+    #
+    # x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #
+    theta1 = torch.tensor(
+        [[[coeffs[0], coeffs[1], coeffs[2]], [coeffs[3], coeffs[4], coeffs[5]]]], dtype=dtype, device=device
+    )
+    theta2 = torch.tensor([[[coeffs[6], coeffs[7], 1.0], [coeffs[6], coeffs[7], 1.0]]], dtype=dtype, device=device)
+
+    d = 0.5
+    base_grid = torch.empty(1, oh, ow, 3, dtype=dtype, device=device)
+    x_grid = torch.linspace(d, ow + d - 1.0, steps=ow, device=device, dtype=dtype)
+    base_grid[..., 0].copy_(x_grid)
+    y_grid = torch.linspace(d, oh + d - 1.0, steps=oh, device=device, dtype=dtype).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+    base_grid[..., 2].fill_(1)
+
+    rescaled_theta1 = theta1.transpose(1, 2).div_(torch.tensor([0.5 * ow, 0.5 * oh], dtype=dtype, device=device))
+    shape = (1, oh * ow, 3)
+    output_grid1 = base_grid.view(shape).bmm(rescaled_theta1)
+    output_grid2 = base_grid.view(shape).bmm(theta2.transpose(1, 2))
+
+    output_grid = output_grid1.div_(output_grid2).sub_(1.0)
+    return output_grid.view(1, oh, ow, 2)
+
+
+def _perspective_coefficients(
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    coefficients: Optional[List[float]],
+) -> List[float]:
+    if coefficients is not None:
+        if startpoints is not None and endpoints is not None:
+            raise ValueError("The startpoints/endpoints and the coefficients shouldn't be defined concurrently.")
+        elif len(coefficients) != 8:
+            raise ValueError("Argument coefficients should have 8 float values")
+        return coefficients
+    elif startpoints is not None and endpoints is not None:
+        return _get_perspective_coeffs(startpoints, endpoints)
+    else:
+        raise ValueError("Either the startpoints/endpoints or the coefficients must have non `None` values.")
+
+
+@_register_kernel_internal(perspective, torch.Tensor)
+@_register_kernel_internal(perspective, tv_tensors.Image)
+def perspective_image(
+    image: torch.Tensor,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
+) -> torch.Tensor:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+    interpolation = _check_interpolation(interpolation)
+
+    if image.numel() == 0:
+        return image
+
+    shape = image.shape
+    ndim = image.ndim
+
+    if ndim > 4:
+        image = image.reshape((-1,) + shape[-3:])
+        needs_unsquash = True
+    elif ndim == 3:
+        image = image.unsqueeze(0)
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    _assert_grid_transform_inputs(
+        image,
+        matrix=None,
+        interpolation=interpolation.value,
+        fill=fill,
+        supported_interpolation_modes=["nearest", "bilinear"],
+        coeffs=perspective_coeffs,
+    )
+
+    oh, ow = shape[-2:]
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+    grid = _perspective_grid(perspective_coeffs, ow=ow, oh=oh, dtype=dtype, device=image.device)
+    output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+    if needs_unsquash:
+        output = output.reshape(shape)
+
+    return output
+
+
+@_register_kernel_internal(perspective, PIL.Image.Image)
+def _perspective_image_pil(
+    image: PIL.Image.Image,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BICUBIC,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
+) -> PIL.Image.Image:
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+    interpolation = _check_interpolation(interpolation)
+    return _FP.perspective(image, perspective_coeffs, interpolation=pil_modes_mapping[interpolation], fill=fill)
+
+
+def perspective_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: Tuple[int, int],
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    coefficients: Optional[List[float]] = None,
+) -> torch.Tensor:
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes
+
+    perspective_coeffs = _perspective_coefficients(startpoints, endpoints, coefficients)
+
+    original_shape = bounding_boxes.shape
+    # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
+    bounding_boxes = (
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY)
+    ).reshape(-1, 4)
+
+    dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
+    device = bounding_boxes.device
+
+    # perspective_coeffs are computed as endpoint -> start point
+    # We have to invert perspective_coeffs for bboxes:
+    # (x, y) - end point and (x_out, y_out) - start point
+    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    # and we would like to get:
+    # x = (inv_coeffs[0] * x_out + inv_coeffs[1] * y_out + inv_coeffs[2])
+    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
+    # y = (inv_coeffs[3] * x_out + inv_coeffs[4] * y_out + inv_coeffs[5])
+    #       / (inv_coeffs[6] * x_out + inv_coeffs[7] * y_out + 1)
+    # and compute inv_coeffs in terms of coeffs
+
+    denom = perspective_coeffs[0] * perspective_coeffs[4] - perspective_coeffs[1] * perspective_coeffs[3]
+    if denom == 0:
+        raise RuntimeError(
+            f"Provided perspective_coeffs {perspective_coeffs} can not be inverted to transform bounding boxes. "
+            f"Denominator is zero, denom={denom}"
+        )
+
+    inv_coeffs = [
+        (perspective_coeffs[4] - perspective_coeffs[5] * perspective_coeffs[7]) / denom,
+        (-perspective_coeffs[1] + perspective_coeffs[2] * perspective_coeffs[7]) / denom,
+        (perspective_coeffs[1] * perspective_coeffs[5] - perspective_coeffs[2] * perspective_coeffs[4]) / denom,
+        (-perspective_coeffs[3] + perspective_coeffs[5] * perspective_coeffs[6]) / denom,
+        (perspective_coeffs[0] - perspective_coeffs[2] * perspective_coeffs[6]) / denom,
+        (-perspective_coeffs[0] * perspective_coeffs[5] + perspective_coeffs[2] * perspective_coeffs[3]) / denom,
+        (-perspective_coeffs[4] * perspective_coeffs[6] + perspective_coeffs[3] * perspective_coeffs[7]) / denom,
+        (-perspective_coeffs[0] * perspective_coeffs[7] + perspective_coeffs[1] * perspective_coeffs[6]) / denom,
+    ]
+
+    theta1 = torch.tensor(
+        [[inv_coeffs[0], inv_coeffs[1], inv_coeffs[2]], [inv_coeffs[3], inv_coeffs[4], inv_coeffs[5]]],
+        dtype=dtype,
+        device=device,
+    )
+
+    theta2 = torch.tensor(
+        [[inv_coeffs[6], inv_coeffs[7], 1.0], [inv_coeffs[6], inv_coeffs[7], 1.0]], dtype=dtype, device=device
+    )
+
+    # 1) Let's transform bboxes into a tensor of 4 points (top-left, top-right, bottom-left, bottom-right corners).
+    # Tensor of points has shape (N * 4, 3), where N is the number of bboxes
+    # Single point structure is similar to
+    # [(xmin, ymin, 1), (xmax, ymin, 1), (xmax, ymax, 1), (xmin, ymax, 1)]
+    points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
+    points = torch.cat([points, torch.ones(points.shape[0], 1, device=points.device)], dim=-1)
+    # 2) Now let's transform the points using perspective matrices
+    #   x_out = (coeffs[0] * x + coeffs[1] * y + coeffs[2]) / (coeffs[6] * x + coeffs[7] * y + 1)
+    #   y_out = (coeffs[3] * x + coeffs[4] * y + coeffs[5]) / (coeffs[6] * x + coeffs[7] * y + 1)
+
+    numer_points = torch.matmul(points, theta1.T)
+    denom_points = torch.matmul(points, theta2.T)
+    transformed_points = numer_points.div_(denom_points)
+
+    # 3) Reshape transformed points to [N boxes, 4 points, x/y coords]
+    # and compute bounding box from 4 transformed points:
+    transformed_points = transformed_points.reshape(-1, 4, 2)
+    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+
+    out_bboxes = clamp_bounding_boxes(
+        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
+        format=tv_tensors.BoundingBoxFormat.XYXY,
+        canvas_size=canvas_size,
+    )
+
+    # out_bboxes should be of shape [N boxes, 4]
+
+    return convert_bounding_box_format(
+        out_bboxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+
+@_register_kernel_internal(perspective, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _perspective_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    coefficients: Optional[List[float]] = None,
+    **kwargs,
+) -> tv_tensors.BoundingBoxes:
+    output = perspective_bounding_boxes(
+        inpt.as_subclass(torch.Tensor),
+        format=inpt.format,
+        canvas_size=inpt.canvas_size,
+        startpoints=startpoints,
+        endpoints=endpoints,
+        coefficients=coefficients,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def perspective_mask(
+    mask: torch.Tensor,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = perspective_image(
+        mask, startpoints, endpoints, interpolation=InterpolationMode.NEAREST, fill=fill, coefficients=coefficients
+    )
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(perspective, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _perspective_mask_dispatch(
+    inpt: tv_tensors.Mask,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
+    **kwargs,
+) -> tv_tensors.Mask:
+    output = perspective_mask(
+        inpt.as_subclass(torch.Tensor),
+        startpoints=startpoints,
+        endpoints=endpoints,
+        fill=fill,
+        coefficients=coefficients,
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(perspective, tv_tensors.Video)
+def perspective_video(
+    video: torch.Tensor,
+    startpoints: Optional[List[List[int]]],
+    endpoints: Optional[List[List[int]]],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+    coefficients: Optional[List[float]] = None,
+) -> torch.Tensor:
+    return perspective_image(
+        video, startpoints, endpoints, interpolation=interpolation, fill=fill, coefficients=coefficients
+    )
+
+
+def elastic(
+    inpt: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.ElasticTransform` for details."""
+    if torch.jit.is_scripting():
+        return elastic_image(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
+
+    _log_api_usage_once(elastic)
+
+    kernel = _get_kernel(elastic, type(inpt))
+    return kernel(inpt, displacement=displacement, interpolation=interpolation, fill=fill)
+
+
+elastic_transform = elastic
+
+
+@_register_kernel_internal(elastic, torch.Tensor)
+@_register_kernel_internal(elastic, tv_tensors.Image)
+def elastic_image(
+    image: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+
+    interpolation = _check_interpolation(interpolation)
+
+    if image.numel() == 0:
+        return image
+
+    shape = image.shape
+    ndim = image.ndim
+
+    device = image.device
+    dtype = image.dtype if torch.is_floating_point(image) else torch.float32
+
+    # Patch: elastic transform should support (cpu,f16) input
+    is_cpu_half = device.type == "cpu" and dtype == torch.float16
+    if is_cpu_half:
+        image = image.to(torch.float32)
+        dtype = torch.float32
+
+    # We are aware that if input image dtype is uint8 and displacement is float64 then
+    # displacement will be casted to float32 and all computations will be done with float32
+    # We can fix this later if needed
+
+    expected_shape = (1,) + shape[-2:] + (2,)
+    if expected_shape != displacement.shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
+    if ndim > 4:
+        image = image.reshape((-1,) + shape[-3:])
+        needs_unsquash = True
+    elif ndim == 3:
+        image = image.unsqueeze(0)
+        needs_unsquash = True
+    else:
+        needs_unsquash = False
+
+    if displacement.dtype != dtype or displacement.device != device:
+        displacement = displacement.to(dtype=dtype, device=device)
+
+    image_height, image_width = shape[-2:]
+    grid = _create_identity_grid((image_height, image_width), device=device, dtype=dtype).add_(displacement)
+    output = _apply_grid_transform(image, grid, interpolation.value, fill=fill)
+
+    if needs_unsquash:
+        output = output.reshape(shape)
+
+    if is_cpu_half:
+        output = output.to(torch.float16)
+
+    return output
+
+
+@_register_kernel_internal(elastic, PIL.Image.Image)
+def _elastic_image_pil(
+    image: PIL.Image.Image,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> PIL.Image.Image:
+    t_img = pil_to_tensor(image)
+    output = elastic_image(t_img, displacement, interpolation=interpolation, fill=fill)
+    return to_pil_image(output, mode=image.mode)
+
+
+def _create_identity_grid(size: Tuple[int, int], device: torch.device, dtype: torch.dtype) -> torch.Tensor:
+    sy, sx = size
+    base_grid = torch.empty(1, sy, sx, 2, device=device, dtype=dtype)
+    x_grid = torch.linspace((-sx + 1) / sx, (sx - 1) / sx, sx, device=device, dtype=dtype)
+    base_grid[..., 0].copy_(x_grid)
+
+    y_grid = torch.linspace((-sy + 1) / sy, (sy - 1) / sy, sy, device=device, dtype=dtype).unsqueeze_(-1)
+    base_grid[..., 1].copy_(y_grid)
+
+    return base_grid
+
+
+def elastic_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: Tuple[int, int],
+    displacement: torch.Tensor,
+) -> torch.Tensor:
+    expected_shape = (1, canvas_size[0], canvas_size[1], 2)
+    if not isinstance(displacement, torch.Tensor):
+        raise TypeError("Argument displacement should be a Tensor")
+    elif displacement.shape != expected_shape:
+        raise ValueError(f"Argument displacement shape should be {expected_shape}, but given {displacement.shape}")
+
+    if bounding_boxes.numel() == 0:
+        return bounding_boxes
+
+    # TODO: add in docstring about approximation we are doing for grid inversion
+    device = bounding_boxes.device
+    dtype = bounding_boxes.dtype if torch.is_floating_point(bounding_boxes) else torch.float32
+
+    if displacement.dtype != dtype or displacement.device != device:
+        displacement = displacement.to(dtype=dtype, device=device)
+
+    original_shape = bounding_boxes.shape
+    # TODO: first cast to float if bbox is int64 before convert_bounding_box_format
+    bounding_boxes = (
+        convert_bounding_box_format(bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY)
+    ).reshape(-1, 4)
+
+    id_grid = _create_identity_grid(canvas_size, device=device, dtype=dtype)
+    # We construct an approximation of inverse grid as inv_grid = id_grid - displacement
+    # This is not an exact inverse of the grid
+    inv_grid = id_grid.sub_(displacement)
+
+    # Get points from bboxes
+    points = bounding_boxes[:, [[0, 1], [2, 1], [2, 3], [0, 3]]].reshape(-1, 2)
+    if points.is_floating_point():
+        points = points.ceil_()
+    index_xy = points.to(dtype=torch.long)
+    index_x, index_y = index_xy[:, 0], index_xy[:, 1]
+
+    # Transform points:
+    t_size = torch.tensor(canvas_size[::-1], device=displacement.device, dtype=displacement.dtype)
+    transformed_points = inv_grid[0, index_y, index_x, :].add_(1).mul_(0.5 * t_size).sub_(0.5)
+
+    transformed_points = transformed_points.reshape(-1, 4, 2)
+    out_bbox_mins, out_bbox_maxs = torch.aminmax(transformed_points, dim=1)
+    out_bboxes = clamp_bounding_boxes(
+        torch.cat([out_bbox_mins, out_bbox_maxs], dim=1).to(bounding_boxes.dtype),
+        format=tv_tensors.BoundingBoxFormat.XYXY,
+        canvas_size=canvas_size,
+    )
+
+    return convert_bounding_box_format(
+        out_bboxes, old_format=tv_tensors.BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    ).reshape(original_shape)
+
+
+@_register_kernel_internal(elastic, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _elastic_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, displacement: torch.Tensor, **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output = elastic_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, displacement=displacement
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+def elastic_mask(
+    mask: torch.Tensor,
+    displacement: torch.Tensor,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = elastic_image(mask, displacement=displacement, interpolation=InterpolationMode.NEAREST, fill=fill)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(elastic, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _elastic_mask_dispatch(
+    inpt: tv_tensors.Mask, displacement: torch.Tensor, fill: _FillTypeJIT = None, **kwargs
+) -> tv_tensors.Mask:
+    output = elastic_mask(inpt.as_subclass(torch.Tensor), displacement=displacement, fill=fill)
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(elastic, tv_tensors.Video)
+def elastic_video(
+    video: torch.Tensor,
+    displacement: torch.Tensor,
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    fill: _FillTypeJIT = None,
+) -> torch.Tensor:
+    return elastic_image(video, displacement, interpolation=interpolation, fill=fill)
+
+
+def center_crop(inpt: torch.Tensor, output_size: List[int]) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomCrop` for details."""
+    if torch.jit.is_scripting():
+        return center_crop_image(inpt, output_size=output_size)
+
+    _log_api_usage_once(center_crop)
+
+    kernel = _get_kernel(center_crop, type(inpt))
+    return kernel(inpt, output_size=output_size)
+
+
+def _center_crop_parse_output_size(output_size: List[int]) -> List[int]:
+    if isinstance(output_size, numbers.Number):
+        s = int(output_size)
+        return [s, s]
+    elif isinstance(output_size, (tuple, list)) and len(output_size) == 1:
+        return [output_size[0], output_size[0]]
+    else:
+        return list(output_size)
+
+
+def _center_crop_compute_padding(crop_height: int, crop_width: int, image_height: int, image_width: int) -> List[int]:
+    return [
+        (crop_width - image_width) // 2 if crop_width > image_width else 0,
+        (crop_height - image_height) // 2 if crop_height > image_height else 0,
+        (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
+        (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
+    ]
+
+
+def _center_crop_compute_crop_anchor(
+    crop_height: int, crop_width: int, image_height: int, image_width: int
+) -> Tuple[int, int]:
+    crop_top = int(round((image_height - crop_height) / 2.0))
+    crop_left = int(round((image_width - crop_width) / 2.0))
+    return crop_top, crop_left
+
+
+@_register_kernel_internal(center_crop, torch.Tensor)
+@_register_kernel_internal(center_crop, tv_tensors.Image)
+def center_crop_image(image: torch.Tensor, output_size: List[int]) -> torch.Tensor:
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    shape = image.shape
+    if image.numel() == 0:
+        return image.reshape(shape[:-2] + (crop_height, crop_width))
+    image_height, image_width = shape[-2:]
+
+    if crop_height > image_height or crop_width > image_width:
+        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
+        image = torch_pad(image, _parse_pad_padding(padding_ltrb), value=0.0)
+
+        image_height, image_width = image.shape[-2:]
+        if crop_width == image_width and crop_height == image_height:
+            return image
+
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
+    return image[..., crop_top : (crop_top + crop_height), crop_left : (crop_left + crop_width)]
+
+
+@_register_kernel_internal(center_crop, PIL.Image.Image)
+def _center_crop_image_pil(image: PIL.Image.Image, output_size: List[int]) -> PIL.Image.Image:
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    image_height, image_width = _get_size_image_pil(image)
+
+    if crop_height > image_height or crop_width > image_width:
+        padding_ltrb = _center_crop_compute_padding(crop_height, crop_width, image_height, image_width)
+        image = _pad_image_pil(image, padding_ltrb, fill=0)
+
+        image_height, image_width = _get_size_image_pil(image)
+        if crop_width == image_width and crop_height == image_height:
+            return image
+
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, image_height, image_width)
+    return _crop_image_pil(image, crop_top, crop_left, crop_height, crop_width)
+
+
+def center_crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    canvas_size: Tuple[int, int],
+    output_size: List[int],
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    crop_height, crop_width = _center_crop_parse_output_size(output_size)
+    crop_top, crop_left = _center_crop_compute_crop_anchor(crop_height, crop_width, *canvas_size)
+    return crop_bounding_boxes(
+        bounding_boxes, format, top=crop_top, left=crop_left, height=crop_height, width=crop_width
+    )
+
+
+@_register_kernel_internal(center_crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _center_crop_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, output_size: List[int]
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = center_crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size, output_size=output_size
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+@_register_kernel_internal(center_crop, tv_tensors.Mask)
+def center_crop_mask(mask: torch.Tensor, output_size: List[int]) -> torch.Tensor:
+    if mask.ndim < 3:
+        mask = mask.unsqueeze(0)
+        needs_squeeze = True
+    else:
+        needs_squeeze = False
+
+    output = center_crop_image(image=mask, output_size=output_size)
+
+    if needs_squeeze:
+        output = output.squeeze(0)
+
+    return output
+
+
+@_register_kernel_internal(center_crop, tv_tensors.Video)
+def center_crop_video(video: torch.Tensor, output_size: List[int]) -> torch.Tensor:
+    return center_crop_image(video, output_size)
+
+
+def resized_crop(
+    inpt: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.RandomResizedCrop` for details."""
+    if torch.jit.is_scripting():
+        return resized_crop_image(
+            inpt,
+            top=top,
+            left=left,
+            height=height,
+            width=width,
+            size=size,
+            interpolation=interpolation,
+            antialias=antialias,
+        )
+
+    _log_api_usage_once(resized_crop)
+
+    kernel = _get_kernel(resized_crop, type(inpt))
+    return kernel(
+        inpt,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        interpolation=interpolation,
+        antialias=antialias,
+    )
+
+
+@_register_kernel_internal(resized_crop, torch.Tensor)
+@_register_kernel_internal(resized_crop, tv_tensors.Image)
+def resized_crop_image(
+    image: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> torch.Tensor:
+    image = crop_image(image, top, left, height, width)
+    return resize_image(image, size, interpolation=interpolation, antialias=antialias)
+
+
+def _resized_crop_image_pil(
+    image: PIL.Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+) -> PIL.Image.Image:
+    image = _crop_image_pil(image, top, left, height, width)
+    return _resize_image_pil(image, size, interpolation=interpolation)
+
+
+@_register_kernel_internal(resized_crop, PIL.Image.Image)
+def _resized_crop_image_pil_dispatch(
+    image: PIL.Image.Image,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> PIL.Image.Image:
+    if antialias is False:
+        warnings.warn("Anti-alias option is always applied for PIL Image input. Argument antialias is ignored.")
+    return _resized_crop_image_pil(
+        image,
+        top=top,
+        left=left,
+        height=height,
+        width=width,
+        size=size,
+        interpolation=interpolation,
+    )
+
+
+def resized_crop_bounding_boxes(
+    bounding_boxes: torch.Tensor,
+    format: tv_tensors.BoundingBoxFormat,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+) -> Tuple[torch.Tensor, Tuple[int, int]]:
+    bounding_boxes, canvas_size = crop_bounding_boxes(bounding_boxes, format, top, left, height, width)
+    return resize_bounding_boxes(bounding_boxes, canvas_size=canvas_size, size=size)
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def _resized_crop_bounding_boxes_dispatch(
+    inpt: tv_tensors.BoundingBoxes, top: int, left: int, height: int, width: int, size: List[int], **kwargs
+) -> tv_tensors.BoundingBoxes:
+    output, canvas_size = resized_crop_bounding_boxes(
+        inpt.as_subclass(torch.Tensor), format=inpt.format, top=top, left=left, height=height, width=width, size=size
+    )
+    return tv_tensors.wrap(output, like=inpt, canvas_size=canvas_size)
+
+
+def resized_crop_mask(
+    mask: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+) -> torch.Tensor:
+    mask = crop_mask(mask, top, left, height, width)
+    return resize_mask(mask, size)
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _resized_crop_mask_dispatch(
+    inpt: tv_tensors.Mask, top: int, left: int, height: int, width: int, size: List[int], **kwargs
+) -> tv_tensors.Mask:
+    output = resized_crop_mask(
+        inpt.as_subclass(torch.Tensor), top=top, left=left, height=height, width=width, size=size
+    )
+    return tv_tensors.wrap(output, like=inpt)
+
+
+@_register_kernel_internal(resized_crop, tv_tensors.Video)
+def resized_crop_video(
+    video: torch.Tensor,
+    top: int,
+    left: int,
+    height: int,
+    width: int,
+    size: List[int],
+    interpolation: Union[InterpolationMode, int] = InterpolationMode.BILINEAR,
+    antialias: Optional[Union[str, bool]] = "warn",
+) -> torch.Tensor:
+    return resized_crop_image(
+        video, top, left, height, width, antialias=antialias, size=size, interpolation=interpolation
+    )
+
+
+def five_crop(
+    inpt: torch.Tensor, size: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    """[BETA] See :class:`~torchvision.transforms.v2.FiveCrop` for details."""
+    if torch.jit.is_scripting():
+        return five_crop_image(inpt, size=size)
+
+    _log_api_usage_once(five_crop)
+
+    kernel = _get_kernel(five_crop, type(inpt))
+    return kernel(inpt, size=size)
+
+
+def _parse_five_crop_size(size: List[int]) -> List[int]:
+    if isinstance(size, numbers.Number):
+        s = int(size)
+        size = [s, s]
+    elif isinstance(size, (tuple, list)) and len(size) == 1:
+        s = size[0]
+        size = [s, s]
+
+    if len(size) != 2:
+        raise ValueError("Please provide only two dimensions (h, w) for size.")
+
+    return size
+
+
+@_register_five_ten_crop_kernel_internal(five_crop, torch.Tensor)
+@_register_five_ten_crop_kernel_internal(five_crop, tv_tensors.Image)
+def five_crop_image(
+    image: torch.Tensor, size: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    crop_height, crop_width = _parse_five_crop_size(size)
+    image_height, image_width = image.shape[-2:]
+
+    if crop_width > image_width or crop_height > image_height:
+        raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
+
+    tl = crop_image(image, 0, 0, crop_height, crop_width)
+    tr = crop_image(image, 0, image_width - crop_width, crop_height, crop_width)
+    bl = crop_image(image, image_height - crop_height, 0, crop_height, crop_width)
+    br = crop_image(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
+    center = center_crop_image(image, [crop_height, crop_width])
+
+    return tl, tr, bl, br, center
+
+
+@_register_five_ten_crop_kernel_internal(five_crop, PIL.Image.Image)
+def _five_crop_image_pil(
+    image: PIL.Image.Image, size: List[int]
+) -> Tuple[PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image, PIL.Image.Image]:
+    crop_height, crop_width = _parse_five_crop_size(size)
+    image_height, image_width = _get_size_image_pil(image)
+
+    if crop_width > image_width or crop_height > image_height:
+        raise ValueError(f"Requested crop size {size} is bigger than input size {(image_height, image_width)}")
+
+    tl = _crop_image_pil(image, 0, 0, crop_height, crop_width)
+    tr = _crop_image_pil(image, 0, image_width - crop_width, crop_height, crop_width)
+    bl = _crop_image_pil(image, image_height - crop_height, 0, crop_height, crop_width)
+    br = _crop_image_pil(image, image_height - crop_height, image_width - crop_width, crop_height, crop_width)
+    center = _center_crop_image_pil(image, [crop_height, crop_width])
+
+    return tl, tr, bl, br, center
+
+
+@_register_five_ten_crop_kernel_internal(five_crop, tv_tensors.Video)
+def five_crop_video(
+    video: torch.Tensor, size: List[int]
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    return five_crop_image(video, size)
+
+
+def ten_crop(
+    inpt: torch.Tensor, size: List[int], vertical_flip: bool = False
+) -> Tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    """[BETA] See :class:`~torchvision.transforms.v2.TenCrop` for details."""
+    if torch.jit.is_scripting():
+        return ten_crop_image(inpt, size=size, vertical_flip=vertical_flip)
+
+    _log_api_usage_once(ten_crop)
+
+    kernel = _get_kernel(ten_crop, type(inpt))
+    return kernel(inpt, size=size, vertical_flip=vertical_flip)
+
+
+@_register_five_ten_crop_kernel_internal(ten_crop, torch.Tensor)
+@_register_five_ten_crop_kernel_internal(ten_crop, tv_tensors.Image)
+def ten_crop_image(
+    image: torch.Tensor, size: List[int], vertical_flip: bool = False
+) -> Tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    non_flipped = five_crop_image(image, size)
+
+    if vertical_flip:
+        image = vertical_flip_image(image)
+    else:
+        image = horizontal_flip_image(image)
+
+    flipped = five_crop_image(image, size)
+
+    return non_flipped + flipped
+
+
+@_register_five_ten_crop_kernel_internal(ten_crop, PIL.Image.Image)
+def _ten_crop_image_pil(
+    image: PIL.Image.Image, size: List[int], vertical_flip: bool = False
+) -> Tuple[
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+    PIL.Image.Image,
+]:
+    non_flipped = _five_crop_image_pil(image, size)
+
+    if vertical_flip:
+        image = _vertical_flip_image_pil(image)
+    else:
+        image = _horizontal_flip_image_pil(image)
+
+    flipped = _five_crop_image_pil(image, size)
+
+    return non_flipped + flipped
+
+
+@_register_five_ten_crop_kernel_internal(ten_crop, tv_tensors.Video)
+def ten_crop_video(
+    video: torch.Tensor, size: List[int], vertical_flip: bool = False
+) -> Tuple[
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+    torch.Tensor,
+]:
+    return ten_crop_image(video, size, vertical_flip=vertical_flip)
diff --git a/torchvision/transforms/v2/functional/_meta.py b/torchvision/transforms/v2/functional/_meta.py
new file mode 100644
index 0000000000000000000000000000000000000000..61e21ef817525fe6a6d3381048f6969b4d3036cb
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_meta.py
@@ -0,0 +1,274 @@
+from typing import List, Optional, Tuple
+
+import PIL.Image
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import _functional_pil as _FP
+from torchvision.tv_tensors import BoundingBoxFormat
+
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal, is_pure_tensor
+
+
+def get_dimensions(inpt: torch.Tensor) -> List[int]:
+    if torch.jit.is_scripting():
+        return get_dimensions_image(inpt)
+
+    _log_api_usage_once(get_dimensions)
+
+    kernel = _get_kernel(get_dimensions, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_dimensions, torch.Tensor)
+@_register_kernel_internal(get_dimensions, tv_tensors.Image, tv_tensor_wrapper=False)
+def get_dimensions_image(image: torch.Tensor) -> List[int]:
+    chw = list(image.shape[-3:])
+    ndims = len(chw)
+    if ndims == 3:
+        return chw
+    elif ndims == 2:
+        chw.insert(0, 1)
+        return chw
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
+_get_dimensions_image_pil = _register_kernel_internal(get_dimensions, PIL.Image.Image)(_FP.get_dimensions)
+
+
+@_register_kernel_internal(get_dimensions, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_dimensions_video(video: torch.Tensor) -> List[int]:
+    return get_dimensions_image(video)
+
+
+def get_num_channels(inpt: torch.Tensor) -> int:
+    if torch.jit.is_scripting():
+        return get_num_channels_image(inpt)
+
+    _log_api_usage_once(get_num_channels)
+
+    kernel = _get_kernel(get_num_channels, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_num_channels, torch.Tensor)
+@_register_kernel_internal(get_num_channels, tv_tensors.Image, tv_tensor_wrapper=False)
+def get_num_channels_image(image: torch.Tensor) -> int:
+    chw = image.shape[-3:]
+    ndims = len(chw)
+    if ndims == 3:
+        return chw[0]
+    elif ndims == 2:
+        return 1
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
+_get_num_channels_image_pil = _register_kernel_internal(get_num_channels, PIL.Image.Image)(_FP.get_image_num_channels)
+
+
+@_register_kernel_internal(get_num_channels, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_num_channels_video(video: torch.Tensor) -> int:
+    return get_num_channels_image(video)
+
+
+# We changed the names to ensure it can be used not only for images but also videos. Thus, we just alias it without
+# deprecating the old names.
+get_image_num_channels = get_num_channels
+
+
+def get_size(inpt: torch.Tensor) -> List[int]:
+    if torch.jit.is_scripting():
+        return get_size_image(inpt)
+
+    _log_api_usage_once(get_size)
+
+    kernel = _get_kernel(get_size, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_size, torch.Tensor)
+@_register_kernel_internal(get_size, tv_tensors.Image, tv_tensor_wrapper=False)
+def get_size_image(image: torch.Tensor) -> List[int]:
+    hw = list(image.shape[-2:])
+    ndims = len(hw)
+    if ndims == 2:
+        return hw
+    else:
+        raise TypeError(f"Input tensor should have at least two dimensions, but got {ndims}")
+
+
+@_register_kernel_internal(get_size, PIL.Image.Image)
+def _get_size_image_pil(image: PIL.Image.Image) -> List[int]:
+    width, height = _FP.get_image_size(image)
+    return [height, width]
+
+
+@_register_kernel_internal(get_size, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_size_video(video: torch.Tensor) -> List[int]:
+    return get_size_image(video)
+
+
+@_register_kernel_internal(get_size, tv_tensors.Mask, tv_tensor_wrapper=False)
+def get_size_mask(mask: torch.Tensor) -> List[int]:
+    return get_size_image(mask)
+
+
+@_register_kernel_internal(get_size, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+def get_size_bounding_boxes(bounding_box: tv_tensors.BoundingBoxes) -> List[int]:
+    return list(bounding_box.canvas_size)
+
+
+def get_num_frames(inpt: torch.Tensor) -> int:
+    if torch.jit.is_scripting():
+        return get_num_frames_video(inpt)
+
+    _log_api_usage_once(get_num_frames)
+
+    kernel = _get_kernel(get_num_frames, type(inpt))
+    return kernel(inpt)
+
+
+@_register_kernel_internal(get_num_frames, torch.Tensor)
+@_register_kernel_internal(get_num_frames, tv_tensors.Video, tv_tensor_wrapper=False)
+def get_num_frames_video(video: torch.Tensor) -> int:
+    return video.shape[-4]
+
+
+def _xywh_to_xyxy(xywh: torch.Tensor, inplace: bool) -> torch.Tensor:
+    xyxy = xywh if inplace else xywh.clone()
+    xyxy[..., 2:] += xyxy[..., :2]
+    return xyxy
+
+
+def _xyxy_to_xywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    xywh = xyxy if inplace else xyxy.clone()
+    xywh[..., 2:] -= xywh[..., :2]
+    return xywh
+
+
+def _cxcywh_to_xyxy(cxcywh: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        cxcywh = cxcywh.clone()
+
+    # Trick to do fast division by 2 and ceil, without casting. It produces the same result as
+    # `torchvision.ops._box_convert._box_cxcywh_to_xyxy`.
+    half_wh = cxcywh[..., 2:].div(-2, rounding_mode=None if cxcywh.is_floating_point() else "floor").abs_()
+    # (cx - width / 2) = x1, same for y1
+    cxcywh[..., :2].sub_(half_wh)
+    # (x1 + width) = x2, same for y2
+    cxcywh[..., 2:].add_(cxcywh[..., :2])
+
+    return cxcywh
+
+
+def _xyxy_to_cxcywh(xyxy: torch.Tensor, inplace: bool) -> torch.Tensor:
+    if not inplace:
+        xyxy = xyxy.clone()
+
+    # (x2 - x1) = width, same for height
+    xyxy[..., 2:].sub_(xyxy[..., :2])
+    # (x1 * 2 + width) / 2 = x1 + width / 2 = x1 + (x2-x1)/2 = (x1 + x2)/2 = cx, same for cy
+    xyxy[..., :2].mul_(2).add_(xyxy[..., 2:]).div_(2, rounding_mode=None if xyxy.is_floating_point() else "floor")
+
+    return xyxy
+
+
+def _convert_bounding_box_format(
+    bounding_boxes: torch.Tensor, old_format: BoundingBoxFormat, new_format: BoundingBoxFormat, inplace: bool = False
+) -> torch.Tensor:
+
+    if new_format == old_format:
+        return bounding_boxes
+
+    # TODO: Add _xywh_to_cxcywh and _cxcywh_to_xywh to improve performance
+    if old_format == BoundingBoxFormat.XYWH:
+        bounding_boxes = _xywh_to_xyxy(bounding_boxes, inplace)
+    elif old_format == BoundingBoxFormat.CXCYWH:
+        bounding_boxes = _cxcywh_to_xyxy(bounding_boxes, inplace)
+
+    if new_format == BoundingBoxFormat.XYWH:
+        bounding_boxes = _xyxy_to_xywh(bounding_boxes, inplace)
+    elif new_format == BoundingBoxFormat.CXCYWH:
+        bounding_boxes = _xyxy_to_cxcywh(bounding_boxes, inplace)
+
+    return bounding_boxes
+
+
+def convert_bounding_box_format(
+    inpt: torch.Tensor,
+    old_format: Optional[BoundingBoxFormat] = None,
+    new_format: Optional[BoundingBoxFormat] = None,
+    inplace: bool = False,
+) -> torch.Tensor:
+    """[BETA] See :func:`~torchvision.transforms.v2.ConvertBoundingBoxFormat` for details."""
+    # This being a kernel / functional hybrid, we need an option to pass `old_format` explicitly for pure tensor
+    # inputs as well as extract it from `tv_tensors.BoundingBoxes` inputs. However, putting a default value on
+    # `old_format` means we also need to put one on `new_format` to have syntactically correct Python. Here we mimic the
+    # default error that would be thrown if `new_format` had no default value.
+    if new_format is None:
+        raise TypeError("convert_bounding_box_format() missing 1 required argument: 'new_format'")
+
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(convert_bounding_box_format)
+
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
+        if old_format is None:
+            raise ValueError("For pure tensor inputs, `old_format` has to be passed.")
+        return _convert_bounding_box_format(inpt, old_format=old_format, new_format=new_format, inplace=inplace)
+    elif isinstance(inpt, tv_tensors.BoundingBoxes):
+        if old_format is not None:
+            raise ValueError("For bounding box tv_tensor inputs, `old_format` must not be passed.")
+        output = _convert_bounding_box_format(
+            inpt.as_subclass(torch.Tensor), old_format=inpt.format, new_format=new_format, inplace=inplace
+        )
+        return tv_tensors.wrap(output, like=inpt, format=new_format)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or a bounding box tv_tensor, but got {type(inpt)} instead."
+        )
+
+
+def _clamp_bounding_boxes(
+    bounding_boxes: torch.Tensor, format: BoundingBoxFormat, canvas_size: Tuple[int, int]
+) -> torch.Tensor:
+    # TODO: Investigate if it makes sense from a performance perspective to have an implementation for every
+    #  BoundingBoxFormat instead of converting back and forth
+    in_dtype = bounding_boxes.dtype
+    bounding_boxes = bounding_boxes.clone() if bounding_boxes.is_floating_point() else bounding_boxes.float()
+    xyxy_boxes = convert_bounding_box_format(
+        bounding_boxes, old_format=format, new_format=tv_tensors.BoundingBoxFormat.XYXY, inplace=True
+    )
+    xyxy_boxes[..., 0::2].clamp_(min=0, max=canvas_size[1])
+    xyxy_boxes[..., 1::2].clamp_(min=0, max=canvas_size[0])
+    out_boxes = convert_bounding_box_format(
+        xyxy_boxes, old_format=BoundingBoxFormat.XYXY, new_format=format, inplace=True
+    )
+    return out_boxes.to(in_dtype)
+
+
+def clamp_bounding_boxes(
+    inpt: torch.Tensor,
+    format: Optional[BoundingBoxFormat] = None,
+    canvas_size: Optional[Tuple[int, int]] = None,
+) -> torch.Tensor:
+    """[BETA] See :func:`~torchvision.transforms.v2.ClampBoundingBoxes` for details."""
+    if not torch.jit.is_scripting():
+        _log_api_usage_once(clamp_bounding_boxes)
+
+    if torch.jit.is_scripting() or is_pure_tensor(inpt):
+
+        if format is None or canvas_size is None:
+            raise ValueError("For pure tensor inputs, `format` and `canvas_size` has to be passed.")
+        return _clamp_bounding_boxes(inpt, format=format, canvas_size=canvas_size)
+    elif isinstance(inpt, tv_tensors.BoundingBoxes):
+        if format is not None or canvas_size is not None:
+            raise ValueError("For bounding box tv_tensor inputs, `format` and `canvas_size` must not be passed.")
+        output = _clamp_bounding_boxes(inpt.as_subclass(torch.Tensor), format=inpt.format, canvas_size=inpt.canvas_size)
+        return tv_tensors.wrap(output, like=inpt)
+    else:
+        raise TypeError(
+            f"Input can either be a plain tensor or a bounding box tv_tensor, but got {type(inpt)} instead."
+        )
diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a6468bb46a7c3b75f9c0d301fc419f7a0c5a55a
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_misc.py
@@ -0,0 +1,277 @@
+import math
+from typing import List, Optional
+
+import PIL.Image
+import torch
+from torch.nn.functional import conv2d, pad as torch_pad
+
+from torchvision import tv_tensors
+from torchvision.transforms._functional_tensor import _max_value
+from torchvision.transforms.functional import pil_to_tensor, to_pil_image
+
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def normalize(
+    inpt: torch.Tensor,
+    mean: List[float],
+    std: List[float],
+    inplace: bool = False,
+) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.Normalize` for details."""
+    if torch.jit.is_scripting():
+        return normalize_image(inpt, mean=mean, std=std, inplace=inplace)
+
+    _log_api_usage_once(normalize)
+
+    kernel = _get_kernel(normalize, type(inpt))
+    return kernel(inpt, mean=mean, std=std, inplace=inplace)
+
+
+@_register_kernel_internal(normalize, torch.Tensor)
+@_register_kernel_internal(normalize, tv_tensors.Image)
+def normalize_image(image: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
+    if not image.is_floating_point():
+        raise TypeError(f"Input tensor should be a float tensor. Got {image.dtype}.")
+
+    if image.ndim < 3:
+        raise ValueError(f"Expected tensor to be a tensor image of size (..., C, H, W). Got {image.shape}.")
+
+    if isinstance(std, (tuple, list)):
+        divzero = not all(std)
+    elif isinstance(std, (int, float)):
+        divzero = std == 0
+    else:
+        divzero = False
+    if divzero:
+        raise ValueError("std evaluated to zero, leading to division by zero.")
+
+    dtype = image.dtype
+    device = image.device
+    mean = torch.as_tensor(mean, dtype=dtype, device=device)
+    std = torch.as_tensor(std, dtype=dtype, device=device)
+    if mean.ndim == 1:
+        mean = mean.view(-1, 1, 1)
+    if std.ndim == 1:
+        std = std.view(-1, 1, 1)
+
+    if inplace:
+        image = image.sub_(mean)
+    else:
+        image = image.sub(mean)
+
+    return image.div_(std)
+
+
+@_register_kernel_internal(normalize, tv_tensors.Video)
+def normalize_video(video: torch.Tensor, mean: List[float], std: List[float], inplace: bool = False) -> torch.Tensor:
+    return normalize_image(video, mean, std, inplace=inplace)
+
+
+def gaussian_blur(inpt: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.GaussianBlur` for details."""
+    if torch.jit.is_scripting():
+        return gaussian_blur_image(inpt, kernel_size=kernel_size, sigma=sigma)
+
+    _log_api_usage_once(gaussian_blur)
+
+    kernel = _get_kernel(gaussian_blur, type(inpt))
+    return kernel(inpt, kernel_size=kernel_size, sigma=sigma)
+
+
+def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    lim = (kernel_size - 1) / (2.0 * math.sqrt(2.0) * sigma)
+    x = torch.linspace(-lim, lim, steps=kernel_size, dtype=dtype, device=device)
+    kernel1d = torch.softmax(x.pow_(2).neg_(), dim=0)
+    return kernel1d
+
+
+def _get_gaussian_kernel2d(
+    kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device
+) -> torch.Tensor:
+    kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0], dtype, device)
+    kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1], dtype, device)
+    kernel2d = kernel1d_y.unsqueeze(-1) * kernel1d_x
+    return kernel2d
+
+
+@_register_kernel_internal(gaussian_blur, torch.Tensor)
+@_register_kernel_internal(gaussian_blur, tv_tensors.Image)
+def gaussian_blur_image(
+    image: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
+) -> torch.Tensor:
+    # TODO: consider deprecating integers from sigma on the future
+    if isinstance(kernel_size, int):
+        kernel_size = [kernel_size, kernel_size]
+    elif len(kernel_size) != 2:
+        raise ValueError(f"If kernel_size is a sequence its length should be 2. Got {len(kernel_size)}")
+    for ksize in kernel_size:
+        if ksize % 2 == 0 or ksize < 0:
+            raise ValueError(f"kernel_size should have odd and positive integers. Got {kernel_size}")
+
+    if sigma is None:
+        sigma = [ksize * 0.15 + 0.35 for ksize in kernel_size]
+    else:
+        if isinstance(sigma, (list, tuple)):
+            length = len(sigma)
+            if length == 1:
+                s = float(sigma[0])
+                sigma = [s, s]
+            elif length != 2:
+                raise ValueError(f"If sigma is a sequence, its length should be 2. Got {length}")
+        elif isinstance(sigma, (int, float)):
+            s = float(sigma)
+            sigma = [s, s]
+        else:
+            raise TypeError(f"sigma should be either float or sequence of floats. Got {type(sigma)}")
+    for s in sigma:
+        if s <= 0.0:
+            raise ValueError(f"sigma should have positive values. Got {sigma}")
+
+    if image.numel() == 0:
+        return image
+
+    dtype = image.dtype
+    shape = image.shape
+    ndim = image.ndim
+    if ndim == 3:
+        image = image.unsqueeze(dim=0)
+    elif ndim > 4:
+        image = image.reshape((-1,) + shape[-3:])
+
+    fp = torch.is_floating_point(image)
+    kernel = _get_gaussian_kernel2d(kernel_size, sigma, dtype=dtype if fp else torch.float32, device=image.device)
+    kernel = kernel.expand(shape[-3], 1, kernel.shape[0], kernel.shape[1])
+
+    output = image if fp else image.to(dtype=torch.float32)
+
+    # padding = (left, right, top, bottom)
+    padding = [kernel_size[0] // 2, kernel_size[0] // 2, kernel_size[1] // 2, kernel_size[1] // 2]
+    output = torch_pad(output, padding, mode="reflect")
+    output = conv2d(output, kernel, groups=shape[-3])
+
+    if ndim == 3:
+        output = output.squeeze(dim=0)
+    elif ndim > 4:
+        output = output.reshape(shape)
+
+    if not fp:
+        output = output.round_().to(dtype=dtype)
+
+    return output
+
+
+@_register_kernel_internal(gaussian_blur, PIL.Image.Image)
+def _gaussian_blur_image_pil(
+    image: PIL.Image.Image, kernel_size: List[int], sigma: Optional[List[float]] = None
+) -> PIL.Image.Image:
+    t_img = pil_to_tensor(image)
+    output = gaussian_blur_image(t_img, kernel_size=kernel_size, sigma=sigma)
+    return to_pil_image(output, mode=image.mode)
+
+
+@_register_kernel_internal(gaussian_blur, tv_tensors.Video)
+def gaussian_blur_video(
+    video: torch.Tensor, kernel_size: List[int], sigma: Optional[List[float]] = None
+) -> torch.Tensor:
+    return gaussian_blur_image(video, kernel_size, sigma)
+
+
+def to_dtype(inpt: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    """[BETA] See :func:`~torchvision.transforms.v2.ToDtype` for details."""
+    if torch.jit.is_scripting():
+        return to_dtype_image(inpt, dtype=dtype, scale=scale)
+
+    _log_api_usage_once(to_dtype)
+
+    kernel = _get_kernel(to_dtype, type(inpt))
+    return kernel(inpt, dtype=dtype, scale=scale)
+
+
+def _num_value_bits(dtype: torch.dtype) -> int:
+    if dtype == torch.uint8:
+        return 8
+    elif dtype == torch.int8:
+        return 7
+    elif dtype == torch.int16:
+        return 15
+    elif dtype == torch.int32:
+        return 31
+    elif dtype == torch.int64:
+        return 63
+    else:
+        raise TypeError(f"Number of value bits is only defined for integer dtypes, but got {dtype}.")
+
+
+@_register_kernel_internal(to_dtype, torch.Tensor)
+@_register_kernel_internal(to_dtype, tv_tensors.Image)
+def to_dtype_image(image: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+
+    if image.dtype == dtype:
+        return image
+    elif not scale:
+        return image.to(dtype)
+
+    float_input = image.is_floating_point()
+    if torch.jit.is_scripting():
+        # TODO: remove this branch as soon as `dtype.is_floating_point` is supported by JIT
+        float_output = torch.tensor(0, dtype=dtype).is_floating_point()
+    else:
+        float_output = dtype.is_floating_point
+
+    if float_input:
+        # float to float
+        if float_output:
+            return image.to(dtype)
+
+        # float to int
+        if (image.dtype == torch.float32 and dtype in (torch.int32, torch.int64)) or (
+            image.dtype == torch.float64 and dtype == torch.int64
+        ):
+            raise RuntimeError(f"The conversion from {image.dtype} to {dtype} cannot be performed safely.")
+
+        # For data in the range `[0.0, 1.0]`, just multiplying by the maximum value of the integer range and converting
+        # to the integer dtype  is not sufficient. For example, `torch.rand(...).mul(255).to(torch.uint8)` will only
+        # be `255` if the input is exactly `1.0`. See https://github.com/pytorch/vision/pull/2078#issuecomment-612045321
+        # for a detailed analysis.
+        # To mitigate this, we could round before we convert to the integer dtype, but this is an extra operation.
+        # Instead, we can also multiply by the maximum value plus something close to `1`. See
+        # https://github.com/pytorch/vision/pull/2078#issuecomment-613524965 for details.
+        eps = 1e-3
+        max_value = float(_max_value(dtype))
+        # We need to scale first since the conversion would otherwise turn the input range `[0.0, 1.0]` into the
+        # discrete set `{0, 1}`.
+        return image.mul(max_value + 1.0 - eps).to(dtype)
+    else:
+        # int to float
+        if float_output:
+            return image.to(dtype).mul_(1.0 / _max_value(image.dtype))
+
+        # int to int
+        num_value_bits_input = _num_value_bits(image.dtype)
+        num_value_bits_output = _num_value_bits(dtype)
+
+        if num_value_bits_input > num_value_bits_output:
+            return image.bitwise_right_shift(num_value_bits_input - num_value_bits_output).to(dtype)
+        else:
+            return image.to(dtype).bitwise_left_shift_(num_value_bits_output - num_value_bits_input)
+
+
+# We encourage users to use to_dtype() instead but we keep this for BC
+def convert_image_dtype(image: torch.Tensor, dtype: torch.dtype = torch.float32) -> torch.Tensor:
+    """[BETA] [DEPRECATED] Use to_dtype() instead."""
+    return to_dtype_image(image, dtype=dtype, scale=True)
+
+
+@_register_kernel_internal(to_dtype, tv_tensors.Video)
+def to_dtype_video(video: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor:
+    return to_dtype_image(video, dtype, scale=scale)
+
+
+@_register_kernel_internal(to_dtype, tv_tensors.BoundingBoxes, tv_tensor_wrapper=False)
+@_register_kernel_internal(to_dtype, tv_tensors.Mask, tv_tensor_wrapper=False)
+def _to_dtype_tensor_dispatch(inpt: torch.Tensor, dtype: torch.dtype, scale: bool = False) -> torch.Tensor:
+    # We don't need to unwrap and rewrap here, since TVTensor.to() preserves the type
+    return inpt.to(dtype)
diff --git a/torchvision/transforms/v2/functional/_temporal.py b/torchvision/transforms/v2/functional/_temporal.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca2903bbc688e20568b6ddefff5d61a262445d1f
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_temporal.py
@@ -0,0 +1,27 @@
+import torch
+
+from torchvision import tv_tensors
+
+from torchvision.utils import _log_api_usage_once
+
+from ._utils import _get_kernel, _register_kernel_internal
+
+
+def uniform_temporal_subsample(inpt: torch.Tensor, num_samples: int) -> torch.Tensor:
+    """[BETA] See :class:`~torchvision.transforms.v2.UniformTemporalSubsample` for details."""
+    if torch.jit.is_scripting():
+        return uniform_temporal_subsample_video(inpt, num_samples=num_samples)
+
+    _log_api_usage_once(uniform_temporal_subsample)
+
+    kernel = _get_kernel(uniform_temporal_subsample, type(inpt))
+    return kernel(inpt, num_samples=num_samples)
+
+
+@_register_kernel_internal(uniform_temporal_subsample, torch.Tensor)
+@_register_kernel_internal(uniform_temporal_subsample, tv_tensors.Video)
+def uniform_temporal_subsample_video(video: torch.Tensor, num_samples: int) -> torch.Tensor:
+    # Reference: https://github.com/facebookresearch/pytorchvideo/blob/a0a131e/pytorchvideo/transforms/functional.py#L19
+    t_max = video.shape[-4] - 1
+    indices = torch.linspace(0, t_max, num_samples, device=video.device).long()
+    return torch.index_select(video, -4, indices)
diff --git a/torchvision/transforms/v2/functional/_type_conversion.py b/torchvision/transforms/v2/functional/_type_conversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..062f85198eeee6af73eebedc1ac34916c4c1623f
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_type_conversion.py
@@ -0,0 +1,25 @@
+from typing import Union
+
+import numpy as np
+import PIL.Image
+import torch
+from torchvision import tv_tensors
+from torchvision.transforms import functional as _F
+
+
+@torch.jit.unused
+def to_image(inpt: Union[torch.Tensor, PIL.Image.Image, np.ndarray]) -> tv_tensors.Image:
+    """[BETA] See :class:`~torchvision.transforms.v2.ToImage` for details."""
+    if isinstance(inpt, np.ndarray):
+        output = torch.from_numpy(inpt).permute((2, 0, 1)).contiguous()
+    elif isinstance(inpt, PIL.Image.Image):
+        output = pil_to_tensor(inpt)
+    elif isinstance(inpt, torch.Tensor):
+        output = inpt
+    else:
+        raise TypeError(f"Input can either be a numpy array or a PIL image, but got {type(inpt)} instead.")
+    return tv_tensors.Image(output)
+
+
+to_pil_image = _F.to_pil_image
+pil_to_tensor = _F.pil_to_tensor
diff --git a/torchvision/transforms/v2/functional/_utils.py b/torchvision/transforms/v2/functional/_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..771833cbea1dab6cd979374423283f01bea03fa7
--- /dev/null
+++ b/torchvision/transforms/v2/functional/_utils.py
@@ -0,0 +1,141 @@
+import functools
+from typing import Any, Callable, Dict, List, Optional, Sequence, Type, Union
+
+import torch
+from torchvision import tv_tensors
+
+_FillType = Union[int, float, Sequence[int], Sequence[float], None]
+_FillTypeJIT = Optional[List[float]]
+
+
+def is_pure_tensor(inpt: Any) -> bool:
+    return isinstance(inpt, torch.Tensor) and not isinstance(inpt, tv_tensors.TVTensor)
+
+
+# {functional: {input_type: type_specific_kernel}}
+_KERNEL_REGISTRY: Dict[Callable, Dict[Type, Callable]] = {}
+
+
+def _kernel_tv_tensor_wrapper(kernel):
+    @functools.wraps(kernel)
+    def wrapper(inpt, *args, **kwargs):
+        # If you're wondering whether we could / should get rid of this wrapper,
+        # the answer is no: we want to pass pure Tensors to avoid the overhead
+        # of the __torch_function__ machinery. Note that this is always valid,
+        # regardless of whether we override __torch_function__ in our base class
+        # or not.
+        # Also, even if we didn't call `as_subclass` here, we would still need
+        # this wrapper to call wrap(), because the TVTensor type would be
+        # lost after the first operation due to our own __torch_function__
+        # logic.
+        output = kernel(inpt.as_subclass(torch.Tensor), *args, **kwargs)
+        return tv_tensors.wrap(output, like=inpt)
+
+    return wrapper
+
+
+def _register_kernel_internal(functional, input_type, *, tv_tensor_wrapper=True):
+    registry = _KERNEL_REGISTRY.setdefault(functional, {})
+    if input_type in registry:
+        raise ValueError(f"Functional {functional} already has a kernel registered for type {input_type}.")
+
+    def decorator(kernel):
+        registry[input_type] = (
+            _kernel_tv_tensor_wrapper(kernel)
+            if issubclass(input_type, tv_tensors.TVTensor) and tv_tensor_wrapper
+            else kernel
+        )
+        return kernel
+
+    return decorator
+
+
+def _name_to_functional(name):
+    import torchvision.transforms.v2.functional  # noqa
+
+    try:
+        return getattr(torchvision.transforms.v2.functional, name)
+    except AttributeError:
+        raise ValueError(
+            f"Could not find functional with name '{name}' in torchvision.transforms.v2.functional."
+        ) from None
+
+
+_BUILTIN_DATAPOINT_TYPES = {
+    obj for obj in tv_tensors.__dict__.values() if isinstance(obj, type) and issubclass(obj, tv_tensors.TVTensor)
+}
+
+
+def register_kernel(functional, tv_tensor_cls):
+    """[BETA] Decorate a kernel to register it for a functional and a (custom) tv_tensor type.
+
+    See :ref:`sphx_glr_auto_examples_transforms_plot_custom_tv_tensors.py` for usage
+    details.
+    """
+    if isinstance(functional, str):
+        functional = _name_to_functional(name=functional)
+    elif not (
+        callable(functional)
+        and getattr(functional, "__module__", "").startswith("torchvision.transforms.v2.functional")
+    ):
+        raise ValueError(
+            f"Kernels can only be registered on functionals from the torchvision.transforms.v2.functional namespace, "
+            f"but got {functional}."
+        )
+
+    if not (isinstance(tv_tensor_cls, type) and issubclass(tv_tensor_cls, tv_tensors.TVTensor)):
+        raise ValueError(
+            f"Kernels can only be registered for subclasses of torchvision.tv_tensors.TVTensor, "
+            f"but got {tv_tensor_cls}."
+        )
+
+    if tv_tensor_cls in _BUILTIN_DATAPOINT_TYPES:
+        raise ValueError(f"Kernels cannot be registered for the builtin tv_tensor classes, but got {tv_tensor_cls}")
+
+    return _register_kernel_internal(functional, tv_tensor_cls, tv_tensor_wrapper=False)
+
+
+def _get_kernel(functional, input_type, *, allow_passthrough=False):
+    registry = _KERNEL_REGISTRY.get(functional)
+    if not registry:
+        raise ValueError(f"No kernel registered for functional {functional.__name__}.")
+
+    for cls in input_type.__mro__:
+        if cls in registry:
+            return registry[cls]
+        elif cls is tv_tensors.TVTensor:
+            # We don't want user-defined tv_tensors to dispatch to the pure Tensor kernels, so we explicit stop the
+            # MRO traversal before hitting torch.Tensor. We can even stop at tv_tensors.TVTensor, since we don't
+            # allow kernels to be registered for tv_tensors.TVTensor anyway.
+            break
+
+    if allow_passthrough:
+        return lambda inpt, *args, **kwargs: inpt
+
+    raise TypeError(
+        f"Functional F.{functional.__name__} supports inputs of type {registry.keys()}, "
+        f"but got {input_type} instead."
+    )
+
+
+# This basically replicates _register_kernel_internal, but with a specialized wrapper for five_crop / ten_crop
+# We could get rid of this by letting _register_kernel_internal take arbitrary functionals rather than wrap_kernel: bool
+def _register_five_ten_crop_kernel_internal(functional, input_type):
+    registry = _KERNEL_REGISTRY.setdefault(functional, {})
+    if input_type in registry:
+        raise TypeError(f"Functional '{functional}' already has a kernel registered for type '{input_type}'.")
+
+    def wrap(kernel):
+        @functools.wraps(kernel)
+        def wrapper(inpt, *args, **kwargs):
+            output = kernel(inpt, *args, **kwargs)
+            container_type = type(output)
+            return container_type(tv_tensors.wrap(o, like=inpt) for o in output)
+
+        return wrapper
+
+    def decorator(kernel):
+        registry[input_type] = wrap(kernel) if issubclass(input_type, tv_tensors.TVTensor) else kernel
+        return kernel
+
+    return decorator
diff --git a/torchvision/tv_tensors/__init__.py b/torchvision/tv_tensors/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d55e10e8620638e7c32144751498f8dac8c22d9f
--- /dev/null
+++ b/torchvision/tv_tensors/__init__.py
@@ -0,0 +1,31 @@
+import torch
+
+from ._bounding_boxes import BoundingBoxes, BoundingBoxFormat
+from ._image import Image
+from ._mask import Mask
+from ._torch_function_helpers import set_return_type
+from ._tv_tensor import TVTensor
+from ._video import Video
+
+
+def wrap(wrappee, *, like, **kwargs):
+    """[BETA] Convert a :class:`torch.Tensor` (``wrappee``) into the same :class:`~torchvision.tv_tensors.TVTensor` subclass as ``like``.
+
+    If ``like`` is a :class:`~torchvision.tv_tensors.BoundingBoxes`, the ``format`` and ``canvas_size`` of
+    ``like`` are assigned to ``wrappee``, unless they are passed as ``kwargs``.
+
+    Args:
+        wrappee (Tensor): The tensor to convert.
+        like (:class:`~torchvision.tv_tensors.TVTensor`): The reference.
+            ``wrappee`` will be converted into the same subclass as ``like``.
+        kwargs: Can contain "format" and "canvas_size" if ``like`` is a :class:`~torchvision.tv_tensor.BoundingBoxes`.
+            Ignored otherwise.
+    """
+    if isinstance(like, BoundingBoxes):
+        return BoundingBoxes._wrap(
+            wrappee,
+            format=kwargs.get("format", like.format),
+            canvas_size=kwargs.get("canvas_size", like.canvas_size),
+        )
+    else:
+        return wrappee.as_subclass(type(like))
diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce617ce47dce79a6f46131bc603d4a2c5c899900
--- /dev/null
+++ b/torchvision/tv_tensors/_bounding_boxes.py
@@ -0,0 +1,103 @@
+from __future__ import annotations
+
+from enum import Enum
+from typing import Any, Mapping, Optional, Sequence, Tuple, Union
+
+import torch
+from torch.utils._pytree import tree_flatten
+
+from ._tv_tensor import TVTensor
+
+
+class BoundingBoxFormat(Enum):
+    """[BETA] Coordinate format of a bounding box.
+
+    Available formats are
+
+    * ``XYXY``
+    * ``XYWH``
+    * ``CXCYWH``
+    """
+
+    XYXY = "XYXY"
+    XYWH = "XYWH"
+    CXCYWH = "CXCYWH"
+
+
+class BoundingBoxes(TVTensor):
+    """[BETA] :class:`torch.Tensor` subclass for bounding boxes.
+
+    .. note::
+        There should be only one :class:`~torchvision.tv_tensors.BoundingBoxes`
+        instance per sample e.g. ``{"img": img, "bbox": BoundingBoxes(...)}``,
+        although one :class:`~torchvision.tv_tensors.BoundingBoxes` object can
+        contain multiple bounding boxes.
+
+    Args:
+        data: Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        format (BoundingBoxFormat, str): Format of the bounding box.
+        canvas_size (two-tuple of ints): Height and width of the corresponding image or video.
+        dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the bounding box is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations on the bounding box. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    format: BoundingBoxFormat
+    canvas_size: Tuple[int, int]
+
+    @classmethod
+    def _wrap(cls, tensor: torch.Tensor, *, format: Union[BoundingBoxFormat, str], canvas_size: Tuple[int, int], check_dims: bool = True) -> BoundingBoxes:  # type: ignore[override]
+        if check_dims:
+            if tensor.ndim == 1:
+                tensor = tensor.unsqueeze(0)
+            elif tensor.ndim != 2:
+                raise ValueError(f"Expected a 1D or 2D tensor, got {tensor.ndim}D")
+        if isinstance(format, str):
+            format = BoundingBoxFormat[format.upper()]
+        bounding_boxes = tensor.as_subclass(cls)
+        bounding_boxes.format = format
+        bounding_boxes.canvas_size = canvas_size
+        return bounding_boxes
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        format: Union[BoundingBoxFormat, str],
+        canvas_size: Tuple[int, int],
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[Union[torch.device, str, int]] = None,
+        requires_grad: Optional[bool] = None,
+    ) -> BoundingBoxes:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return cls._wrap(tensor, format=format, canvas_size=canvas_size)
+
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Optional[Mapping[str, Any]] = None,
+    ) -> BoundingBoxes:
+        # If there are BoundingBoxes instances in the output, their metadata got lost when we called
+        # super().__torch_function__. We need to restore the metadata somehow, so we choose to take
+        # the metadata from the first bbox in the parameters.
+        # This should be what we want in most cases. When it's not, it's probably a mis-use anyway, e.g.
+        # something like some_xyxy_bbox + some_xywh_bbox; we don't guard against those cases.
+        flat_params, _ = tree_flatten(args + (tuple(kwargs.values()) if kwargs else ()))  # type: ignore[operator]
+        first_bbox_from_args = next(x for x in flat_params if isinstance(x, BoundingBoxes))
+        format, canvas_size = first_bbox_from_args.format, first_bbox_from_args.canvas_size
+
+        if isinstance(output, torch.Tensor) and not isinstance(output, BoundingBoxes):
+            output = BoundingBoxes._wrap(output, format=format, canvas_size=canvas_size, check_dims=False)
+        elif isinstance(output, (tuple, list)):
+            output = type(output)(
+                BoundingBoxes._wrap(part, format=format, canvas_size=canvas_size, check_dims=False) for part in output
+            )
+        return output
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr(format=self.format, canvas_size=self.canvas_size)
diff --git a/torchvision/tv_tensors/_dataset_wrapper.py b/torchvision/tv_tensors/_dataset_wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef9260ebde9dc496c54785c54b57e89731a3801f
--- /dev/null
+++ b/torchvision/tv_tensors/_dataset_wrapper.py
@@ -0,0 +1,652 @@
+# type: ignore
+
+from __future__ import annotations
+
+import collections.abc
+
+import contextlib
+from collections import defaultdict
+
+import torch
+
+from torchvision import datasets, tv_tensors
+from torchvision.transforms.v2 import functional as F
+
+__all__ = ["wrap_dataset_for_transforms_v2"]
+
+
+def wrap_dataset_for_transforms_v2(dataset, target_keys=None):
+    """[BETA] Wrap a ``torchvision.dataset`` for usage with :mod:`torchvision.transforms.v2`.
+
+    .. v2betastatus:: wrap_dataset_for_transforms_v2 function
+
+    Example:
+        >>> dataset = torchvision.datasets.CocoDetection(...)
+        >>> dataset = wrap_dataset_for_transforms_v2(dataset)
+
+    .. note::
+
+       For now, only the most popular datasets are supported. Furthermore, the wrapper only supports dataset
+       configurations that are fully supported by ``torchvision.transforms.v2``. If you encounter an error prompting you
+       to raise an issue to ``torchvision`` for a dataset or configuration that you need, please do so.
+
+    The dataset samples are wrapped according to the description below.
+
+    Special cases:
+
+        * :class:`~torchvision.datasets.CocoDetection`: Instead of returning the target as list of dicts, the wrapper
+          returns a dict of lists. In addition, the key-value-pairs ``"boxes"`` (in ``XYXY`` coordinate format),
+          ``"masks"`` and ``"labels"`` are added and wrap the data in the corresponding ``torchvision.tv_tensors``.
+          The original keys are preserved. If ``target_keys`` is omitted, returns only the values for the
+          ``"image_id"``, ``"boxes"``, and ``"labels"``.
+        * :class:`~torchvision.datasets.VOCDetection`: The key-value-pairs ``"boxes"`` and ``"labels"`` are added to
+          the target and wrap the data in the corresponding ``torchvision.tv_tensors``. The original keys are
+          preserved. If ``target_keys`` is omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
+        * :class:`~torchvision.datasets.CelebA`: The target for ``target_type="bbox"`` is converted to the ``XYXY``
+          coordinate format and wrapped into a :class:`~torchvision.tv_tensors.BoundingBoxes` tv_tensor.
+        * :class:`~torchvision.datasets.Kitti`: Instead returning the target as list of dicts, the wrapper returns a
+          dict of lists. In addition, the key-value-pairs ``"boxes"`` and ``"labels"`` are added and wrap the data
+          in the corresponding ``torchvision.tv_tensors``. The original keys are preserved. If ``target_keys`` is
+          omitted, returns only the values for the ``"boxes"`` and ``"labels"``.
+        * :class:`~torchvision.datasets.OxfordIIITPet`: The target for ``target_type="segmentation"`` is wrapped into a
+          :class:`~torchvision.tv_tensors.Mask` tv_tensor.
+        * :class:`~torchvision.datasets.Cityscapes`: The target for ``target_type="semantic"`` is wrapped into a
+          :class:`~torchvision.tv_tensors.Mask` tv_tensor. The target for ``target_type="instance"`` is *replaced* by
+          a dictionary with the key-value-pairs ``"masks"`` (as :class:`~torchvision.tv_tensors.Mask` tv_tensor) and
+          ``"labels"``.
+        * :class:`~torchvision.datasets.WIDERFace`: The value for key ``"bbox"`` in the target is converted to ``XYXY``
+          coordinate format and wrapped into a :class:`~torchvision.tv_tensors.BoundingBoxes` tv_tensor.
+
+    Image classification datasets
+
+        This wrapper is a no-op for image classification datasets, since they were already fully supported by
+        :mod:`torchvision.transforms` and thus no change is needed for :mod:`torchvision.transforms.v2`.
+
+    Segmentation datasets
+
+        Segmentation datasets, e.g. :class:`~torchvision.datasets.VOCSegmentation`, return a two-tuple of
+        :class:`PIL.Image.Image`'s. This wrapper leaves the image as is (first item), while wrapping the
+        segmentation mask into a :class:`~torchvision.tv_tensors.Mask` (second item).
+
+    Video classification datasets
+
+        Video classification datasets, e.g. :class:`~torchvision.datasets.Kinetics`, return a three-tuple containing a
+        :class:`torch.Tensor` for the video and audio and a :class:`int` as label. This wrapper wraps the video into a
+        :class:`~torchvision.tv_tensors.Video` while leaving the other items as is.
+
+        .. note::
+
+            Only datasets constructed with ``output_format="TCHW"`` are supported, since the alternative
+            ``output_format="THWC"`` is not supported by :mod:`torchvision.transforms.v2`.
+
+    Args:
+        dataset: the dataset instance to wrap for compatibility with transforms v2.
+        target_keys: Target keys to return in case the target is a dictionary. If ``None`` (default), selected keys are
+            specific to the dataset. If ``"all"``, returns the full target. Can also be a collection of strings for
+            fine grained access. Currently only supported for :class:`~torchvision.datasets.CocoDetection`,
+            :class:`~torchvision.datasets.VOCDetection`, :class:`~torchvision.datasets.Kitti`, and
+            :class:`~torchvision.datasets.WIDERFace`. See above for details.
+    """
+    if not (
+        target_keys is None
+        or target_keys == "all"
+        or (isinstance(target_keys, collections.abc.Collection) and all(isinstance(key, str) for key in target_keys))
+    ):
+        raise ValueError(
+            f"`target_keys` can be None, 'all', or a collection of strings denoting the keys to be returned, "
+            f"but got {target_keys}"
+        )
+
+    # Imagine we have isinstance(dataset, datasets.ImageNet). This will create a new class with the name
+    # "WrappedImageNet" at runtime that doubly inherits from VisionDatasetTVTensorWrapper (see below) as well as the
+    # original ImageNet class. This allows the user to do regular isinstance(wrapped_dataset, datasets.ImageNet) checks,
+    # while we can still inject everything that we need.
+    wrapped_dataset_cls = type(f"Wrapped{type(dataset).__name__}", (VisionDatasetTVTensorWrapper, type(dataset)), {})
+    # Since VisionDatasetTVTensorWrapper comes before ImageNet in the MRO, calling the class hits
+    # VisionDatasetTVTensorWrapper.__init__ first. Since we are never doing super().__init__(...), the constructor of
+    # ImageNet is never hit. That is by design, since we don't want to create the dataset instance again, but rather
+    # have the existing instance as attribute on the new object.
+    return wrapped_dataset_cls(dataset, target_keys)
+
+
+class WrapperFactories(dict):
+    def register(self, dataset_cls):
+        def decorator(wrapper_factory):
+            self[dataset_cls] = wrapper_factory
+            return wrapper_factory
+
+        return decorator
+
+
+# We need this two-stage design, i.e. a wrapper factory producing the actual wrapper, since some wrappers depend on the
+# dataset instance rather than just the class, since they require the user defined instance attributes. Thus, we can
+# provide a wrapping from the dataset class to the factory here, but can only instantiate the wrapper at runtime when
+# we have access to the dataset instance.
+WRAPPER_FACTORIES = WrapperFactories()
+
+
+class VisionDatasetTVTensorWrapper:
+    def __init__(self, dataset, target_keys):
+        dataset_cls = type(dataset)
+
+        if not isinstance(dataset, datasets.VisionDataset):
+            raise TypeError(
+                f"This wrapper is meant for subclasses of `torchvision.datasets.VisionDataset`, "
+                f"but got a '{dataset_cls.__name__}' instead.\n"
+                f"For an example of how to perform the wrapping for custom datasets, see\n\n"
+                "https://pytorch.org/vision/main/auto_examples/plot_tv_tensors.html#do-i-have-to-wrap-the-output-of-the-datasets-myself"
+            )
+
+        for cls in dataset_cls.mro():
+            if cls in WRAPPER_FACTORIES:
+                wrapper_factory = WRAPPER_FACTORIES[cls]
+                if target_keys is not None and cls not in {
+                    datasets.CocoDetection,
+                    datasets.VOCDetection,
+                    datasets.Kitti,
+                    datasets.WIDERFace,
+                }:
+                    raise ValueError(
+                        f"`target_keys` is currently only supported for `CocoDetection`, `VOCDetection`, `Kitti`, "
+                        f"and `WIDERFace`, but got {cls.__name__}."
+                    )
+                break
+            elif cls is datasets.VisionDataset:
+                # TODO: If we have documentation on how to do that, put a link in the error message.
+                msg = f"No wrapper exists for dataset class {dataset_cls.__name__}. Please wrap the output yourself."
+                if dataset_cls in datasets.__dict__.values():
+                    msg = (
+                        f"{msg} If an automated wrapper for this dataset would be useful for you, "
+                        f"please open an issue at https://github.com/pytorch/vision/issues."
+                    )
+                raise TypeError(msg)
+
+        self._dataset = dataset
+        self._target_keys = target_keys
+        self._wrapper = wrapper_factory(dataset, target_keys)
+
+        # We need to disable the transforms on the dataset here to be able to inject the wrapping before we apply them.
+        # Although internally, `datasets.VisionDataset` merges `transform` and `target_transform` into the joint
+        # `transforms`
+        # https://github.com/pytorch/vision/blob/135a0f9ea9841b6324b4fe8974e2543cbb95709a/torchvision/datasets/vision.py#L52-L54
+        # some (if not most) datasets still use `transform` and `target_transform` individually. Thus, we need to
+        # disable all three here to be able to extract the untransformed sample to wrap.
+        self.transform, dataset.transform = dataset.transform, None
+        self.target_transform, dataset.target_transform = dataset.target_transform, None
+        self.transforms, dataset.transforms = dataset.transforms, None
+
+    def __getattr__(self, item):
+        with contextlib.suppress(AttributeError):
+            return object.__getattribute__(self, item)
+
+        return getattr(self._dataset, item)
+
+    def __getitem__(self, idx):
+        # This gets us the raw sample since we disabled the transforms for the underlying dataset in the constructor
+        # of this class
+        sample = self._dataset[idx]
+
+        sample = self._wrapper(idx, sample)
+
+        # Regardless of whether the user has supplied the transforms individually (`transform` and `target_transform`)
+        # or joint (`transforms`), we can access the full functionality through `transforms`
+        if self.transforms is not None:
+            sample = self.transforms(*sample)
+
+        return sample
+
+    def __len__(self):
+        return len(self._dataset)
+
+    def __reduce__(self):
+        return wrap_dataset_for_transforms_v2, (self._dataset, self._target_keys)
+
+
+def raise_not_supported(description):
+    raise RuntimeError(
+        f"{description} is currently not supported by this wrapper. "
+        f"If this would be helpful for you, please open an issue at https://github.com/pytorch/vision/issues."
+    )
+
+
+def identity(item):
+    return item
+
+
+def identity_wrapper_factory(dataset, target_keys):
+    def wrapper(idx, sample):
+        return sample
+
+    return wrapper
+
+
+def pil_image_to_mask(pil_image):
+    return tv_tensors.Mask(pil_image)
+
+
+def parse_target_keys(target_keys, *, available, default):
+    if target_keys is None:
+        target_keys = default
+    if target_keys == "all":
+        target_keys = available
+    else:
+        target_keys = set(target_keys)
+        extra = target_keys - available
+        if extra:
+            raise ValueError(f"Target keys {sorted(extra)} are not available")
+
+    return target_keys
+
+
+def list_of_dicts_to_dict_of_lists(list_of_dicts):
+    dict_of_lists = defaultdict(list)
+    for dct in list_of_dicts:
+        for key, value in dct.items():
+            dict_of_lists[key].append(value)
+    return dict(dict_of_lists)
+
+
+def wrap_target_by_type(target, *, target_types, type_wrappers):
+    if not isinstance(target, (tuple, list)):
+        target = [target]
+
+    wrapped_target = tuple(
+        type_wrappers.get(target_type, identity)(item) for target_type, item in zip(target_types, target)
+    )
+
+    if len(wrapped_target) == 1:
+        wrapped_target = wrapped_target[0]
+
+    return wrapped_target
+
+
+def classification_wrapper_factory(dataset, target_keys):
+    return identity_wrapper_factory(dataset, target_keys)
+
+
+for dataset_cls in [
+    datasets.Caltech256,
+    datasets.CIFAR10,
+    datasets.CIFAR100,
+    datasets.ImageNet,
+    datasets.MNIST,
+    datasets.FashionMNIST,
+    datasets.GTSRB,
+    datasets.DatasetFolder,
+    datasets.ImageFolder,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(classification_wrapper_factory)
+
+
+def segmentation_wrapper_factory(dataset, target_keys):
+    def wrapper(idx, sample):
+        image, mask = sample
+        return image, pil_image_to_mask(mask)
+
+    return wrapper
+
+
+for dataset_cls in [
+    datasets.VOCSegmentation,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(segmentation_wrapper_factory)
+
+
+def video_classification_wrapper_factory(dataset, target_keys):
+    if dataset.video_clips.output_format == "THWC":
+        raise RuntimeError(
+            f"{type(dataset).__name__} with `output_format='THWC'` is not supported by this wrapper, "
+            f"since it is not compatible with the transformations. Please use `output_format='TCHW'` instead."
+        )
+
+    def wrapper(idx, sample):
+        video, audio, label = sample
+
+        video = tv_tensors.Video(video)
+
+        return video, audio, label
+
+    return wrapper
+
+
+for dataset_cls in [
+    datasets.HMDB51,
+    datasets.Kinetics,
+    datasets.UCF101,
+]:
+    WRAPPER_FACTORIES.register(dataset_cls)(video_classification_wrapper_factory)
+
+
+@WRAPPER_FACTORIES.register(datasets.Caltech101)
+def caltech101_wrapper_factory(dataset, target_keys):
+    if "annotation" in dataset.target_type:
+        raise_not_supported("Caltech101 dataset with `target_type=['annotation', ...]`")
+
+    return classification_wrapper_factory(dataset, target_keys)
+
+
+@WRAPPER_FACTORIES.register(datasets.CocoDetection)
+def coco_dectection_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "segmentation",
+            "area",
+            "iscrowd",
+            "image_id",
+            "bbox",
+            "category_id",
+            # added by the wrapper
+            "boxes",
+            "masks",
+            "labels",
+        },
+        default={"image_id", "boxes", "labels"},
+    )
+
+    def segmentation_to_mask(segmentation, *, canvas_size):
+        from pycocotools import mask
+
+        segmentation = (
+            mask.frPyObjects(segmentation, *canvas_size)
+            if isinstance(segmentation, dict)
+            else mask.merge(mask.frPyObjects(segmentation, *canvas_size))
+        )
+        return torch.from_numpy(mask.decode(segmentation))
+
+    def wrapper(idx, sample):
+        image_id = dataset.ids[idx]
+
+        image, target = sample
+
+        if not target:
+            return image, dict(image_id=image_id)
+
+        canvas_size = tuple(F.get_size(image))
+
+        batched_target = list_of_dicts_to_dict_of_lists(target)
+        target = {}
+
+        if "image_id" in target_keys:
+            target["image_id"] = image_id
+
+        if "boxes" in target_keys:
+            target["boxes"] = F.convert_bounding_box_format(
+                tv_tensors.BoundingBoxes(
+                    batched_target["bbox"],
+                    format=tv_tensors.BoundingBoxFormat.XYWH,
+                    canvas_size=canvas_size,
+                ),
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            )
+
+        if "masks" in target_keys:
+            target["masks"] = tv_tensors.Mask(
+                torch.stack(
+                    [
+                        segmentation_to_mask(segmentation, canvas_size=canvas_size)
+                        for segmentation in batched_target["segmentation"]
+                    ]
+                ),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor(batched_target["category_id"])
+
+        for target_key in target_keys - {"image_id", "boxes", "masks", "labels"}:
+            target[target_key] = batched_target[target_key]
+
+        return image, target
+
+    return wrapper
+
+
+WRAPPER_FACTORIES.register(datasets.CocoCaptions)(identity_wrapper_factory)
+
+
+VOC_DETECTION_CATEGORIES = [
+    "__background__",
+    "aeroplane",
+    "bicycle",
+    "bird",
+    "boat",
+    "bottle",
+    "bus",
+    "car",
+    "cat",
+    "chair",
+    "cow",
+    "diningtable",
+    "dog",
+    "horse",
+    "motorbike",
+    "person",
+    "pottedplant",
+    "sheep",
+    "sofa",
+    "train",
+    "tvmonitor",
+]
+VOC_DETECTION_CATEGORY_TO_IDX = dict(zip(VOC_DETECTION_CATEGORIES, range(len(VOC_DETECTION_CATEGORIES))))
+
+
+@WRAPPER_FACTORIES.register(datasets.VOCDetection)
+def voc_detection_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "annotation",
+            # added by the wrapper
+            "boxes",
+            "labels",
+        },
+        default={"boxes", "labels"},
+    )
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        batched_instances = list_of_dicts_to_dict_of_lists(target["annotation"]["object"])
+
+        if "annotation" not in target_keys:
+            target = {}
+
+        if "boxes" in target_keys:
+            target["boxes"] = tv_tensors.BoundingBoxes(
+                [
+                    [int(bndbox[part]) for part in ("xmin", "ymin", "xmax", "ymax")]
+                    for bndbox in batched_instances["bndbox"]
+                ],
+                format=tv_tensors.BoundingBoxFormat.XYXY,
+                canvas_size=(image.height, image.width),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor(
+                [VOC_DETECTION_CATEGORY_TO_IDX[category] for category in batched_instances["name"]]
+            )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.SBDataset)
+def sbd_wrapper(dataset, target_keys):
+    if dataset.mode == "boundaries":
+        raise_not_supported("SBDataset with mode='boundaries'")
+
+    return segmentation_wrapper_factory(dataset, target_keys)
+
+
+@WRAPPER_FACTORIES.register(datasets.CelebA)
+def celeba_wrapper_factory(dataset, target_keys):
+    if any(target_type in dataset.target_type for target_type in ["attr", "landmarks"]):
+        raise_not_supported("`CelebA` dataset with `target_type=['attr', 'landmarks', ...]`")
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        target = wrap_target_by_type(
+            target,
+            target_types=dataset.target_type,
+            type_wrappers={
+                "bbox": lambda item: F.convert_bounding_box_format(
+                    tv_tensors.BoundingBoxes(
+                        item,
+                        format=tv_tensors.BoundingBoxFormat.XYWH,
+                        canvas_size=(image.height, image.width),
+                    ),
+                    new_format=tv_tensors.BoundingBoxFormat.XYXY,
+                ),
+            },
+        )
+
+        return image, target
+
+    return wrapper
+
+
+KITTI_CATEGORIES = ["Car", "Van", "Truck", "Pedestrian", "Person_sitting", "Cyclist", "Tram", "Misc", "DontCare"]
+KITTI_CATEGORY_TO_IDX = dict(zip(KITTI_CATEGORIES, range(len(KITTI_CATEGORIES))))
+
+
+@WRAPPER_FACTORIES.register(datasets.Kitti)
+def kitti_wrapper_factory(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            # native
+            "type",
+            "truncated",
+            "occluded",
+            "alpha",
+            "bbox",
+            "dimensions",
+            "location",
+            "rotation_y",
+            # added by the wrapper
+            "boxes",
+            "labels",
+        },
+        default={"boxes", "labels"},
+    )
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        if target is None:
+            return image, target
+
+        batched_target = list_of_dicts_to_dict_of_lists(target)
+        target = {}
+
+        if "boxes" in target_keys:
+            target["boxes"] = tv_tensors.BoundingBoxes(
+                batched_target["bbox"],
+                format=tv_tensors.BoundingBoxFormat.XYXY,
+                canvas_size=(image.height, image.width),
+            )
+
+        if "labels" in target_keys:
+            target["labels"] = torch.tensor([KITTI_CATEGORY_TO_IDX[category] for category in batched_target["type"]])
+
+        for target_key in target_keys - {"boxes", "labels"}:
+            target[target_key] = batched_target[target_key]
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.OxfordIIITPet)
+def oxford_iiit_pet_wrapper_factor(dataset, target_keys):
+    def wrapper(idx, sample):
+        image, target = sample
+
+        if target is not None:
+            target = wrap_target_by_type(
+                target,
+                target_types=dataset._target_types,
+                type_wrappers={
+                    "segmentation": pil_image_to_mask,
+                },
+            )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.Cityscapes)
+def cityscapes_wrapper_factory(dataset, target_keys):
+    if any(target_type in dataset.target_type for target_type in ["polygon", "color"]):
+        raise_not_supported("`Cityscapes` dataset with `target_type=['polygon', 'color', ...]`")
+
+    def instance_segmentation_wrapper(mask):
+        # See https://github.com/mcordts/cityscapesScripts/blob/8da5dd00c9069058ccc134654116aac52d4f6fa2/cityscapesscripts/preparation/json2instanceImg.py#L7-L21
+        data = pil_image_to_mask(mask)
+        masks = []
+        labels = []
+        for id in data.unique():
+            masks.append(data == id)
+            label = id
+            if label >= 1_000:
+                label //= 1_000
+            labels.append(label)
+        return dict(masks=tv_tensors.Mask(torch.stack(masks)), labels=torch.stack(labels))
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        target = wrap_target_by_type(
+            target,
+            target_types=dataset.target_type,
+            type_wrappers={
+                "instance": instance_segmentation_wrapper,
+                "semantic": pil_image_to_mask,
+            },
+        )
+
+        return image, target
+
+    return wrapper
+
+
+@WRAPPER_FACTORIES.register(datasets.WIDERFace)
+def widerface_wrapper(dataset, target_keys):
+    target_keys = parse_target_keys(
+        target_keys,
+        available={
+            "bbox",
+            "blur",
+            "expression",
+            "illumination",
+            "occlusion",
+            "pose",
+            "invalid",
+        },
+        default="all",
+    )
+
+    def wrapper(idx, sample):
+        image, target = sample
+
+        if target is None:
+            return image, target
+
+        target = {key: target[key] for key in target_keys}
+
+        if "bbox" in target_keys:
+            target["bbox"] = F.convert_bounding_box_format(
+                tv_tensors.BoundingBoxes(
+                    target["bbox"], format=tv_tensors.BoundingBoxFormat.XYWH, canvas_size=(image.height, image.width)
+                ),
+                new_format=tv_tensors.BoundingBoxFormat.XYXY,
+            )
+
+        return image, target
+
+    return wrapper
diff --git a/torchvision/tv_tensors/_image.py b/torchvision/tv_tensors/_image.py
new file mode 100644
index 0000000000000000000000000000000000000000..a785e4b3e7ec11983f065561a061db8af7dc84d1
--- /dev/null
+++ b/torchvision/tv_tensors/_image.py
@@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from typing import Any, Optional, Union
+
+import PIL.Image
+import torch
+
+from ._tv_tensor import TVTensor
+
+
+class Image(TVTensor):
+    """[BETA] :class:`torch.Tensor` subclass for images.
+
+    .. note::
+
+        In the :ref:`transforms <transforms>`, ``Image`` instances are largely
+        interchangeable with pure :class:`torch.Tensor`. See
+        :ref:`this note <passthrough_heuristic>` for more details.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the image is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[Union[torch.device, str, int]] = None,
+        requires_grad: Optional[bool] = None,
+    ) -> Image:
+        if isinstance(data, PIL.Image.Image):
+            from torchvision.transforms.v2 import functional as F
+
+            data = F.pil_to_tensor(data)
+
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        if tensor.ndim < 2:
+            raise ValueError
+        elif tensor.ndim == 2:
+            tensor = tensor.unsqueeze(0)
+
+        return tensor.as_subclass(cls)
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr()
diff --git a/torchvision/tv_tensors/_mask.py b/torchvision/tv_tensors/_mask.py
new file mode 100644
index 0000000000000000000000000000000000000000..553fc581c50c481869eb4b0ce6405744cd5092c5
--- /dev/null
+++ b/torchvision/tv_tensors/_mask.py
@@ -0,0 +1,39 @@
+from __future__ import annotations
+
+from typing import Any, Optional, Union
+
+import PIL.Image
+import torch
+
+from ._tv_tensor import TVTensor
+
+
+class Mask(TVTensor):
+    """[BETA] :class:`torch.Tensor` subclass for segmentation and detection masks.
+
+    Args:
+        data (tensor-like, PIL.Image.Image): Any data that can be turned into a tensor with :func:`torch.as_tensor` as
+            well as PIL images.
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the mask is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[Union[torch.device, str, int]] = None,
+        requires_grad: Optional[bool] = None,
+    ) -> Mask:
+        if isinstance(data, PIL.Image.Image):
+            from torchvision.transforms.v2 import functional as F
+
+            data = F.pil_to_tensor(data)
+
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        return tensor.as_subclass(cls)
diff --git a/torchvision/tv_tensors/_torch_function_helpers.py b/torchvision/tv_tensors/_torch_function_helpers.py
new file mode 100644
index 0000000000000000000000000000000000000000..7edc471b1109a4599fe193a1b481cebe349c4390
--- /dev/null
+++ b/torchvision/tv_tensors/_torch_function_helpers.py
@@ -0,0 +1,72 @@
+import torch
+
+_TORCHFUNCTION_SUBCLASS = False
+
+
+class _ReturnTypeCM:
+    def __init__(self, to_restore):
+        self.to_restore = to_restore
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *args):
+        global _TORCHFUNCTION_SUBCLASS
+        _TORCHFUNCTION_SUBCLASS = self.to_restore
+
+
+def set_return_type(return_type: str):
+    """[BETA] Set the return type of torch operations on :class:`~torchvision.tv_tensors.TVTensor`.
+
+    This only affects the behaviour of torch operations. It has no effect on
+    ``torchvision`` transforms or functionals, which will always return as
+    output the same type that was passed as input.
+
+    .. warning::
+
+        We recommend using :class:`~torchvision.transforms.v2.ToPureTensor` at
+        the end of your transform pipelines if you use
+        ``set_return_type("TVTensor")``. This will avoid the
+        ``__torch_function__`` overhead in the models ``forward()``.
+
+    Can be used as a global flag for the entire program:
+
+    .. code:: python
+
+        img = tv_tensors.Image(torch.rand(3, 5, 5))
+        img + 2  # This is a pure Tensor (default behaviour)
+
+        set_return_type("TVTensor")
+        img + 2  # This is an Image
+
+    or as a context manager to restrict the scope:
+
+    .. code:: python
+
+        img = tv_tensors.Image(torch.rand(3, 5, 5))
+        img + 2  # This is a pure Tensor
+        with set_return_type("TVTensor"):
+            img + 2  # This is an Image
+        img + 2  # This is a pure Tensor
+
+    Args:
+        return_type (str): Can be "TVTensor" or "Tensor" (case-insensitive).
+            Default is "Tensor" (i.e. pure :class:`torch.Tensor`).
+    """
+    global _TORCHFUNCTION_SUBCLASS
+    to_restore = _TORCHFUNCTION_SUBCLASS
+
+    try:
+        _TORCHFUNCTION_SUBCLASS = {"tensor": False, "tvtensor": True}[return_type.lower()]
+    except KeyError:
+        raise ValueError(f"return_type must be 'TVTensor' or 'Tensor', got {return_type}") from None
+
+    return _ReturnTypeCM(to_restore)
+
+
+def _must_return_subclass():
+    return _TORCHFUNCTION_SUBCLASS
+
+
+# For those ops we always want to preserve the original subclass instead of returning a pure Tensor
+_FORCE_TORCHFUNCTION_SUBCLASS = {torch.Tensor.clone, torch.Tensor.to, torch.Tensor.detach, torch.Tensor.requires_grad_}
diff --git a/torchvision/tv_tensors/_tv_tensor.py b/torchvision/tv_tensors/_tv_tensor.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c6af95af87749e4c6838fab390efb0506b59eed
--- /dev/null
+++ b/torchvision/tv_tensors/_tv_tensor.py
@@ -0,0 +1,132 @@
+from __future__ import annotations
+
+from typing import Any, Callable, Dict, Mapping, Optional, Sequence, Tuple, Type, TypeVar, Union
+
+import torch
+from torch._C import DisableTorchFunctionSubclass
+from torch.types import _device, _dtype, _size
+
+from torchvision.tv_tensors._torch_function_helpers import _FORCE_TORCHFUNCTION_SUBCLASS, _must_return_subclass
+
+
+D = TypeVar("D", bound="TVTensor")
+
+
+class TVTensor(torch.Tensor):
+    """[Beta] Base class for all TVTensors.
+
+    You probably don't want to use this class unless you're defining your own
+    custom TVTensors. See
+    :ref:`sphx_glr_auto_examples_transforms_plot_custom_tv_tensors.py` for details.
+    """
+
+    @staticmethod
+    def _to_tensor(
+        data: Any,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[Union[torch.device, str, int]] = None,
+        requires_grad: Optional[bool] = None,
+    ) -> torch.Tensor:
+        if requires_grad is None:
+            requires_grad = data.requires_grad if isinstance(data, torch.Tensor) else False
+        return torch.as_tensor(data, dtype=dtype, device=device).requires_grad_(requires_grad)
+
+    @classmethod
+    def _wrap_output(
+        cls,
+        output: torch.Tensor,
+        args: Sequence[Any] = (),
+        kwargs: Optional[Mapping[str, Any]] = None,
+    ) -> torch.Tensor:
+        # Same as torch._tensor._convert
+        if isinstance(output, torch.Tensor) and not isinstance(output, cls):
+            output = output.as_subclass(cls)
+
+        if isinstance(output, (tuple, list)):
+            # Also handles things like namedtuples
+            output = type(output)(cls._wrap_output(part, args, kwargs) for part in output)
+        return output
+
+    @classmethod
+    def __torch_function__(
+        cls,
+        func: Callable[..., torch.Tensor],
+        types: Tuple[Type[torch.Tensor], ...],
+        args: Sequence[Any] = (),
+        kwargs: Optional[Mapping[str, Any]] = None,
+    ) -> torch.Tensor:
+        """For general information about how the __torch_function__ protocol works,
+        see https://pytorch.org/docs/stable/notes/extending.html#extending-torch
+
+        TL;DR: Every time a PyTorch operator is called, it goes through the inputs and looks for the
+        ``__torch_function__`` method. If one is found, it is invoked with the operator as ``func`` as well as the
+        ``args`` and ``kwargs`` of the original call.
+
+        Why do we override this? Because the base implementation in torch.Tensor would preserve the TVTensor type
+        of the output. In our case, we want to return pure tensors instead (with a few exceptions). Refer to the
+        "TVTensors FAQ" gallery example for a rationale of this behaviour (TL;DR: perf + no silver bullet).
+
+        Our implementation below is very similar to the base implementation in ``torch.Tensor`` - go check it out.
+        """
+        if not all(issubclass(cls, t) for t in types):
+            return NotImplemented
+
+        # Like in the base Tensor.__torch_function__ implementation, it's easier to always use
+        # DisableTorchFunctionSubclass and then manually re-wrap the output if necessary
+        with DisableTorchFunctionSubclass():
+            output = func(*args, **kwargs or dict())
+
+        must_return_subclass = _must_return_subclass()
+        if must_return_subclass or (func in _FORCE_TORCHFUNCTION_SUBCLASS and isinstance(args[0], cls)):
+            # If you're wondering why we need the `isinstance(args[0], cls)` check, remove it and see what fails
+            # in test_to_tv_tensor_reference().
+            # The __torch_function__ protocol will invoke the __torch_function__ method on *all* types involved in
+            # the computation by walking the MRO upwards. For example,
+            # `out = a_pure_tensor.to(an_image)` will invoke `Image.__torch_function__` with
+            # `args = (a_pure_tensor, an_image)` first. Without this guard, `out` would
+            # be wrapped into an `Image`.
+            return cls._wrap_output(output, args, kwargs)
+
+        if not must_return_subclass and isinstance(output, cls):
+            # DisableTorchFunctionSubclass is ignored by inplace ops like `.add_(...)`,
+            # so for those, the output is still a TVTensor. Thus, we need to manually unwrap.
+            return output.as_subclass(torch.Tensor)
+
+        return output
+
+    def _make_repr(self, **kwargs: Any) -> str:
+        # This is a poor man's implementation of the proposal in https://github.com/pytorch/pytorch/issues/76532.
+        # If that ever gets implemented, remove this in favor of the solution on the `torch.Tensor` class.
+        extra_repr = ", ".join(f"{key}={value}" for key, value in kwargs.items())
+        return f"{super().__repr__()[:-1]}, {extra_repr})"
+
+    # Add properties for common attributes like shape, dtype, device, ndim etc
+    # this way we return the result without passing into __torch_function__
+    @property
+    def shape(self) -> _size:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().shape
+
+    @property
+    def ndim(self) -> int:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().ndim
+
+    @property
+    def device(self, *args: Any, **kwargs: Any) -> _device:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().device
+
+    @property
+    def dtype(self) -> _dtype:  # type: ignore[override]
+        with DisableTorchFunctionSubclass():
+            return super().dtype
+
+    def __deepcopy__(self: D, memo: Dict[int, Any]) -> D:
+        # We need to detach first, since a plain `Tensor.clone` will be part of the computation graph, which does
+        # *not* happen for `deepcopy(Tensor)`. A side-effect from detaching is that the `Tensor.requires_grad`
+        # attribute is cleared, so we need to refill it before we return.
+        # Note: We don't explicitly handle deep-copying of the metadata here. The only metadata we currently have is
+        # `BoundingBoxes.format` and `BoundingBoxes.canvas_size`, which are immutable and thus implicitly deep-copied by
+        # `BoundingBoxes.clone()`.
+        return self.detach().clone().requires_grad_(self.requires_grad)  # type: ignore[return-value]
diff --git a/torchvision/tv_tensors/_video.py b/torchvision/tv_tensors/_video.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1efe4fe406ecb5ff52a2186316b6bb5d87be49d
--- /dev/null
+++ b/torchvision/tv_tensors/_video.py
@@ -0,0 +1,37 @@
+from __future__ import annotations
+
+from typing import Any, Optional, Union
+
+import torch
+
+from ._tv_tensor import TVTensor
+
+
+class Video(TVTensor):
+    """[BETA] :class:`torch.Tensor` subclass for videos.
+
+    Args:
+        data (tensor-like): Any data that can be turned into a tensor with :func:`torch.as_tensor`.
+        dtype (torch.dtype, optional): Desired data type. If omitted, will be inferred from
+            ``data``.
+        device (torch.device, optional): Desired device. If omitted and ``data`` is a
+            :class:`torch.Tensor`, the device is taken from it. Otherwise, the video is constructed on the CPU.
+        requires_grad (bool, optional): Whether autograd should record operations. If omitted and
+            ``data`` is a :class:`torch.Tensor`, the value is taken from it. Otherwise, defaults to ``False``.
+    """
+
+    def __new__(
+        cls,
+        data: Any,
+        *,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[Union[torch.device, str, int]] = None,
+        requires_grad: Optional[bool] = None,
+    ) -> Video:
+        tensor = cls._to_tensor(data, dtype=dtype, device=device, requires_grad=requires_grad)
+        if data.ndim < 4:
+            raise ValueError
+        return tensor.as_subclass(cls)
+
+    def __repr__(self, *, tensor_contents: Any = None) -> str:  # type: ignore[override]
+        return self._make_repr()
diff --git a/torchvision/utils.py b/torchvision/utils.py
index 98f9381e11ef0bdec3eebb2d449caf0ed01d6ca6..6ec19a0e0a1f0dce97067bf57cbca675be3fe2d6 100644
--- a/torchvision/utils.py
+++ b/torchvision/utils.py
@@ -29,7 +29,6 @@ def make_grid(
     value_range: Optional[Tuple[int, int]] = None,
     scale_each: bool = False,
     pad_value: float = 0.0,
-    **kwargs,
 ) -> torch.Tensor:
     """
     Make a grid of images.
@@ -145,7 +144,7 @@ def save_image(
     if not torch.jit.is_scripting() and not torch.jit.is_tracing():
         _log_api_usage_once(save_image)
     grid = make_grid(tensor, **kwargs)
-    # Add 0.5 after unnormalizing to [0, 255] to round to nearest integer
+    # Add 0.5 after unnormalizing to [0, 255] to round to the nearest integer
     ndarr = grid.mul(255).add_(0.5).clamp_(0, 255).permute(1, 2, 0).to("cpu", torch.uint8).numpy()
     im = Image.fromarray(ndarr)
     im.save(fp, format=format)
@@ -217,15 +216,7 @@ def draw_bounding_boxes(
             f"Number of boxes ({num_boxes}) and labels ({len(labels)}) mismatch. Please specify labels for each box."
         )
 
-    if colors is None:
-        colors = _generate_color_palette(num_boxes)
-    elif isinstance(colors, list):
-        if len(colors) < num_boxes:
-            raise ValueError(f"Number of colors ({len(colors)}) is less than number of boxes ({num_boxes}). ")
-    else:  # colors specifies a single color for all boxes
-        colors = [colors] * num_boxes
-
-    colors = [(ImageColor.getrgb(color) if isinstance(color, str) else color) for color in colors]
+    colors = _parse_colors(colors, num_objects=num_boxes)
 
     if font is None:
         if font_size is not None:
@@ -307,34 +298,20 @@ def draw_segmentation_masks(
         raise ValueError("The image and the masks must have the same height and width")
 
     num_masks = masks.size()[0]
-    if colors is not None and num_masks > len(colors):
-        raise ValueError(f"There are more masks ({num_masks}) than colors ({len(colors)})")
 
     if num_masks == 0:
         warnings.warn("masks doesn't contain any mask. No mask was drawn")
         return image
 
-    if colors is None:
-        colors = _generate_color_palette(num_masks)
-
-    if not isinstance(colors, list):
-        colors = [colors]
-    if not isinstance(colors[0], (tuple, str)):
-        raise ValueError("colors must be a tuple or a string, or a list thereof")
-    if isinstance(colors[0], tuple) and len(colors[0]) != 3:
-        raise ValueError("It seems that you passed a tuple of colors instead of a list of colors")
-
     out_dtype = torch.uint8
-
-    colors_ = []
-    for color in colors:
-        if isinstance(color, str):
-            color = ImageColor.getrgb(color)
-        colors_.append(torch.tensor(color, dtype=out_dtype))
+    colors = [
+        torch.tensor(color, dtype=out_dtype, device=image.device)
+        for color in _parse_colors(colors, num_objects=num_masks)
+    ]
 
     img_to_draw = image.detach().clone()
     # TODO: There might be a way to vectorize this
-    for mask, color in zip(masks, colors_):
+    for mask, color in zip(masks, colors):
         img_to_draw[:, mask] = color[:, None]
 
     out = image * (1 - alpha) + img_to_draw * alpha
@@ -535,6 +512,49 @@ def _generate_color_palette(num_objects: int):
     return [tuple((i * palette) % 255) for i in range(num_objects)]
 
 
+def _parse_colors(
+    colors: Union[None, str, Tuple[int, int, int], List[Union[str, Tuple[int, int, int]]]],
+    *,
+    num_objects: int,
+) -> List[Tuple[int, int, int]]:
+    """
+    Parses a specification of colors for a set of objects.
+
+    Args:
+        colors: A specification of colors for the objects. This can be one of the following:
+            - None: to generate a color palette automatically.
+            - A list of colors: where each color is either a string (specifying a named color) or an RGB tuple.
+            - A string or an RGB tuple: to use the same color for all objects.
+
+            If `colors` is a tuple, it should be a 3-tuple specifying the RGB values of the color.
+            If `colors` is a list, it should have at least as many elements as the number of objects to color.
+
+        num_objects (int): The number of objects to color.
+
+    Returns:
+        A list of 3-tuples, specifying the RGB values of the colors.
+
+    Raises:
+        ValueError: If the number of colors in the list is less than the number of objects to color.
+                    If `colors` is not a list, tuple, string or None.
+    """
+    if colors is None:
+        colors = _generate_color_palette(num_objects)
+    elif isinstance(colors, list):
+        if len(colors) < num_objects:
+            raise ValueError(
+                f"Number of colors must be equal or larger than the number of objects, but got {len(colors)} < {num_objects}."
+            )
+    elif not isinstance(colors, (tuple, str)):
+        raise ValueError("`colors` must be a tuple or a string, or a list thereof, but got {colors}.")
+    elif isinstance(colors, tuple) and len(colors) != 3:
+        raise ValueError("If passed as tuple, colors should be an RGB triplet, but got {colors}.")
+    else:  # colors specifies a single color for all objects
+        colors = [colors] * num_objects
+
+    return [ImageColor.getrgb(color) if isinstance(color, str) else color for color in colors]
+
+
 def _log_api_usage_once(obj: Any) -> None:
 
     """
@@ -565,7 +585,7 @@ def _log_api_usage_once(obj: Any) -> None:
 def _make_ntuple(x: Any, n: int) -> Tuple[Any, ...]:
     """
     Make n-tuple from input x. If x is an iterable, then we just convert it to tuple.
-    Otherwise we will make a tuple of length n, all with value of x.
+    Otherwise, we will make a tuple of length n, all with value of x.
     reference: https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/utils.py#L8
 
     Args:
diff --git a/version.txt b/version.txt
index 930e3000bdc9aaa03a5a26831c271dd32d494f61..04a373efe6ba8b6928cd5214e1fc3aef70c59002 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.14.1
+0.16.0