Fix sphinx warnings and turn warnings into errors (#3290)

Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>

Fix sphinx warnings and turn warnings into errors (#3290)
Co-authored-by: Vasilis Vryniotis <datumbox@users.noreply.github.com>
eb00e2ad · Nicolas Hug · GitHub · 59d3af53 · eb00e2ad · eb00e2ad
Unverified Commit eb00e2ad authored Jan 26, 2021 by Nicolas Hug Committed by GitHub Jan 26, 2021
20 changed files
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -2,7 +2,7 @@
 #
 # You can set these variables from the command line.
-SPHINXOPTS    =
+SPHINXOPTS    = -W  # turn warnings into errors
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = torchvision
 SOURCEDIR     = source

--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -23,8 +23,23 @@
 import torch
 import torchvision
 import pytorch_sphinx_theme
+from sphinxcontrib import googleanalytics
+# Wrap sphinxcontrib-googleanalytics setup() function to avoid a Sphinx warning:
+# "WARNING: extension ‘sphinxcontrib.googleanalytics’ returned an unsupported
+# object from its setup() function; it should return None or a metadata
+# dictionary"
+_googleanalytics_setup_original = googleanalytics.setup
+def _googleanalytics_setup_wrapper(app):
+    _googleanalytics_setup_original(app)
+    return {"version": "0.1"}
+googleanalytics.setup = _googleanalytics_setup_wrapper
 # -- General configuration ------------------------------------------------
 # If your documentation needs a minimal Sphinx version, state it here.
@@ -48,6 +63,8 @@ extensions = [
 ]
 napoleon_use_ivar = True
+napoleon_numpy_docstring = False
+napoleon_google_docstring = True
 googleanalytics_id = 'UA-90545585-1'
 googleanalytics_enabled = True

--- a/docs/source/datasets.rst
+++ b/docs/source/datasets.rst
@@ -155,7 +155,7 @@ MNIST
 .. autoclass:: MNIST
 Omniglot
-~~~~~~
+~~~~~~~~
 .. autoclass:: Omniglot

--- a/docs/source/io.rst
+++ b/docs/source/io.rst
@@ -18,7 +18,7 @@ Video
 Fine-grained video API
-------------------
+----------------------
 In addition to the :mod:`read_video` function, we provide a high-performance 
 lower-level API for more fine-grained control compared to the :mod:`read_video` function.

--- a/torchvision/datasets/celeba.py
+++ b/torchvision/datasets/celeba.py
@@ -17,12 +17,15 @@ class CelebA(VisionDataset):
        target_type (string or list, optional): Type of target to use, ``attr``, ``identity``, ``bbox``,
            or ``landmarks``. Can also be a list to output a tuple with all specified target types.
            The targets represent:
-                ``attr`` (np.array shape=(40,) dtype=int): binary (0, 1) labels for attributes
-                ``identity`` (int): label for each person (data points with the same identity are the same person)
+                - ``attr`` (np.array shape=(40,) dtype=int): binary (0, 1) labels for attributes
-                ``bbox`` (np.array shape=(4,) dtype=int): bounding box (x, y, width, height)
+                - ``identity`` (int): label for each person (data points with the same identity are the same person)
-                ``landmarks`` (np.array shape=(10,) dtype=int): landmark points (lefteye_x, lefteye_y, righteye_x,
+                - ``bbox`` (np.array shape=(4,) dtype=int): bounding box (x, y, width, height)
+                - ``landmarks`` (np.array shape=(10,) dtype=int): landmark points (lefteye_x, lefteye_y, righteye_x,
                  righteye_y, nose_x, nose_y, leftmouth_x, leftmouth_y, rightmouth_x, rightmouth_y)
            Defaults to ``attr``. If empty, ``None`` will be returned as target.
        transform (callable, optional): A function/transform that  takes in an PIL image
            and returns a transformed version. E.g, ``transforms.ToTensor``
        target_transform (callable, optional): A function/transform that takes in the

--- a/torchvision/datasets/hmdb51.py
+++ b/torchvision/datasets/hmdb51.py
@@ -37,10 +37,12 @@ class HMDB51(VisionDataset):
            and returns a transformed version.
    Returns:
-        video (Tensor[T, H, W, C]): the `T` video frames
+        tuple: A 3-tuple with the following entries:
-        audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
+            - video (Tensor[T, H, W, C]): The `T` video frames
+            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
              and `L` is the number of points
-        label (int): class of the video clip
+            - label (int): class of the video clip
    """
    data_url = "http://serre-lab.clps.brown.edu/wp-content/uploads/2013/10/hmdb51_org.rar"

--- a/torchvision/datasets/kinetics.py
+++ b/torchvision/datasets/kinetics.py
@@ -30,10 +30,12 @@ class Kinetics400(VisionDataset):
            and returns a transformed version.
    Returns:
-        video (Tensor[T, H, W, C]): the `T` video frames
+        tuple: A 3-tuple with the following entries:
-        audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
+            - video (Tensor[T, H, W, C]): the `T` video frames
+            - audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
              and `L` is the number of points
-        label (int): class of the video clip
+            - label (int): class of the video clip
    """
    def __init__(self, root, frames_per_clip, step_between_clips=1, frame_rate=None,

--- a/torchvision/datasets/mnist.py
+++ b/torchvision/datasets/mnist.py
@@ -318,7 +318,7 @@ class QMNIST(MNIST):
    """`QMNIST <https://github.com/facebookresearch/qmnist>`_ Dataset.
    Args:
-        root (string): Root directory of dataset whose ``processed''
+        root (string): Root directory of dataset whose ``processed``
            subdir contains torch binary files with the datasets.
        what (string,optional): Can be 'train', 'test', 'test10k',
            'test50k', or 'nist' for respectively the mnist compatible
@@ -342,7 +342,6 @@ class QMNIST(MNIST):
        train (bool,optional,compatibility): When argument 'what' is
            not specified, this boolean decides whether to load the
            training set ot the testing set.  Default: True.
    """
    subsets = {

--- a/torchvision/datasets/omniglot.py
+++ b/torchvision/datasets/omniglot.py
@@ -7,6 +7,7 @@ from .utils import download_and_extract_archive, check_integrity, list_dir, list
 class Omniglot(VisionDataset):
    """`Omniglot <https://github.com/brendenlake/omniglot>`_ Dataset.
    Args:
        root (string): Root directory of dataset where directory
            ``omniglot-py`` exists.

--- a/torchvision/datasets/stl10.py
+++ b/torchvision/datasets/stl10.py
@@ -26,7 +26,6 @@ class STL10(VisionDataset):
        download (bool, optional): If true, downloads the dataset from the internet and
            puts it in root directory. If dataset is already downloaded, it is not
            downloaded again.
    """
    base_folder = 'stl10_binary'
    url = "http://ai.stanford.edu/~acoates/stl10/stl10_binary.tar.gz"

--- a/torchvision/datasets/ucf101.py
+++ b/torchvision/datasets/ucf101.py
@@ -35,10 +35,12 @@ class UCF101(VisionDataset):
            and returns a transformed version.
    Returns:
-        video (Tensor[T, H, W, C]): the `T` video frames
+        tuple: A 3-tuple with the following entries:
-        audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
+            - video (Tensor[T, H, W, C]): the `T` video frames
+            -  audio(Tensor[K, L]): the audio frames, where `K` is the number of channels
               and `L` is the number of points
-        label (int): class of the video clip
+            - label (int): class of the video clip
    """
    def __init__(self, root, annotation_path, frames_per_clip, step_between_clips=1,

--- a/torchvision/io/__init__.py
+++ b/torchvision/io/__init__.py
@@ -51,9 +51,9 @@ class VideoReader:
    Example:
        The following examples creates a :mod:`VideoReader` object, seeks into 2s
        point, and returns a single frame::
            import torchvision
            video_path = "path_to_a_test_video"
            reader = torchvision.io.VideoReader(video_path, "video")
            reader.seek(2.0)
            frame = next(reader)
@@ -61,18 +61,23 @@ class VideoReader:
        :mod:`VideoReader` implements the iterable API, which makes it suitable to
        using it in conjunction with :mod:`itertools` for more advanced reading.
        As such, we can use a :mod:`VideoReader` instance inside for loops::
            reader.seek(2)
            for frame in reader:
                frames.append(frame['data'])
            # additionally, `seek` implements a fluent API, so we can do
            for frame in reader.seek(2):
                frames.append(frame['data'])
        With :mod:`itertools`, we can read all frames between 2 and 5 seconds with the
        following code::
            for frame in itertools.takewhile(lambda x: x['pts'] <= 5, reader.seek(2)):
                frames.append(frame['data'])
        and similarly, reading 10 frames after the 2s timestamp can be achieved
        as follows::
            for frame in itertools.islice(reader.seek(2), 10):
                frames.append(frame['data'])

--- a/torchvision/io/image.py
+++ b/torchvision/io/image.py
@@ -126,7 +126,7 @@ def encode_png(input: torch.Tensor, compression_level: int = 6) -> torch.Tensor:
            between 0 and 9. Default: 6
    Returns:
-    output (Tensor[1]): A one dimensional int8 tensor that contains the raw bytes of the
+        Tensor[1]: A one dimensional int8 tensor that contains the raw bytes of the
            PNG file.
    """
    output = torch.ops.image.encode_png(input, compression_level)

--- a/torchvision/io/video.py
+++ b/torchvision/io/video.py
@@ -253,10 +253,8 @@ def read_video(
    Returns:
        vframes (Tensor[T, H, W, C]): the `T` video frames
-        aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the
+        aframes (Tensor[K, L]): the audio frames, where `K` is the number of channels and `L` is the number of points
-            number of points
+        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float) and audio_fps (int)
-        info (Dict): metadata for the video and audio. Can contain the fields video_fps (float)
-            and audio_fps (int)
    """
    from torchvision import get_video_backend

--- a/torchvision/models/detection/faster_rcnn.py
+++ b/torchvision/models/detection/faster_rcnn.py
@@ -308,6 +308,7 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values of ``x``
          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
@@ -318,6 +319,7 @@ def fasterrcnn_resnet50_fpn(pretrained=False, progress=True,
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
    follows:
        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values of ``x``
          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
        - labels (``Int64Tensor[N]``): the predicted labels for each image

--- a/torchvision/models/detection/keypoint_rcnn.py
+++ b/torchvision/models/detection/keypoint_rcnn.py
@@ -26,6 +26,7 @@ class KeypointRCNN(FasterRCNN):
    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (FloatTensor[N, 4]): the ground-truth boxes in [x1, y1, x2, y2] format, with values of x
          between 0 and W and values of y between 0 and H
        - labels (Int64Tensor[N]): the class label for each ground-truth box
@@ -38,6 +39,7 @@ class KeypointRCNN(FasterRCNN):
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a List[Dict[Tensor]], one for each input image. The fields of the Dict are as
    follows:
        - boxes (FloatTensor[N, 4]): the predicted boxes in [x1, y1, x2, y2] format, with values of x
          between 0 and W and values of y between 0 and H
        - labels (Int64Tensor[N]): the predicted labels for each image
@@ -283,6 +285,7 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values of ``x``
          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
@@ -295,6 +298,7 @@ def keypointrcnn_resnet50_fpn(pretrained=False, progress=True,
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
    follows:
        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format,  with values of ``x``
          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
        - labels (``Int64Tensor[N]``): the predicted labels for each image

--- a/torchvision/models/detection/mask_rcnn.py
+++ b/torchvision/models/detection/mask_rcnn.py
@@ -278,6 +278,7 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format,  with values of ``x``
          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
@@ -289,6 +290,7 @@ def maskrcnn_resnet50_fpn(pretrained=False, progress=True,
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
    follows:
        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format,  with values of ``x``
          between ``0`` and ``W`` and values of ``y`` between ``0`` and ``H``
        - labels (``Int64Tensor[N]``): the predicted labels for each image

--- a/torchvision/models/detection/retinanet.py
+++ b/torchvision/models/detection/retinanet.py
@@ -575,6 +575,7 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True,
    During training, the model expects both the input tensors, as well as a targets (list of dictionary),
    containing:
        - boxes (``FloatTensor[N, 4]``): the ground-truth boxes in ``[x1, y1, x2, y2]`` format, with values
          between ``0`` and ``H`` and ``0`` and ``W``
        - labels (``Int64Tensor[N]``): the class label for each ground-truth box
@@ -585,6 +586,7 @@ def retinanet_resnet50_fpn(pretrained=False, progress=True,
    During inference, the model requires only the input tensors, and returns the post-processed
    predictions as a ``List[Dict[Tensor]]``, one for each input image. The fields of the ``Dict`` are as
    follows:
        - boxes (``FloatTensor[N, 4]``): the predicted boxes in ``[x1, y1, x2, y2]`` format, with values between
          ``0`` and ``H`` and ``0`` and ``W``
        - labels (``Int64Tensor[N]``): the predicted labels for each image

--- a/torchvision/transforms/functional.py
+++ b/torchvision/transforms/functional.py
@@ -982,9 +982,9 @@ def affine(
            of length 1: ``[value, ]``.
            If input is PIL Image, the options is only available for ``Pillow>=5.0.0``.
        fillcolor (sequence, int, float): deprecated argument and will be removed since v0.10.0.
-            Please use `arg`:fill: instead.
+            Please use the ``fill`` parameter instead.
        resample (int, optional): deprecated argument and will be removed since v0.10.0.
-            Please use `arg`:interpolation: instead.
+            Please use the ``interpolation`` parameter instead.
    Returns:
        PIL Image or Tensor: Transformed image.

--- a/torchvision/transforms/transforms.py
+++ b/torchvision/transforms/transforms.py
@@ -1179,7 +1179,7 @@ class RandomRotation(torch.nn.Module):
            image. If given a number, the value is used for all bands respectively.
            If input is PIL Image, the options is only available for ``Pillow>=5.2.0``.
        resample (int, optional): deprecated argument and will be removed since v0.10.0.
-            Please use `arg`:interpolation: instead.
+            Please use the ``interpolation`` parameter instead.
    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters
@@ -1284,9 +1284,9 @@ class RandomAffine(torch.nn.Module):
            image. If given a number, the value is used for all bands respectively.
            If input is PIL Image, the options is only available for ``Pillow>=5.0.0``.
        fillcolor (sequence or number, optional): deprecated argument and will be removed since v0.10.0.
-            Please use `arg`:fill: instead.
+            Please use the ``fill`` parameter instead.
        resample (int, optional): deprecated argument and will be removed since v0.10.0.
-            Please use `arg`:interpolation: instead.
+            Please use the ``interpolation`` parameter instead.
    .. _filters: https://pillow.readthedocs.io/en/latest/handbook/concepts.html#filters