fix prototype resource loading (#5447)

* fix prototype resource loading * revert unrelated change

fix prototype resource loading (#5447)
* fix prototype resource loading * revert unrelated change
c530b623 · Philip Meier · GitHub · e88a5549 · c530b623
Unverified Commit c530b623 authored Feb 21, 2022 by Philip Meier Committed by GitHub Feb 21, 2022
Hide whitespace changes
Inline Side-by-side

Showing with 20 additions and 10 deletions

torchvision/prototype/datasets/utils/_resource.py torchvision/prototype/datasets/utils/_resource.py +20 -10

No files found.
--- a/torchvision/prototype/datasets/utils/_resource.py
+++ b/torchvision/prototype/datasets/utils/_resource.py
@@ -88,20 +88,30 @@ class OnlineResource(abc.ABC):
        root = pathlib.Path(root)
        path = root / self.file_name
        # Instead of the raw file, there might also be files with fewer suffixes after decompression or directories
-        # with no suffixes at all. Thus, we look for all paths that share the same name without suffixes as the raw
+        # with no suffixes at all.
-        # file.
+        stem = path.name.replace("".join(path.suffixes), "")
-        path_candidates = {file for file in path.parent.glob(path.name.replace("".join(path.suffixes), "") + "*")}
-        # If we don't find anything, we try to download the raw file.
+        # In a first step, we check for a folder with the same stem as the raw file. If it exists, we use it since
-        if not path_candidates:
+        # extracted files give the best I/O performance. Note that OnlineResource._extract() makes sure that an archive
-            path_candidates = {self.download(root, skip_integrity_check=skip_integrity_check)}
+        # is always extracted in a folder with the corresponding file name.
+        folder_candidate = path.parent / stem
+        if folder_candidate.exists() and folder_candidate.is_dir():
+            return self._loader(folder_candidate)
+        # If there is no folder, we look for all files that share the same stem as the raw file, but might have a
+        # different suffix.
+        file_candidates = {file for file in path.parent.glob(stem + ".*")}
+        # If we don't find anything, we download the raw file.
+        if not file_candidates:
+            file_candidates = {self.download(root, skip_integrity_check=skip_integrity_check)}
        # If the only thing we find is the raw file, we use it and optionally perform some preprocessing steps.
-        if path_candidates == {path}:
+        if file_candidates == {path}:
            if self._preprocess is not None:
                path = self._preprocess(path)
-        # Otherwise we use the path with the fewest suffixes. This gives us the extracted > decompressed > raw priority
+        # Otherwise, we use the path with the fewest suffixes. This gives us the decompressed > raw priority that we
-        # that we want.
+        # want for the best I/O performance.
        else:
-            path = min(path_candidates, key=lambda path: len(path.suffixes))
+            path = min(file_candidates, key=lambda path: len(path.suffixes))
        return self._loader(path)
    @abc.abstractmethod