[Dataset] Add VOCSuperpixels dataset in LRGB (#6389)

8edcad2d · paoxiaode · GitHub · 403dba62 · 8edcad2d · 8edcad2d
Unverified Commit 8edcad2d authored Oct 08, 2023 by paoxiaode Committed by GitHub Oct 08, 2023
4 changed files
--- a/docs/source/api/python/dgl.data.rst
+++ b/docs/source/api/python/dgl.data.rst
@@ -70,6 +70,7 @@ Datasets for node classification/regression tasks
    MovieLensDataset
    PeptidesStructuralDataset
    PeptidesFunctionalDataset
+    VOCSuperpixelsDataset
 Edge Prediction Datasets
 ---------------------------------------

--- a/python/dgl/data/__init__.py
+++ b/python/dgl/data/__init__.py
@@ -75,7 +75,11 @@ from .heterophilous_graphs import (
 # Exception handling was added to prevent crashes for users who are using other
 # datasets.
 try:
-    from .lrgb import PeptidesFunctionalDataset, PeptidesStructuralDataset
+    from .lrgb import (
+        PeptidesFunctionalDataset,
+        PeptidesStructuralDataset,
+        VOCSuperpixelsDataset,
+    )
 except ImportError:
    pass
 from .pattern import PATTERNDataset

--- a/python/dgl/data/lrgb.py
+++ b/python/dgl/data/lrgb.py
@@ -10,7 +10,14 @@ from .. import backend as F
 from ..convert import graph as dgl_graph
 from .dgl_dataset import DGLDataset
-from .utils import download, load_graphs, save_graphs, Subset
+from .utils import (
+    download,
+    extract_archive,
+    load_graphs,
+    makedirs,
+    save_graphs,
+    Subset,
+)
 class PeptidesStructuralDataset(DGLDataset):
@@ -48,7 +55,7 @@ class PeptidesStructuralDataset(DGLDataset):
    Parameters
    ----------
    raw_dir : str
-        Raw file directory to download/contains the input data directory.
+        Directory to store all the downloaded raw datasets.
        Default: "~/.dgl/".
    force_reload : bool
        Whether to reload the dataset.
@@ -79,6 +86,9 @@ class PeptidesStructuralDataset(DGLDataset):
        ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
        edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
+    >>> # accept tensor to be index, but will ignore transform parameter
+    >>> # get train dataset
    >>> split_dict = dataset.get_idx_split()
    >>> trainset = dataset[split_dict["train"]]
    >>> graph, label = trainset[0]
@@ -86,6 +96,16 @@ class PeptidesStructuralDataset(DGLDataset):
    Graph(num_nodes=338, num_edges=682,
        ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
        edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
+    >>> # get subset of dataset
+    >>> import torch
+    >>> idx = torch.tensor([0, 1, 2])
+    >>> dataset_subset = dataset[idx]
+    >>> graph, label = dataset_subset[0]
+    >>> graph
+    Graph(num_nodes=119, num_edges=244,
+        ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
+        edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
    """
    def __init__(
@@ -234,7 +254,21 @@ class PeptidesStructuralDataset(DGLDataset):
        return len(self.graphs)
    def __getitem__(self, idx):
-        """Get datapoint with index"""
+        """Get the idx-th sample.
+        Parameters
+        ---------
+        idx : int or tensor
+            The sample index, if idx is tensor will ignore transform.
+        Returns
+        -------
+        (:class:`dgl.DGLGraph`, Tensor)
+            Graph with node feature stored in ``feat`` field and its label.
+        or
+        :class:`dgl.data.utils.Subset`
+            Subset of the dataset at specified indices
+        """
        if F.is_tensor(idx) and idx.dim() == 1:
            return Subset(self, idx.cpu())
@@ -271,7 +305,7 @@ class PeptidesFunctionalDataset(DGLDataset):
    Parameters
    ----------
    raw_dir : str
-        Raw file directory to download/contains the input data directory.
+        Directory to store all the downloaded raw datasets.
        Default: "~/.dgl/".
    force_reload : bool
        Whether to reload the dataset.
@@ -302,6 +336,9 @@ class PeptidesFunctionalDataset(DGLDataset):
        ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
        edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
+    >>> # accept tensor to be index, but will ignore transform parameter
+    >>> # get train dataset
    >>> split_dict = dataset.get_idx_split()
    >>> trainset = dataset[split_dict["train"]]
    >>> graph, label = trainset[0]
@@ -310,6 +347,15 @@ class PeptidesFunctionalDataset(DGLDataset):
        ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
        edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
+    >>> # get subset of dataset
+    >>> import torch
+    >>> idx = torch.tensor([0, 1, 2])
+    >>> dataset_subset = dataset[idx]
+    >>> graph, label = dataset_subset[0]
+    >>> graph
+    Graph(num_nodes=119, num_edges=244,
+        ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
+        edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
    """
    def __init__(
@@ -442,7 +488,21 @@ class PeptidesFunctionalDataset(DGLDataset):
        return len(self.graphs)
    def __getitem__(self, idx):
-        """Get datapoint with index"""
+        """Get the idx-th sample.
+        Parameters
+        ---------
+        idx : int or tensor
+            The sample index, if idx is tensor will ignore transform.
+        Returns
+        -------
+        (:class:`dgl.DGLGraph`, Tensor)
+            Graph with node feature stored in ``feat`` field and its label.
+        or
+        :class:`dgl.data.utils.Subset`
+            Subset of the dataset at specified indices
+        """
        if F.is_tensor(idx) and idx.dim() == 1:
            return Subset(self, idx.cpu())
@@ -450,3 +510,246 @@ class PeptidesFunctionalDataset(DGLDataset):
            return self.graphs[idx], self.labels[idx]
        else:
            return self._transform(self.graphs[idx]), self.labels[idx]
+class VOCSuperpixelsDataset(DGLDataset):
+    r"""VOCSuperpixels dataset for the node classification task.
+    DGL dataset of Pascal VOC Superpixels which contains image superpixels
+    and a semantic segmentation label for each node superpixel.
+    color map
+    0=background, 1=aeroplane, 2=bicycle, 3=bird, 4=boat, 5=bottle,
+    6=bus, 7=car, 8=cat, 9=chair, 10=cow,
+    11=diningtable, 12=dog, 13=horse, 14=motorbike, 15=person,
+    16=potted plant, 17=sheep, 18=sofa, 19=train, 20=tv/monitor
+    Reference `<https://arxiv.org/abs/2206.08164.pdf>`_
+    Statistics:
+    - Train examples: 8,498
+    - Valid examples: 1,428
+    - Test examples: 1,429
+    - Average number of nodes: 479.40
+    - Average number of edges: 2,710.48
+    Parameters
+    ----------
+    raw_dir : str
+        Directory to store all the downloaded raw datasets.
+        Default: "~/.dgl/".
+    split : str
+        Should be chosen from ["train", "val", "test"]
+        Default: "train".
+    construct_format : str, optional
+        Option to select the graph construction format.
+        Should be chosen from the following formats:
+        "edge_wt_only_coord": the graphs are 8-nn graphs with the edge weights
+        computed based on only spatial coordinates of superpixel nodes.
+        "edge_wt_coord_feat": the graphs are 8-nn graphs with the edge weights
+        computed based on combination of spatial coordinates and feature
+        values of superpixel nodes.
+        "edge_wt_region_boundary": the graphs region boundary graphs where two
+        regions (i.e. superpixel nodes) have an edge between them if they share
+        a boundary in the original image.
+        Default: "edge_wt_region_boundary".
+    slic_compactness : int, optional
+        Option to select compactness of slic that was used for superpixels
+        Should be chosen from [10, 30]
+        Default: 30.
+    force_reload : bool
+        Whether to reload the dataset.
+        Default: False.
+    verbose : bool
+        Whether to print out progress information.
+        Default: False.
+    transform : callable, optional
+        A transform that takes in a :class:`~dgl.DGLGraph` object and returns
+        a transformed version. The :class:`~dgl.DGLGraph` object will be
+        transformed before every access.
+    Examples
+    ---------
+    >>> from dgl.data import VOCSuperpixelsDataset
+    >>> train_dataset = VOCSuperpixelsDataset(split="train")
+    >>> len(train_dataset)
+    8498
+    >>> train_dataset.num_classes
+    21
+    >>> graph = train_dataset[0]
+    >>> graph
+    Graph(num_nodes=460, num_edges=2632,
+        ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int32)}
+        edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)})
+    >>> # accept tensor to be index, but will ignore transform parameter
+    >>> import torch
+    >>> idx = torch.tensor([0, 1, 2])
+    >>> train_dataset_subset = train_dataset[idx]
+    >>> train_dataset_subset[0]
+    Graph(num_nodes=460, num_edges=2632,
+        ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int32)}
+        edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)})
+    """
+    urls = {
+        10: {
+            "edge_wt_only_coord": """
+            https://www.dropbox.com/s/rk6pfnuh7tq3t37/voc_superpixels_edge_wt_only_coord.zip?dl=1
+            """,
+            "edge_wt_coord_feat": """
+            https://www.dropbox.com/s/2a53nmfp6llqg8y/voc_superpixels_edge_wt_coord_feat.zip?dl=1
+            """,
+            "edge_wt_region_boundary": """
+            https://www.dropbox.com/s/6pfz2mccfbkj7r3/voc_superpixels_edge_wt_region_boundary.zip?dl=1
+            """,
+        },
+        30: {
+            "edge_wt_only_coord": """
+            https://www.dropbox.com/s/toqulkdpb1jrswk/voc_superpixels_edge_wt_only_coord.zip?dl=1
+            """,
+            "edge_wt_coord_feat": """
+            https://www.dropbox.com/s/xywki8ysj63584d/voc_superpixels_edge_wt_coord_feat.zip?dl=1
+            """,
+            "edge_wt_region_boundary": """
+            https://www.dropbox.com/s/8x722ai272wqwl4/voc_superpixels_edge_wt_region_boundary.zip?dl=1
+            """,
+        },
+    }
+    def __init__(
+        self,
+        raw_dir=None,
+        split="train",
+        construct_format="edge_wt_region_boundary",
+        slic_compactness=30,
+        force_reload=None,
+        verbose=None,
+        transform=None,
+    ):
+        self.construct_format = construct_format
+        self.slic_compactness = slic_compactness
+        assert split in ["train", "val", "test"], "split not valid."
+        assert construct_format in [
+            "edge_wt_only_coord",
+            "edge_wt_coord_feat",
+            "edge_wt_region_boundary",
+        ], "construct_format not valid."
+        assert slic_compactness in [10, 30], "slic_compactness not valid."
+        self.split = split
+        super(VOCSuperpixelsDataset, self).__init__(
+            name="PascalVOC-SP",
+            raw_dir=raw_dir,
+            url=self.urls[self.slic_compactness][self.construct_format],
+            force_reload=force_reload,
+            verbose=verbose,
+            transform=transform,
+        )
+    @property
+    def save_path(self):
+        return os.path.join(
+            self.raw_path,
+            "slic_compactness_" + str(self.slic_compactness),
+            self.construct_format,
+        )
+    @property
+    def raw_data_path(self):
+        return os.path.join(self.save_path, f"{self.split}.pickle")
+    @property
+    def graph_path(self):
+        return os.path.join(self.save_path, f"processed_{self.split}.pkl")
+    @property
+    def num_classes(self):
+        r"""Number of classes for each node."""
+        return 21
+    def __len__(self):
+        r"""The number of examples in the dataset."""
+        return len(self.graphs)
+    def download(self):
+        zip_file_path = os.path.join(
+            self.raw_path, "voc_superpixels_" + self.construct_format + ".zip"
+        )
+        path = download(self.url, path=zip_file_path)
+        extract_archive(path, self.raw_path, overwrite=True)
+        makedirs(self.save_path)
+        os.rename(
+            os.path.join(
+                self.raw_path, "voc_superpixels_" + self.construct_format
+            ),
+            self.save_path,
+        )
+        os.unlink(path)
+    def process(self):
+        with open(self.raw_data_path, "rb") as f:
+            graphs = pickle.load(f)
+        self.graphs = []
+        for idx in tqdm(
+            range(len(graphs)), desc=f"Processing {self.split} dataset"
+        ):
+            graph = graphs[idx]
+            """
+            Each `graph` is a tuple (x, edge_attr, edge_index, y)
+                Shape of x : [num_nodes, 14]
+                Shape of edge_attr : [num_edges, 1] or [num_edges, 2]
+                Shape of edge_index : [2, num_edges]
+                Shape of y : [num_nodes]
+            """
+            DGLgraph = dgl_graph(
+                (graph[2][0], graph[2][1]),
+                num_nodes=len(graph[3]),
+            )
+            DGLgraph.ndata["feat"] = graph[0].to(F.float32)
+            DGLgraph.edata["feat"] = graph[1].to(F.float32)
+            DGLgraph.ndata["label"] = F.tensor(graph[3])
+            self.graphs.append(DGLgraph)
+    def load(self):
+        with open(self.graph_path, "rb") as f:
+            f = pickle.load(f)
+            self.graphs = f
+    def save(self):
+        with open(os.path.join(self.graph_path), "wb") as f:
+            pickle.dump(self.graphs, f)
+    def has_cache(self):
+        return os.path.exists(self.graph_path)
+    def __getitem__(self, idx):
+        r"""Get the idx-th sample.
+        Parameters
+        ---------
+        idx : int or tensor
+            The sample index, if idx is tensor will ignore transform.
+        Returns
+        -------
+        :class:`dgl.DGLGraph`
+            graph structure, node features, node labels and edge features.
+            - ``ndata['feat']``: node features
+            - ``ndata['label']``: node labels
+            - ``edata['feat']``: edge features
+        or
+        :class:`dgl.data.utils.Subset`
+            Subset of the dataset at specified indices
+        """
+        if F.is_tensor(idx) and idx.dim() == 1:
+            return Subset(self, idx.cpu())
+        if self._transform is None:
+            return self.graphs[idx]
+        else:
+            return self._transform(self.graphs[idx])
--- a/tests/integration/test_data.py
+++ b/tests/integration/test_data.py
@@ -90,6 +90,23 @@ def test_peptides_functional():
    assert dataset1.num_classes == label.shape[0]
+@unittest.skipIf(
+    F._default_context_str == "gpu",
+    reason="Datasets don't need to be tested on GPU.",
+)
+@unittest.skipIf(
+    dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
+)
+def test_VOC_superpixels():
+    transform = dgl.AddSelfLoop(allow_duplicate=True)
+    dataset1 = data.VOCSuperpixelsDataset()
+    g1 = dataset1[0]
+    dataset2 = data.VOCSuperpixelsDataset(transform=transform)
+    g2 = dataset2[0]
+    assert g2.num_edges() - g1.num_edges() == g1.num_nodes()
 @unittest.skipIf(
    F._default_context_str == "gpu",
    reason="Datasets don't need to be tested on GPU.",