"docs/vscode:/vscode.git/clone" did not exist on "9dc84448aca9718f9e1175cf83a6a9c10467882a"
Unverified Commit 8edcad2d authored by paoxiaode's avatar paoxiaode Committed by GitHub
Browse files

[Dataset] Add VOCSuperpixels dataset in LRGB (#6389)

parent 403dba62
......@@ -70,6 +70,7 @@ Datasets for node classification/regression tasks
MovieLensDataset
PeptidesStructuralDataset
PeptidesFunctionalDataset
VOCSuperpixelsDataset
Edge Prediction Datasets
---------------------------------------
......
......@@ -75,7 +75,11 @@ from .heterophilous_graphs import (
# Exception handling was added to prevent crashes for users who are using other
# datasets.
try:
from .lrgb import PeptidesFunctionalDataset, PeptidesStructuralDataset
from .lrgb import (
PeptidesFunctionalDataset,
PeptidesStructuralDataset,
VOCSuperpixelsDataset,
)
except ImportError:
pass
from .pattern import PATTERNDataset
......
......@@ -10,7 +10,14 @@ from .. import backend as F
from ..convert import graph as dgl_graph
from .dgl_dataset import DGLDataset
from .utils import download, load_graphs, save_graphs, Subset
from .utils import (
download,
extract_archive,
load_graphs,
makedirs,
save_graphs,
Subset,
)
class PeptidesStructuralDataset(DGLDataset):
......@@ -48,7 +55,7 @@ class PeptidesStructuralDataset(DGLDataset):
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Directory to store all the downloaded raw datasets.
Default: "~/.dgl/".
force_reload : bool
Whether to reload the dataset.
......@@ -79,6 +86,9 @@ class PeptidesStructuralDataset(DGLDataset):
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
>>> # accept tensor to be index, but will ignore transform parameter
>>> # get train dataset
>>> split_dict = dataset.get_idx_split()
>>> trainset = dataset[split_dict["train"]]
>>> graph, label = trainset[0]
......@@ -86,6 +96,16 @@ class PeptidesStructuralDataset(DGLDataset):
Graph(num_nodes=338, num_edges=682,
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
>>> # get subset of dataset
>>> import torch
>>> idx = torch.tensor([0, 1, 2])
>>> dataset_subset = dataset[idx]
>>> graph, label = dataset_subset[0]
>>> graph
Graph(num_nodes=119, num_edges=244,
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
"""
def __init__(
......@@ -234,7 +254,21 @@ class PeptidesStructuralDataset(DGLDataset):
return len(self.graphs)
def __getitem__(self, idx):
"""Get datapoint with index"""
"""Get the idx-th sample.
Parameters
---------
idx : int or tensor
The sample index, if idx is tensor will ignore transform.
Returns
-------
(:class:`dgl.DGLGraph`, Tensor)
Graph with node feature stored in ``feat`` field and its label.
or
:class:`dgl.data.utils.Subset`
Subset of the dataset at specified indices
"""
if F.is_tensor(idx) and idx.dim() == 1:
return Subset(self, idx.cpu())
......@@ -271,7 +305,7 @@ class PeptidesFunctionalDataset(DGLDataset):
Parameters
----------
raw_dir : str
Raw file directory to download/contains the input data directory.
Directory to store all the downloaded raw datasets.
Default: "~/.dgl/".
force_reload : bool
Whether to reload the dataset.
......@@ -302,6 +336,9 @@ class PeptidesFunctionalDataset(DGLDataset):
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
>>> # accept tensor to be index, but will ignore transform parameter
>>> # get train dataset
>>> split_dict = dataset.get_idx_split()
>>> trainset = dataset[split_dict["train"]]
>>> graph, label = trainset[0]
......@@ -310,6 +347,15 @@ class PeptidesFunctionalDataset(DGLDataset):
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
>>> # get subset of dataset
>>> import torch
>>> idx = torch.tensor([0, 1, 2])
>>> dataset_subset = dataset[idx]
>>> graph, label = dataset_subset[0]
>>> graph
Graph(num_nodes=119, num_edges=244,
ndata_schemes={'feat': Scheme(shape=(9,), dtype=torch.int64)}
edata_schemes={'feat': Scheme(shape=(3,), dtype=torch.int64)})
"""
def __init__(
......@@ -442,7 +488,21 @@ class PeptidesFunctionalDataset(DGLDataset):
return len(self.graphs)
def __getitem__(self, idx):
"""Get datapoint with index"""
"""Get the idx-th sample.
Parameters
---------
idx : int or tensor
The sample index, if idx is tensor will ignore transform.
Returns
-------
(:class:`dgl.DGLGraph`, Tensor)
Graph with node feature stored in ``feat`` field and its label.
or
:class:`dgl.data.utils.Subset`
Subset of the dataset at specified indices
"""
if F.is_tensor(idx) and idx.dim() == 1:
return Subset(self, idx.cpu())
......@@ -450,3 +510,246 @@ class PeptidesFunctionalDataset(DGLDataset):
return self.graphs[idx], self.labels[idx]
else:
return self._transform(self.graphs[idx]), self.labels[idx]
class VOCSuperpixelsDataset(DGLDataset):
r"""VOCSuperpixels dataset for the node classification task.
DGL dataset of Pascal VOC Superpixels which contains image superpixels
and a semantic segmentation label for each node superpixel.
color map
0=background, 1=aeroplane, 2=bicycle, 3=bird, 4=boat, 5=bottle,
6=bus, 7=car, 8=cat, 9=chair, 10=cow,
11=diningtable, 12=dog, 13=horse, 14=motorbike, 15=person,
16=potted plant, 17=sheep, 18=sofa, 19=train, 20=tv/monitor
Reference `<https://arxiv.org/abs/2206.08164.pdf>`_
Statistics:
- Train examples: 8,498
- Valid examples: 1,428
- Test examples: 1,429
- Average number of nodes: 479.40
- Average number of edges: 2,710.48
Parameters
----------
raw_dir : str
Directory to store all the downloaded raw datasets.
Default: "~/.dgl/".
split : str
Should be chosen from ["train", "val", "test"]
Default: "train".
construct_format : str, optional
Option to select the graph construction format.
Should be chosen from the following formats:
"edge_wt_only_coord": the graphs are 8-nn graphs with the edge weights
computed based on only spatial coordinates of superpixel nodes.
"edge_wt_coord_feat": the graphs are 8-nn graphs with the edge weights
computed based on combination of spatial coordinates and feature
values of superpixel nodes.
"edge_wt_region_boundary": the graphs region boundary graphs where two
regions (i.e. superpixel nodes) have an edge between them if they share
a boundary in the original image.
Default: "edge_wt_region_boundary".
slic_compactness : int, optional
Option to select compactness of slic that was used for superpixels
Should be chosen from [10, 30]
Default: 30.
force_reload : bool
Whether to reload the dataset.
Default: False.
verbose : bool
Whether to print out progress information.
Default: False.
transform : callable, optional
A transform that takes in a :class:`~dgl.DGLGraph` object and returns
a transformed version. The :class:`~dgl.DGLGraph` object will be
transformed before every access.
Examples
---------
>>> from dgl.data import VOCSuperpixelsDataset
>>> train_dataset = VOCSuperpixelsDataset(split="train")
>>> len(train_dataset)
8498
>>> train_dataset.num_classes
21
>>> graph = train_dataset[0]
>>> graph
Graph(num_nodes=460, num_edges=2632,
ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int32)}
edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)})
>>> # accept tensor to be index, but will ignore transform parameter
>>> import torch
>>> idx = torch.tensor([0, 1, 2])
>>> train_dataset_subset = train_dataset[idx]
>>> train_dataset_subset[0]
Graph(num_nodes=460, num_edges=2632,
ndata_schemes={'feat': Scheme(shape=(14,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int32)}
edata_schemes={'feat': Scheme(shape=(2,), dtype=torch.float32)})
"""
urls = {
10: {
"edge_wt_only_coord": """
https://www.dropbox.com/s/rk6pfnuh7tq3t37/voc_superpixels_edge_wt_only_coord.zip?dl=1
""",
"edge_wt_coord_feat": """
https://www.dropbox.com/s/2a53nmfp6llqg8y/voc_superpixels_edge_wt_coord_feat.zip?dl=1
""",
"edge_wt_region_boundary": """
https://www.dropbox.com/s/6pfz2mccfbkj7r3/voc_superpixels_edge_wt_region_boundary.zip?dl=1
""",
},
30: {
"edge_wt_only_coord": """
https://www.dropbox.com/s/toqulkdpb1jrswk/voc_superpixels_edge_wt_only_coord.zip?dl=1
""",
"edge_wt_coord_feat": """
https://www.dropbox.com/s/xywki8ysj63584d/voc_superpixels_edge_wt_coord_feat.zip?dl=1
""",
"edge_wt_region_boundary": """
https://www.dropbox.com/s/8x722ai272wqwl4/voc_superpixels_edge_wt_region_boundary.zip?dl=1
""",
},
}
def __init__(
self,
raw_dir=None,
split="train",
construct_format="edge_wt_region_boundary",
slic_compactness=30,
force_reload=None,
verbose=None,
transform=None,
):
self.construct_format = construct_format
self.slic_compactness = slic_compactness
assert split in ["train", "val", "test"], "split not valid."
assert construct_format in [
"edge_wt_only_coord",
"edge_wt_coord_feat",
"edge_wt_region_boundary",
], "construct_format not valid."
assert slic_compactness in [10, 30], "slic_compactness not valid."
self.split = split
super(VOCSuperpixelsDataset, self).__init__(
name="PascalVOC-SP",
raw_dir=raw_dir,
url=self.urls[self.slic_compactness][self.construct_format],
force_reload=force_reload,
verbose=verbose,
transform=transform,
)
@property
def save_path(self):
return os.path.join(
self.raw_path,
"slic_compactness_" + str(self.slic_compactness),
self.construct_format,
)
@property
def raw_data_path(self):
return os.path.join(self.save_path, f"{self.split}.pickle")
@property
def graph_path(self):
return os.path.join(self.save_path, f"processed_{self.split}.pkl")
@property
def num_classes(self):
r"""Number of classes for each node."""
return 21
def __len__(self):
r"""The number of examples in the dataset."""
return len(self.graphs)
def download(self):
zip_file_path = os.path.join(
self.raw_path, "voc_superpixels_" + self.construct_format + ".zip"
)
path = download(self.url, path=zip_file_path)
extract_archive(path, self.raw_path, overwrite=True)
makedirs(self.save_path)
os.rename(
os.path.join(
self.raw_path, "voc_superpixels_" + self.construct_format
),
self.save_path,
)
os.unlink(path)
def process(self):
with open(self.raw_data_path, "rb") as f:
graphs = pickle.load(f)
self.graphs = []
for idx in tqdm(
range(len(graphs)), desc=f"Processing {self.split} dataset"
):
graph = graphs[idx]
"""
Each `graph` is a tuple (x, edge_attr, edge_index, y)
Shape of x : [num_nodes, 14]
Shape of edge_attr : [num_edges, 1] or [num_edges, 2]
Shape of edge_index : [2, num_edges]
Shape of y : [num_nodes]
"""
DGLgraph = dgl_graph(
(graph[2][0], graph[2][1]),
num_nodes=len(graph[3]),
)
DGLgraph.ndata["feat"] = graph[0].to(F.float32)
DGLgraph.edata["feat"] = graph[1].to(F.float32)
DGLgraph.ndata["label"] = F.tensor(graph[3])
self.graphs.append(DGLgraph)
def load(self):
with open(self.graph_path, "rb") as f:
f = pickle.load(f)
self.graphs = f
def save(self):
with open(os.path.join(self.graph_path), "wb") as f:
pickle.dump(self.graphs, f)
def has_cache(self):
return os.path.exists(self.graph_path)
def __getitem__(self, idx):
r"""Get the idx-th sample.
Parameters
---------
idx : int or tensor
The sample index, if idx is tensor will ignore transform.
Returns
-------
:class:`dgl.DGLGraph`
graph structure, node features, node labels and edge features.
- ``ndata['feat']``: node features
- ``ndata['label']``: node labels
- ``edata['feat']``: edge features
or
:class:`dgl.data.utils.Subset`
Subset of the dataset at specified indices
"""
if F.is_tensor(idx) and idx.dim() == 1:
return Subset(self, idx.cpu())
if self._transform is None:
return self.graphs[idx]
else:
return self._transform(self.graphs[idx])
......@@ -90,6 +90,23 @@ def test_peptides_functional():
assert dataset1.num_classes == label.shape[0]
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
)
@unittest.skipIf(
dgl.backend.backend_name != "pytorch", reason="only supports pytorch"
)
def test_VOC_superpixels():
transform = dgl.AddSelfLoop(allow_duplicate=True)
dataset1 = data.VOCSuperpixelsDataset()
g1 = dataset1[0]
dataset2 = data.VOCSuperpixelsDataset(transform=transform)
g2 = dataset2[0]
assert g2.num_edges() - g1.num_edges() == g1.num_nodes()
@unittest.skipIf(
F._default_context_str == "gpu",
reason="Datasets don't need to be tested on GPU.",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment