Add a note about the order of TUDataset (#3549)

a7b5085a · esang · GitHub · cd6d1138 · a7b5085a
Unverified Commit a7b5085a authored Nov 28, 2021 by esang Committed by GitHub Nov 28, 2021
Show whitespace changes
Inline Side-by-side

Showing with 21 additions and 16 deletions

python/dgl/data/tu.py python/dgl/data/tu.py +21 -16

No files found.
--- a/python/dgl/data/tu.py
+++ b/python/dgl/data/tu.py
@@ -34,6 +34,14 @@ class LegacyTUDataset(DGLBuiltinDataset):
    num_labels : int
        Number of classes

+    Notes
+    -----
+    LegacyTUDataset uses provided node feature by default. If no feature provided, it uses one-hot node label instead.
+    If neither labels provided, it uses constant for node feature.
+
+    The dataset sorts graphs by their labels. 
+    Shuffle is preferred before manual train/val split.
+
    Examples
    --------
    >>> data = LegacyTUDataset('DD')
@@ -59,11 +67,6 @@ class LegacyTUDataset(DGLBuiltinDataset):
    Graph(num_nodes=9539, num_edges=47382,
          ndata_schemes={'feat': Scheme(shape=(89,), dtype=torch.float32), '_ID': Scheme(shape=(), dtype=torch.int64)}
          edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
-
-    Notes
-    -----
-    LegacyTUDataset uses provided node feature by default. If no feature provided, it uses one-hot node label instead.
-    If neither labels provided, it uses constant for node feature.
    """

    _url = r"https://www.chrsmrrs.com/graphkerneldatasets/{}.zip"
@@ -259,6 +262,18 @@ class TUDataset(DGLBuiltinDataset):
    as per the original data.  Other frameworks such as PyTorch Geometric removes the
    duplicates by default.  You can remove the duplicate edges with :func:`dgl.to_simple`.

+    Graphs may have node labels, node attributes, edge labels, and edge attributes,
+    varing from different dataset.
+
+    Labels are mapped to :math:`\lbrace 0,\cdots,n-1 \rbrace` where :math:`n` is the
+    number of labels (some datasets have raw labels :math:`\lbrace -1, 1 \rbrace` which
+    will be mapped to :math:`\lbrace 0, 1 \rbrace`). In previous versions, the minimum
+    label was added so that :math:`\lbrace -1, 1 \rbrace` was mapped to
+    :math:`\lbrace 0, 2 \rbrace`.
+
+    The dataset sorts graphs by their labels. 
+    Shuffle is preferred before manual train/val split.
+
    Examples
    --------
    >>> data = TUDataset('DD')
@@ -285,16 +300,6 @@ class TUDataset(DGLBuiltinDataset):
          ndata_schemes={'node_labels': Scheme(shape=(1,), dtype=torch.int64), '_ID': Scheme(shape=(), dtype=torch.int64)}
          edata_schemes={'_ID': Scheme(shape=(), dtype=torch.int64)})
    
-    Notes
-    -----
-    Graphs may have node labels, node attributes, edge labels, and edge attributes,
-    varing from different dataset.
-
-    Labels are mapped to :math:`\lbrace 0,\cdots,n-1 \rbrace` where :math:`n` is the
-    number of labels (some datasets have raw labels :math:`\lbrace -1, 1 \rbrace` which
-    will be mapped to :math:`\lbrace 0, 1 \rbrace`). In previous versions, the minimum
-    label was added so that :math:`\lbrace -1, 1 \rbrace` was mapped to
-    :math:`\lbrace 0, 2 \rbrace`.
    """

    _url = r"https://www.chrsmrrs.com/graphkerneldatasets/{}.zip"