[GraphBolt] Update dataset names that exposed to users in examples. (#7280)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>

[GraphBolt] Update dataset names that exposed to users in examples. (#7280)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
d9caeaaa · Xinyu Yao · GitHub · e9661d3b · d9caeaaa · d9caeaaa
Unverified Commit d9caeaaa authored Apr 08, 2024 by Xinyu Yao Committed by GitHub Apr 08, 2024
11 changed files
--- a/examples/sampling/graphbolt/lightning/node_classification.py
+++ b/examples/sampling/graphbolt/lightning/node_classification.py
@@ -205,7 +205,7 @@ if __name__ == "__main__":
    )
    args = parser.parse_args()

-    dataset = gb.BuiltinDataset("ogbn-products-seeds").load()
+    dataset = gb.BuiltinDataset("ogbn-products").load()
    datamodule = DataModule(
        dataset,
        [10, 10, 10],

--- a/examples/sampling/graphbolt/link_prediction.py
+++ b/examples/sampling/graphbolt/link_prediction.py
@@ -385,7 +385,7 @@ def main(args):

    # Load and preprocess dataset.
    print("Loading data")
-    dataset = gb.BuiltinDataset("ogbl-citation2-seeds").load()
+    dataset = gb.BuiltinDataset("ogbl-citation2").load()

    # Move the dataset to the selected storage.
    if args.storage_device == "pinned":

--- a/examples/sampling/graphbolt/node_classification.py
+++ b/examples/sampling/graphbolt/node_classification.py
@@ -364,12 +364,8 @@ def parse_args():
    parser.add_argument(
        "--dataset",
        type=str,
-        default="ogbn-products-seeds",
-        choices=[
-            "ogbn-arxiv-seeds",
-            "ogbn-products-seeds",
-            "ogbn-papers100M-seeds",
-        ],
+        default="ogbn-products",
+        choices=["ogbn-arxiv", "ogbn-products", "ogbn-papers100M"],
        help="The dataset we can use for node classification example. Currently"
        " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
    )

--- a/examples/sampling/graphbolt/pyg/node_classification.py
+++ b/examples/sampling/graphbolt/pyg/node_classification.py
@@ -208,9 +208,8 @@ def main():
    parser.add_argument(
        "--dataset",
        type=str,
-        default="ogbn-products-seeds",
-        help='Name of the dataset to use (e.g., "ogbn-products-seeds",'
-        + ' "ogbn-arxiv-seeds")',
+        default="ogbn-products",
+        help='Name of the dataset to use (e.g., "ogbn-products", "ogbn-arxiv")',
    )
    parser.add_argument(
        "--epochs", type=int, default=10, help="Number of training epochs."

--- a/examples/sampling/graphbolt/pyg/node_classification_advanced.py
+++ b/examples/sampling/graphbolt/pyg/node_classification_advanced.py
@@ -324,12 +324,8 @@ def parse_args():
    parser.add_argument(
        "--dataset",
        type=str,
-        default="ogbn-products-seeds",
-        choices=[
-            "ogbn-arxiv-seeds",
-            "ogbn-products-seeds",
-            "ogbn-papers100M-seeds",
-        ],
+        default="ogbn-products",
+        choices=["ogbn-arxiv", "ogbn-products", "ogbn-papers100M"],
        help="The dataset we can use for node classification example. Currently"
        " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
    )

--- a/examples/sampling/graphbolt/quickstart/link_prediction.py
+++ b/examples/sampling/graphbolt/quickstart/link_prediction.py
@@ -158,7 +158,7 @@ if __name__ == "__main__":

    # Load and preprocess dataset.
    print("Loading data...")
-    dataset = gb.BuiltinDataset("cora-seeds").load()
+    dataset = gb.BuiltinDataset("cora").load()

    # If a CUDA device is selected, we pin the graph and the features so that
    # the GPU can access them.

--- a/examples/sampling/graphbolt/quickstart/node_classification.py
+++ b/examples/sampling/graphbolt/quickstart/node_classification.py
@@ -117,7 +117,7 @@ if __name__ == "__main__":

    # Load and preprocess dataset.
    print("Loading data...")
-    dataset = gb.BuiltinDataset("cora-seeds").load()
+    dataset = gb.BuiltinDataset("cora").load()

    # If a CUDA device is selected, we pin the graph and the features so that
    # the GPU can access them.

--- a/examples/sampling/graphbolt/rgcn/hetero_rgcn.py
+++ b/examples/sampling/graphbolt/rgcn/hetero_rgcn.py
@@ -153,7 +153,7 @@ def extract_embed(node_embed, input_nodes):

 def extract_node_features(name, block, data, node_embed, device):
    """Extract the node features from embedding layer or raw features."""
-    if name == "ogbn-mag-seeds":
+    if name == "ogbn-mag":
        input_nodes = {
            k: v.to(device) for k, v in block.srcdata[dgl.NID].items()
        }
@@ -419,8 +419,8 @@ def evaluate(
    model.eval()
    category = "paper"
    # An evaluator for the dataset.
-    if name == "ogbn-mag-seeds":
-        evaluator = Evaluator(name="ogbn-mag")
+    if name == "ogbn-mag":
+        evaluator = Evaluator(name=name)
    else:
        evaluator = MAG240MEvaluator()

@@ -578,7 +578,7 @@ def main(args):
    # `institution` are generated in advance and stored in the feature store.
    # For `ogbn-mag`, we generate the features on the fly.
    embed_layer = None
-    if args.dataset == "ogbn-mag-seeds":
+    if args.dataset == "ogbn-mag":
        # Create the embedding layer and move it to the appropriate device.
        embed_layer = rel_graph_embed(g, feat_size).to(device)
        print(
@@ -652,9 +652,9 @@ if __name__ == "__main__":
    parser.add_argument(
        "--dataset",
        type=str,
-        default="ogbn-mag-seeds",
-        choices=["ogbn-mag-seeds", "ogb-lsc-mag240m"],
-        help="Dataset name. Possible values: ogbn-mag-seeds, ogb-lsc-mag240m",
+        default="ogbn-mag",
+        choices=["ogbn-mag", "ogb-lsc-mag240m"],
+        help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m",
    )
    parser.add_argument("--num_epochs", type=int, default=3)
    parser.add_argument("--num_workers", type=int, default=0)

--- a/examples/sampling/graphbolt/sparse/graphsage.py
+++ b/examples/sampling/graphbolt/sparse/graphsage.py
@@ -242,7 +242,7 @@ if __name__ == "__main__":
    # Load and preprocess dataset.
    print("Loading data")
    device = torch.device("cpu" if args.mode == "cpu" else "cuda")
-    dataset = gb.BuiltinDataset("ogbn-products-seeds").load()
+    dataset = gb.BuiltinDataset("ogbn-products").load()
    g = dataset.graph
    features = dataset.feature


--- a/python/dgl/graphbolt/impl/ondisk_dataset.py
+++ b/python/dgl/graphbolt/impl/ondisk_dataset.py
@@ -1015,6 +1015,11 @@ class BuiltinDataset(OnDiskDataset):
    _all_datasets = _datasets + _large_datasets

    def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset:
+        # For user using DGL 2.2 or later version, we prefer them to use
+        # datasets with `seeds` suffix. This hack should be removed, when the
+        # datasets with `seeds` suffix have covered previous ones.
+        if "seeds" not in name:
+            name += "-seeds"
        dataset_dir = os.path.join(root, name)
        if not os.path.exists(dataset_dir):
            if name not in self._all_datasets:

--- a/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
+++ b/tests/python/pytorch/graphbolt/impl/test_ondisk_dataset.py
@@ -2481,8 +2481,10 @@ def test_BuiltinDataset():
    with tempfile.TemporaryDirectory() as test_dir:
        # Case 1: download from DGL S3 storage.
        dataset_name = "test-dataset-231207"
-        # Add dataset to the builtin dataset list for testing only.
-        gb.BuiltinDataset._all_datasets.append(dataset_name)
+        # Add dataset to the builtin dataset list for testing only. Due to we
+        # add `seeds` suffix to datasets when downloading, so we append
+        # dataset name with `-seeds` suffix here.
+        gb.BuiltinDataset._all_datasets.append(dataset_name + "-seeds")
        dataset = gb.BuiltinDataset(name=dataset_name, root=test_dir).load()
        assert dataset.graph is not None
        assert dataset.feature is not None
@@ -2499,7 +2501,7 @@ def test_BuiltinDataset():
        dataset = None

        # Case 3: dataset is not available.
-        dataset_name = "fake_name"
+        dataset_name = "fake_name-seeds"
        with pytest.raises(
            RuntimeError,
            match=rf"Dataset {dataset_name} is not available.*",