"...en/git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "2e8d18e6994a91f251e1a26fa3d05d6dad1be212"
Unverified Commit d9caeaaa authored by Xinyu Yao's avatar Xinyu Yao Committed by GitHub
Browse files

[GraphBolt] Update dataset names that exposed to users in examples. (#7280)


Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-0-133.us-west-2.compute.internal>
parent e9661d3b
...@@ -205,7 +205,7 @@ if __name__ == "__main__": ...@@ -205,7 +205,7 @@ if __name__ == "__main__":
) )
args = parser.parse_args() args = parser.parse_args()
dataset = gb.BuiltinDataset("ogbn-products-seeds").load() dataset = gb.BuiltinDataset("ogbn-products").load()
datamodule = DataModule( datamodule = DataModule(
dataset, dataset,
[10, 10, 10], [10, 10, 10],
......
...@@ -385,7 +385,7 @@ def main(args): ...@@ -385,7 +385,7 @@ def main(args):
# Load and preprocess dataset. # Load and preprocess dataset.
print("Loading data") print("Loading data")
dataset = gb.BuiltinDataset("ogbl-citation2-seeds").load() dataset = gb.BuiltinDataset("ogbl-citation2").load()
# Move the dataset to the selected storage. # Move the dataset to the selected storage.
if args.storage_device == "pinned": if args.storage_device == "pinned":
......
...@@ -364,12 +364,8 @@ def parse_args(): ...@@ -364,12 +364,8 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--dataset", "--dataset",
type=str, type=str,
default="ogbn-products-seeds", default="ogbn-products",
choices=[ choices=["ogbn-arxiv", "ogbn-products", "ogbn-papers100M"],
"ogbn-arxiv-seeds",
"ogbn-products-seeds",
"ogbn-papers100M-seeds",
],
help="The dataset we can use for node classification example. Currently" help="The dataset we can use for node classification example. Currently"
" ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.", " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
) )
......
...@@ -208,9 +208,8 @@ def main(): ...@@ -208,9 +208,8 @@ def main():
parser.add_argument( parser.add_argument(
"--dataset", "--dataset",
type=str, type=str,
default="ogbn-products-seeds", default="ogbn-products",
help='Name of the dataset to use (e.g., "ogbn-products-seeds",' help='Name of the dataset to use (e.g., "ogbn-products", "ogbn-arxiv")',
+ ' "ogbn-arxiv-seeds")',
) )
parser.add_argument( parser.add_argument(
"--epochs", type=int, default=10, help="Number of training epochs." "--epochs", type=int, default=10, help="Number of training epochs."
......
...@@ -324,12 +324,8 @@ def parse_args(): ...@@ -324,12 +324,8 @@ def parse_args():
parser.add_argument( parser.add_argument(
"--dataset", "--dataset",
type=str, type=str,
default="ogbn-products-seeds", default="ogbn-products",
choices=[ choices=["ogbn-arxiv", "ogbn-products", "ogbn-papers100M"],
"ogbn-arxiv-seeds",
"ogbn-products-seeds",
"ogbn-papers100M-seeds",
],
help="The dataset we can use for node classification example. Currently" help="The dataset we can use for node classification example. Currently"
" ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.", " ogbn-products, ogbn-arxiv, ogbn-papers100M datasets are supported.",
) )
......
...@@ -158,7 +158,7 @@ if __name__ == "__main__": ...@@ -158,7 +158,7 @@ if __name__ == "__main__":
# Load and preprocess dataset. # Load and preprocess dataset.
print("Loading data...") print("Loading data...")
dataset = gb.BuiltinDataset("cora-seeds").load() dataset = gb.BuiltinDataset("cora").load()
# If a CUDA device is selected, we pin the graph and the features so that # If a CUDA device is selected, we pin the graph and the features so that
# the GPU can access them. # the GPU can access them.
......
...@@ -117,7 +117,7 @@ if __name__ == "__main__": ...@@ -117,7 +117,7 @@ if __name__ == "__main__":
# Load and preprocess dataset. # Load and preprocess dataset.
print("Loading data...") print("Loading data...")
dataset = gb.BuiltinDataset("cora-seeds").load() dataset = gb.BuiltinDataset("cora").load()
# If a CUDA device is selected, we pin the graph and the features so that # If a CUDA device is selected, we pin the graph and the features so that
# the GPU can access them. # the GPU can access them.
......
...@@ -153,7 +153,7 @@ def extract_embed(node_embed, input_nodes): ...@@ -153,7 +153,7 @@ def extract_embed(node_embed, input_nodes):
def extract_node_features(name, block, data, node_embed, device): def extract_node_features(name, block, data, node_embed, device):
"""Extract the node features from embedding layer or raw features.""" """Extract the node features from embedding layer or raw features."""
if name == "ogbn-mag-seeds": if name == "ogbn-mag":
input_nodes = { input_nodes = {
k: v.to(device) for k, v in block.srcdata[dgl.NID].items() k: v.to(device) for k, v in block.srcdata[dgl.NID].items()
} }
...@@ -419,8 +419,8 @@ def evaluate( ...@@ -419,8 +419,8 @@ def evaluate(
model.eval() model.eval()
category = "paper" category = "paper"
# An evaluator for the dataset. # An evaluator for the dataset.
if name == "ogbn-mag-seeds": if name == "ogbn-mag":
evaluator = Evaluator(name="ogbn-mag") evaluator = Evaluator(name=name)
else: else:
evaluator = MAG240MEvaluator() evaluator = MAG240MEvaluator()
...@@ -578,7 +578,7 @@ def main(args): ...@@ -578,7 +578,7 @@ def main(args):
# `institution` are generated in advance and stored in the feature store. # `institution` are generated in advance and stored in the feature store.
# For `ogbn-mag`, we generate the features on the fly. # For `ogbn-mag`, we generate the features on the fly.
embed_layer = None embed_layer = None
if args.dataset == "ogbn-mag-seeds": if args.dataset == "ogbn-mag":
# Create the embedding layer and move it to the appropriate device. # Create the embedding layer and move it to the appropriate device.
embed_layer = rel_graph_embed(g, feat_size).to(device) embed_layer = rel_graph_embed(g, feat_size).to(device)
print( print(
...@@ -652,9 +652,9 @@ if __name__ == "__main__": ...@@ -652,9 +652,9 @@ if __name__ == "__main__":
parser.add_argument( parser.add_argument(
"--dataset", "--dataset",
type=str, type=str,
default="ogbn-mag-seeds", default="ogbn-mag",
choices=["ogbn-mag-seeds", "ogb-lsc-mag240m"], choices=["ogbn-mag", "ogb-lsc-mag240m"],
help="Dataset name. Possible values: ogbn-mag-seeds, ogb-lsc-mag240m", help="Dataset name. Possible values: ogbn-mag, ogb-lsc-mag240m",
) )
parser.add_argument("--num_epochs", type=int, default=3) parser.add_argument("--num_epochs", type=int, default=3)
parser.add_argument("--num_workers", type=int, default=0) parser.add_argument("--num_workers", type=int, default=0)
......
...@@ -242,7 +242,7 @@ if __name__ == "__main__": ...@@ -242,7 +242,7 @@ if __name__ == "__main__":
# Load and preprocess dataset. # Load and preprocess dataset.
print("Loading data") print("Loading data")
device = torch.device("cpu" if args.mode == "cpu" else "cuda") device = torch.device("cpu" if args.mode == "cpu" else "cuda")
dataset = gb.BuiltinDataset("ogbn-products-seeds").load() dataset = gb.BuiltinDataset("ogbn-products").load()
g = dataset.graph g = dataset.graph
features = dataset.feature features = dataset.feature
......
...@@ -1015,6 +1015,11 @@ class BuiltinDataset(OnDiskDataset): ...@@ -1015,6 +1015,11 @@ class BuiltinDataset(OnDiskDataset):
_all_datasets = _datasets + _large_datasets _all_datasets = _datasets + _large_datasets
def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset: def __init__(self, name: str, root: str = "datasets") -> OnDiskDataset:
# For user using DGL 2.2 or later version, we prefer them to use
# datasets with `seeds` suffix. This hack should be removed, when the
# datasets with `seeds` suffix have covered previous ones.
if "seeds" not in name:
name += "-seeds"
dataset_dir = os.path.join(root, name) dataset_dir = os.path.join(root, name)
if not os.path.exists(dataset_dir): if not os.path.exists(dataset_dir):
if name not in self._all_datasets: if name not in self._all_datasets:
......
...@@ -2481,8 +2481,10 @@ def test_BuiltinDataset(): ...@@ -2481,8 +2481,10 @@ def test_BuiltinDataset():
with tempfile.TemporaryDirectory() as test_dir: with tempfile.TemporaryDirectory() as test_dir:
# Case 1: download from DGL S3 storage. # Case 1: download from DGL S3 storage.
dataset_name = "test-dataset-231207" dataset_name = "test-dataset-231207"
# Add dataset to the builtin dataset list for testing only. # Add dataset to the builtin dataset list for testing only. Due to we
gb.BuiltinDataset._all_datasets.append(dataset_name) # add `seeds` suffix to datasets when downloading, so we append
# dataset name with `-seeds` suffix here.
gb.BuiltinDataset._all_datasets.append(dataset_name + "-seeds")
dataset = gb.BuiltinDataset(name=dataset_name, root=test_dir).load() dataset = gb.BuiltinDataset(name=dataset_name, root=test_dir).load()
assert dataset.graph is not None assert dataset.graph is not None
assert dataset.feature is not None assert dataset.feature is not None
...@@ -2499,7 +2501,7 @@ def test_BuiltinDataset(): ...@@ -2499,7 +2501,7 @@ def test_BuiltinDataset():
dataset = None dataset = None
# Case 3: dataset is not available. # Case 3: dataset is not available.
dataset_name = "fake_name" dataset_name = "fake_name-seeds"
with pytest.raises( with pytest.raises(
RuntimeError, RuntimeError,
match=rf"Dataset {dataset_name} is not available.*", match=rf"Dataset {dataset_name} is not available.*",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment