Unverified Commit 67d93458 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[GraphBolt] update OnDiskDataset tutorial (#6793)

parent 3cf82462
...@@ -127,20 +127,20 @@ ...@@ -127,20 +127,20 @@
"# Edge type: \"user:like:item\"\n", "# Edge type: \"user:like:item\"\n",
"like_edges_path = os.path.join(base_dir, \"like-edges.csv\")\n", "like_edges_path = os.path.join(base_dir, \"like-edges.csv\")\n",
"like_edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n", "like_edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n",
"print(f\"Part of [user:like:item] edges: {like_edges[:10, :]}\")\n", "print(f\"Part of [user:like:item] edges: {like_edges[:10, :]}\\n\")\n",
"\n", "\n",
"df = pd.DataFrame(like_edges)\n", "df = pd.DataFrame(like_edges)\n",
"df.to_csv(like_edges_path, index=False, header=False)\n", "df.to_csv(like_edges_path, index=False, header=False)\n",
"print(f\"[user:like:item] edges are saved into {like_edges_path}\")\n", "print(f\"[user:like:item] edges are saved into {like_edges_path}\\n\")\n",
"\n", "\n",
"# Edge type: \"user:follow:user\"\n", "# Edge type: \"user:follow:user\"\n",
"follow_edges_path = os.path.join(base_dir, \"follow-edges.csv\")\n", "follow_edges_path = os.path.join(base_dir, \"follow-edges.csv\")\n",
"follow_edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n", "follow_edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n",
"print(f\"Part of [user:follow:user] edges: {follow_edges[:10, :]}\")\n", "print(f\"Part of [user:follow:user] edges: {follow_edges[:10, :]}\\n\")\n",
"\n", "\n",
"df = pd.DataFrame(follow_edges)\n", "df = pd.DataFrame(follow_edges)\n",
"df.to_csv(follow_edges_path, index=False, header=False)\n", "df.to_csv(follow_edges_path, index=False, header=False)\n",
"print(f\"[user:follow:user] edges are saved into {follow_edges_path}\")" "print(f\"[user:follow:user] edges are saved into {follow_edges_path}\\n\")"
], ],
"metadata": { "metadata": {
"id": "HcBt4G5BmSjr" "id": "HcBt4G5BmSjr"
...@@ -166,56 +166,56 @@ ...@@ -166,56 +166,56 @@
"node_user_feat_0 = np.random.rand(num_nodes, 5)\n", "node_user_feat_0 = np.random.rand(num_nodes, 5)\n",
"print(f\"Part of node[user] feature [feat_0]: {node_user_feat_0[:10, :]}\")\n", "print(f\"Part of node[user] feature [feat_0]: {node_user_feat_0[:10, :]}\")\n",
"np.save(node_user_feat_0_path, node_user_feat_0)\n", "np.save(node_user_feat_0_path, node_user_feat_0)\n",
"print(f\"Node[user] feature [feat_0] is saved to {node_user_feat_0_path}\")\n", "print(f\"Node[user] feature [feat_0] is saved to {node_user_feat_0_path}\\n\")\n",
"\n", "\n",
"# Generate another node[user] feature in torch tensor\n", "# Generate another node[user] feature in torch tensor\n",
"node_user_feat_1_path = os.path.join(base_dir, \"node-user-feat-1.pt\")\n", "node_user_feat_1_path = os.path.join(base_dir, \"node-user-feat-1.pt\")\n",
"node_user_feat_1 = torch.rand(num_nodes, 5)\n", "node_user_feat_1 = torch.rand(num_nodes, 5)\n",
"print(f\"Part of node[user] feature [feat_1]: {node_user_feat_1[:10, :]}\")\n", "print(f\"Part of node[user] feature [feat_1]: {node_user_feat_1[:10, :]}\")\n",
"torch.save(node_user_feat_1, node_user_feat_1_path)\n", "torch.save(node_user_feat_1, node_user_feat_1_path)\n",
"print(f\"Node[user] feature [feat_1] is saved to {node_user_feat_1_path}\")\n", "print(f\"Node[user] feature [feat_1] is saved to {node_user_feat_1_path}\\n\")\n",
"\n", "\n",
"# Generate node[item] feature in numpy array.\n", "# Generate node[item] feature in numpy array.\n",
"node_item_feat_0_path = os.path.join(base_dir, \"node-item-feat-0.npy\")\n", "node_item_feat_0_path = os.path.join(base_dir, \"node-item-feat-0.npy\")\n",
"node_item_feat_0 = np.random.rand(num_nodes, 5)\n", "node_item_feat_0 = np.random.rand(num_nodes, 5)\n",
"print(f\"Part of node[item] feature [feat_0]: {node_item_feat_0[:10, :]}\")\n", "print(f\"Part of node[item] feature [feat_0]: {node_item_feat_0[:10, :]}\")\n",
"np.save(node_item_feat_0_path, node_item_feat_0)\n", "np.save(node_item_feat_0_path, node_item_feat_0)\n",
"print(f\"Node[item] feature [feat_0] is saved to {node_item_feat_0_path}\")\n", "print(f\"Node[item] feature [feat_0] is saved to {node_item_feat_0_path}\\n\")\n",
"\n", "\n",
"# Generate another node[item] feature in torch tensor\n", "# Generate another node[item] feature in torch tensor\n",
"node_item_feat_1_path = os.path.join(base_dir, \"node-item-feat-1.pt\")\n", "node_item_feat_1_path = os.path.join(base_dir, \"node-item-feat-1.pt\")\n",
"node_item_feat_1 = torch.rand(num_nodes, 5)\n", "node_item_feat_1 = torch.rand(num_nodes, 5)\n",
"print(f\"Part of node[item] feature [feat_1]: {node_item_feat_1[:10, :]}\")\n", "print(f\"Part of node[item] feature [feat_1]: {node_item_feat_1[:10, :]}\")\n",
"torch.save(node_item_feat_1, node_item_feat_1_path)\n", "torch.save(node_item_feat_1, node_item_feat_1_path)\n",
"print(f\"Node[item] feature [feat_1] is saved to {node_item_feat_1_path}\")\n", "print(f\"Node[item] feature [feat_1] is saved to {node_item_feat_1_path}\\n\")\n",
"\n", "\n",
"# Generate edge[user:like:item] feature in numpy array.\n", "# Generate edge[user:like:item] feature in numpy array.\n",
"edge_like_feat_0_path = os.path.join(base_dir, \"edge-like-feat-0.npy\")\n", "edge_like_feat_0_path = os.path.join(base_dir, \"edge-like-feat-0.npy\")\n",
"edge_like_feat_0 = np.random.rand(num_edges, 5)\n", "edge_like_feat_0 = np.random.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:like:item] feature [feat_0]: {edge_like_feat_0[:10, :]}\")\n", "print(f\"Part of edge[user:like:item] feature [feat_0]: {edge_like_feat_0[:10, :]}\")\n",
"np.save(edge_like_feat_0_path, edge_like_feat_0)\n", "np.save(edge_like_feat_0_path, edge_like_feat_0)\n",
"print(f\"Edge[user:like:item] feature [feat_0] is saved to {edge_like_feat_0_path}\")\n", "print(f\"Edge[user:like:item] feature [feat_0] is saved to {edge_like_feat_0_path}\\n\")\n",
"\n", "\n",
"# Generate another edge[user:like:item] feature in torch tensor\n", "# Generate another edge[user:like:item] feature in torch tensor\n",
"edge_like_feat_1_path = os.path.join(base_dir, \"edge-like-feat-1.pt\")\n", "edge_like_feat_1_path = os.path.join(base_dir, \"edge-like-feat-1.pt\")\n",
"edge_like_feat_1 = torch.rand(num_edges, 5)\n", "edge_like_feat_1 = torch.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:like:item] feature [feat_1]: {edge_like_feat_1[:10, :]}\")\n", "print(f\"Part of edge[user:like:item] feature [feat_1]: {edge_like_feat_1[:10, :]}\")\n",
"torch.save(edge_like_feat_1, edge_like_feat_1_path)\n", "torch.save(edge_like_feat_1, edge_like_feat_1_path)\n",
"print(f\"Edge[user:like:item] feature [feat_1] is saved to {edge_like_feat_1_path}\")\n", "print(f\"Edge[user:like:item] feature [feat_1] is saved to {edge_like_feat_1_path}\\n\")\n",
"\n", "\n",
"# Generate edge[user:follow:user] feature in numpy array.\n", "# Generate edge[user:follow:user] feature in numpy array.\n",
"edge_follow_feat_0_path = os.path.join(base_dir, \"edge-follow-feat-0.npy\")\n", "edge_follow_feat_0_path = os.path.join(base_dir, \"edge-follow-feat-0.npy\")\n",
"edge_follow_feat_0 = np.random.rand(num_edges, 5)\n", "edge_follow_feat_0 = np.random.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:follow:user] feature [feat_0]: {edge_follow_feat_0[:10, :]}\")\n", "print(f\"Part of edge[user:follow:user] feature [feat_0]: {edge_follow_feat_0[:10, :]}\")\n",
"np.save(edge_follow_feat_0_path, edge_follow_feat_0)\n", "np.save(edge_follow_feat_0_path, edge_follow_feat_0)\n",
"print(f\"Edge[user:follow:user] feature [feat_0] is saved to {edge_follow_feat_0_path}\")\n", "print(f\"Edge[user:follow:user] feature [feat_0] is saved to {edge_follow_feat_0_path}\\n\")\n",
"\n", "\n",
"# Generate another edge[user:follow:user] feature in torch tensor\n", "# Generate another edge[user:follow:user] feature in torch tensor\n",
"edge_follow_feat_1_path = os.path.join(base_dir, \"edge-follow-feat-1.pt\")\n", "edge_follow_feat_1_path = os.path.join(base_dir, \"edge-follow-feat-1.pt\")\n",
"edge_follow_feat_1 = torch.rand(num_edges, 5)\n", "edge_follow_feat_1 = torch.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:follow:user] feature [feat_1]: {edge_follow_feat_1[:10, :]}\")\n", "print(f\"Part of edge[user:follow:user] feature [feat_1]: {edge_follow_feat_1[:10, :]}\")\n",
"torch.save(edge_follow_feat_1, edge_follow_feat_1_path)\n", "torch.save(edge_follow_feat_1, edge_follow_feat_1_path)\n",
"print(f\"Edge[user:follow:user] feature [feat_1] is saved to {edge_follow_feat_1_path}\")" "print(f\"Edge[user:follow:user] feature [feat_1] is saved to {edge_follow_feat_1_path}\\n\")"
], ],
"metadata": { "metadata": {
"id": "_PVu1u5brBhF" "id": "_PVu1u5brBhF"
...@@ -262,84 +262,84 @@ ...@@ -262,84 +262,84 @@
"nc_train_user_ids = user_ids[:num_trains]\n", "nc_train_user_ids = user_ids[:num_trains]\n",
"print(f\"Part of train ids[user] for node classification: {nc_train_user_ids[:10]}\")\n", "print(f\"Part of train ids[user] for node classification: {nc_train_user_ids[:10]}\")\n",
"np.save(nc_train_user_ids_path, nc_train_user_ids)\n", "np.save(nc_train_user_ids_path, nc_train_user_ids)\n",
"print(f\"NC train ids[user] are saved to {nc_train_user_ids_path}\")\n", "print(f\"NC train ids[user] are saved to {nc_train_user_ids_path}\\n\")\n",
"\n", "\n",
"# Train labels for user.\n", "# Train labels for user.\n",
"nc_train_user_labels_path = os.path.join(base_dir, \"nc-train-user-labels.pt\")\n", "nc_train_user_labels_path = os.path.join(base_dir, \"nc-train-user-labels.pt\")\n",
"nc_train_user_labels = torch.randint(0, 10, (num_trains,))\n", "nc_train_user_labels = torch.randint(0, 10, (num_trains,))\n",
"print(f\"Part of train labels[user] for node classification: {nc_train_user_labels[:10]}\")\n", "print(f\"Part of train labels[user] for node classification: {nc_train_user_labels[:10]}\")\n",
"torch.save(nc_train_user_labels, nc_train_user_labels_path)\n", "torch.save(nc_train_user_labels, nc_train_user_labels_path)\n",
"print(f\"NC train labels[user] are saved to {nc_train_user_labels_path}\")\n", "print(f\"NC train labels[user] are saved to {nc_train_user_labels_path}\\n\")\n",
"\n", "\n",
"# Train IDs for item.\n", "# Train IDs for item.\n",
"nc_train_item_ids_path = os.path.join(base_dir, \"nc-train-item-ids.npy\")\n", "nc_train_item_ids_path = os.path.join(base_dir, \"nc-train-item-ids.npy\")\n",
"nc_train_item_ids = item_ids[:num_trains]\n", "nc_train_item_ids = item_ids[:num_trains]\n",
"print(f\"Part of train ids[item] for node classification: {nc_train_item_ids[:10]}\")\n", "print(f\"Part of train ids[item] for node classification: {nc_train_item_ids[:10]}\")\n",
"np.save(nc_train_item_ids_path, nc_train_item_ids)\n", "np.save(nc_train_item_ids_path, nc_train_item_ids)\n",
"print(f\"NC train ids[item] are saved to {nc_train_item_ids_path}\")\n", "print(f\"NC train ids[item] are saved to {nc_train_item_ids_path}\\n\")\n",
"\n", "\n",
"# Train labels for item.\n", "# Train labels for item.\n",
"nc_train_item_labels_path = os.path.join(base_dir, \"nc-train-item-labels.pt\")\n", "nc_train_item_labels_path = os.path.join(base_dir, \"nc-train-item-labels.pt\")\n",
"nc_train_item_labels = torch.randint(0, 10, (num_trains,))\n", "nc_train_item_labels = torch.randint(0, 10, (num_trains,))\n",
"print(f\"Part of train labels[item] for node classification: {nc_train_item_labels[:10]}\")\n", "print(f\"Part of train labels[item] for node classification: {nc_train_item_labels[:10]}\")\n",
"torch.save(nc_train_item_labels, nc_train_item_labels_path)\n", "torch.save(nc_train_item_labels, nc_train_item_labels_path)\n",
"print(f\"NC train labels[item] are saved to {nc_train_item_labels_path}\")\n", "print(f\"NC train labels[item] are saved to {nc_train_item_labels_path}\\n\")\n",
"\n", "\n",
"# Val IDs for user.\n", "# Val IDs for user.\n",
"nc_val_user_ids_path = os.path.join(base_dir, \"nc-val-user-ids.npy\")\n", "nc_val_user_ids_path = os.path.join(base_dir, \"nc-val-user-ids.npy\")\n",
"nc_val_user_ids = user_ids[num_trains:num_trains+num_vals]\n", "nc_val_user_ids = user_ids[num_trains:num_trains+num_vals]\n",
"print(f\"Part of val ids[user] for node classification: {nc_val_user_ids[:10]}\")\n", "print(f\"Part of val ids[user] for node classification: {nc_val_user_ids[:10]}\")\n",
"np.save(nc_val_user_ids_path, nc_val_user_ids)\n", "np.save(nc_val_user_ids_path, nc_val_user_ids)\n",
"print(f\"NC val ids[user] are saved to {nc_val_user_ids_path}\")\n", "print(f\"NC val ids[user] are saved to {nc_val_user_ids_path}\\n\")\n",
"\n", "\n",
"# Val labels for user.\n", "# Val labels for user.\n",
"nc_val_user_labels_path = os.path.join(base_dir, \"nc-val-user-labels.pt\")\n", "nc_val_user_labels_path = os.path.join(base_dir, \"nc-val-user-labels.pt\")\n",
"nc_val_user_labels = torch.randint(0, 10, (num_vals,))\n", "nc_val_user_labels = torch.randint(0, 10, (num_vals,))\n",
"print(f\"Part of val labels[user] for node classification: {nc_val_user_labels[:10]}\")\n", "print(f\"Part of val labels[user] for node classification: {nc_val_user_labels[:10]}\")\n",
"torch.save(nc_val_user_labels, nc_val_user_labels_path)\n", "torch.save(nc_val_user_labels, nc_val_user_labels_path)\n",
"print(f\"NC val labels[user] are saved to {nc_val_user_labels_path}\")\n", "print(f\"NC val labels[user] are saved to {nc_val_user_labels_path}\\n\")\n",
"\n", "\n",
"# Val IDs for item.\n", "# Val IDs for item.\n",
"nc_val_item_ids_path = os.path.join(base_dir, \"nc-val-item-ids.npy\")\n", "nc_val_item_ids_path = os.path.join(base_dir, \"nc-val-item-ids.npy\")\n",
"nc_val_item_ids = item_ids[num_trains:num_trains+num_vals]\n", "nc_val_item_ids = item_ids[num_trains:num_trains+num_vals]\n",
"print(f\"Part of val ids[item] for node classification: {nc_val_item_ids[:10]}\")\n", "print(f\"Part of val ids[item] for node classification: {nc_val_item_ids[:10]}\")\n",
"np.save(nc_val_item_ids_path, nc_val_item_ids)\n", "np.save(nc_val_item_ids_path, nc_val_item_ids)\n",
"print(f\"NC val ids[item] are saved to {nc_val_item_ids_path}\")\n", "print(f\"NC val ids[item] are saved to {nc_val_item_ids_path}\\n\")\n",
"\n", "\n",
"# Val labels for item.\n", "# Val labels for item.\n",
"nc_val_item_labels_path = os.path.join(base_dir, \"nc-val-item-labels.pt\")\n", "nc_val_item_labels_path = os.path.join(base_dir, \"nc-val-item-labels.pt\")\n",
"nc_val_item_labels = torch.randint(0, 10, (num_vals,))\n", "nc_val_item_labels = torch.randint(0, 10, (num_vals,))\n",
"print(f\"Part of val labels[item] for node classification: {nc_val_item_labels[:10]}\")\n", "print(f\"Part of val labels[item] for node classification: {nc_val_item_labels[:10]}\")\n",
"torch.save(nc_val_item_labels, nc_val_item_labels_path)\n", "torch.save(nc_val_item_labels, nc_val_item_labels_path)\n",
"print(f\"NC val labels[item] are saved to {nc_val_item_labels_path}\")\n", "print(f\"NC val labels[item] are saved to {nc_val_item_labels_path}\\n\")\n",
"\n", "\n",
"# Test IDs for user.\n", "# Test IDs for user.\n",
"nc_test_user_ids_path = os.path.join(base_dir, \"nc-test-user-ids.npy\")\n", "nc_test_user_ids_path = os.path.join(base_dir, \"nc-test-user-ids.npy\")\n",
"nc_test_user_ids = user_ids[-num_tests:]\n", "nc_test_user_ids = user_ids[-num_tests:]\n",
"print(f\"Part of test ids[user] for node classification: {nc_test_user_ids[:10]}\")\n", "print(f\"Part of test ids[user] for node classification: {nc_test_user_ids[:10]}\")\n",
"np.save(nc_test_user_ids_path, nc_test_user_ids)\n", "np.save(nc_test_user_ids_path, nc_test_user_ids)\n",
"print(f\"NC test ids[user] are saved to {nc_test_user_ids_path}\")\n", "print(f\"NC test ids[user] are saved to {nc_test_user_ids_path}\\n\")\n",
"\n", "\n",
"# Test labels for user.\n", "# Test labels for user.\n",
"nc_test_user_labels_path = os.path.join(base_dir, \"nc-test-user-labels.pt\")\n", "nc_test_user_labels_path = os.path.join(base_dir, \"nc-test-user-labels.pt\")\n",
"nc_test_user_labels = torch.randint(0, 10, (num_tests,))\n", "nc_test_user_labels = torch.randint(0, 10, (num_tests,))\n",
"print(f\"Part of test labels[user] for node classification: {nc_test_user_labels[:10]}\")\n", "print(f\"Part of test labels[user] for node classification: {nc_test_user_labels[:10]}\")\n",
"torch.save(nc_test_user_labels, nc_test_user_labels_path)\n", "torch.save(nc_test_user_labels, nc_test_user_labels_path)\n",
"print(f\"NC test labels[user] are saved to {nc_test_user_labels_path}\")\n", "print(f\"NC test labels[user] are saved to {nc_test_user_labels_path}\\n\")\n",
"\n", "\n",
"# Test IDs for item.\n", "# Test IDs for item.\n",
"nc_test_item_ids_path = os.path.join(base_dir, \"nc-test-item-ids.npy\")\n", "nc_test_item_ids_path = os.path.join(base_dir, \"nc-test-item-ids.npy\")\n",
"nc_test_item_ids = item_ids[-num_tests:]\n", "nc_test_item_ids = item_ids[-num_tests:]\n",
"print(f\"Part of test ids[item] for node classification: {nc_test_item_ids[:10]}\")\n", "print(f\"Part of test ids[item] for node classification: {nc_test_item_ids[:10]}\")\n",
"np.save(nc_test_item_ids_path, nc_test_item_ids)\n", "np.save(nc_test_item_ids_path, nc_test_item_ids)\n",
"print(f\"NC test ids[item] are saved to {nc_test_item_ids_path}\")\n", "print(f\"NC test ids[item] are saved to {nc_test_item_ids_path}\\n\")\n",
"\n", "\n",
"# Test labels for item.\n", "# Test labels for item.\n",
"nc_test_item_labels_path = os.path.join(base_dir, \"nc-test-item-labels.pt\")\n", "nc_test_item_labels_path = os.path.join(base_dir, \"nc-test-item-labels.pt\")\n",
"nc_test_item_labels = torch.randint(0, 10, (num_tests,))\n", "nc_test_item_labels = torch.randint(0, 10, (num_tests,))\n",
"print(f\"Part of test labels[item] for node classification: {nc_test_item_labels[:10]}\")\n", "print(f\"Part of test labels[item] for node classification: {nc_test_item_labels[:10]}\")\n",
"torch.save(nc_test_item_labels, nc_test_item_labels_path)\n", "torch.save(nc_test_item_labels, nc_test_item_labels_path)\n",
"print(f\"NC test labels[item] are saved to {nc_test_item_labels_path}\")" "print(f\"NC test labels[item] are saved to {nc_test_item_labels_path}\\n\")"
], ],
"metadata": { "metadata": {
"id": "S5-fyBbHzTCO" "id": "S5-fyBbHzTCO"
...@@ -370,70 +370,70 @@ ...@@ -370,70 +370,70 @@
"lp_train_like_node_pairs = like_edges[:num_trains, :]\n", "lp_train_like_node_pairs = like_edges[:num_trains, :]\n",
"print(f\"Part of train node pairs[user:like:item] for link prediction: {lp_train_like_node_pairs[:10]}\")\n", "print(f\"Part of train node pairs[user:like:item] for link prediction: {lp_train_like_node_pairs[:10]}\")\n",
"np.save(lp_train_like_node_pairs_path, lp_train_like_node_pairs)\n", "np.save(lp_train_like_node_pairs_path, lp_train_like_node_pairs)\n",
"print(f\"LP train node pairs[user:like:item] are saved to {lp_train_like_node_pairs_path}\")\n", "print(f\"LP train node pairs[user:like:item] are saved to {lp_train_like_node_pairs_path}\\n\")\n",
"\n", "\n",
"# Train node pairs for user:follow:user.\n", "# Train node pairs for user:follow:user.\n",
"lp_train_follow_node_pairs_path = os.path.join(base_dir, \"lp-train-follow-node-pairs.npy\")\n", "lp_train_follow_node_pairs_path = os.path.join(base_dir, \"lp-train-follow-node-pairs.npy\")\n",
"lp_train_follow_node_pairs = follow_edges[:num_trains, :]\n", "lp_train_follow_node_pairs = follow_edges[:num_trains, :]\n",
"print(f\"Part of train node pairs[user:follow:user] for link prediction: {lp_train_follow_node_pairs[:10]}\")\n", "print(f\"Part of train node pairs[user:follow:user] for link prediction: {lp_train_follow_node_pairs[:10]}\")\n",
"np.save(lp_train_follow_node_pairs_path, lp_train_follow_node_pairs)\n", "np.save(lp_train_follow_node_pairs_path, lp_train_follow_node_pairs)\n",
"print(f\"LP train node pairs[user:follow:user] are saved to {lp_train_follow_node_pairs_path}\")\n", "print(f\"LP train node pairs[user:follow:user] are saved to {lp_train_follow_node_pairs_path}\\n\")\n",
"\n", "\n",
"# Val node pairs for user:like:item.\n", "# Val node pairs for user:like:item.\n",
"lp_val_like_node_pairs_path = os.path.join(base_dir, \"lp-val-like-node-pairs.npy\")\n", "lp_val_like_node_pairs_path = os.path.join(base_dir, \"lp-val-like-node-pairs.npy\")\n",
"lp_val_like_node_pairs = like_edges[num_trains:num_trains+num_vals, :]\n", "lp_val_like_node_pairs = like_edges[num_trains:num_trains+num_vals, :]\n",
"print(f\"Part of val node pairs[user:like:item] for link prediction: {lp_val_like_node_pairs[:10]}\")\n", "print(f\"Part of val node pairs[user:like:item] for link prediction: {lp_val_like_node_pairs[:10]}\")\n",
"np.save(lp_val_like_node_pairs_path, lp_val_like_node_pairs)\n", "np.save(lp_val_like_node_pairs_path, lp_val_like_node_pairs)\n",
"print(f\"LP val node pairs[user:like:item] are saved to {lp_val_like_node_pairs_path}\")\n", "print(f\"LP val node pairs[user:like:item] are saved to {lp_val_like_node_pairs_path}\\n\")\n",
"\n", "\n",
"# Val negative dsts for user:like:item.\n", "# Val negative dsts for user:like:item.\n",
"lp_val_like_neg_dsts_path = os.path.join(base_dir, \"lp-val-like-neg-dsts.pt\")\n", "lp_val_like_neg_dsts_path = os.path.join(base_dir, \"lp-val-like-neg-dsts.pt\")\n",
"lp_val_like_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n", "lp_val_like_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n",
"print(f\"Part of val negative dsts[user:like:item] for link prediction: {lp_val_like_neg_dsts[:10]}\")\n", "print(f\"Part of val negative dsts[user:like:item] for link prediction: {lp_val_like_neg_dsts[:10]}\")\n",
"torch.save(lp_val_like_neg_dsts, lp_val_like_neg_dsts_path)\n", "torch.save(lp_val_like_neg_dsts, lp_val_like_neg_dsts_path)\n",
"print(f\"LP val negative dsts[user:like:item] are saved to {lp_val_like_neg_dsts_path}\")\n", "print(f\"LP val negative dsts[user:like:item] are saved to {lp_val_like_neg_dsts_path}\\n\")\n",
"\n", "\n",
"# Val node pairs for user:follow:user.\n", "# Val node pairs for user:follow:user.\n",
"lp_val_follow_node_pairs_path = os.path.join(base_dir, \"lp-val-follow-node-pairs.npy\")\n", "lp_val_follow_node_pairs_path = os.path.join(base_dir, \"lp-val-follow-node-pairs.npy\")\n",
"lp_val_follow_node_pairs = follow_edges[num_trains:num_trains+num_vals, :]\n", "lp_val_follow_node_pairs = follow_edges[num_trains:num_trains+num_vals, :]\n",
"print(f\"Part of val node pairs[user:follow:user] for link prediction: {lp_val_follow_node_pairs[:10]}\")\n", "print(f\"Part of val node pairs[user:follow:user] for link prediction: {lp_val_follow_node_pairs[:10]}\")\n",
"np.save(lp_val_follow_node_pairs_path, lp_val_follow_node_pairs)\n", "np.save(lp_val_follow_node_pairs_path, lp_val_follow_node_pairs)\n",
"print(f\"LP val node pairs[user:follow:user] are saved to {lp_val_follow_node_pairs_path}\")\n", "print(f\"LP val node pairs[user:follow:user] are saved to {lp_val_follow_node_pairs_path}\\n\")\n",
"\n", "\n",
"# Val negative dsts for user:follow:user.\n", "# Val negative dsts for user:follow:user.\n",
"lp_val_follow_neg_dsts_path = os.path.join(base_dir, \"lp-val-follow-neg-dsts.pt\")\n", "lp_val_follow_neg_dsts_path = os.path.join(base_dir, \"lp-val-follow-neg-dsts.pt\")\n",
"lp_val_follow_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n", "lp_val_follow_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n",
"print(f\"Part of val negative dsts[user:follow:user] for link prediction: {lp_val_follow_neg_dsts[:10]}\")\n", "print(f\"Part of val negative dsts[user:follow:user] for link prediction: {lp_val_follow_neg_dsts[:10]}\")\n",
"torch.save(lp_val_follow_neg_dsts, lp_val_follow_neg_dsts_path)\n", "torch.save(lp_val_follow_neg_dsts, lp_val_follow_neg_dsts_path)\n",
"print(f\"LP val negative dsts[user:follow:user] are saved to {lp_val_follow_neg_dsts_path}\")\n", "print(f\"LP val negative dsts[user:follow:user] are saved to {lp_val_follow_neg_dsts_path}\\n\")\n",
"\n", "\n",
"# Test node paris for user:like:item.\n", "# Test node paris for user:like:item.\n",
"lp_test_like_node_pairs_path = os.path.join(base_dir, \"lp-test-like-node-pairs.npy\")\n", "lp_test_like_node_pairs_path = os.path.join(base_dir, \"lp-test-like-node-pairs.npy\")\n",
"lp_test_like_node_pairs = like_edges[-num_tests, :]\n", "lp_test_like_node_pairs = like_edges[-num_tests, :]\n",
"print(f\"Part of test node pairs[user:like:item] for link prediction: {lp_test_like_node_pairs[:10]}\")\n", "print(f\"Part of test node pairs[user:like:item] for link prediction: {lp_test_like_node_pairs[:10]}\")\n",
"np.save(lp_test_like_node_pairs_path, lp_test_like_node_pairs)\n", "np.save(lp_test_like_node_pairs_path, lp_test_like_node_pairs)\n",
"print(f\"LP test node pairs[user:like:item] are saved to {lp_test_like_node_pairs_path}\")\n", "print(f\"LP test node pairs[user:like:item] are saved to {lp_test_like_node_pairs_path}\\n\")\n",
"\n", "\n",
"# Test negative dsts for user:like:item.\n", "# Test negative dsts for user:like:item.\n",
"lp_test_like_neg_dsts_path = os.path.join(base_dir, \"lp-test-like-neg-dsts.pt\")\n", "lp_test_like_neg_dsts_path = os.path.join(base_dir, \"lp-test-like-neg-dsts.pt\")\n",
"lp_test_like_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n", "lp_test_like_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n",
"print(f\"Part of test negative dsts[user:like:item] for link prediction: {lp_test_like_neg_dsts[:10]}\")\n", "print(f\"Part of test negative dsts[user:like:item] for link prediction: {lp_test_like_neg_dsts[:10]}\")\n",
"torch.save(lp_test_like_neg_dsts, lp_test_like_neg_dsts_path)\n", "torch.save(lp_test_like_neg_dsts, lp_test_like_neg_dsts_path)\n",
"print(f\"LP test negative dsts[user:like:item] are saved to {lp_test_like_neg_dsts_path}\")\n", "print(f\"LP test negative dsts[user:like:item] are saved to {lp_test_like_neg_dsts_path}\\n\")\n",
"\n", "\n",
"# Test node paris for user:follow:user.\n", "# Test node paris for user:follow:user.\n",
"lp_test_follow_node_pairs_path = os.path.join(base_dir, \"lp-test-follow-node-pairs.npy\")\n", "lp_test_follow_node_pairs_path = os.path.join(base_dir, \"lp-test-follow-node-pairs.npy\")\n",
"lp_test_follow_node_pairs = follow_edges[-num_tests, :]\n", "lp_test_follow_node_pairs = follow_edges[-num_tests, :]\n",
"print(f\"Part of test node pairs[user:follow:user] for link prediction: {lp_test_follow_node_pairs[:10]}\")\n", "print(f\"Part of test node pairs[user:follow:user] for link prediction: {lp_test_follow_node_pairs[:10]}\")\n",
"np.save(lp_test_follow_node_pairs_path, lp_test_follow_node_pairs)\n", "np.save(lp_test_follow_node_pairs_path, lp_test_follow_node_pairs)\n",
"print(f\"LP test node pairs[user:follow:user] are saved to {lp_test_follow_node_pairs_path}\")\n", "print(f\"LP test node pairs[user:follow:user] are saved to {lp_test_follow_node_pairs_path}\\n\")\n",
"\n", "\n",
"# Test negative dsts for user:follow:user.\n", "# Test negative dsts for user:follow:user.\n",
"lp_test_follow_neg_dsts_path = os.path.join(base_dir, \"lp-test-follow-neg-dsts.pt\")\n", "lp_test_follow_neg_dsts_path = os.path.join(base_dir, \"lp-test-follow-neg-dsts.pt\")\n",
"lp_test_follow_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n", "lp_test_follow_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n",
"print(f\"Part of test negative dsts[user:follow:user] for link prediction: {lp_test_follow_neg_dsts[:10]}\")\n", "print(f\"Part of test negative dsts[user:follow:user] for link prediction: {lp_test_follow_neg_dsts[:10]}\")\n",
"torch.save(lp_test_follow_neg_dsts, lp_test_follow_neg_dsts_path)\n", "torch.save(lp_test_follow_neg_dsts, lp_test_follow_neg_dsts_path)\n",
"print(f\"LP test negative dsts[user:follow:user] are saved to {lp_test_follow_neg_dsts_path}\")" "print(f\"LP test negative dsts[user:follow:user] are saved to {lp_test_follow_neg_dsts_path}\\n\")"
], ],
"metadata": { "metadata": {
"id": "u0jCnXIcAQy4" "id": "u0jCnXIcAQy4"
...@@ -447,7 +447,14 @@ ...@@ -447,7 +447,14 @@
"## Organize Data into YAML File\n", "## Organize Data into YAML File\n",
"Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets. Please note that all path should be relative to `metadata.yaml`.\n", "Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets. Please note that all path should be relative to `metadata.yaml`.\n",
"\n", "\n",
"For heterogeneous graph, we need to specify the node/edge type in **type** fields. For edge type, canonical etype is required which is a string that's concatenated by source node type, etype, and destination node type together with `:`." "For heterogeneous graph, we need to specify the node/edge type in **type** fields. For edge type, canonical etype is required which is a string that's concatenated by source node type, etype, and destination node type together with `:`.\n",
"\n",
"Notes:\n",
"- all path should be relative to `metadata.yaml`.\n",
"- Below fields are optional and not specified in below example.\n",
" - `in_memory`: indicates whether to load dada into memory or `mmap`. Default is `True`.\n",
"\n",
"Please refer to [YAML specification](https://github.com/dmlc/dgl/blob/master/docs/source/stochastic_training/ondisk-dataset-specification.rst) for more details."
], ],
"metadata": { "metadata": {
"id": "wbk6-wxRK-6S" "id": "wbk6-wxRK-6S"
...@@ -476,49 +483,41 @@ ...@@ -476,49 +483,41 @@
" type: user\n", " type: user\n",
" name: feat_0\n", " name: feat_0\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(node_user_feat_0_path)}\n", " path: {os.path.basename(node_user_feat_0_path)}\n",
" - domain: node\n", " - domain: node\n",
" type: user\n", " type: user\n",
" name: feat_1\n", " name: feat_1\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(node_user_feat_1_path)}\n", " path: {os.path.basename(node_user_feat_1_path)}\n",
" - domain: node\n", " - domain: node\n",
" type: item\n", " type: item\n",
" name: feat_0\n", " name: feat_0\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(node_item_feat_0_path)}\n", " path: {os.path.basename(node_item_feat_0_path)}\n",
" - domain: node\n", " - domain: node\n",
" type: item\n", " type: item\n",
" name: feat_1\n", " name: feat_1\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(node_item_feat_1_path)}\n", " path: {os.path.basename(node_item_feat_1_path)}\n",
" - domain: edge\n", " - domain: edge\n",
" type: \"user:like:item\"\n", " type: \"user:like:item\"\n",
" name: feat_0\n", " name: feat_0\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(edge_like_feat_0_path)}\n", " path: {os.path.basename(edge_like_feat_0_path)}\n",
" - domain: edge\n", " - domain: edge\n",
" type: \"user:like:item\"\n", " type: \"user:like:item\"\n",
" name: feat_1\n", " name: feat_1\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(edge_like_feat_1_path)}\n", " path: {os.path.basename(edge_like_feat_1_path)}\n",
" - domain: edge\n", " - domain: edge\n",
" type: \"user:follow:user\"\n", " type: \"user:follow:user\"\n",
" name: feat_0\n", " name: feat_0\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(edge_follow_feat_0_path)}\n", " path: {os.path.basename(edge_follow_feat_0_path)}\n",
" - domain: edge\n", " - domain: edge\n",
" type: \"user:follow:user\"\n", " type: \"user:follow:user\"\n",
" name: feat_1\n", " name: feat_1\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(edge_follow_feat_1_path)}\n", " path: {os.path.basename(edge_follow_feat_1_path)}\n",
" tasks:\n", " tasks:\n",
" - name: node_classification\n", " - name: node_classification\n",
...@@ -528,66 +527,54 @@ ...@@ -528,66 +527,54 @@
" - type: user\n", " - type: user\n",
" name: seed_nodes\n", " name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_user_ids_path)}\n", " path: {os.path.basename(nc_train_user_ids_path)}\n",
" - type: user\n", " - type: user\n",
" name: labels\n", " name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_user_labels_path)}\n", " path: {os.path.basename(nc_train_user_labels_path)}\n",
" - type: item\n", " - type: item\n",
" name: seed_nodes\n", " name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_item_ids_path)}\n", " path: {os.path.basename(nc_train_item_ids_path)}\n",
" - type: item\n", " - type: item\n",
" name: labels\n", " name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_item_labels_path)}\n", " path: {os.path.basename(nc_train_item_labels_path)}\n",
" validation_set:\n", " validation_set:\n",
" - data:\n", " - data:\n",
" - type: user\n", " - type: user\n",
" name: seed_nodes\n", " name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_user_ids_path)}\n", " path: {os.path.basename(nc_val_user_ids_path)}\n",
" - type: user\n", " - type: user\n",
" name: labels\n", " name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_user_labels_path)}\n", " path: {os.path.basename(nc_val_user_labels_path)}\n",
" - type: item\n", " - type: item\n",
" name: seed_nodes\n", " name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_item_ids_path)}\n", " path: {os.path.basename(nc_val_item_ids_path)}\n",
" - type: item\n", " - type: item\n",
" name: labels\n", " name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_item_labels_path)}\n", " path: {os.path.basename(nc_val_item_labels_path)}\n",
" test_set:\n", " test_set:\n",
" - data:\n", " - data:\n",
" - type: user\n", " - type: user\n",
" name: seed_nodes\n", " name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_user_ids_path)}\n", " path: {os.path.basename(nc_test_user_ids_path)}\n",
" - type: user\n", " - type: user\n",
" name: labels\n", " name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_user_labels_path)}\n", " path: {os.path.basename(nc_test_user_labels_path)}\n",
" - type: item\n", " - type: item\n",
" name: seed_nodes\n", " name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_item_ids_path)}\n", " path: {os.path.basename(nc_test_item_ids_path)}\n",
" - type: item\n", " - type: item\n",
" name: labels\n", " name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_item_labels_path)}\n", " path: {os.path.basename(nc_test_item_labels_path)}\n",
" - name: link_prediction\n", " - name: link_prediction\n",
" num_classes: 10\n", " num_classes: 10\n",
...@@ -596,56 +583,46 @@ ...@@ -596,56 +583,46 @@
" - type: \"user:like:item\"\n", " - type: \"user:like:item\"\n",
" name: node_pairs\n", " name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_train_like_node_pairs_path)}\n", " path: {os.path.basename(lp_train_like_node_pairs_path)}\n",
" - type: \"user:follow:user\"\n", " - type: \"user:follow:user\"\n",
" name: node_pairs\n", " name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_train_follow_node_pairs_path)}\n", " path: {os.path.basename(lp_train_follow_node_pairs_path)}\n",
" validation_set:\n", " validation_set:\n",
" - data:\n", " - data:\n",
" - type: \"user:like:item\"\n", " - type: \"user:like:item\"\n",
" name: node_pairs\n", " name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_like_node_pairs_path)}\n", " path: {os.path.basename(lp_val_like_node_pairs_path)}\n",
" - type: \"user:like:item\"\n", " - type: \"user:like:item\"\n",
" name: negative_dsts\n", " name: negative_dsts\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_like_neg_dsts_path)}\n", " path: {os.path.basename(lp_val_like_neg_dsts_path)}\n",
" - type: \"user:follow:user\"\n", " - type: \"user:follow:user\"\n",
" name: node_pairs\n", " name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_follow_node_pairs_path)}\n", " path: {os.path.basename(lp_val_follow_node_pairs_path)}\n",
" - type: \"user:follow:user\"\n", " - type: \"user:follow:user\"\n",
" name: negative_dsts\n", " name: negative_dsts\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_follow_neg_dsts_path)}\n", " path: {os.path.basename(lp_val_follow_neg_dsts_path)}\n",
" test_set:\n", " test_set:\n",
" - data:\n", " - data:\n",
" - type: \"user:like:item\"\n", " - type: \"user:like:item\"\n",
" name: node_pairs\n", " name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_like_node_pairs_path)}\n", " path: {os.path.basename(lp_test_like_node_pairs_path)}\n",
" - type: \"user:like:item\"\n", " - type: \"user:like:item\"\n",
" name: negative_dsts\n", " name: negative_dsts\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_like_neg_dsts_path)}\n", " path: {os.path.basename(lp_test_like_neg_dsts_path)}\n",
" - type: \"user:follow:user\"\n", " - type: \"user:follow:user\"\n",
" name: node_pairs\n", " name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_follow_node_pairs_path)}\n", " path: {os.path.basename(lp_test_follow_node_pairs_path)}\n",
" - type: \"user:follow:user\"\n", " - type: \"user:follow:user\"\n",
" name: negative_dsts\n", " name: negative_dsts\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_follow_neg_dsts_path)}\n", " path: {os.path.basename(lp_test_follow_neg_dsts_path)}\n",
"\"\"\"\n", "\"\"\"\n",
"metadata_path = os.path.join(base_dir, \"metadata.yaml\")\n", "metadata_path = os.path.join(base_dir, \"metadata.yaml\")\n",
...@@ -677,16 +654,16 @@ ...@@ -677,16 +654,16 @@
"source": [ "source": [
"dataset = gb.OnDiskDataset(base_dir).load()\n", "dataset = gb.OnDiskDataset(base_dir).load()\n",
"graph = dataset.graph\n", "graph = dataset.graph\n",
"print(f\"Loaded graph: {graph}\")\n", "print(f\"Loaded graph: {graph}\\n\")\n",
"\n", "\n",
"feature = dataset.feature\n", "feature = dataset.feature\n",
"print(f\"Loaded feature store: {feature}\")\n", "print(f\"Loaded feature store: {feature}\\n\")\n",
"\n", "\n",
"tasks = dataset.tasks\n", "tasks = dataset.tasks\n",
"nc_task = tasks[0]\n", "nc_task = tasks[0]\n",
"print(f\"Loaded node classification task: {nc_task}\")\n", "print(f\"Loaded node classification task: {nc_task}\\n\")\n",
"lp_task = tasks[1]\n", "lp_task = tasks[1]\n",
"print(f\"Loaded link prediction task: {lp_task}\")" "print(f\"Loaded link prediction task: {lp_task}\\n\")"
], ],
"metadata": { "metadata": {
"id": "W58CZoSzOiyo" "id": "W58CZoSzOiyo"
......
...@@ -152,28 +152,28 @@ ...@@ -152,28 +152,28 @@
"node_feat_0 = np.random.rand(num_nodes, 5)\n", "node_feat_0 = np.random.rand(num_nodes, 5)\n",
"print(f\"Part of node feature [feat_0]: {node_feat_0[:10, :]}\")\n", "print(f\"Part of node feature [feat_0]: {node_feat_0[:10, :]}\")\n",
"np.save(node_feat_0_path, node_feat_0)\n", "np.save(node_feat_0_path, node_feat_0)\n",
"print(f\"Node feature [feat_0] is saved to {node_feat_0_path}\")\n", "print(f\"Node feature [feat_0] is saved to {node_feat_0_path}\\n\")\n",
"\n", "\n",
"# Generate another node feature in torch tensor\n", "# Generate another node feature in torch tensor\n",
"node_feat_1_path = os.path.join(base_dir, \"node-feat-1.pt\")\n", "node_feat_1_path = os.path.join(base_dir, \"node-feat-1.pt\")\n",
"node_feat_1 = torch.rand(num_nodes, 5)\n", "node_feat_1 = torch.rand(num_nodes, 5)\n",
"print(f\"Part of node feature [feat_1]: {node_feat_1[:10, :]}\")\n", "print(f\"Part of node feature [feat_1]: {node_feat_1[:10, :]}\")\n",
"torch.save(node_feat_1, node_feat_1_path)\n", "torch.save(node_feat_1, node_feat_1_path)\n",
"print(f\"Node feature [feat_1] is saved to {node_feat_1_path}\")\n", "print(f\"Node feature [feat_1] is saved to {node_feat_1_path}\\n\")\n",
"\n", "\n",
"# Generate edge feature in numpy array.\n", "# Generate edge feature in numpy array.\n",
"edge_feat_0_path = os.path.join(base_dir, \"edge-feat-0.npy\")\n", "edge_feat_0_path = os.path.join(base_dir, \"edge-feat-0.npy\")\n",
"edge_feat_0 = np.random.rand(num_edges, 5)\n", "edge_feat_0 = np.random.rand(num_edges, 5)\n",
"print(f\"Part of edge feature [feat_0]: {edge_feat_0[:10, :]}\")\n", "print(f\"Part of edge feature [feat_0]: {edge_feat_0[:10, :]}\")\n",
"np.save(edge_feat_0_path, edge_feat_0)\n", "np.save(edge_feat_0_path, edge_feat_0)\n",
"print(f\"Edge feature [feat_0] is saved to {edge_feat_0_path}\")\n", "print(f\"Edge feature [feat_0] is saved to {edge_feat_0_path}\\n\")\n",
"\n", "\n",
"# Generate another edge feature in torch tensor\n", "# Generate another edge feature in torch tensor\n",
"edge_feat_1_path = os.path.join(base_dir, \"edge-feat-1.pt\")\n", "edge_feat_1_path = os.path.join(base_dir, \"edge-feat-1.pt\")\n",
"edge_feat_1 = torch.rand(num_edges, 5)\n", "edge_feat_1 = torch.rand(num_edges, 5)\n",
"print(f\"Part of edge feature [feat_1]: {edge_feat_1[:10, :]}\")\n", "print(f\"Part of edge feature [feat_1]: {edge_feat_1[:10, :]}\")\n",
"torch.save(edge_feat_1, edge_feat_1_path)\n", "torch.save(edge_feat_1, edge_feat_1_path)\n",
"print(f\"Edge feature [feat_1] is saved to {edge_feat_1_path}\")\n" "print(f\"Edge feature [feat_1] is saved to {edge_feat_1_path}\\n\")\n"
], ],
"metadata": { "metadata": {
"id": "_PVu1u5brBhF" "id": "_PVu1u5brBhF"
...@@ -215,37 +215,37 @@ ...@@ -215,37 +215,37 @@
"nc_train_ids = ids[:num_trains]\n", "nc_train_ids = ids[:num_trains]\n",
"print(f\"Part of train ids for node classification: {nc_train_ids[:10]}\")\n", "print(f\"Part of train ids for node classification: {nc_train_ids[:10]}\")\n",
"np.save(nc_train_ids_path, nc_train_ids)\n", "np.save(nc_train_ids_path, nc_train_ids)\n",
"print(f\"NC train ids are saved to {nc_train_ids_path}\")\n", "print(f\"NC train ids are saved to {nc_train_ids_path}\\n\")\n",
"\n", "\n",
"nc_train_labels_path = os.path.join(base_dir, \"nc-train-labels.pt\")\n", "nc_train_labels_path = os.path.join(base_dir, \"nc-train-labels.pt\")\n",
"nc_train_labels = torch.randint(0, 10, (num_trains,))\n", "nc_train_labels = torch.randint(0, 10, (num_trains,))\n",
"print(f\"Part of train labels for node classification: {nc_train_labels[:10]}\")\n", "print(f\"Part of train labels for node classification: {nc_train_labels[:10]}\")\n",
"torch.save(nc_train_labels, nc_train_labels_path)\n", "torch.save(nc_train_labels, nc_train_labels_path)\n",
"print(f\"NC train labels are saved to {nc_train_labels_path}\")\n", "print(f\"NC train labels are saved to {nc_train_labels_path}\\n\")\n",
"\n", "\n",
"nc_val_ids_path = os.path.join(base_dir, \"nc-val-ids.npy\")\n", "nc_val_ids_path = os.path.join(base_dir, \"nc-val-ids.npy\")\n",
"nc_val_ids = ids[num_trains:num_trains+num_vals]\n", "nc_val_ids = ids[num_trains:num_trains+num_vals]\n",
"print(f\"Part of val ids for node classification: {nc_val_ids[:10]}\")\n", "print(f\"Part of val ids for node classification: {nc_val_ids[:10]}\")\n",
"np.save(nc_val_ids_path, nc_val_ids)\n", "np.save(nc_val_ids_path, nc_val_ids)\n",
"print(f\"NC val ids are saved to {nc_val_ids_path}\")\n", "print(f\"NC val ids are saved to {nc_val_ids_path}\\n\")\n",
"\n", "\n",
"nc_val_labels_path = os.path.join(base_dir, \"nc-val-labels.pt\")\n", "nc_val_labels_path = os.path.join(base_dir, \"nc-val-labels.pt\")\n",
"nc_val_labels = torch.randint(0, 10, (num_vals,))\n", "nc_val_labels = torch.randint(0, 10, (num_vals,))\n",
"print(f\"Part of val labels for node classification: {nc_val_labels[:10]}\")\n", "print(f\"Part of val labels for node classification: {nc_val_labels[:10]}\")\n",
"torch.save(nc_val_labels, nc_val_labels_path)\n", "torch.save(nc_val_labels, nc_val_labels_path)\n",
"print(f\"NC val labels are saved to {nc_val_labels_path}\")\n", "print(f\"NC val labels are saved to {nc_val_labels_path}\\n\")\n",
"\n", "\n",
"nc_test_ids_path = os.path.join(base_dir, \"nc-test-ids.npy\")\n", "nc_test_ids_path = os.path.join(base_dir, \"nc-test-ids.npy\")\n",
"nc_test_ids = ids[-num_tests:]\n", "nc_test_ids = ids[-num_tests:]\n",
"print(f\"Part of test ids for node classification: {nc_test_ids[:10]}\")\n", "print(f\"Part of test ids for node classification: {nc_test_ids[:10]}\")\n",
"np.save(nc_test_ids_path, nc_test_ids)\n", "np.save(nc_test_ids_path, nc_test_ids)\n",
"print(f\"NC test ids are saved to {nc_test_ids_path}\")\n", "print(f\"NC test ids are saved to {nc_test_ids_path}\\n\")\n",
"\n", "\n",
"nc_test_labels_path = os.path.join(base_dir, \"nc-test-labels.pt\")\n", "nc_test_labels_path = os.path.join(base_dir, \"nc-test-labels.pt\")\n",
"nc_test_labels = torch.randint(0, 10, (num_tests,))\n", "nc_test_labels = torch.randint(0, 10, (num_tests,))\n",
"print(f\"Part of test labels for node classification: {nc_test_labels[:10]}\")\n", "print(f\"Part of test labels for node classification: {nc_test_labels[:10]}\")\n",
"torch.save(nc_test_labels, nc_test_labels_path)\n", "torch.save(nc_test_labels, nc_test_labels_path)\n",
"print(f\"NC test labels are saved to {nc_test_labels_path}\")" "print(f\"NC test labels are saved to {nc_test_labels_path}\\n\")"
], ],
"metadata": { "metadata": {
"id": "S5-fyBbHzTCO" "id": "S5-fyBbHzTCO"
...@@ -274,31 +274,31 @@ ...@@ -274,31 +274,31 @@
"lp_train_node_pairs = edges[:num_trains, :]\n", "lp_train_node_pairs = edges[:num_trains, :]\n",
"print(f\"Part of train node pairs for link prediction: {lp_train_node_pairs[:10]}\")\n", "print(f\"Part of train node pairs for link prediction: {lp_train_node_pairs[:10]}\")\n",
"np.save(lp_train_node_pairs_path, lp_train_node_pairs)\n", "np.save(lp_train_node_pairs_path, lp_train_node_pairs)\n",
"print(f\"LP train node pairs are saved to {lp_train_node_pairs_path}\")\n", "print(f\"LP train node pairs are saved to {lp_train_node_pairs_path}\\n\")\n",
"\n", "\n",
"lp_val_node_pairs_path = os.path.join(base_dir, \"lp-val-node-pairs.npy\")\n", "lp_val_node_pairs_path = os.path.join(base_dir, \"lp-val-node-pairs.npy\")\n",
"lp_val_node_pairs = edges[num_trains:num_trains+num_vals, :]\n", "lp_val_node_pairs = edges[num_trains:num_trains+num_vals, :]\n",
"print(f\"Part of val node pairs for link prediction: {lp_val_node_pairs[:10]}\")\n", "print(f\"Part of val node pairs for link prediction: {lp_val_node_pairs[:10]}\")\n",
"np.save(lp_val_node_pairs_path, lp_val_node_pairs)\n", "np.save(lp_val_node_pairs_path, lp_val_node_pairs)\n",
"print(f\"LP val node pairs are saved to {lp_val_node_pairs_path}\")\n", "print(f\"LP val node pairs are saved to {lp_val_node_pairs_path}\\n\")\n",
"\n", "\n",
"lp_val_neg_dsts_path = os.path.join(base_dir, \"lp-val-neg-dsts.pt\")\n", "lp_val_neg_dsts_path = os.path.join(base_dir, \"lp-val-neg-dsts.pt\")\n",
"lp_val_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n", "lp_val_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n",
"print(f\"Part of val negative dsts for link prediction: {lp_val_neg_dsts[:10]}\")\n", "print(f\"Part of val negative dsts for link prediction: {lp_val_neg_dsts[:10]}\")\n",
"torch.save(lp_val_neg_dsts, lp_val_neg_dsts_path)\n", "torch.save(lp_val_neg_dsts, lp_val_neg_dsts_path)\n",
"print(f\"LP val negative dsts are saved to {lp_val_neg_dsts_path}\")\n", "print(f\"LP val negative dsts are saved to {lp_val_neg_dsts_path}\\n\")\n",
"\n", "\n",
"lp_test_node_pairs_path = os.path.join(base_dir, \"lp-test-node-pairs.npy\")\n", "lp_test_node_pairs_path = os.path.join(base_dir, \"lp-test-node-pairs.npy\")\n",
"lp_test_node_pairs = edges[-num_tests, :]\n", "lp_test_node_pairs = edges[-num_tests, :]\n",
"print(f\"Part of test node pairs for link prediction: {lp_test_node_pairs[:10]}\")\n", "print(f\"Part of test node pairs for link prediction: {lp_test_node_pairs[:10]}\")\n",
"np.save(lp_test_node_pairs_path, lp_test_node_pairs)\n", "np.save(lp_test_node_pairs_path, lp_test_node_pairs)\n",
"print(f\"LP test node pairs are saved to {lp_test_node_pairs_path}\")\n", "print(f\"LP test node pairs are saved to {lp_test_node_pairs_path}\\n\")\n",
"\n", "\n",
"lp_test_neg_dsts_path = os.path.join(base_dir, \"lp-test-neg-dsts.pt\")\n", "lp_test_neg_dsts_path = os.path.join(base_dir, \"lp-test-neg-dsts.pt\")\n",
"lp_test_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n", "lp_test_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n",
"print(f\"Part of test negative dsts for link prediction: {lp_test_neg_dsts[:10]}\")\n", "print(f\"Part of test negative dsts for link prediction: {lp_test_neg_dsts[:10]}\")\n",
"torch.save(lp_test_neg_dsts, lp_test_neg_dsts_path)\n", "torch.save(lp_test_neg_dsts, lp_test_neg_dsts_path)\n",
"print(f\"LP test negative dsts are saved to {lp_test_neg_dsts_path}\")" "print(f\"LP test negative dsts are saved to {lp_test_neg_dsts_path}\\n\")"
], ],
"metadata": { "metadata": {
"id": "u0jCnXIcAQy4" "id": "u0jCnXIcAQy4"
...@@ -310,7 +310,14 @@ ...@@ -310,7 +310,14 @@
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": [
"## Organize Data into YAML File\n", "## Organize Data into YAML File\n",
"Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets. Please note that all path should be relative to `metadata.yaml`." "Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets.\n",
"\n",
"Notes:\n",
"- all path should be relative to `metadata.yaml`.\n",
"- Below fields are optional and not specified in below example.\n",
" - `in_memory`: indicates whether to load dada into memory or `mmap`. Default is `True`.\n",
"\n",
"Please refer to [YAML specification](https://github.com/dmlc/dgl/blob/master/docs/source/stochastic_training/ondisk-dataset-specification.rst) for more details."
], ],
"metadata": { "metadata": {
"id": "wbk6-wxRK-6S" "id": "wbk6-wxRK-6S"
...@@ -331,22 +338,18 @@ ...@@ -331,22 +338,18 @@
" - domain: node\n", " - domain: node\n",
" name: feat_0\n", " name: feat_0\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(node_feat_0_path)}\n", " path: {os.path.basename(node_feat_0_path)}\n",
" - domain: node\n", " - domain: node\n",
" name: feat_1\n", " name: feat_1\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(node_feat_1_path)}\n", " path: {os.path.basename(node_feat_1_path)}\n",
" - domain: edge\n", " - domain: edge\n",
" name: feat_0\n", " name: feat_0\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(edge_feat_0_path)}\n", " path: {os.path.basename(edge_feat_0_path)}\n",
" - domain: edge\n", " - domain: edge\n",
" name: feat_1\n", " name: feat_1\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(edge_feat_1_path)}\n", " path: {os.path.basename(edge_feat_1_path)}\n",
" tasks:\n", " tasks:\n",
" - name: node_classification\n", " - name: node_classification\n",
...@@ -355,31 +358,25 @@ ...@@ -355,31 +358,25 @@
" - data:\n", " - data:\n",
" - name: seed_nodes\n", " - name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_ids_path)}\n", " path: {os.path.basename(nc_train_ids_path)}\n",
" - name: labels\n", " - name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_labels_path)}\n", " path: {os.path.basename(nc_train_labels_path)}\n",
" validation_set:\n", " validation_set:\n",
" - data:\n", " - data:\n",
" - name: seed_nodes\n", " - name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_ids_path)}\n", " path: {os.path.basename(nc_val_ids_path)}\n",
" - name: labels\n", " - name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_labels_path)}\n", " path: {os.path.basename(nc_val_labels_path)}\n",
" test_set:\n", " test_set:\n",
" - data:\n", " - data:\n",
" - name: seed_nodes\n", " - name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_ids_path)}\n", " path: {os.path.basename(nc_test_ids_path)}\n",
" - name: labels\n", " - name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_labels_path)}\n", " path: {os.path.basename(nc_test_labels_path)}\n",
" - name: link_prediction\n", " - name: link_prediction\n",
" num_classes: 10\n", " num_classes: 10\n",
...@@ -387,27 +384,22 @@ ...@@ -387,27 +384,22 @@
" - data:\n", " - data:\n",
" - name: node_pairs\n", " - name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_train_node_pairs_path)}\n", " path: {os.path.basename(lp_train_node_pairs_path)}\n",
" validation_set:\n", " validation_set:\n",
" - data:\n", " - data:\n",
" - name: node_pairs\n", " - name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_node_pairs_path)}\n", " path: {os.path.basename(lp_val_node_pairs_path)}\n",
" - name: negative_dsts\n", " - name: negative_dsts\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_neg_dsts_path)}\n", " path: {os.path.basename(lp_val_neg_dsts_path)}\n",
" test_set:\n", " test_set:\n",
" - data:\n", " - data:\n",
" - name: node_pairs\n", " - name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_node_pairs_path)}\n", " path: {os.path.basename(lp_test_node_pairs_path)}\n",
" - name: negative_dsts\n", " - name: negative_dsts\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_neg_dsts_path)}\n", " path: {os.path.basename(lp_test_neg_dsts_path)}\n",
"\"\"\"\n", "\"\"\"\n",
"metadata_path = os.path.join(base_dir, \"metadata.yaml\")\n", "metadata_path = os.path.join(base_dir, \"metadata.yaml\")\n",
...@@ -439,16 +431,16 @@ ...@@ -439,16 +431,16 @@
"source": [ "source": [
"dataset = gb.OnDiskDataset(base_dir).load()\n", "dataset = gb.OnDiskDataset(base_dir).load()\n",
"graph = dataset.graph\n", "graph = dataset.graph\n",
"print(f\"Loaded graph: {graph}\")\n", "print(f\"Loaded graph: {graph}\\n\")\n",
"\n", "\n",
"feature = dataset.feature\n", "feature = dataset.feature\n",
"print(f\"Loaded feature store: {feature}\")\n", "print(f\"Loaded feature store: {feature}\\n\")\n",
"\n", "\n",
"tasks = dataset.tasks\n", "tasks = dataset.tasks\n",
"nc_task = tasks[0]\n", "nc_task = tasks[0]\n",
"print(f\"Loaded node classification task: {nc_task}\")\n", "print(f\"Loaded node classification task: {nc_task}\\n\")\n",
"lp_task = tasks[1]\n", "lp_task = tasks[1]\n",
"print(f\"Loaded link prediction task: {lp_task}\")" "print(f\"Loaded link prediction task: {lp_task}\\n\")"
], ],
"metadata": { "metadata": {
"id": "W58CZoSzOiyo" "id": "W58CZoSzOiyo"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment