Unverified Commit 6db323b3 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[GraphBolt] update OnDiskDataset tutorial (#6783)

parent 97ed294d
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
"\n", "\n",
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dmlc/dgl/blob/master/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb) [![GitHub](https://img.shields.io/badge/-View%20on%20GitHub-181717?logo=github&logoColor=ffffff)](https://github.com/dmlc/dgl/blob/master/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb)\n", "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dmlc/dgl/blob/master/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb) [![GitHub](https://img.shields.io/badge/-View%20on%20GitHub-181717?logo=github&logoColor=ffffff)](https://github.com/dmlc/dgl/blob/master/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb)\n",
"\n", "\n",
"This tutorial shows how to create `OnDiskDataset` for heterogeneous graph that could be used in **GraphBolt** framework.\n", "This tutorial shows how to create `OnDiskDataset` for heterogeneous graph that could be used in **GraphBolt** framework. The major difference from creating dataset for homogeneous graph is that we need to specify node/edge types for edges, feature data, training/validation/test sets.\n",
"\n", "\n",
"By the end of this tutorial, you will be able to\n", "By the end of this tutorial, you will be able to\n",
"- organize graph structure data.\n", "- organize graph structure data.\n",
...@@ -102,10 +102,10 @@ ...@@ -102,10 +102,10 @@
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": [
"### Generate graph structure data\n", "### Generate graph structure data\n",
"For heterogeneous graph, we just need to save edges(namely node pairs) into **CSV** file.\n", "For heterogeneous graph, we need to save different edge edges(namely node pairs) into separate **CSV** files.\n",
"\n", "\n",
"Note:\n", "Note:\n",
"when saving to file, do not save index and header.*italicized text*\n" "when saving to file, do not save index and header.\n"
], ],
"metadata": { "metadata": {
"id": "qhNtIn_xhlnl" "id": "qhNtIn_xhlnl"
...@@ -116,17 +116,31 @@ ...@@ -116,17 +116,31 @@
"source": [ "source": [
"import numpy as np\n", "import numpy as np\n",
"import pandas as pd\n", "import pandas as pd\n",
"\n",
"# For simplicity, we create a heterogeneous graph with\n",
"# 2 node types: `user`, `item`\n",
"# 2 edge types: `user:like:item`, `user:follow:user`\n",
"# And each node/edge type has the same number of nodes/edges.\n",
"num_nodes = 1000\n", "num_nodes = 1000\n",
"num_edges = 10 * num_nodes\n", "num_edges = 10 * num_nodes\n",
"edges_path = os.path.join(base_dir, \"edges.csv\")\n",
"edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n",
"\n", "\n",
"print(f\"Part of edges: {edges[:10, :]}\")\n", "# Edge type: \"user:like:item\"\n",
"like_edges_path = os.path.join(base_dir, \"like-edges.csv\")\n",
"like_edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n",
"print(f\"Part of [user:like:item] edges: {like_edges[:10, :]}\")\n",
"\n",
"df = pd.DataFrame(like_edges)\n",
"df.to_csv(like_edges_path, index=False, header=False)\n",
"print(f\"[user:like:item] edges are saved into {like_edges_path}\")\n",
"\n", "\n",
"df = pd.DataFrame(edges)\n", "# Edge type: \"user:follow:user\"\n",
"df.to_csv(edges_path, index=False, header=False)\n", "follow_edges_path = os.path.join(base_dir, \"follow-edges.csv\")\n",
"follow_edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n",
"print(f\"Part of [user:follow:user] edges: {follow_edges[:10, :]}\")\n",
"\n", "\n",
"print(f\"Edges are saved into {edges_path}\")" "df = pd.DataFrame(follow_edges)\n",
"df.to_csv(follow_edges_path, index=False, header=False)\n",
"print(f\"[user:follow:user] edges are saved into {follow_edges_path}\")"
], ],
"metadata": { "metadata": {
"id": "HcBt4G5BmSjr" "id": "HcBt4G5BmSjr"
...@@ -138,7 +152,7 @@ ...@@ -138,7 +152,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": [
"### Generate feature data for graph\n", "### Generate feature data for graph\n",
"For feature data, numpy arrays and torch tensors are supported for now." "For feature data, numpy arrays and torch tensors are supported for now. Let's generate feature data for each node/edge type."
], ],
"metadata": { "metadata": {
"id": "kh-4cPtzpcaH" "id": "kh-4cPtzpcaH"
...@@ -147,33 +161,61 @@ ...@@ -147,33 +161,61 @@
{ {
"cell_type": "code", "cell_type": "code",
"source": [ "source": [
"# Generate node feature in numpy array.\n", "# Generate node[user] feature in numpy array.\n",
"node_feat_0_path = os.path.join(base_dir, \"node-feat-0.npy\")\n", "node_user_feat_0_path = os.path.join(base_dir, \"node-user-feat-0.npy\")\n",
"node_feat_0 = np.random.rand(num_nodes, 5)\n", "node_user_feat_0 = np.random.rand(num_nodes, 5)\n",
"print(f\"Part of node feature [feat_0]: {node_feat_0[:10, :]}\")\n", "print(f\"Part of node[user] feature [feat_0]: {node_user_feat_0[:10, :]}\")\n",
"np.save(node_feat_0_path, node_feat_0)\n", "np.save(node_user_feat_0_path, node_user_feat_0)\n",
"print(f\"Node feature [feat_0] is saved to {node_feat_0_path}\")\n", "print(f\"Node[user] feature [feat_0] is saved to {node_user_feat_0_path}\")\n",
"\n", "\n",
"# Generate another node feature in torch tensor\n", "# Generate another node[user] feature in torch tensor\n",
"node_feat_1_path = os.path.join(base_dir, \"node-feat-1.pt\")\n", "node_user_feat_1_path = os.path.join(base_dir, \"node-user-feat-1.pt\")\n",
"node_feat_1 = torch.rand(num_nodes, 5)\n", "node_user_feat_1 = torch.rand(num_nodes, 5)\n",
"print(f\"Part of node feature [feat_1]: {node_feat_1[:10, :]}\")\n", "print(f\"Part of node[user] feature [feat_1]: {node_user_feat_1[:10, :]}\")\n",
"torch.save(node_feat_1, node_feat_1_path)\n", "torch.save(node_user_feat_1, node_user_feat_1_path)\n",
"print(f\"Node feature [feat_1] is saved to {node_feat_1_path}\")\n", "print(f\"Node[user] feature [feat_1] is saved to {node_user_feat_1_path}\")\n",
"\n", "\n",
"# Generate edge feature in numpy array.\n", "# Generate node[item] feature in numpy array.\n",
"edge_feat_0_path = os.path.join(base_dir, \"edge-feat-0.npy\")\n", "node_item_feat_0_path = os.path.join(base_dir, \"node-item-feat-0.npy\")\n",
"edge_feat_0 = np.random.rand(num_edges, 5)\n", "node_item_feat_0 = np.random.rand(num_nodes, 5)\n",
"print(f\"Part of edge feature [feat_0]: {edge_feat_0[:10, :]}\")\n", "print(f\"Part of node[item] feature [feat_0]: {node_item_feat_0[:10, :]}\")\n",
"np.save(edge_feat_0_path, edge_feat_0)\n", "np.save(node_item_feat_0_path, node_item_feat_0)\n",
"print(f\"Edge feature [feat_0] is saved to {edge_feat_0_path}\")\n", "print(f\"Node[item] feature [feat_0] is saved to {node_item_feat_0_path}\")\n",
"\n", "\n",
"# Generate another edge feature in torch tensor\n", "# Generate another node[item] feature in torch tensor\n",
"edge_feat_1_path = os.path.join(base_dir, \"edge-feat-1.pt\")\n", "node_item_feat_1_path = os.path.join(base_dir, \"node-item-feat-1.pt\")\n",
"edge_feat_1 = torch.rand(num_edges, 5)\n", "node_item_feat_1 = torch.rand(num_nodes, 5)\n",
"print(f\"Part of edge feature [feat_1]: {edge_feat_1[:10, :]}\")\n", "print(f\"Part of node[item] feature [feat_1]: {node_item_feat_1[:10, :]}\")\n",
"torch.save(edge_feat_1, edge_feat_1_path)\n", "torch.save(node_item_feat_1, node_item_feat_1_path)\n",
"print(f\"Edge feature [feat_1] is saved to {edge_feat_1_path}\")\n" "print(f\"Node[item] feature [feat_1] is saved to {node_item_feat_1_path}\")\n",
"\n",
"# Generate edge[user:like:item] feature in numpy array.\n",
"edge_like_feat_0_path = os.path.join(base_dir, \"edge-like-feat-0.npy\")\n",
"edge_like_feat_0 = np.random.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:like:item] feature [feat_0]: {edge_like_feat_0[:10, :]}\")\n",
"np.save(edge_like_feat_0_path, edge_like_feat_0)\n",
"print(f\"Edge[user:like:item] feature [feat_0] is saved to {edge_like_feat_0_path}\")\n",
"\n",
"# Generate another edge[user:like:item] feature in torch tensor\n",
"edge_like_feat_1_path = os.path.join(base_dir, \"edge-like-feat-1.pt\")\n",
"edge_like_feat_1 = torch.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:like:item] feature [feat_1]: {edge_like_feat_1[:10, :]}\")\n",
"torch.save(edge_like_feat_1, edge_like_feat_1_path)\n",
"print(f\"Edge[user:like:item] feature [feat_1] is saved to {edge_like_feat_1_path}\")\n",
"\n",
"# Generate edge[user:follow:user] feature in numpy array.\n",
"edge_follow_feat_0_path = os.path.join(base_dir, \"edge-follow-feat-0.npy\")\n",
"edge_follow_feat_0 = np.random.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:follow:user] feature [feat_0]: {edge_follow_feat_0[:10, :]}\")\n",
"np.save(edge_follow_feat_0_path, edge_follow_feat_0)\n",
"print(f\"Edge[user:follow:user] feature [feat_0] is saved to {edge_follow_feat_0_path}\")\n",
"\n",
"# Generate another edge[user:follow:user] feature in torch tensor\n",
"edge_follow_feat_1_path = os.path.join(base_dir, \"edge-follow-feat-1.pt\")\n",
"edge_follow_feat_1 = torch.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:follow:user] feature [feat_1]: {edge_follow_feat_1[:10, :]}\")\n",
"torch.save(edge_follow_feat_1, edge_follow_feat_1_path)\n",
"print(f\"Edge[user:follow:user] feature [feat_1] is saved to {edge_follow_feat_1_path}\")"
], ],
"metadata": { "metadata": {
"id": "_PVu1u5brBhF" "id": "_PVu1u5brBhF"
...@@ -204,48 +246,100 @@ ...@@ -204,48 +246,100 @@
{ {
"cell_type": "code", "cell_type": "code",
"source": [ "source": [
"# For illustration, let's generate item sets for each node type.\n",
"num_trains = int(num_nodes * 0.6)\n", "num_trains = int(num_nodes * 0.6)\n",
"num_vals = int(num_nodes * 0.2)\n", "num_vals = int(num_nodes * 0.2)\n",
"num_tests = num_nodes - num_trains - num_vals\n", "num_tests = num_nodes - num_trains - num_vals\n",
"\n", "\n",
"ids = np.arange(num_nodes)\n", "user_ids = np.arange(num_nodes)\n",
"np.random.shuffle(ids)\n", "np.random.shuffle(user_ids)\n",
"\n", "\n",
"nc_train_ids_path = os.path.join(base_dir, \"nc-train-ids.npy\")\n", "item_ids = np.arange(num_nodes)\n",
"nc_train_ids = ids[:num_trains]\n", "np.random.shuffle(item_ids)\n",
"print(f\"Part of train ids for node classification: {nc_train_ids[:10]}\")\n", "\n",
"np.save(nc_train_ids_path, nc_train_ids)\n", "# Train IDs for user.\n",
"print(f\"NC train ids are saved to {nc_train_ids_path}\")\n", "nc_train_user_ids_path = os.path.join(base_dir, \"nc-train-user-ids.npy\")\n",
"\n", "nc_train_user_ids = user_ids[:num_trains]\n",
"nc_train_labels_path = os.path.join(base_dir, \"nc-train-labels.pt\")\n", "print(f\"Part of train ids[user] for node classification: {nc_train_user_ids[:10]}\")\n",
"nc_train_labels = torch.randint(0, 10, (num_trains,))\n", "np.save(nc_train_user_ids_path, nc_train_user_ids)\n",
"print(f\"Part of train labels for node classification: {nc_train_labels[:10]}\")\n", "print(f\"NC train ids[user] are saved to {nc_train_user_ids_path}\")\n",
"torch.save(nc_train_labels, nc_train_labels_path)\n", "\n",
"print(f\"NC train labels are saved to {nc_train_labels_path}\")\n", "# Train labels for user.\n",
"\n", "nc_train_user_labels_path = os.path.join(base_dir, \"nc-train-user-labels.pt\")\n",
"nc_val_ids_path = os.path.join(base_dir, \"nc-val-ids.npy\")\n", "nc_train_user_labels = torch.randint(0, 10, (num_trains,))\n",
"nc_val_ids = ids[num_trains:num_trains+num_vals]\n", "print(f\"Part of train labels[user] for node classification: {nc_train_user_labels[:10]}\")\n",
"print(f\"Part of val ids for node classification: {nc_val_ids[:10]}\")\n", "torch.save(nc_train_user_labels, nc_train_user_labels_path)\n",
"np.save(nc_val_ids_path, nc_val_ids)\n", "print(f\"NC train labels[user] are saved to {nc_train_user_labels_path}\")\n",
"print(f\"NC val ids are saved to {nc_val_ids_path}\")\n", "\n",
"\n", "# Train IDs for item.\n",
"nc_val_labels_path = os.path.join(base_dir, \"nc-val-labels.pt\")\n", "nc_train_item_ids_path = os.path.join(base_dir, \"nc-train-item-ids.npy\")\n",
"nc_val_labels = torch.randint(0, 10, (num_vals,))\n", "nc_train_item_ids = item_ids[:num_trains]\n",
"print(f\"Part of val labels for node classification: {nc_val_labels[:10]}\")\n", "print(f\"Part of train ids[item] for node classification: {nc_train_item_ids[:10]}\")\n",
"torch.save(nc_val_labels, nc_val_labels_path)\n", "np.save(nc_train_item_ids_path, nc_train_item_ids)\n",
"print(f\"NC val labels are saved to {nc_val_labels_path}\")\n", "print(f\"NC train ids[item] are saved to {nc_train_item_ids_path}\")\n",
"\n", "\n",
"nc_test_ids_path = os.path.join(base_dir, \"nc-test-ids.npy\")\n", "# Train labels for item.\n",
"nc_test_ids = ids[-num_tests:]\n", "nc_train_item_labels_path = os.path.join(base_dir, \"nc-train-item-labels.pt\")\n",
"print(f\"Part of test ids for node classification: {nc_test_ids[:10]}\")\n", "nc_train_item_labels = torch.randint(0, 10, (num_trains,))\n",
"np.save(nc_test_ids_path, nc_test_ids)\n", "print(f\"Part of train labels[item] for node classification: {nc_train_item_labels[:10]}\")\n",
"print(f\"NC test ids are saved to {nc_test_ids_path}\")\n", "torch.save(nc_train_item_labels, nc_train_item_labels_path)\n",
"\n", "print(f\"NC train labels[item] are saved to {nc_train_item_labels_path}\")\n",
"nc_test_labels_path = os.path.join(base_dir, \"nc-test-labels.pt\")\n", "\n",
"nc_test_labels = torch.randint(0, 10, (num_tests,))\n", "# Val IDs for user.\n",
"print(f\"Part of test labels for node classification: {nc_test_labels[:10]}\")\n", "nc_val_user_ids_path = os.path.join(base_dir, \"nc-val-user-ids.npy\")\n",
"torch.save(nc_test_labels, nc_test_labels_path)\n", "nc_val_user_ids = user_ids[num_trains:num_trains+num_vals]\n",
"print(f\"NC test labels are saved to {nc_test_labels_path}\")" "print(f\"Part of val ids[user] for node classification: {nc_val_user_ids[:10]}\")\n",
"np.save(nc_val_user_ids_path, nc_val_user_ids)\n",
"print(f\"NC val ids[user] are saved to {nc_val_user_ids_path}\")\n",
"\n",
"# Val labels for user.\n",
"nc_val_user_labels_path = os.path.join(base_dir, \"nc-val-user-labels.pt\")\n",
"nc_val_user_labels = torch.randint(0, 10, (num_vals,))\n",
"print(f\"Part of val labels[user] for node classification: {nc_val_user_labels[:10]}\")\n",
"torch.save(nc_val_user_labels, nc_val_user_labels_path)\n",
"print(f\"NC val labels[user] are saved to {nc_val_user_labels_path}\")\n",
"\n",
"# Val IDs for item.\n",
"nc_val_item_ids_path = os.path.join(base_dir, \"nc-val-item-ids.npy\")\n",
"nc_val_item_ids = item_ids[num_trains:num_trains+num_vals]\n",
"print(f\"Part of val ids[item] for node classification: {nc_val_item_ids[:10]}\")\n",
"np.save(nc_val_item_ids_path, nc_val_item_ids)\n",
"print(f\"NC val ids[item] are saved to {nc_val_item_ids_path}\")\n",
"\n",
"# Val labels for item.\n",
"nc_val_item_labels_path = os.path.join(base_dir, \"nc-val-item-labels.pt\")\n",
"nc_val_item_labels = torch.randint(0, 10, (num_vals,))\n",
"print(f\"Part of val labels[item] for node classification: {nc_val_item_labels[:10]}\")\n",
"torch.save(nc_val_item_labels, nc_val_item_labels_path)\n",
"print(f\"NC val labels[item] are saved to {nc_val_item_labels_path}\")\n",
"\n",
"# Test IDs for user.\n",
"nc_test_user_ids_path = os.path.join(base_dir, \"nc-test-user-ids.npy\")\n",
"nc_test_user_ids = user_ids[-num_tests:]\n",
"print(f\"Part of test ids[user] for node classification: {nc_test_user_ids[:10]}\")\n",
"np.save(nc_test_user_ids_path, nc_test_user_ids)\n",
"print(f\"NC test ids[user] are saved to {nc_test_user_ids_path}\")\n",
"\n",
"# Test labels for user.\n",
"nc_test_user_labels_path = os.path.join(base_dir, \"nc-test-user-labels.pt\")\n",
"nc_test_user_labels = torch.randint(0, 10, (num_tests,))\n",
"print(f\"Part of test labels[user] for node classification: {nc_test_user_labels[:10]}\")\n",
"torch.save(nc_test_user_labels, nc_test_user_labels_path)\n",
"print(f\"NC test labels[user] are saved to {nc_test_user_labels_path}\")\n",
"\n",
"# Test IDs for item.\n",
"nc_test_item_ids_path = os.path.join(base_dir, \"nc-test-item-ids.npy\")\n",
"nc_test_item_ids = item_ids[-num_tests:]\n",
"print(f\"Part of test ids[item] for node classification: {nc_test_item_ids[:10]}\")\n",
"np.save(nc_test_item_ids_path, nc_test_item_ids)\n",
"print(f\"NC test ids[item] are saved to {nc_test_item_ids_path}\")\n",
"\n",
"# Test labels for item.\n",
"nc_test_item_labels_path = os.path.join(base_dir, \"nc-test-item-labels.pt\")\n",
"nc_test_item_labels = torch.randint(0, 10, (num_tests,))\n",
"print(f\"Part of test labels[item] for node classification: {nc_test_item_labels[:10]}\")\n",
"torch.save(nc_test_item_labels, nc_test_item_labels_path)\n",
"print(f\"NC test labels[item] are saved to {nc_test_item_labels_path}\")"
], ],
"metadata": { "metadata": {
"id": "S5-fyBbHzTCO" "id": "S5-fyBbHzTCO"
...@@ -266,39 +360,80 @@ ...@@ -266,39 +360,80 @@
{ {
"cell_type": "code", "cell_type": "code",
"source": [ "source": [
"# For illustration, let's generate item sets for each edge type.\n",
"num_trains = int(num_edges * 0.6)\n", "num_trains = int(num_edges * 0.6)\n",
"num_vals = int(num_edges * 0.2)\n", "num_vals = int(num_edges * 0.2)\n",
"num_tests = num_edges - num_trains - num_vals\n", "num_tests = num_edges - num_trains - num_vals\n",
"\n", "\n",
"lp_train_node_pairs_path = os.path.join(base_dir, \"lp-train-node-pairs.npy\")\n", "# Train node pairs for user:like:item.\n",
"lp_train_node_pairs = edges[:num_trains, :]\n", "lp_train_like_node_pairs_path = os.path.join(base_dir, \"lp-train-like-node-pairs.npy\")\n",
"print(f\"Part of train node pairs for link prediction: {lp_train_node_pairs[:10]}\")\n", "lp_train_like_node_pairs = like_edges[:num_trains, :]\n",
"np.save(lp_train_node_pairs_path, lp_train_node_pairs)\n", "print(f\"Part of train node pairs[user:like:item] for link prediction: {lp_train_like_node_pairs[:10]}\")\n",
"print(f\"LP train node pairs are saved to {lp_train_node_pairs_path}\")\n", "np.save(lp_train_like_node_pairs_path, lp_train_like_node_pairs)\n",
"\n", "print(f\"LP train node pairs[user:like:item] are saved to {lp_train_like_node_pairs_path}\")\n",
"lp_val_node_pairs_path = os.path.join(base_dir, \"lp-val-node-pairs.npy\")\n", "\n",
"lp_val_node_pairs = edges[num_trains:num_trains+num_vals, :]\n", "# Train node pairs for user:follow:user.\n",
"print(f\"Part of val node pairs for link prediction: {lp_val_node_pairs[:10]}\")\n", "lp_train_follow_node_pairs_path = os.path.join(base_dir, \"lp-train-follow-node-pairs.npy\")\n",
"np.save(lp_val_node_pairs_path, lp_val_node_pairs)\n", "lp_train_follow_node_pairs = follow_edges[:num_trains, :]\n",
"print(f\"LP val node pairs are saved to {lp_val_node_pairs_path}\")\n", "print(f\"Part of train node pairs[user:follow:user] for link prediction: {lp_train_follow_node_pairs[:10]}\")\n",
"\n", "np.save(lp_train_follow_node_pairs_path, lp_train_follow_node_pairs)\n",
"lp_val_neg_dsts_path = os.path.join(base_dir, \"lp-val-neg-dsts.pt\")\n", "print(f\"LP train node pairs[user:follow:user] are saved to {lp_train_follow_node_pairs_path}\")\n",
"lp_val_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n", "\n",
"print(f\"Part of val negative dsts for link prediction: {lp_val_neg_dsts[:10]}\")\n", "# Val node pairs for user:like:item.\n",
"torch.save(lp_val_neg_dsts, lp_val_neg_dsts_path)\n", "lp_val_like_node_pairs_path = os.path.join(base_dir, \"lp-val-like-node-pairs.npy\")\n",
"print(f\"LP val negative dsts are saved to {lp_val_neg_dsts_path}\")\n", "lp_val_like_node_pairs = like_edges[num_trains:num_trains+num_vals, :]\n",
"\n", "print(f\"Part of val node pairs[user:like:item] for link prediction: {lp_val_like_node_pairs[:10]}\")\n",
"lp_test_node_pairs_path = os.path.join(base_dir, \"lp-test-node-pairs.npy\")\n", "np.save(lp_val_like_node_pairs_path, lp_val_like_node_pairs)\n",
"lp_test_node_pairs = edges[-num_tests, :]\n", "print(f\"LP val node pairs[user:like:item] are saved to {lp_val_like_node_pairs_path}\")\n",
"print(f\"Part of test node pairs for link prediction: {lp_test_node_pairs[:10]}\")\n", "\n",
"np.save(lp_test_node_pairs_path, lp_test_node_pairs)\n", "# Val negative dsts for user:like:item.\n",
"print(f\"LP test node pairs are saved to {lp_test_node_pairs_path}\")\n", "lp_val_like_neg_dsts_path = os.path.join(base_dir, \"lp-val-like-neg-dsts.pt\")\n",
"\n", "lp_val_like_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n",
"lp_test_neg_dsts_path = os.path.join(base_dir, \"lp-test-neg-dsts.pt\")\n", "print(f\"Part of val negative dsts[user:like:item] for link prediction: {lp_val_like_neg_dsts[:10]}\")\n",
"lp_test_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n", "torch.save(lp_val_like_neg_dsts, lp_val_like_neg_dsts_path)\n",
"print(f\"Part of test negative dsts for link prediction: {lp_test_neg_dsts[:10]}\")\n", "print(f\"LP val negative dsts[user:like:item] are saved to {lp_val_like_neg_dsts_path}\")\n",
"torch.save(lp_test_neg_dsts, lp_test_neg_dsts_path)\n", "\n",
"print(f\"LP test negative dsts are saved to {lp_test_neg_dsts_path}\")" "# Val node pairs for user:follow:user.\n",
"lp_val_follow_node_pairs_path = os.path.join(base_dir, \"lp-val-follow-node-pairs.npy\")\n",
"lp_val_follow_node_pairs = follow_edges[num_trains:num_trains+num_vals, :]\n",
"print(f\"Part of val node pairs[user:follow:user] for link prediction: {lp_val_follow_node_pairs[:10]}\")\n",
"np.save(lp_val_follow_node_pairs_path, lp_val_follow_node_pairs)\n",
"print(f\"LP val node pairs[user:follow:user] are saved to {lp_val_follow_node_pairs_path}\")\n",
"\n",
"# Val negative dsts for user:follow:user.\n",
"lp_val_follow_neg_dsts_path = os.path.join(base_dir, \"lp-val-follow-neg-dsts.pt\")\n",
"lp_val_follow_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n",
"print(f\"Part of val negative dsts[user:follow:user] for link prediction: {lp_val_follow_neg_dsts[:10]}\")\n",
"torch.save(lp_val_follow_neg_dsts, lp_val_follow_neg_dsts_path)\n",
"print(f\"LP val negative dsts[user:follow:user] are saved to {lp_val_follow_neg_dsts_path}\")\n",
"\n",
"# Test node paris for user:like:item.\n",
"lp_test_like_node_pairs_path = os.path.join(base_dir, \"lp-test-like-node-pairs.npy\")\n",
"lp_test_like_node_pairs = like_edges[-num_tests, :]\n",
"print(f\"Part of test node pairs[user:like:item] for link prediction: {lp_test_like_node_pairs[:10]}\")\n",
"np.save(lp_test_like_node_pairs_path, lp_test_like_node_pairs)\n",
"print(f\"LP test node pairs[user:like:item] are saved to {lp_test_like_node_pairs_path}\")\n",
"\n",
"# Test negative dsts for user:like:item.\n",
"lp_test_like_neg_dsts_path = os.path.join(base_dir, \"lp-test-like-neg-dsts.pt\")\n",
"lp_test_like_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n",
"print(f\"Part of test negative dsts[user:like:item] for link prediction: {lp_test_like_neg_dsts[:10]}\")\n",
"torch.save(lp_test_like_neg_dsts, lp_test_like_neg_dsts_path)\n",
"print(f\"LP test negative dsts[user:like:item] are saved to {lp_test_like_neg_dsts_path}\")\n",
"\n",
"# Test node paris for user:follow:user.\n",
"lp_test_follow_node_pairs_path = os.path.join(base_dir, \"lp-test-follow-node-pairs.npy\")\n",
"lp_test_follow_node_pairs = follow_edges[-num_tests, :]\n",
"print(f\"Part of test node pairs[user:follow:user] for link prediction: {lp_test_follow_node_pairs[:10]}\")\n",
"np.save(lp_test_follow_node_pairs_path, lp_test_follow_node_pairs)\n",
"print(f\"LP test node pairs[user:follow:user] are saved to {lp_test_follow_node_pairs_path}\")\n",
"\n",
"# Test negative dsts for user:follow:user.\n",
"lp_test_follow_neg_dsts_path = os.path.join(base_dir, \"lp-test-follow-neg-dsts.pt\")\n",
"lp_test_follow_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n",
"print(f\"Part of test negative dsts[user:follow:user] for link prediction: {lp_test_follow_neg_dsts[:10]}\")\n",
"torch.save(lp_test_follow_neg_dsts, lp_test_follow_neg_dsts_path)\n",
"print(f\"LP test negative dsts[user:follow:user] are saved to {lp_test_follow_neg_dsts_path}\")"
], ],
"metadata": { "metadata": {
"id": "u0jCnXIcAQy4" "id": "u0jCnXIcAQy4"
...@@ -310,7 +445,9 @@ ...@@ -310,7 +445,9 @@
"cell_type": "markdown", "cell_type": "markdown",
"source": [ "source": [
"## Organize Data into YAML File\n", "## Organize Data into YAML File\n",
"Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets. Please note that all path should be relative to `metadata.yaml`." "Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets. Please note that all path should be relative to `metadata.yaml`.\n",
"\n",
"For heterogeneous graph, we need to specify the node/edge type in **type** fields. For edge type, canonical etype is required which is a string that's concatenated by source node type, etype, and destination node type together with `:`."
], ],
"metadata": { "metadata": {
"id": "wbk6-wxRK-6S" "id": "wbk6-wxRK-6S"
...@@ -323,92 +460,193 @@ ...@@ -323,92 +460,193 @@
" dataset_name: heterogeneous_graph_nc_lp\n", " dataset_name: heterogeneous_graph_nc_lp\n",
" graph:\n", " graph:\n",
" nodes:\n", " nodes:\n",
" - num: {num_nodes}\n", " - type: user\n",
" num: {num_nodes}\n",
" - type: item\n",
" num: {num_nodes}\n",
" edges:\n", " edges:\n",
" - format: csv\n", " - type: \"user:like:item\"\n",
" path: {os.path.basename(edges_path)}\n", " format: csv\n",
" path: {os.path.basename(like_edges_path)}\n",
" - type: \"user:follow:user\"\n",
" format: csv\n",
" path: {os.path.basename(follow_edges_path)}\n",
" feature_data:\n", " feature_data:\n",
" - domain: node\n", " - domain: node\n",
" type: user\n",
" name: feat_0\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(node_user_feat_0_path)}\n",
" - domain: node\n",
" type: user\n",
" name: feat_1\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(node_user_feat_1_path)}\n",
" - domain: node\n",
" type: item\n",
" name: feat_0\n", " name: feat_0\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(node_feat_0_path)}\n", " path: {os.path.basename(node_item_feat_0_path)}\n",
" - domain: node\n", " - domain: node\n",
" type: item\n",
" name: feat_1\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(node_item_feat_1_path)}\n",
" - domain: edge\n",
" type: \"user:like:item\"\n",
" name: feat_0\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(edge_like_feat_0_path)}\n",
" - domain: edge\n",
" type: \"user:like:item\"\n",
" name: feat_1\n", " name: feat_1\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(node_feat_1_path)}\n", " path: {os.path.basename(edge_like_feat_1_path)}\n",
" - domain: edge\n", " - domain: edge\n",
" type: \"user:follow:user\"\n",
" name: feat_0\n", " name: feat_0\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(edge_feat_0_path)}\n", " path: {os.path.basename(edge_follow_feat_0_path)}\n",
" - domain: edge\n", " - domain: edge\n",
" type: \"user:follow:user\"\n",
" name: feat_1\n", " name: feat_1\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(edge_feat_1_path)}\n", " path: {os.path.basename(edge_follow_feat_1_path)}\n",
" tasks:\n", " tasks:\n",
" - name: node_classification\n", " - name: node_classification\n",
" num_classes: 10\n", " num_classes: 10\n",
" train_set:\n", " train_set:\n",
" - data:\n", " - data:\n",
" - name: seed_nodes\n", " - type: user\n",
" name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(nc_train_ids_path)}\n", " path: {os.path.basename(nc_train_user_ids_path)}\n",
" - name: labels\n", " - type: user\n",
" name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(nc_train_labels_path)}\n", " path: {os.path.basename(nc_train_user_labels_path)}\n",
" - type: item\n",
" name: seed_nodes\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_item_ids_path)}\n",
" - type: item\n",
" name: labels\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_item_labels_path)}\n",
" validation_set:\n", " validation_set:\n",
" - data:\n", " - data:\n",
" - name: seed_nodes\n", " - type: user\n",
" name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(nc_val_ids_path)}\n", " path: {os.path.basename(nc_val_user_ids_path)}\n",
" - name: labels\n", " - type: user\n",
" name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(nc_val_labels_path)}\n", " path: {os.path.basename(nc_val_user_labels_path)}\n",
" - type: item\n",
" name: seed_nodes\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_item_ids_path)}\n",
" - type: item\n",
" name: labels\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_item_labels_path)}\n",
" test_set:\n", " test_set:\n",
" - data:\n", " - data:\n",
" - name: seed_nodes\n", " - type: user\n",
" name: seed_nodes\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(nc_test_ids_path)}\n", " path: {os.path.basename(nc_test_user_ids_path)}\n",
" - name: labels\n", " - type: user\n",
" name: labels\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(nc_test_labels_path)}\n", " path: {os.path.basename(nc_test_user_labels_path)}\n",
" - type: item\n",
" name: seed_nodes\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_item_ids_path)}\n",
" - type: item\n",
" name: labels\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_item_labels_path)}\n",
" - name: link_prediction\n", " - name: link_prediction\n",
" num_classes: 10\n", " num_classes: 10\n",
" train_set:\n", " train_set:\n",
" - data:\n", " - data:\n",
" - name: node_pairs\n", " - type: \"user:like:item\"\n",
" name: node_pairs\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_train_like_node_pairs_path)}\n",
" - type: \"user:follow:user\"\n",
" name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(lp_train_node_pairs_path)}\n", " path: {os.path.basename(lp_train_follow_node_pairs_path)}\n",
" validation_set:\n", " validation_set:\n",
" - data:\n", " - data:\n",
" - name: node_pairs\n", " - type: \"user:like:item\"\n",
" name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(lp_val_node_pairs_path)}\n", " path: {os.path.basename(lp_val_like_node_pairs_path)}\n",
" - name: negative_dsts\n", " - type: \"user:like:item\"\n",
" name: negative_dsts\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(lp_val_neg_dsts_path)}\n", " path: {os.path.basename(lp_val_like_neg_dsts_path)}\n",
" - type: \"user:follow:user\"\n",
" name: node_pairs\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_follow_node_pairs_path)}\n",
" - type: \"user:follow:user\"\n",
" name: negative_dsts\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_follow_neg_dsts_path)}\n",
" test_set:\n", " test_set:\n",
" - data:\n", " - data:\n",
" - name: node_pairs\n", " - type: \"user:like:item\"\n",
" name: node_pairs\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_like_node_pairs_path)}\n",
" - type: \"user:like:item\"\n",
" name: negative_dsts\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_like_neg_dsts_path)}\n",
" - type: \"user:follow:user\"\n",
" name: node_pairs\n",
" format: numpy\n", " format: numpy\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(lp_test_node_pairs_path)}\n", " path: {os.path.basename(lp_test_follow_node_pairs_path)}\n",
" - name: negative_dsts\n", " - type: \"user:follow:user\"\n",
" name: negative_dsts\n",
" format: torch\n", " format: torch\n",
" in_memory: true\n", " in_memory: true\n",
" path: {os.path.basename(lp_test_neg_dsts_path)}\n", " path: {os.path.basename(lp_test_follow_neg_dsts_path)}\n",
"\"\"\"\n", "\"\"\"\n",
"metadata_path = os.path.join(base_dir, \"metadata.yaml\")\n", "metadata_path = os.path.join(base_dir, \"metadata.yaml\")\n",
"with open(metadata_path, \"w\") as f:\n", "with open(metadata_path, \"w\") as f:\n",
...@@ -457,4 +695,4 @@ ...@@ -457,4 +695,4 @@
"outputs": [] "outputs": []
} }
] ]
} }
\ No newline at end of file
...@@ -105,7 +105,7 @@ ...@@ -105,7 +105,7 @@
"For homogeneous graph, we just need to save edges(namely node pairs) into **CSV** file.\n", "For homogeneous graph, we just need to save edges(namely node pairs) into **CSV** file.\n",
"\n", "\n",
"Note:\n", "Note:\n",
"when saving to file, do not save index and header.*italicized text*\n" "when saving to file, do not save index and header.\n"
], ],
"metadata": { "metadata": {
"id": "qhNtIn_xhlnl" "id": "qhNtIn_xhlnl"
...@@ -457,4 +457,4 @@ ...@@ -457,4 +457,4 @@
"outputs": [] "outputs": []
} }
] ]
} }
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment