Unverified Commit 6db323b3 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[GraphBolt] update OnDiskDataset tutorial (#6783)

parent 97ed294d
......@@ -22,7 +22,7 @@
"\n",
"[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dmlc/dgl/blob/master/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb) [![GitHub](https://img.shields.io/badge/-View%20on%20GitHub-181717?logo=github&logoColor=ffffff)](https://github.com/dmlc/dgl/blob/master/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb)\n",
"\n",
"This tutorial shows how to create `OnDiskDataset` for heterogeneous graph that could be used in **GraphBolt** framework.\n",
"This tutorial shows how to create `OnDiskDataset` for heterogeneous graph that could be used in **GraphBolt** framework. The major difference from creating dataset for homogeneous graph is that we need to specify node/edge types for edges, feature data, training/validation/test sets.\n",
"\n",
"By the end of this tutorial, you will be able to\n",
"- organize graph structure data.\n",
......@@ -102,10 +102,10 @@
"cell_type": "markdown",
"source": [
"### Generate graph structure data\n",
"For heterogeneous graph, we just need to save edges(namely node pairs) into **CSV** file.\n",
"For heterogeneous graph, we need to save different edge edges(namely node pairs) into separate **CSV** files.\n",
"\n",
"Note:\n",
"when saving to file, do not save index and header.*italicized text*\n"
"when saving to file, do not save index and header.\n"
],
"metadata": {
"id": "qhNtIn_xhlnl"
......@@ -116,17 +116,31 @@
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# For simplicity, we create a heterogeneous graph with\n",
"# 2 node types: `user`, `item`\n",
"# 2 edge types: `user:like:item`, `user:follow:user`\n",
"# And each node/edge type has the same number of nodes/edges.\n",
"num_nodes = 1000\n",
"num_edges = 10 * num_nodes\n",
"edges_path = os.path.join(base_dir, \"edges.csv\")\n",
"edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n",
"\n",
"print(f\"Part of edges: {edges[:10, :]}\")\n",
"# Edge type: \"user:like:item\"\n",
"like_edges_path = os.path.join(base_dir, \"like-edges.csv\")\n",
"like_edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n",
"print(f\"Part of [user:like:item] edges: {like_edges[:10, :]}\")\n",
"\n",
"df = pd.DataFrame(like_edges)\n",
"df.to_csv(like_edges_path, index=False, header=False)\n",
"print(f\"[user:like:item] edges are saved into {like_edges_path}\")\n",
"\n",
"df = pd.DataFrame(edges)\n",
"df.to_csv(edges_path, index=False, header=False)\n",
"# Edge type: \"user:follow:user\"\n",
"follow_edges_path = os.path.join(base_dir, \"follow-edges.csv\")\n",
"follow_edges = np.random.randint(0, num_nodes, size=(num_edges, 2))\n",
"print(f\"Part of [user:follow:user] edges: {follow_edges[:10, :]}\")\n",
"\n",
"print(f\"Edges are saved into {edges_path}\")"
"df = pd.DataFrame(follow_edges)\n",
"df.to_csv(follow_edges_path, index=False, header=False)\n",
"print(f\"[user:follow:user] edges are saved into {follow_edges_path}\")"
],
"metadata": {
"id": "HcBt4G5BmSjr"
......@@ -138,7 +152,7 @@
"cell_type": "markdown",
"source": [
"### Generate feature data for graph\n",
"For feature data, numpy arrays and torch tensors are supported for now."
"For feature data, numpy arrays and torch tensors are supported for now. Let's generate feature data for each node/edge type."
],
"metadata": {
"id": "kh-4cPtzpcaH"
......@@ -147,33 +161,61 @@
{
"cell_type": "code",
"source": [
"# Generate node feature in numpy array.\n",
"node_feat_0_path = os.path.join(base_dir, \"node-feat-0.npy\")\n",
"node_feat_0 = np.random.rand(num_nodes, 5)\n",
"print(f\"Part of node feature [feat_0]: {node_feat_0[:10, :]}\")\n",
"np.save(node_feat_0_path, node_feat_0)\n",
"print(f\"Node feature [feat_0] is saved to {node_feat_0_path}\")\n",
"\n",
"# Generate another node feature in torch tensor\n",
"node_feat_1_path = os.path.join(base_dir, \"node-feat-1.pt\")\n",
"node_feat_1 = torch.rand(num_nodes, 5)\n",
"print(f\"Part of node feature [feat_1]: {node_feat_1[:10, :]}\")\n",
"torch.save(node_feat_1, node_feat_1_path)\n",
"print(f\"Node feature [feat_1] is saved to {node_feat_1_path}\")\n",
"\n",
"# Generate edge feature in numpy array.\n",
"edge_feat_0_path = os.path.join(base_dir, \"edge-feat-0.npy\")\n",
"edge_feat_0 = np.random.rand(num_edges, 5)\n",
"print(f\"Part of edge feature [feat_0]: {edge_feat_0[:10, :]}\")\n",
"np.save(edge_feat_0_path, edge_feat_0)\n",
"print(f\"Edge feature [feat_0] is saved to {edge_feat_0_path}\")\n",
"\n",
"# Generate another edge feature in torch tensor\n",
"edge_feat_1_path = os.path.join(base_dir, \"edge-feat-1.pt\")\n",
"edge_feat_1 = torch.rand(num_edges, 5)\n",
"print(f\"Part of edge feature [feat_1]: {edge_feat_1[:10, :]}\")\n",
"torch.save(edge_feat_1, edge_feat_1_path)\n",
"print(f\"Edge feature [feat_1] is saved to {edge_feat_1_path}\")\n"
"# Generate node[user] feature in numpy array.\n",
"node_user_feat_0_path = os.path.join(base_dir, \"node-user-feat-0.npy\")\n",
"node_user_feat_0 = np.random.rand(num_nodes, 5)\n",
"print(f\"Part of node[user] feature [feat_0]: {node_user_feat_0[:10, :]}\")\n",
"np.save(node_user_feat_0_path, node_user_feat_0)\n",
"print(f\"Node[user] feature [feat_0] is saved to {node_user_feat_0_path}\")\n",
"\n",
"# Generate another node[user] feature in torch tensor\n",
"node_user_feat_1_path = os.path.join(base_dir, \"node-user-feat-1.pt\")\n",
"node_user_feat_1 = torch.rand(num_nodes, 5)\n",
"print(f\"Part of node[user] feature [feat_1]: {node_user_feat_1[:10, :]}\")\n",
"torch.save(node_user_feat_1, node_user_feat_1_path)\n",
"print(f\"Node[user] feature [feat_1] is saved to {node_user_feat_1_path}\")\n",
"\n",
"# Generate node[item] feature in numpy array.\n",
"node_item_feat_0_path = os.path.join(base_dir, \"node-item-feat-0.npy\")\n",
"node_item_feat_0 = np.random.rand(num_nodes, 5)\n",
"print(f\"Part of node[item] feature [feat_0]: {node_item_feat_0[:10, :]}\")\n",
"np.save(node_item_feat_0_path, node_item_feat_0)\n",
"print(f\"Node[item] feature [feat_0] is saved to {node_item_feat_0_path}\")\n",
"\n",
"# Generate another node[item] feature in torch tensor\n",
"node_item_feat_1_path = os.path.join(base_dir, \"node-item-feat-1.pt\")\n",
"node_item_feat_1 = torch.rand(num_nodes, 5)\n",
"print(f\"Part of node[item] feature [feat_1]: {node_item_feat_1[:10, :]}\")\n",
"torch.save(node_item_feat_1, node_item_feat_1_path)\n",
"print(f\"Node[item] feature [feat_1] is saved to {node_item_feat_1_path}\")\n",
"\n",
"# Generate edge[user:like:item] feature in numpy array.\n",
"edge_like_feat_0_path = os.path.join(base_dir, \"edge-like-feat-0.npy\")\n",
"edge_like_feat_0 = np.random.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:like:item] feature [feat_0]: {edge_like_feat_0[:10, :]}\")\n",
"np.save(edge_like_feat_0_path, edge_like_feat_0)\n",
"print(f\"Edge[user:like:item] feature [feat_0] is saved to {edge_like_feat_0_path}\")\n",
"\n",
"# Generate another edge[user:like:item] feature in torch tensor\n",
"edge_like_feat_1_path = os.path.join(base_dir, \"edge-like-feat-1.pt\")\n",
"edge_like_feat_1 = torch.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:like:item] feature [feat_1]: {edge_like_feat_1[:10, :]}\")\n",
"torch.save(edge_like_feat_1, edge_like_feat_1_path)\n",
"print(f\"Edge[user:like:item] feature [feat_1] is saved to {edge_like_feat_1_path}\")\n",
"\n",
"# Generate edge[user:follow:user] feature in numpy array.\n",
"edge_follow_feat_0_path = os.path.join(base_dir, \"edge-follow-feat-0.npy\")\n",
"edge_follow_feat_0 = np.random.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:follow:user] feature [feat_0]: {edge_follow_feat_0[:10, :]}\")\n",
"np.save(edge_follow_feat_0_path, edge_follow_feat_0)\n",
"print(f\"Edge[user:follow:user] feature [feat_0] is saved to {edge_follow_feat_0_path}\")\n",
"\n",
"# Generate another edge[user:follow:user] feature in torch tensor\n",
"edge_follow_feat_1_path = os.path.join(base_dir, \"edge-follow-feat-1.pt\")\n",
"edge_follow_feat_1 = torch.rand(num_edges, 5)\n",
"print(f\"Part of edge[user:follow:user] feature [feat_1]: {edge_follow_feat_1[:10, :]}\")\n",
"torch.save(edge_follow_feat_1, edge_follow_feat_1_path)\n",
"print(f\"Edge[user:follow:user] feature [feat_1] is saved to {edge_follow_feat_1_path}\")"
],
"metadata": {
"id": "_PVu1u5brBhF"
......@@ -204,48 +246,100 @@
{
"cell_type": "code",
"source": [
"# For illustration, let's generate item sets for each node type.\n",
"num_trains = int(num_nodes * 0.6)\n",
"num_vals = int(num_nodes * 0.2)\n",
"num_tests = num_nodes - num_trains - num_vals\n",
"\n",
"ids = np.arange(num_nodes)\n",
"np.random.shuffle(ids)\n",
"\n",
"nc_train_ids_path = os.path.join(base_dir, \"nc-train-ids.npy\")\n",
"nc_train_ids = ids[:num_trains]\n",
"print(f\"Part of train ids for node classification: {nc_train_ids[:10]}\")\n",
"np.save(nc_train_ids_path, nc_train_ids)\n",
"print(f\"NC train ids are saved to {nc_train_ids_path}\")\n",
"\n",
"nc_train_labels_path = os.path.join(base_dir, \"nc-train-labels.pt\")\n",
"nc_train_labels = torch.randint(0, 10, (num_trains,))\n",
"print(f\"Part of train labels for node classification: {nc_train_labels[:10]}\")\n",
"torch.save(nc_train_labels, nc_train_labels_path)\n",
"print(f\"NC train labels are saved to {nc_train_labels_path}\")\n",
"\n",
"nc_val_ids_path = os.path.join(base_dir, \"nc-val-ids.npy\")\n",
"nc_val_ids = ids[num_trains:num_trains+num_vals]\n",
"print(f\"Part of val ids for node classification: {nc_val_ids[:10]}\")\n",
"np.save(nc_val_ids_path, nc_val_ids)\n",
"print(f\"NC val ids are saved to {nc_val_ids_path}\")\n",
"\n",
"nc_val_labels_path = os.path.join(base_dir, \"nc-val-labels.pt\")\n",
"nc_val_labels = torch.randint(0, 10, (num_vals,))\n",
"print(f\"Part of val labels for node classification: {nc_val_labels[:10]}\")\n",
"torch.save(nc_val_labels, nc_val_labels_path)\n",
"print(f\"NC val labels are saved to {nc_val_labels_path}\")\n",
"\n",
"nc_test_ids_path = os.path.join(base_dir, \"nc-test-ids.npy\")\n",
"nc_test_ids = ids[-num_tests:]\n",
"print(f\"Part of test ids for node classification: {nc_test_ids[:10]}\")\n",
"np.save(nc_test_ids_path, nc_test_ids)\n",
"print(f\"NC test ids are saved to {nc_test_ids_path}\")\n",
"\n",
"nc_test_labels_path = os.path.join(base_dir, \"nc-test-labels.pt\")\n",
"nc_test_labels = torch.randint(0, 10, (num_tests,))\n",
"print(f\"Part of test labels for node classification: {nc_test_labels[:10]}\")\n",
"torch.save(nc_test_labels, nc_test_labels_path)\n",
"print(f\"NC test labels are saved to {nc_test_labels_path}\")"
"user_ids = np.arange(num_nodes)\n",
"np.random.shuffle(user_ids)\n",
"\n",
"item_ids = np.arange(num_nodes)\n",
"np.random.shuffle(item_ids)\n",
"\n",
"# Train IDs for user.\n",
"nc_train_user_ids_path = os.path.join(base_dir, \"nc-train-user-ids.npy\")\n",
"nc_train_user_ids = user_ids[:num_trains]\n",
"print(f\"Part of train ids[user] for node classification: {nc_train_user_ids[:10]}\")\n",
"np.save(nc_train_user_ids_path, nc_train_user_ids)\n",
"print(f\"NC train ids[user] are saved to {nc_train_user_ids_path}\")\n",
"\n",
"# Train labels for user.\n",
"nc_train_user_labels_path = os.path.join(base_dir, \"nc-train-user-labels.pt\")\n",
"nc_train_user_labels = torch.randint(0, 10, (num_trains,))\n",
"print(f\"Part of train labels[user] for node classification: {nc_train_user_labels[:10]}\")\n",
"torch.save(nc_train_user_labels, nc_train_user_labels_path)\n",
"print(f\"NC train labels[user] are saved to {nc_train_user_labels_path}\")\n",
"\n",
"# Train IDs for item.\n",
"nc_train_item_ids_path = os.path.join(base_dir, \"nc-train-item-ids.npy\")\n",
"nc_train_item_ids = item_ids[:num_trains]\n",
"print(f\"Part of train ids[item] for node classification: {nc_train_item_ids[:10]}\")\n",
"np.save(nc_train_item_ids_path, nc_train_item_ids)\n",
"print(f\"NC train ids[item] are saved to {nc_train_item_ids_path}\")\n",
"\n",
"# Train labels for item.\n",
"nc_train_item_labels_path = os.path.join(base_dir, \"nc-train-item-labels.pt\")\n",
"nc_train_item_labels = torch.randint(0, 10, (num_trains,))\n",
"print(f\"Part of train labels[item] for node classification: {nc_train_item_labels[:10]}\")\n",
"torch.save(nc_train_item_labels, nc_train_item_labels_path)\n",
"print(f\"NC train labels[item] are saved to {nc_train_item_labels_path}\")\n",
"\n",
"# Val IDs for user.\n",
"nc_val_user_ids_path = os.path.join(base_dir, \"nc-val-user-ids.npy\")\n",
"nc_val_user_ids = user_ids[num_trains:num_trains+num_vals]\n",
"print(f\"Part of val ids[user] for node classification: {nc_val_user_ids[:10]}\")\n",
"np.save(nc_val_user_ids_path, nc_val_user_ids)\n",
"print(f\"NC val ids[user] are saved to {nc_val_user_ids_path}\")\n",
"\n",
"# Val labels for user.\n",
"nc_val_user_labels_path = os.path.join(base_dir, \"nc-val-user-labels.pt\")\n",
"nc_val_user_labels = torch.randint(0, 10, (num_vals,))\n",
"print(f\"Part of val labels[user] for node classification: {nc_val_user_labels[:10]}\")\n",
"torch.save(nc_val_user_labels, nc_val_user_labels_path)\n",
"print(f\"NC val labels[user] are saved to {nc_val_user_labels_path}\")\n",
"\n",
"# Val IDs for item.\n",
"nc_val_item_ids_path = os.path.join(base_dir, \"nc-val-item-ids.npy\")\n",
"nc_val_item_ids = item_ids[num_trains:num_trains+num_vals]\n",
"print(f\"Part of val ids[item] for node classification: {nc_val_item_ids[:10]}\")\n",
"np.save(nc_val_item_ids_path, nc_val_item_ids)\n",
"print(f\"NC val ids[item] are saved to {nc_val_item_ids_path}\")\n",
"\n",
"# Val labels for item.\n",
"nc_val_item_labels_path = os.path.join(base_dir, \"nc-val-item-labels.pt\")\n",
"nc_val_item_labels = torch.randint(0, 10, (num_vals,))\n",
"print(f\"Part of val labels[item] for node classification: {nc_val_item_labels[:10]}\")\n",
"torch.save(nc_val_item_labels, nc_val_item_labels_path)\n",
"print(f\"NC val labels[item] are saved to {nc_val_item_labels_path}\")\n",
"\n",
"# Test IDs for user.\n",
"nc_test_user_ids_path = os.path.join(base_dir, \"nc-test-user-ids.npy\")\n",
"nc_test_user_ids = user_ids[-num_tests:]\n",
"print(f\"Part of test ids[user] for node classification: {nc_test_user_ids[:10]}\")\n",
"np.save(nc_test_user_ids_path, nc_test_user_ids)\n",
"print(f\"NC test ids[user] are saved to {nc_test_user_ids_path}\")\n",
"\n",
"# Test labels for user.\n",
"nc_test_user_labels_path = os.path.join(base_dir, \"nc-test-user-labels.pt\")\n",
"nc_test_user_labels = torch.randint(0, 10, (num_tests,))\n",
"print(f\"Part of test labels[user] for node classification: {nc_test_user_labels[:10]}\")\n",
"torch.save(nc_test_user_labels, nc_test_user_labels_path)\n",
"print(f\"NC test labels[user] are saved to {nc_test_user_labels_path}\")\n",
"\n",
"# Test IDs for item.\n",
"nc_test_item_ids_path = os.path.join(base_dir, \"nc-test-item-ids.npy\")\n",
"nc_test_item_ids = item_ids[-num_tests:]\n",
"print(f\"Part of test ids[item] for node classification: {nc_test_item_ids[:10]}\")\n",
"np.save(nc_test_item_ids_path, nc_test_item_ids)\n",
"print(f\"NC test ids[item] are saved to {nc_test_item_ids_path}\")\n",
"\n",
"# Test labels for item.\n",
"nc_test_item_labels_path = os.path.join(base_dir, \"nc-test-item-labels.pt\")\n",
"nc_test_item_labels = torch.randint(0, 10, (num_tests,))\n",
"print(f\"Part of test labels[item] for node classification: {nc_test_item_labels[:10]}\")\n",
"torch.save(nc_test_item_labels, nc_test_item_labels_path)\n",
"print(f\"NC test labels[item] are saved to {nc_test_item_labels_path}\")"
],
"metadata": {
"id": "S5-fyBbHzTCO"
......@@ -266,39 +360,80 @@
{
"cell_type": "code",
"source": [
"# For illustration, let's generate item sets for each edge type.\n",
"num_trains = int(num_edges * 0.6)\n",
"num_vals = int(num_edges * 0.2)\n",
"num_tests = num_edges - num_trains - num_vals\n",
"\n",
"lp_train_node_pairs_path = os.path.join(base_dir, \"lp-train-node-pairs.npy\")\n",
"lp_train_node_pairs = edges[:num_trains, :]\n",
"print(f\"Part of train node pairs for link prediction: {lp_train_node_pairs[:10]}\")\n",
"np.save(lp_train_node_pairs_path, lp_train_node_pairs)\n",
"print(f\"LP train node pairs are saved to {lp_train_node_pairs_path}\")\n",
"\n",
"lp_val_node_pairs_path = os.path.join(base_dir, \"lp-val-node-pairs.npy\")\n",
"lp_val_node_pairs = edges[num_trains:num_trains+num_vals, :]\n",
"print(f\"Part of val node pairs for link prediction: {lp_val_node_pairs[:10]}\")\n",
"np.save(lp_val_node_pairs_path, lp_val_node_pairs)\n",
"print(f\"LP val node pairs are saved to {lp_val_node_pairs_path}\")\n",
"\n",
"lp_val_neg_dsts_path = os.path.join(base_dir, \"lp-val-neg-dsts.pt\")\n",
"lp_val_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n",
"print(f\"Part of val negative dsts for link prediction: {lp_val_neg_dsts[:10]}\")\n",
"torch.save(lp_val_neg_dsts, lp_val_neg_dsts_path)\n",
"print(f\"LP val negative dsts are saved to {lp_val_neg_dsts_path}\")\n",
"\n",
"lp_test_node_pairs_path = os.path.join(base_dir, \"lp-test-node-pairs.npy\")\n",
"lp_test_node_pairs = edges[-num_tests, :]\n",
"print(f\"Part of test node pairs for link prediction: {lp_test_node_pairs[:10]}\")\n",
"np.save(lp_test_node_pairs_path, lp_test_node_pairs)\n",
"print(f\"LP test node pairs are saved to {lp_test_node_pairs_path}\")\n",
"\n",
"lp_test_neg_dsts_path = os.path.join(base_dir, \"lp-test-neg-dsts.pt\")\n",
"lp_test_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n",
"print(f\"Part of test negative dsts for link prediction: {lp_test_neg_dsts[:10]}\")\n",
"torch.save(lp_test_neg_dsts, lp_test_neg_dsts_path)\n",
"print(f\"LP test negative dsts are saved to {lp_test_neg_dsts_path}\")"
"# Train node pairs for user:like:item.\n",
"lp_train_like_node_pairs_path = os.path.join(base_dir, \"lp-train-like-node-pairs.npy\")\n",
"lp_train_like_node_pairs = like_edges[:num_trains, :]\n",
"print(f\"Part of train node pairs[user:like:item] for link prediction: {lp_train_like_node_pairs[:10]}\")\n",
"np.save(lp_train_like_node_pairs_path, lp_train_like_node_pairs)\n",
"print(f\"LP train node pairs[user:like:item] are saved to {lp_train_like_node_pairs_path}\")\n",
"\n",
"# Train node pairs for user:follow:user.\n",
"lp_train_follow_node_pairs_path = os.path.join(base_dir, \"lp-train-follow-node-pairs.npy\")\n",
"lp_train_follow_node_pairs = follow_edges[:num_trains, :]\n",
"print(f\"Part of train node pairs[user:follow:user] for link prediction: {lp_train_follow_node_pairs[:10]}\")\n",
"np.save(lp_train_follow_node_pairs_path, lp_train_follow_node_pairs)\n",
"print(f\"LP train node pairs[user:follow:user] are saved to {lp_train_follow_node_pairs_path}\")\n",
"\n",
"# Val node pairs for user:like:item.\n",
"lp_val_like_node_pairs_path = os.path.join(base_dir, \"lp-val-like-node-pairs.npy\")\n",
"lp_val_like_node_pairs = like_edges[num_trains:num_trains+num_vals, :]\n",
"print(f\"Part of val node pairs[user:like:item] for link prediction: {lp_val_like_node_pairs[:10]}\")\n",
"np.save(lp_val_like_node_pairs_path, lp_val_like_node_pairs)\n",
"print(f\"LP val node pairs[user:like:item] are saved to {lp_val_like_node_pairs_path}\")\n",
"\n",
"# Val negative dsts for user:like:item.\n",
"lp_val_like_neg_dsts_path = os.path.join(base_dir, \"lp-val-like-neg-dsts.pt\")\n",
"lp_val_like_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n",
"print(f\"Part of val negative dsts[user:like:item] for link prediction: {lp_val_like_neg_dsts[:10]}\")\n",
"torch.save(lp_val_like_neg_dsts, lp_val_like_neg_dsts_path)\n",
"print(f\"LP val negative dsts[user:like:item] are saved to {lp_val_like_neg_dsts_path}\")\n",
"\n",
"# Val node pairs for user:follow:user.\n",
"lp_val_follow_node_pairs_path = os.path.join(base_dir, \"lp-val-follow-node-pairs.npy\")\n",
"lp_val_follow_node_pairs = follow_edges[num_trains:num_trains+num_vals, :]\n",
"print(f\"Part of val node pairs[user:follow:user] for link prediction: {lp_val_follow_node_pairs[:10]}\")\n",
"np.save(lp_val_follow_node_pairs_path, lp_val_follow_node_pairs)\n",
"print(f\"LP val node pairs[user:follow:user] are saved to {lp_val_follow_node_pairs_path}\")\n",
"\n",
"# Val negative dsts for user:follow:user.\n",
"lp_val_follow_neg_dsts_path = os.path.join(base_dir, \"lp-val-follow-neg-dsts.pt\")\n",
"lp_val_follow_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n",
"print(f\"Part of val negative dsts[user:follow:user] for link prediction: {lp_val_follow_neg_dsts[:10]}\")\n",
"torch.save(lp_val_follow_neg_dsts, lp_val_follow_neg_dsts_path)\n",
"print(f\"LP val negative dsts[user:follow:user] are saved to {lp_val_follow_neg_dsts_path}\")\n",
"\n",
"# Test node paris for user:like:item.\n",
"lp_test_like_node_pairs_path = os.path.join(base_dir, \"lp-test-like-node-pairs.npy\")\n",
"lp_test_like_node_pairs = like_edges[-num_tests, :]\n",
"print(f\"Part of test node pairs[user:like:item] for link prediction: {lp_test_like_node_pairs[:10]}\")\n",
"np.save(lp_test_like_node_pairs_path, lp_test_like_node_pairs)\n",
"print(f\"LP test node pairs[user:like:item] are saved to {lp_test_like_node_pairs_path}\")\n",
"\n",
"# Test negative dsts for user:like:item.\n",
"lp_test_like_neg_dsts_path = os.path.join(base_dir, \"lp-test-like-neg-dsts.pt\")\n",
"lp_test_like_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n",
"print(f\"Part of test negative dsts[user:like:item] for link prediction: {lp_test_like_neg_dsts[:10]}\")\n",
"torch.save(lp_test_like_neg_dsts, lp_test_like_neg_dsts_path)\n",
"print(f\"LP test negative dsts[user:like:item] are saved to {lp_test_like_neg_dsts_path}\")\n",
"\n",
"# Test node paris for user:follow:user.\n",
"lp_test_follow_node_pairs_path = os.path.join(base_dir, \"lp-test-follow-node-pairs.npy\")\n",
"lp_test_follow_node_pairs = follow_edges[-num_tests, :]\n",
"print(f\"Part of test node pairs[user:follow:user] for link prediction: {lp_test_follow_node_pairs[:10]}\")\n",
"np.save(lp_test_follow_node_pairs_path, lp_test_follow_node_pairs)\n",
"print(f\"LP test node pairs[user:follow:user] are saved to {lp_test_follow_node_pairs_path}\")\n",
"\n",
"# Test negative dsts for user:follow:user.\n",
"lp_test_follow_neg_dsts_path = os.path.join(base_dir, \"lp-test-follow-neg-dsts.pt\")\n",
"lp_test_follow_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n",
"print(f\"Part of test negative dsts[user:follow:user] for link prediction: {lp_test_follow_neg_dsts[:10]}\")\n",
"torch.save(lp_test_follow_neg_dsts, lp_test_follow_neg_dsts_path)\n",
"print(f\"LP test negative dsts[user:follow:user] are saved to {lp_test_follow_neg_dsts_path}\")"
],
"metadata": {
"id": "u0jCnXIcAQy4"
......@@ -310,7 +445,9 @@
"cell_type": "markdown",
"source": [
"## Organize Data into YAML File\n",
"Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets. Please note that all path should be relative to `metadata.yaml`."
"Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets. Please note that all path should be relative to `metadata.yaml`.\n",
"\n",
"For heterogeneous graph, we need to specify the node/edge type in **type** fields. For edge type, canonical etype is required which is a string that's concatenated by source node type, etype, and destination node type together with `:`."
],
"metadata": {
"id": "wbk6-wxRK-6S"
......@@ -323,92 +460,193 @@
" dataset_name: heterogeneous_graph_nc_lp\n",
" graph:\n",
" nodes:\n",
" - num: {num_nodes}\n",
" - type: user\n",
" num: {num_nodes}\n",
" - type: item\n",
" num: {num_nodes}\n",
" edges:\n",
" - format: csv\n",
" path: {os.path.basename(edges_path)}\n",
" - type: \"user:like:item\"\n",
" format: csv\n",
" path: {os.path.basename(like_edges_path)}\n",
" - type: \"user:follow:user\"\n",
" format: csv\n",
" path: {os.path.basename(follow_edges_path)}\n",
" feature_data:\n",
" - domain: node\n",
" type: user\n",
" name: feat_0\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(node_user_feat_0_path)}\n",
" - domain: node\n",
" type: user\n",
" name: feat_1\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(node_user_feat_1_path)}\n",
" - domain: node\n",
" type: item\n",
" name: feat_0\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(node_feat_0_path)}\n",
" path: {os.path.basename(node_item_feat_0_path)}\n",
" - domain: node\n",
" type: item\n",
" name: feat_1\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(node_item_feat_1_path)}\n",
" - domain: edge\n",
" type: \"user:like:item\"\n",
" name: feat_0\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(edge_like_feat_0_path)}\n",
" - domain: edge\n",
" type: \"user:like:item\"\n",
" name: feat_1\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(node_feat_1_path)}\n",
" path: {os.path.basename(edge_like_feat_1_path)}\n",
" - domain: edge\n",
" type: \"user:follow:user\"\n",
" name: feat_0\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(edge_feat_0_path)}\n",
" path: {os.path.basename(edge_follow_feat_0_path)}\n",
" - domain: edge\n",
" type: \"user:follow:user\"\n",
" name: feat_1\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(edge_feat_1_path)}\n",
" path: {os.path.basename(edge_follow_feat_1_path)}\n",
" tasks:\n",
" - name: node_classification\n",
" num_classes: 10\n",
" train_set:\n",
" - data:\n",
" - name: seed_nodes\n",
" - type: user\n",
" name: seed_nodes\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_ids_path)}\n",
" - name: labels\n",
" path: {os.path.basename(nc_train_user_ids_path)}\n",
" - type: user\n",
" name: labels\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_labels_path)}\n",
" path: {os.path.basename(nc_train_user_labels_path)}\n",
" - type: item\n",
" name: seed_nodes\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_item_ids_path)}\n",
" - type: item\n",
" name: labels\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_train_item_labels_path)}\n",
" validation_set:\n",
" - data:\n",
" - name: seed_nodes\n",
" - type: user\n",
" name: seed_nodes\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_ids_path)}\n",
" - name: labels\n",
" path: {os.path.basename(nc_val_user_ids_path)}\n",
" - type: user\n",
" name: labels\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_labels_path)}\n",
" path: {os.path.basename(nc_val_user_labels_path)}\n",
" - type: item\n",
" name: seed_nodes\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_item_ids_path)}\n",
" - type: item\n",
" name: labels\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_val_item_labels_path)}\n",
" test_set:\n",
" - data:\n",
" - name: seed_nodes\n",
" - type: user\n",
" name: seed_nodes\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_ids_path)}\n",
" - name: labels\n",
" path: {os.path.basename(nc_test_user_ids_path)}\n",
" - type: user\n",
" name: labels\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_labels_path)}\n",
" path: {os.path.basename(nc_test_user_labels_path)}\n",
" - type: item\n",
" name: seed_nodes\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_item_ids_path)}\n",
" - type: item\n",
" name: labels\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(nc_test_item_labels_path)}\n",
" - name: link_prediction\n",
" num_classes: 10\n",
" train_set:\n",
" - data:\n",
" - name: node_pairs\n",
" - type: \"user:like:item\"\n",
" name: node_pairs\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_train_like_node_pairs_path)}\n",
" - type: \"user:follow:user\"\n",
" name: node_pairs\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_train_node_pairs_path)}\n",
" path: {os.path.basename(lp_train_follow_node_pairs_path)}\n",
" validation_set:\n",
" - data:\n",
" - name: node_pairs\n",
" - type: \"user:like:item\"\n",
" name: node_pairs\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_node_pairs_path)}\n",
" - name: negative_dsts\n",
" path: {os.path.basename(lp_val_like_node_pairs_path)}\n",
" - type: \"user:like:item\"\n",
" name: negative_dsts\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_neg_dsts_path)}\n",
" path: {os.path.basename(lp_val_like_neg_dsts_path)}\n",
" - type: \"user:follow:user\"\n",
" name: node_pairs\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_follow_node_pairs_path)}\n",
" - type: \"user:follow:user\"\n",
" name: negative_dsts\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_val_follow_neg_dsts_path)}\n",
" test_set:\n",
" - data:\n",
" - name: node_pairs\n",
" - type: \"user:like:item\"\n",
" name: node_pairs\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_like_node_pairs_path)}\n",
" - type: \"user:like:item\"\n",
" name: negative_dsts\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_like_neg_dsts_path)}\n",
" - type: \"user:follow:user\"\n",
" name: node_pairs\n",
" format: numpy\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_node_pairs_path)}\n",
" - name: negative_dsts\n",
" path: {os.path.basename(lp_test_follow_node_pairs_path)}\n",
" - type: \"user:follow:user\"\n",
" name: negative_dsts\n",
" format: torch\n",
" in_memory: true\n",
" path: {os.path.basename(lp_test_neg_dsts_path)}\n",
" path: {os.path.basename(lp_test_follow_neg_dsts_path)}\n",
"\"\"\"\n",
"metadata_path = os.path.join(base_dir, \"metadata.yaml\")\n",
"with open(metadata_path, \"w\") as f:\n",
......
......@@ -105,7 +105,7 @@
"For homogeneous graph, we just need to save edges(namely node pairs) into **CSV** file.\n",
"\n",
"Note:\n",
"when saving to file, do not save index and header.*italicized text*\n"
"when saving to file, do not save index and header.\n"
],
"metadata": {
"id": "qhNtIn_xhlnl"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment