[GraphBolt] update OnDiskDataset tutorial (#6793)

67d93458 · Rhett Ying · GitHub · 3cf82462 · 67d93458 · 67d93458
Unverified Commit 67d93458 authored Dec 21, 2023 by Rhett Ying Committed by GitHub Dec 21, 2023
2 changed files
--- a/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb
+++ b/notebooks/stochastic_training/ondisk_dataset_heterograph.ipynb
--- a/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb
+++ b/notebooks/stochastic_training/ondisk_dataset_homograph.ipynb
@@ -152,28 +152,28 @@
        "node_feat_0 = np.random.rand(num_nodes, 5)\n",
        "print(f\"Part of node feature [feat_0]: {node_feat_0[:10, :]}\")\n",
        "np.save(node_feat_0_path, node_feat_0)\n",
-        "print(f\"Node feature [feat_0] is saved to {node_feat_0_path}\")\n",
+        "print(f\"Node feature [feat_0] is saved to {node_feat_0_path}\\n\")\n",
        "\n",
        "# Generate another node feature in torch tensor\n",
        "node_feat_1_path = os.path.join(base_dir, \"node-feat-1.pt\")\n",
        "node_feat_1 = torch.rand(num_nodes, 5)\n",
        "print(f\"Part of node feature [feat_1]: {node_feat_1[:10, :]}\")\n",
        "torch.save(node_feat_1, node_feat_1_path)\n",
-        "print(f\"Node feature [feat_1] is saved to {node_feat_1_path}\")\n",
+        "print(f\"Node feature [feat_1] is saved to {node_feat_1_path}\\n\")\n",
        "\n",
        "# Generate edge feature in numpy array.\n",
        "edge_feat_0_path = os.path.join(base_dir, \"edge-feat-0.npy\")\n",
        "edge_feat_0 = np.random.rand(num_edges, 5)\n",
        "print(f\"Part of edge feature [feat_0]: {edge_feat_0[:10, :]}\")\n",
        "np.save(edge_feat_0_path, edge_feat_0)\n",
-        "print(f\"Edge feature [feat_0] is saved to {edge_feat_0_path}\")\n",
+        "print(f\"Edge feature [feat_0] is saved to {edge_feat_0_path}\\n\")\n",
        "\n",
        "# Generate another edge feature in torch tensor\n",
        "edge_feat_1_path = os.path.join(base_dir, \"edge-feat-1.pt\")\n",
        "edge_feat_1 = torch.rand(num_edges, 5)\n",
        "print(f\"Part of edge feature [feat_1]: {edge_feat_1[:10, :]}\")\n",
        "torch.save(edge_feat_1, edge_feat_1_path)\n",
-        "print(f\"Edge feature [feat_1] is saved to {edge_feat_1_path}\")\n"
+        "print(f\"Edge feature [feat_1] is saved to {edge_feat_1_path}\\n\")\n"
      ],
      "metadata": {
        "id": "_PVu1u5brBhF"
@@ -215,37 +215,37 @@
        "nc_train_ids = ids[:num_trains]\n",
        "print(f\"Part of train ids for node classification: {nc_train_ids[:10]}\")\n",
        "np.save(nc_train_ids_path, nc_train_ids)\n",
-        "print(f\"NC train ids are saved to {nc_train_ids_path}\")\n",
+        "print(f\"NC train ids are saved to {nc_train_ids_path}\\n\")\n",
        "\n",
        "nc_train_labels_path = os.path.join(base_dir, \"nc-train-labels.pt\")\n",
        "nc_train_labels = torch.randint(0, 10, (num_trains,))\n",
        "print(f\"Part of train labels for node classification: {nc_train_labels[:10]}\")\n",
        "torch.save(nc_train_labels, nc_train_labels_path)\n",
-        "print(f\"NC train labels are saved to {nc_train_labels_path}\")\n",
+        "print(f\"NC train labels are saved to {nc_train_labels_path}\\n\")\n",
        "\n",
        "nc_val_ids_path = os.path.join(base_dir, \"nc-val-ids.npy\")\n",
        "nc_val_ids = ids[num_trains:num_trains+num_vals]\n",
        "print(f\"Part of val ids for node classification: {nc_val_ids[:10]}\")\n",
        "np.save(nc_val_ids_path, nc_val_ids)\n",
-        "print(f\"NC val ids are saved to {nc_val_ids_path}\")\n",
+        "print(f\"NC val ids are saved to {nc_val_ids_path}\\n\")\n",
        "\n",
        "nc_val_labels_path = os.path.join(base_dir, \"nc-val-labels.pt\")\n",
        "nc_val_labels = torch.randint(0, 10, (num_vals,))\n",
        "print(f\"Part of val labels for node classification: {nc_val_labels[:10]}\")\n",
        "torch.save(nc_val_labels, nc_val_labels_path)\n",
-        "print(f\"NC val labels are saved to {nc_val_labels_path}\")\n",
+        "print(f\"NC val labels are saved to {nc_val_labels_path}\\n\")\n",
        "\n",
        "nc_test_ids_path = os.path.join(base_dir, \"nc-test-ids.npy\")\n",
        "nc_test_ids = ids[-num_tests:]\n",
        "print(f\"Part of test ids for node classification: {nc_test_ids[:10]}\")\n",
        "np.save(nc_test_ids_path, nc_test_ids)\n",
-        "print(f\"NC test ids are saved to {nc_test_ids_path}\")\n",
+        "print(f\"NC test ids are saved to {nc_test_ids_path}\\n\")\n",
        "\n",
        "nc_test_labels_path = os.path.join(base_dir, \"nc-test-labels.pt\")\n",
        "nc_test_labels = torch.randint(0, 10, (num_tests,))\n",
        "print(f\"Part of test labels for node classification: {nc_test_labels[:10]}\")\n",
        "torch.save(nc_test_labels, nc_test_labels_path)\n",
-        "print(f\"NC test labels are saved to {nc_test_labels_path}\")"
+        "print(f\"NC test labels are saved to {nc_test_labels_path}\\n\")"
      ],
      "metadata": {
        "id": "S5-fyBbHzTCO"
@@ -274,31 +274,31 @@
        "lp_train_node_pairs = edges[:num_trains, :]\n",
        "print(f\"Part of train node pairs for link prediction: {lp_train_node_pairs[:10]}\")\n",
        "np.save(lp_train_node_pairs_path, lp_train_node_pairs)\n",
-        "print(f\"LP train node pairs are saved to {lp_train_node_pairs_path}\")\n",
+        "print(f\"LP train node pairs are saved to {lp_train_node_pairs_path}\\n\")\n",
        "\n",
        "lp_val_node_pairs_path = os.path.join(base_dir, \"lp-val-node-pairs.npy\")\n",
        "lp_val_node_pairs = edges[num_trains:num_trains+num_vals, :]\n",
        "print(f\"Part of val node pairs for link prediction: {lp_val_node_pairs[:10]}\")\n",
        "np.save(lp_val_node_pairs_path, lp_val_node_pairs)\n",
-        "print(f\"LP val node pairs are saved to {lp_val_node_pairs_path}\")\n",
+        "print(f\"LP val node pairs are saved to {lp_val_node_pairs_path}\\n\")\n",
        "\n",
        "lp_val_neg_dsts_path = os.path.join(base_dir, \"lp-val-neg-dsts.pt\")\n",
        "lp_val_neg_dsts = torch.randint(0, num_nodes, (num_vals, 10))\n",
        "print(f\"Part of val negative dsts for link prediction: {lp_val_neg_dsts[:10]}\")\n",
        "torch.save(lp_val_neg_dsts, lp_val_neg_dsts_path)\n",
-        "print(f\"LP val negative dsts are saved to {lp_val_neg_dsts_path}\")\n",
+        "print(f\"LP val negative dsts are saved to {lp_val_neg_dsts_path}\\n\")\n",
        "\n",
        "lp_test_node_pairs_path = os.path.join(base_dir, \"lp-test-node-pairs.npy\")\n",
        "lp_test_node_pairs = edges[-num_tests, :]\n",
        "print(f\"Part of test node pairs for link prediction: {lp_test_node_pairs[:10]}\")\n",
        "np.save(lp_test_node_pairs_path, lp_test_node_pairs)\n",
-        "print(f\"LP test node pairs are saved to {lp_test_node_pairs_path}\")\n",
+        "print(f\"LP test node pairs are saved to {lp_test_node_pairs_path}\\n\")\n",
        "\n",
        "lp_test_neg_dsts_path = os.path.join(base_dir, \"lp-test-neg-dsts.pt\")\n",
        "lp_test_neg_dsts = torch.randint(0, num_nodes, (num_tests, 10))\n",
        "print(f\"Part of test negative dsts for link prediction: {lp_test_neg_dsts[:10]}\")\n",
        "torch.save(lp_test_neg_dsts, lp_test_neg_dsts_path)\n",
-        "print(f\"LP test negative dsts are saved to {lp_test_neg_dsts_path}\")"
+        "print(f\"LP test negative dsts are saved to {lp_test_neg_dsts_path}\\n\")"
      ],
      "metadata": {
        "id": "u0jCnXIcAQy4"
@@ -310,7 +310,14 @@
      "cell_type": "markdown",
      "source": [
        "## Organize Data into YAML File\n",
-        "Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets. Please note that all path should be relative to `metadata.yaml`."
+        "Now we need to create a `metadata.yaml` file which contains the paths, dadta types of graph structure, feature data, training/validation/test sets.\n",
+        "\n",
+        "Notes:\n",
+        "- all path should be relative to `metadata.yaml`.\n",
+        "- Below fields are optional and not specified in below example.\n",
+        "  - `in_memory`: indicates whether to load dada into memory or `mmap`. Default is `True`.\n",
+        "\n",
+        "Please refer to [YAML specification](https://github.com/dmlc/dgl/blob/master/docs/source/stochastic_training/ondisk-dataset-specification.rst) for more details."
      ],
      "metadata": {
        "id": "wbk6-wxRK-6S"
@@ -331,22 +338,18 @@
        "      - domain: node\n",
        "        name: feat_0\n",
        "        format: numpy\n",
-        "        in_memory: true\n",
        "        path: {os.path.basename(node_feat_0_path)}\n",
        "      - domain: node\n",
        "        name: feat_1\n",
        "        format: torch\n",
-        "        in_memory: true\n",
        "        path: {os.path.basename(node_feat_1_path)}\n",
        "      - domain: edge\n",
        "        name: feat_0\n",
        "        format: numpy\n",
-        "        in_memory: true\n",
        "        path: {os.path.basename(edge_feat_0_path)}\n",
        "      - domain: edge\n",
        "        name: feat_1\n",
        "        format: torch\n",
-        "        in_memory: true\n",
        "        path: {os.path.basename(edge_feat_1_path)}\n",
        "    tasks:\n",
        "      - name: node_classification\n",
@@ -355,31 +358,25 @@
        "          - data:\n",
        "              - name: seed_nodes\n",
        "                format: numpy\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(nc_train_ids_path)}\n",
        "              - name: labels\n",
        "                format: torch\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(nc_train_labels_path)}\n",
        "        validation_set:\n",
        "          - data:\n",
        "              - name: seed_nodes\n",
        "                format: numpy\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(nc_val_ids_path)}\n",
        "              - name: labels\n",
        "                format: torch\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(nc_val_labels_path)}\n",
        "        test_set:\n",
        "          - data:\n",
        "              - name: seed_nodes\n",
        "                format: numpy\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(nc_test_ids_path)}\n",
        "              - name: labels\n",
        "                format: torch\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(nc_test_labels_path)}\n",
        "      - name: link_prediction\n",
        "        num_classes: 10\n",
@@ -387,27 +384,22 @@
        "          - data:\n",
        "              - name: node_pairs\n",
        "                format: numpy\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(lp_train_node_pairs_path)}\n",
        "        validation_set:\n",
        "          - data:\n",
        "              - name: node_pairs\n",
        "                format: numpy\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(lp_val_node_pairs_path)}\n",
        "              - name: negative_dsts\n",
        "                format: torch\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(lp_val_neg_dsts_path)}\n",
        "        test_set:\n",
        "          - data:\n",
        "              - name: node_pairs\n",
        "                format: numpy\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(lp_test_node_pairs_path)}\n",
        "              - name: negative_dsts\n",
        "                format: torch\n",
-        "                in_memory: true\n",
        "                path: {os.path.basename(lp_test_neg_dsts_path)}\n",
        "\"\"\"\n",
        "metadata_path = os.path.join(base_dir, \"metadata.yaml\")\n",
@@ -439,16 +431,16 @@
      "source": [
        "dataset = gb.OnDiskDataset(base_dir).load()\n",
        "graph = dataset.graph\n",
-        "print(f\"Loaded graph: {graph}\")\n",
+        "print(f\"Loaded graph: {graph}\\n\")\n",
        "\n",
        "feature = dataset.feature\n",
-        "print(f\"Loaded feature store: {feature}\")\n",
+        "print(f\"Loaded feature store: {feature}\\n\")\n",
        "\n",
        "tasks = dataset.tasks\n",
        "nc_task = tasks[0]\n",
-        "print(f\"Loaded node classification task: {nc_task}\")\n",
+        "print(f\"Loaded node classification task: {nc_task}\\n\")\n",
        "lp_task = tasks[1]\n",
-        "print(f\"Loaded link prediction task: {lp_task}\")"
+        "print(f\"Loaded link prediction task: {lp_task}\\n\")"
      ],
      "metadata": {
        "id": "W58CZoSzOiyo"