Code changes to fix order sensitivity of the pipeline (#5288)

Following changes are made in this PR. 1. In dataset_utils.py, when reading edges from disk we follow the order defined by the STR_EDGE_TYPE key in the metadata.json file. This order is implicitly used to assign edgeid to edge types. This same order is used to read edges from the disk as well. 2. Now the unit test framework will also randomize the order of edges read from the disk. This is done for the edges when reading from the disk for the unit tests. Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>

Code changes to fix order sensitivity of the pipeline (#5288)
Following changes are made in this PR. 1. In dataset_utils.py, when reading edges from disk we follow the order defined by the STR_EDGE_TYPE key in the metadata.json file. This order is implicitly used to assign edgeid to edge types. This same order is used to read edges from the disk as well. 2. Now the unit test framework will also randomize the order of edges read from the disk. This is done for the edges when reading from the disk for the unit tests. Co-authored-by: Quan (Andy) Gan <coin2028@hotmail.com>
432c71ef · kylasa · GitHub · f5afc6ea · 432c71ef · 432c71ef
Unverified Commit 432c71ef authored Feb 13, 2023 by kylasa Committed by GitHub Feb 13, 2023
Show whitespace changes
Inline Side-by-side

Showing with 49 additions and 17 deletions

tests/tools/utils.py tests/tools/utils.py +47 -16

tools/distpartitioning/dataset_utils.py tools/distpartitioning/dataset_utils.py +2 -1

No files found.
--- a/tests/tools/utils.py
+++ b/tests/tools/utils.py
-import os
 import json
 import logging
-import numpy as np
-import torch
+import os

 import dgl
+import numpy as np
+import torch
 from distpartitioning import array_readwriter
 from distpartitioning.array_readwriter.parquet import ParquetArrayParser
 from files import setdir
@@ -16,12 +16,16 @@ def _chunk_numpy_array(arr, fmt_meta, chunk_sizes, path_fmt, vector_rows=False):

    for j, n in enumerate(chunk_sizes):
        path = os.path.abspath(path_fmt % j)
-        arr_chunk = arr[offset: offset + n]
+        arr_chunk = arr[offset : offset + n]
        shape = arr_chunk.shape
        logging.info("Chunking %d-%d" % (offset, offset + n))
        # If requested we write multi-column arrays as single-column vector Parquet files
        array_parser = array_readwriter.get_array_parser(**fmt_meta)
-        if isinstance(array_parser, ParquetArrayParser) and len(shape) > 1 and shape[1] > 1:
+        if (
+            isinstance(array_parser, ParquetArrayParser)
+            and len(shape) > 1
+            and shape[1] > 1
+        ):
            array_parser.write(path, arr_chunk, vector_rows=vector_rows)
        else:
            array_parser.write(path, arr_chunk)
@@ -83,8 +87,15 @@ def _initialize_num_chunks(g, num_chunks, kwargs=None):


 def _chunk_graph(
-    g, name, ndata_paths, edata_paths, num_chunks, data_fmt, edges_format,
-    vector_rows=False, **kwargs
+    g,
+    name,
+    ndata_paths,
+    edata_paths,
+    num_chunks,
+    data_fmt,
+    edges_format,
+    vector_rows=False,
+    **kwargs,
 ):
    # First deal with ndata and edata that are homogeneous
    # (i.e. not a dict-of-dict)
@@ -139,16 +150,24 @@ def _chunk_graph(
        k: v for k, v in zip(g.canonical_etypes, num_edges_per_chunk)
    }

+    idxes_etypestr = {
+        idx: (etype, etypestrs[etype])
+        for idx, etype in enumerate(g.canonical_etypes)
+    }
+    idxes = np.arange(len(idxes_etypestr))
+
    # Split edge index
    metadata["edges"] = {}
    with setdir("edge_index"):
-        for etype in g.canonical_etypes:
-            etypestr = etypestrs[etype]
+        np.random.shuffle(idxes)
+        for idx in idxes:
+            etype = idxes_etypestr[idx][0]
+            etypestr = idxes_etypestr[idx][1]
            logging.info("Chunking edge index for %s" % etypestr)
            edges_meta = {}
-            if edges_format == 'csv':
+            if edges_format == "csv":
                fmt_meta = {"name": edges_format, "delimiter": " "}
-            elif edges_format == 'parquet':
+            elif edges_format == "parquet":
                fmt_meta = {"name": edges_format}
            else:
                raise RuntimeError(f"Invalid edges_fmt: {edges_format}")
@@ -259,7 +278,7 @@ def chunk_graph(
    num_chunks,
    output_path,
    data_fmt="numpy",
-    edges_fmt='csv',
+    edges_fmt="csv",
    vector_rows=False,
    **kwargs,
 ):
@@ -302,14 +321,26 @@ def chunk_graph(
            edata[key] = os.path.abspath(edata[key])
    with setdir(output_path):
        _chunk_graph(
-            g, name, ndata_paths, edata_paths, num_chunks, data_fmt, edges_fmt,
-            vector_rows, **kwargs
+            g,
+            name,
+            ndata_paths,
+            edata_paths,
+            num_chunks,
+            data_fmt,
+            edges_fmt,
+            vector_rows,
+            **kwargs,
        )


 def create_chunked_dataset(
-    root_dir, num_chunks, data_fmt="numpy", edges_fmt='csv',
-    vector_rows=False, **kwargs):
+    root_dir,
+    num_chunks,
+    data_fmt="numpy",
+    edges_fmt="csv",
+    vector_rows=False,
+    **kwargs,
+):
    """
    This function creates a sample dataset, based on MAG240 dataset.


--- a/tools/distpartitioning/dataset_utils.py
+++ b/tools/distpartitioning/dataset_utils.py
@@ -529,7 +529,8 @@ def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
    ]:
        edge_datadict[col] = []

-    for etype_name, etype_info in edge_data.items():
+    for etype_name, etype_id in etype_name_idmap.items():
+        etype_info = edge_data[etype_name]
        edge_info = etype_info[constants.STR_DATA]

        # edgetype strings are in canonical format, src_node_type:edge_type:dst_node_type