[Dist] Flexible pipeline - Initial commit (#4733)

* Flexible pipeline - Initial commit 1. Implementation of flexible pipeline feature. 2. With this implementation, the pipeline now supports multiple partitions per process. And also assumes that num_partitions is always a multiple of num_processes. * Update test_dist_part.py * Code changes to address review comments * Code refactoring of exchange_features function into two functions for better readability * Upadting test_dist_part to fix merge issues with the master branch * corrected variable names... * Fixed code refactoring issues. * Provide missing function arguments to exchange_feature function * Providing the missing function argument to fix error. * Provide missing function argument to 'get_shuffle_nids' function. * Repositioned a variable within its scope. * Removed tab space which is causing the indentation problem * Fix issue with the CI test framework, which is the root cause for the failure of the CI tests. 1. Now we read files specific to the partition-id and store this data separately, identified by the local_part_id, in the local process. 2. Similarly as above, we also differentiate the node and edge features type_ids with the same keys as above. 3. These above two changes will help up to get the appropriate feature data during the feature exchange and send to the destination process correctly. * Correct the parametrization for the CI unit test cases. * Addressing Rui's code review comments. * Addressing code review comments.

[Dist] Flexible pipeline - Initial commit (#4733)
* Flexible pipeline - Initial commit 1. Implementation of flexible pipeline feature. 2. With this implementation, the pipeline now supports multiple partitions per process. And also assumes that num_partitions is always a multiple of num_processes. * Update test_dist_part.py * Code changes to address review comments * Code refactoring of exchange_features function into two functions for better readability * Upadting test_dist_part to fix merge issues with the master branch * corrected variable names... * Fixed code refactoring issues. * Provide missing function arguments to exchange_feature function * Providing the missing function argument to fix error. * Provide missing function argument to 'get_shuffle_nids' function. * Repositioned a variable within its scope. * Removed tab space which is causing the indentation problem * Fix issue with the CI test framework, which is the root cause for the failure of the CI tests. 1. Now we read files specific to the partition-id and store this data separately, identified by the local_part_id, in the local process. 2. Similarly as above, we also differentiate the node and edge features type_ids with the same keys as above. 3. These above two changes will help up to get the appropriate feature data during the feature exchange and send to the destination process correctly. * Correct the parametrization for the CI unit test cases. * Addressing Rui's code review comments. * Addressing code review comments.
c8ea9fa4 · kylasa · GitHub · ee5f0967 · c8ea9fa4 · c8ea9fa4
Unverified Commit c8ea9fa4 authored Nov 18, 2022 by kylasa Committed by GitHub Nov 18, 2022
9 changed files
--- a/tests/tools/test_dist_part.py
+++ b/tests/tools/test_dist_part.py
@@ -136,11 +136,15 @@ def test_chunk_graph(num_chunks):
                test_data(sub_dir, feat, data, g.num_edges(c_etype) // num_chunks)
-def _test_pipeline(num_chunks, num_parts, graph_formats=None):
+def _test_pipeline(num_chunks, num_parts, world_size, graph_formats=None):
    if num_chunks < num_parts:
        # num_parts should less/equal than num_chunks
        return
+    if num_parts % world_size != 0:
+        # num_parts should be a multiple of world_size
+        return
    with tempfile.TemporaryDirectory() as root_dir:
        g = create_chunked_dataset(root_dir, num_chunks)
@@ -161,12 +165,12 @@ def _test_pipeline(num_chunks, num_parts, graph_formats=None):
                assert isinstance(int(header), int)
        # Step2: data dispatch
-        partition_dir = os.path.join(root_dir, "parted_data")
+        partition_dir = os.path.join(root_dir, 'parted_data')
-        out_dir = os.path.join(root_dir, "partitioned")
+        out_dir = os.path.join(root_dir, 'partitioned')
-        ip_config = os.path.join(root_dir, "ip_config.txt")
+        ip_config = os.path.join(root_dir, 'ip_config.txt')
-        with open(ip_config, "w") as f:
+        with open(ip_config, 'w') as f:
-            for i in range(num_parts):
+            for i in range(world_size):
-                f.write(f"127.0.0.{i + 1}\n")
+                f.write(f'127.0.0.{i + 1}\n')
        cmd = "python3 tools/dispatch_data.py"
        cmd += f" --in-dir {in_dir}"
@@ -209,15 +213,14 @@ def _test_pipeline(num_chunks, num_parts, graph_formats=None):
            )
-@pytest.mark.parametrize("num_chunks", [1, 3, 4, 8])
+@pytest.mark.parametrize("num_chunks, num_parts, world_size", [[8, 4, 2], [9, 6, 3], [11, 11, 1], [11, 4, 2], [5, 3, 1]])
-@pytest.mark.parametrize("num_parts", [1, 3, 4, 8])
+def test_pipeline_basics(num_chunks, num_parts, world_size):
-def test_pipeline_basics(num_chunks, num_parts):
+    _test_pipeline(num_chunks, num_parts, world_size)
-    _test_pipeline(num_chunks, num_parts)
 @pytest.mark.parametrize(
    "graph_formats", [None, "csc", "coo,csc", "coo,csc,csr"]
 )
 def test_pipeline_formats(graph_formats):
-    _test_pipeline(4, 4, graph_formats)
+    _test_pipeline(4, 4, 4, graph_formats)
--- a/tools/dispatch_data.py
+++ b/tools/dispatch_data.py
@@ -59,14 +59,12 @@ def submit_jobs(args) -> str:
    with open(args.ip_config, "r") as f:
        num_ips = len(f.readlines())
        assert (
-            num_ips == num_parts
+            num_parts % num_ips == 0
-        ), f"The number of lines[{args.ip_config}] should be equal to num_parts[{num_parts}]."
+        ), f"The num_parts[{args.num_parts}] should be a multiple of number of lines(ip addresses)[{args.ip_config}]."
    argslist = ""
-    argslist += "--world-size {} ".format(num_parts)
+    argslist += "--world-size {} ".format(num_ips)
-    argslist += "--partitions-dir {} ".format(
+    argslist += "--partitions-dir {} ".format(os.path.abspath(args.partitions_dir))
-        os.path.abspath(args.partitions_dir)
-    )
    argslist += "--input-dir {} ".format(os.path.abspath(args.in_dir))
    argslist += "--graph-name {} ".format(graph_name)
    argslist += "--schema {} ".format(schema_path)

--- a/tools/distpartitioning/convert_partition.py
+++ b/tools/distpartitioning/convert_partition.py
@@ -110,8 +110,7 @@ def create_dgl_object(schema, part_id, node_data, edge_data, edgeid_offset,
    #create auxiliary data structures from the schema object
    memory_snapshot("CreateDGLObj_Begin", part_id)
    _, global_nid_ranges = get_idranges(schema[constants.STR_NODE_TYPE],
-        schema[constants.STR_NUM_NODES_PER_CHUNK])
+                                    schema[constants.STR_NUM_NODES_PER_CHUNK])
    _, global_eid_ranges = get_idranges(schema[constants.STR_EDGE_TYPE],
                                    schema[constants.STR_NUM_EDGES_PER_CHUNK])

--- a/tools/distpartitioning/data_shuffle.py
+++ b/tools/distpartitioning/data_shuffle.py
--- a/tools/distpartitioning/dataset_utils.py
+++ b/tools/distpartitioning/dataset_utils.py
@@ -7,10 +7,10 @@ import torch
 from pyarrow import csv
 import constants
-from utils import get_idranges
+from utils import get_idranges, map_partid_rank
-def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
+def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
    """
    Function to read the multiple file formatted dataset. 
@@ -24,6 +24,8 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
        rank of the current process
    world_size : int
        total number of process in the current execution
+    num_parts : int
+        total number of output graph partitions
    schema_map : dictionary
        this is the dictionary created by reading the graph metadata json file
        for the input graph dataset
@@ -105,34 +107,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
    Data read from each of the node features file is a multi-dimensional tensor data and is read
    in numpy format, which is also the storage format of node features on the permanent storage.
-    '''
-    #iterate over the "node_data" dictionary in the schema_map
-    #read the node features if exists
-    #also keep track of the type_nids for which the node_features are read.
-    dataset_features = schema_map[constants.STR_NODE_DATA]
-    if((dataset_features is not None) and (len(dataset_features) > 0)):
-        for ntype_name, ntype_feature_data in dataset_features.items():
-            #ntype_feature_data is a dictionary
-            #where key: feature_name, value: dictionary in which keys are "format", "data"
-            node_feature_tids[ntype_name] = []
-            for feat_name, feat_data in ntype_feature_data.items():
-                assert feat_data[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_NUMPY
-                num_chunks = len(feat_data[constants.STR_DATA])
-                read_list = np.array_split(np.arange(num_chunks), world_size)
-                nfeat = []
-                for idx in read_list[rank]:
-                    nfeat_file = feat_data[constants.STR_DATA][idx]
-                    if not os.path.isabs(nfeat_file):
-                        nfeat_file = os.path.join(input_dir, nfeat_file)
-                    logging.info(f'Loading node feature[{feat_name}] of ntype[{ntype_name}] from {nfeat_file}')
-                    nfeat.append(np.load(nfeat_file))
-                nfeat = np.concatenate(nfeat)
-                node_features[ntype_name + '/' + feat_name] = torch.from_numpy(nfeat)
-                node_feature_tids[ntype_name].append([feat_name, -1, -1])
-    '''
        "node_type" : ["ntype0-name", "ntype1-name", ....], #m node types
        "num_nodes_per_chunk" : [
            [a0, a1, ...a<p-1>], #p partitions
@@ -154,25 +129,66 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
    which are owned by that particular rank. And using the "num_nodes_per_chunk" information each
    process can easily compute any nodes per-type node_id and global node_id.
    The node-ids are treated as int64's in order to support billions of nodes in the input graph.
    '''
    #read my nodes for each node type
    node_tids, ntype_gnid_offset = get_idranges(schema_map[constants.STR_NODE_TYPE], 
                                    schema_map[constants.STR_NUM_NODES_PER_CHUNK],
-                                    num_chunks=world_size)
+                                    num_chunks=num_parts)
-    for ntype_name in schema_map[constants.STR_NODE_TYPE]: 
-        if ntype_name in node_feature_tids: 
+    #iterate over the "node_data" dictionary in the schema_map
-            for item in node_feature_tids[ntype_name]:
+    #read the node features if exists
-                item[1] = node_tids[ntype_name][rank][0]
+    #also keep track of the type_nids for which the node_features are read.
-                item[2] = node_tids[ntype_name][rank][1]
+    dataset_features = schema_map[constants.STR_NODE_DATA]
+    if((dataset_features is not None) and (len(dataset_features) > 0)):
+        for ntype_name, ntype_feature_data in dataset_features.items():
+            for feat_name, feat_data in ntype_feature_data.items():
+                assert feat_data[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_NUMPY
+                # It is guaranteed that num_chunks is always greater 
+                # than num_partitions. 
+                num_chunks = len(feat_data[constants.STR_DATA])
+                read_list = np.array_split(np.arange(num_chunks), num_parts)
+                for local_part_id in range(num_parts):
+                    if map_partid_rank(local_part_id, world_size) == rank:
+                        nfeat = []
+                        nfeat_tids = []
+                        for idx in read_list[local_part_id]:
+                            nfeat_file = feat_data[constants.STR_DATA][idx]
+                            if not os.path.isabs(nfeat_file):
+                                nfeat_file = os.path.join(input_dir, nfeat_file)
+                            logging.info(f'Loading node feature[{feat_name}] of ntype[{ntype_name}] from {nfeat_file}')
+                            nfeat.append(np.load(nfeat_file))
+                        nfeat = np.concatenate(nfeat) if len(nfeat) != 0 else np.array([])
+                        node_features[ntype_name+"/"+feat_name+"/"+str(local_part_id//world_size)] = torch.from_numpy(nfeat)
+                        nfeat_tids.append(node_tids[ntype_name][local_part_id])
+                        node_feature_tids[ntype_name+"/"+feat_name+"/"+str(local_part_id//world_size)] = nfeat_tids
    #done building node_features locally. 
    if len(node_features) <= 0:
        logging.info(f'[Rank: {rank}] This dataset does not have any node features')
    else:
-        for k, v in node_features.items():
+        assert len(node_features) == len(node_feature_tids)
-            logging.info(f'[Rank: {rank}] node feature name: {k}, feature data shape: {v.size()}')
+        # Note that the keys in the node_features dictionary are as follows:
+        # `ntype_name/feat_name/local_part_id`. 
+        #   where ntype_name and feat_name are self-explanatory, and 
+        #   local_part_id indicates the partition-id, in the context of current
+        #   process which take the values 0, 1, 2, ....
+        for feat_name, feat_info  in node_features.items():
+            logging.info(f'[Rank: {rank}] node feature name: {feat_name}, feature data shape: {feat_info.size()}')
+            tokens = feat_name.split("/")
+            assert len(tokens) == 3
+            # Get the range of type ids which are mapped to the current node.
+            tids = node_feature_tids[feat_name]
+            # Iterate over the range of type ids for the current node feature
+            # and count the number of features for this feature name.
+            count = tids[0][1] - tids[0][0]
+            assert count == feat_info.size()[0]
    '''
    Reading edge features now.
@@ -214,50 +230,48 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
    edge_features = {}
    edge_feature_tids = {}
+    # Read edges for each edge type that are processed by the currnet process.
+    edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE], 
+                                    schema_map[constants.STR_NUM_EDGES_PER_CHUNK], num_parts)
    # Iterate over the "edge_data" dictionary in the schema_map.
    # Read the edge features if exists.
    # Also keep track of the type_eids for which the edge_features are read.
    dataset_features = schema_map[constants.STR_EDGE_DATA]
    if dataset_features and (len(dataset_features) > 0):
        for etype_name, etype_feature_data in dataset_features.items():
-            #etype_feature_data is a dictionary
-            #where key: feature_name, value: dictionary in which keys are "format", "data"
-            edge_feature_tids[etype_name] = []
            for feat_name, feat_data in etype_feature_data.items():
                assert feat_data[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_NUMPY
                num_chunks = len(feat_data[constants.STR_DATA])
-                read_list = np.array_split(np.arange(num_chunks), world_size)
+                read_list = np.array_split(np.arange(num_chunks), num_parts)
-                efeat = []
+                for local_part_id in range(num_parts):
-                for idx in read_list[rank]:
+                    if map_partid_rank(local_part_id, world_size) == rank:
-                    efeat_file = feat_data[constants.STR_DATA][idx]
+                        efeats = []
-                    if not os.path.isabs(efeat_file):
+                        efeat_tids = []
-                        efeat_file = os.path.join(input_dir, efeat_file)
+                        for idx in read_list[local_part_id]:
-                    logging.info(
+                            feature_fname = feat_data[constants.STR_DATA][idx]
-                        f'Loading edge feature[{feat_name}] of etype[{etype_name}] from {efeat_file}'
+                            if (os.path.isabs(feature_fname)):
-                    )
+                                logging.info(f'Loading numpy from {feature_fname}')
-                    efeat.append(np.load(efeat_file))
+                                efeats.append(torch.from_numpy(np.load(feature_fname)))
-                efeat = np.concatenate(efeat)
+                            else:
-                edge_features[etype_name + '/' + feat_name] = torch.from_numpy(efeat)
+                                numpy_path = os.path.join(input_dir, feature_fname)
+                                logging.info(f'Loading numpy from {numpy_path}')
-                edge_feature_tids[etype_name].append([feat_name, -1, -1])
+                                efeats.append(torch.from_numpy(np.load(numpy_path)))
+                        efeat_tids.append(edge_tids[etype_name][local_part_id])
-    # Read edges for each node types that are processed by the currnet process.
+                        edge_features[etype_name+'/'+feat_name+"/"+str(local_part_id//world_size)] = torch.from_numpy(np.concatenate(efeats))
-    edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE], 
+                        edge_feature_tids[etype_name+"/"+feat_name+"/"+str(local_part_id//world_size)] = efeat_tids
-                                schema_map[constants.STR_NUM_EDGES_PER_CHUNK],
-                                num_chunks=world_size)
-    for etype_name in schema_map[constants.STR_EDGE_TYPE]: 
-        if etype_name in edge_feature_tids: 
-            for item in edge_feature_tids[etype_name]:
-                item[1] = edge_tids[etype_name][rank][0]
-                item[2] = edge_tids[etype_name][rank][1]
    # Done with building node_features locally. 
    if len(edge_features) <= 0:
        logging.info(f'[Rank: {rank}] This dataset does not have any edge features')
    else:
-        for k, v in edge_features.items():
+        assert len(edge_features) == len(edge_feature_tids)
-            logging.info(f'[Rank: {rank}] edge feature name: {k}, feature data shape: {v.size()}')
+        for k, v in edge_features.items():
+            logging.info(f'[Rank: {rank}] edge feature name: {k}, feature data shape: {v.shape}')
+            tids = edge_feature_tids[k]
+            count = tids[0][1] - tids[0][0]
+            assert count == v.size()[0]
    '''
    Code below is used to read edges from the input dataset with the help of the metadata json file
@@ -306,7 +320,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
    etype_name_idmap = {e : idx for idx, e in enumerate(etype_names)}
    edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE],
                    schema_map[constants.STR_NUM_EDGES_PER_CHUNK],
-                    num_chunks=world_size)
+                    num_chunks=num_parts)
    edge_datadict = {}
    edge_data = schema_map[constants.STR_EDGES]
@@ -329,10 +343,16 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
        dst_ntype_name = tokens[2]
        num_chunks = len(edge_info)
-        read_list = np.array_split(np.arange(num_chunks), world_size)
+        read_list = np.array_split(np.arange(num_chunks), num_parts)
        src_ids = []
        dst_ids = []
-        for idx in read_list[rank]:
+        curr_partids = []
+        for part_id in range(num_parts):
+            if map_partid_rank(part_id, world_size) == rank:
+                curr_partids.append(read_list[part_id])
+        for idx in np.concatenate(curr_partids):
            edge_file = edge_info[idx]
            if not os.path.isabs(edge_file):
                edge_file = os.path.join(input_dir, edge_file)
@@ -355,10 +375,13 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
        #currently these are just type_edge_ids... which will be converted to global ids
        edge_datadict[constants.GLOBAL_SRC_ID].append(src_ids + ntype_gnid_offset[src_ntype_name][0, 0])
        edge_datadict[constants.GLOBAL_DST_ID].append(dst_ids + ntype_gnid_offset[dst_ntype_name][0, 0])
-        edge_datadict[constants.GLOBAL_TYPE_EID].append(np.arange(edge_tids[etype_name][rank][0],\
-                edge_tids[etype_name][rank][1] ,dtype=np.int64))
        edge_datadict[constants.ETYPE_ID].append(etype_name_idmap[etype_name] * \
-                np.ones(shape=(src_ids.shape), dtype=np.int64))
+            np.ones(shape=(src_ids.shape), dtype=np.int64))
+        for local_part_id in range(num_parts):
+            if (map_partid_rank(local_part_id, world_size) == rank):
+                edge_datadict[constants.GLOBAL_TYPE_EID].append(np.arange(edge_tids[etype_name][local_part_id][0],\
+                    edge_tids[etype_name][local_part_id][1] ,dtype=np.int64))
    #stitch together to create the final data on the local machine
    for col in [constants.GLOBAL_SRC_ID, constants.GLOBAL_DST_ID, constants.GLOBAL_TYPE_EID, constants.ETYPE_ID]:
@@ -368,6 +391,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
    assert edge_datadict[constants.GLOBAL_DST_ID].shape == edge_datadict[constants.GLOBAL_TYPE_EID].shape
    assert edge_datadict[constants.GLOBAL_TYPE_EID].shape == edge_datadict[constants.ETYPE_ID].shape
    logging.info(f'[Rank: {rank}] Done reading edge_file: {len(edge_datadict)}, {edge_datadict[constants.GLOBAL_SRC_ID].shape}')
+    logging.info(f'Rank: {rank} edge_feat_tids: {edge_feature_tids}')
    return node_tids, node_features, node_feature_tids, edge_datadict, edge_tids, edge_features, edge_feature_tids
--- a/tools/distpartitioning/dist_lookup.py
+++ b/tools/distpartitioning/dist_lookup.py
@@ -7,6 +7,7 @@ import copy
 from pyarrow import csv
 from gloo_wrapper import alltoallv_cpu
+from utils import map_partid_rank
 class DistLookupService:
@@ -100,7 +101,7 @@ class DistLookupService:
        self.ntype_count = np.array(ntype_count, dtype=np.int64)
        self.rank = rank
        self.world_size = world_size
    def get_partition_ids(self, global_nids):
        '''
@@ -237,7 +238,7 @@ class DistLookupService:
        # Now the owner_ids (partition-ids) which corresponding to the  global_nids.
        return owner_ids
-    def get_shuffle_nids(self, global_nids, my_global_nids, my_shuffle_global_nids):
+    def get_shuffle_nids(self, global_nids, my_global_nids, my_shuffle_global_nids, world_size):
        '''
        This function is used to retrieve shuffle_global_nids for a given set of incoming
        global_nids. Note that global_nids are of random order and will contain duplicates
@@ -267,6 +268,8 @@ class DistLookupService:
            This process has the node <-> partition id mapping
        my_shuffle_global_nids : numpy ndarray
            array of shuffle_global_nids which are assigned by the current process/rank
+        world_size : int
+            total no. of processes in the MPI_WORLD
        Returns:
        --------
@@ -278,6 +281,21 @@ class DistLookupService:
        # Get the owner_ids (partition-ids or rank).
        owner_ids = self.get_partition_ids(global_nids)
+        # These owner_ids, which are also partition ids of the nodes in the 
+        # input graph, are in the range 0 - (num_partitions - 1).
+        # These ids are generated using some kind of graph partitioning method.
+        # Distribuged lookup service, as used by the graph partitioning 
+        # pipeline, is used to store ntype-ids (also type_nids) and their 
+        # mapping to the associated partition-id. 
+        # These ids are split into `num_process` chunks and processes in the
+        # dist. lookup service are assigned the owernship of these chunks.
+        # The pipeline also enforeces the following constraint among the
+        # pipeline input parameters: num_partitions, num_processes
+        #   num_partitions is an integer multiple of num_processes
+        #   which means each individual node in the cluster will be running
+        #   equal number of processes.
+        owner_ids = map_partid_rank(owner_ids, world_size)
        # Ask these owners to supply for the shuffle_global_nids.
        send_list = []
        id_list = []

--- a/tools/distpartitioning/globalids.py
+++ b/tools/distpartitioning/globalids.py
@@ -59,7 +59,7 @@ def get_shuffle_global_nids(rank, world_size, global_nids_ranks, node_data):
    ret_val = np.column_stack([global_nids, shuffle_global_nids])
    return ret_val
-def lookup_shuffle_global_nids_edges(rank, world_size, edge_data, id_lookup, node_data):
+def lookup_shuffle_global_nids_edges(rank, world_size, num_parts, edge_data, id_lookup, node_data):
    '''
    This function is a helper function used to lookup shuffle-global-nids for a given set of
    global-nids using a distributed lookup service.
@@ -70,6 +70,8 @@ def lookup_shuffle_global_nids_edges(rank, world_size, edge_data, id_lookup, nod
        rank of the process
    world_size : integer
        total number of processes used in the process group
+    num_parts : integer
+        total number of output graph partitions
    edge_data : dictionary
        edge_data is a dicitonary with keys as column names and values as numpy arrays representing
        all the edges present in the current graph partition
@@ -93,40 +95,49 @@ def lookup_shuffle_global_nids_edges(rank, world_size, edge_data, id_lookup, nod
    MILLION = 1000 * 1000
    BATCH_SIZE = 250 * MILLION
    memory_snapshot("GlobalToShuffleIDMapBegin: ", rank)
-    node_list = edge_data[constants.GLOBAL_SRC_ID]
+    local_nids = []
-    # Determine the no. of times each process has to send alltoall messages.
+    local_shuffle_nids = []
-    all_sizes = allgather_sizes([node_list.shape[0]], world_size, return_sizes=True)
+    for local_part_id in range(num_parts//world_size):
-    max_count = np.amax(all_sizes)
+        local_nids.append(node_data[constants.GLOBAL_NID+"/"+str(local_part_id)])
-    num_splits = max_count // BATCH_SIZE + 1
+        local_shuffle_nids.append(node_data[constants.SHUFFLE_GLOBAL_NID+"/"+str(local_part_id)])
-    # Split the message into batches and send.
+    local_nids = np.concatenate(local_nids)
-    splits = np.array_split(node_list, num_splits)
+    local_shuffle_nids = np.concatenate(local_shuffle_nids)
-    shuffle_mappings = []
-    for item in splits:
+    for local_part_id in range(num_parts//world_size):
-        shuffle_ids = id_lookup.get_shuffle_nids(item,
+        node_list = edge_data[constants.GLOBAL_SRC_ID+"/"+str(local_part_id)]  
-                                            node_data[constants.GLOBAL_NID],
-                                            node_data[constants.SHUFFLE_GLOBAL_NID])
+        # Determine the no. of times each process has to send alltoall messages.
-        shuffle_mappings.append(shuffle_ids)
+        all_sizes = allgather_sizes([node_list.shape[0]], world_size, num_parts, return_sizes=True)
+        max_count = np.amax(all_sizes)
-    shuffle_ids = np.concatenate(shuffle_mappings)
+        num_splits = max_count // BATCH_SIZE + 1    
-    assert shuffle_ids.shape[0] == node_list.shape[0]
-    edge_data[constants.SHUFFLE_GLOBAL_SRC_ID] = shuffle_ids
+        # Split the message into batches and send.
+        splits = np.array_split(node_list, num_splits)
-    # Destination end points of edges are owned by the current node and therefore
+        shuffle_mappings = []
-    # should have corresponding SHUFFLE_GLOBAL_NODE_IDs. 
+        for item in splits:
-    # Here retrieve SHUFFLE_GLOBAL_NODE_IDs for the destination end points of local edges.
+            shuffle_ids = id_lookup.get_shuffle_nids(item, local_nids, local_shuffle_nids, world_size)
-    uniq_ids, inverse_idx = np.unique(edge_data[constants.GLOBAL_DST_ID], return_inverse=True)
+            shuffle_mappings.append(shuffle_ids)
-    common, idx1, idx2 = np.intersect1d(uniq_ids, node_data[constants.GLOBAL_NID], assume_unique=True, return_indices=True)
-    assert len(common) == len(uniq_ids)
+        shuffle_ids = np.concatenate(shuffle_mappings)
+        assert shuffle_ids.shape[0] == node_list.shape[0]
-    edge_data[constants.SHUFFLE_GLOBAL_DST_ID] = node_data[constants.SHUFFLE_GLOBAL_NID][idx2][inverse_idx]
+        edge_data[constants.SHUFFLE_GLOBAL_SRC_ID+"/"+str(local_part_id)] = shuffle_ids
-    assert len(edge_data[constants.SHUFFLE_GLOBAL_DST_ID]) == len(edge_data[constants.GLOBAL_DST_ID])
+        # Destination end points of edges are owned by the current node and therefore
+        # should have corresponding SHUFFLE_GLOBAL_NODE_IDs. 
+        # Here retrieve SHUFFLE_GLOBAL_NODE_IDs for the destination end points of local edges.
+        uniq_ids, inverse_idx = np.unique(edge_data[constants.GLOBAL_DST_ID+"/"+str(local_part_id)], return_inverse=True)
+        common, idx1, idx2 = np.intersect1d(uniq_ids, node_data[constants.GLOBAL_NID+"/"+str(local_part_id)], assume_unique=True, return_indices=True)
+        assert len(common) == len(uniq_ids)
+        edge_data[constants.SHUFFLE_GLOBAL_DST_ID+"/"+str(local_part_id)] = node_data[constants.SHUFFLE_GLOBAL_NID+"/"+str(local_part_id)][idx2][inverse_idx]
+        assert len(edge_data[constants.SHUFFLE_GLOBAL_DST_ID+"/"+str(local_part_id)]) == len(edge_data[constants.GLOBAL_DST_ID+"/"+str(local_part_id)])
    memory_snapshot("GlobalToShuffleIDMap_AfterLookupServiceCalls: ", rank)
    return edge_data
-def assign_shuffle_global_nids_nodes(rank, world_size, node_data):
+def assign_shuffle_global_nids_nodes(rank, world_size, num_parts, node_data):
    """
    Utility function to assign shuffle global ids to nodes at a given rank
    node_data gets converted from [ntype, global_type_nid, global_nid]
@@ -144,25 +155,27 @@ def assign_shuffle_global_nids_nodes(rank, world_size, node_data):
        rank of the process
    world_size : integer
        total number of processes used in the process group
-    ntype_counts: list of tuples
+    num_parts : integer
-        list of tuples (x,y), where x=ntype and y=no. of nodes whose shuffle_global_nids are needed
+        total number of output graph partitions
    node_data : dictionary
        node_data is a dictionary with keys as column names and values as numpy arrays
    """
    # Compute prefix sum to determine node-id offsets
-    prefix_sum_nodes = allgather_sizes([node_data[constants.GLOBAL_NID].shape[0]], world_size)
+    local_row_counts = []
+    for local_part_id in range(num_parts//world_size):
+        local_row_counts.append(node_data[constants.GLOBAL_NID+"/"+str(local_part_id)].shape[0])
-    # assigning node-ids from localNodeStartId to (localNodeEndId - 1)
+    # Perform allgather to compute the local offsets.
-    # Assuming here that the nodeDataArr is sorted based on the nodeType.
+    prefix_sum_nodes = allgather_sizes(local_row_counts, world_size, num_parts)
-    shuffle_global_nid_start = prefix_sum_nodes[rank]
-    shuffle_global_nid_end = prefix_sum_nodes[rank + 1]
-    # add a column with global-ids (after data shuffle)
+    for local_part_id in range(num_parts//world_size):
-    shuffle_global_nids = np.arange(shuffle_global_nid_start, shuffle_global_nid_end, dtype=np.int64)
+        shuffle_global_nid_start = prefix_sum_nodes[rank + (local_part_id*world_size)]
-    node_data[constants.SHUFFLE_GLOBAL_NID] = shuffle_global_nids
+        shuffle_global_nid_end = prefix_sum_nodes[rank + 1 + (local_part_id*world_size)]
+        shuffle_global_nids = np.arange(shuffle_global_nid_start, shuffle_global_nid_end, dtype=np.int64)
+        node_data[constants.SHUFFLE_GLOBAL_NID+"/"+str(local_part_id)] = shuffle_global_nids
-def assign_shuffle_global_nids_edges(rank, world_size, edge_data):
+def assign_shuffle_global_nids_edges(rank, world_size, num_parts, edge_data):
    """
    Utility function to assign shuffle_global_eids to edges
    edge_data gets converted from [global_src_nid, global_dst_nid, global_type_eid, etype]
@@ -174,8 +187,8 @@ def assign_shuffle_global_nids_edges(rank, world_size, edge_data):
        rank of the current process
    world_size : integer
        total count of processes in execution
-    etype_counts : list of tuples
+    num_parts : integer
-        list of tuples (x,y), x = rank, y = no. of edges
+        total number of output graph partitions
    edge_data : numpy ndarray
        edge data as read from xxx_edges.txt file
@@ -187,12 +200,17 @@ def assign_shuffle_global_nids_edges(rank, world_size, edge_data):
    """
    #get prefix sum of edge counts per rank to locate the starting point
    #from which global-ids to edges are assigned in the current rank
-    prefix_sum_edges = allgather_sizes([edge_data[constants.GLOBAL_SRC_ID].shape[0]], world_size)
+    local_row_counts = []
-    shuffle_global_eid_start = prefix_sum_edges[rank]
+    for local_part_id in range(num_parts//world_size):
-    shuffle_global_eid_end = prefix_sum_edges[rank + 1]
+        local_row_counts.append(edge_data[constants.GLOBAL_SRC_ID+"/"+str(local_part_id)].shape[0])
-    # assigning edge-ids from localEdgeStart to (localEdgeEndId - 1)
+    shuffle_global_eid_offset = []
-    # Assuming here that the edge_data is sorted by edge_type
+    prefix_sum_edges = allgather_sizes(local_row_counts, world_size, num_parts)
-    shuffle_global_eids = np.arange(shuffle_global_eid_start, shuffle_global_eid_end, dtype=np.int64)
+    for local_part_id in range(num_parts//world_size):
-    edge_data[constants.SHUFFLE_GLOBAL_EID] = shuffle_global_eids
+        shuffle_global_eid_start = prefix_sum_edges[rank + (local_part_id*world_size)]
-    return shuffle_global_eid_start
+        shuffle_global_eid_end = prefix_sum_edges[rank + 1 + (local_part_id*world_size)]
+        shuffle_global_eids = np.arange(shuffle_global_eid_start, shuffle_global_eid_end, dtype=np.int64)
+        edge_data[constants.SHUFFLE_GLOBAL_EID+"/"+str(local_part_id)] = shuffle_global_eids
+        shuffle_global_eid_offset.append(shuffle_global_eid_start)
+    return shuffle_global_eid_offset
--- a/tools/distpartitioning/gloo_wrapper.py
+++ b/tools/distpartitioning/gloo_wrapper.py
@@ -2,7 +2,7 @@ import numpy as np
 import torch
 import torch.distributed as dist
-def allgather_sizes(send_data, world_size, return_sizes=False):
+def allgather_sizes(send_data, world_size, num_parts, return_sizes=False):
    """ 
    Perform all gather on list lengths, used to compute prefix sums
    to determine the offsets on each ranks. This is used to allocate
@@ -14,6 +14,8 @@ def allgather_sizes(send_data, world_size, return_sizes=False):
        Data on which allgather is performed.
    world_size : integer
        No. of processes configured for execution
+    num_parts : integer
+        No. of output graph partitions
    return_sizes : bool
        Boolean flag to indicate whether to return raw sizes from each process
        or perform prefix sum on the raw sizes.
@@ -24,6 +26,9 @@ def allgather_sizes(send_data, world_size, return_sizes=False):
            array with the prefix sum
    """
+    # Assert on the world_size, num_parts
+    assert (num_parts % world_size) == 0
    #compute the length of the local data
    send_length = len(send_data)
    out_tensor = torch.as_tensor(send_data, dtype=torch.int64)
@@ -38,11 +43,16 @@ def allgather_sizes(send_data, world_size, return_sizes=False):
        return torch.cat(in_tensor).numpy()
    #gather sizes in on array to return to the invoking function
-    rank_sizes = np.zeros(world_size + 1, dtype=np.int64)
+    rank_sizes = np.zeros(num_parts + 1, dtype=np.int64)
+    part_counts = torch.cat(in_tensor).numpy()
    count = rank_sizes[0]
-    for i, t in enumerate(in_tensor): 
+    idx = 1
-        count += t.item()
+    for local_part_id in range(num_parts//world_size):
-        rank_sizes[i+1] = count
+        for r in range(world_size):
+            count += part_counts[r*(num_parts//world_size) + local_part_id]
+            rank_sizes[idx] = count
+            idx += 1
    return rank_sizes

--- a/tools/distpartitioning/utils.py
+++ b/tools/distpartitioning/utils.py
@@ -188,7 +188,7 @@ def get_gnid_range_map(node_tids):
    return ntypes_gid_range
-def write_metadata_json(metadata_list, output_dir, graph_name):
+def write_metadata_json(input_list, output_dir, graph_name, world_size, num_parts):
    """
    Merge json schema's from each of the rank's on rank-0. 
    This utility function, to be used on rank-0, to create aggregated json file.
@@ -202,6 +202,14 @@ def write_metadata_json(metadata_list, output_dir, graph_name):
    graph-name : string
        a string specifying the graph name
    """
+    # Preprocess the input_list, a list of dictionaries
+    # each dictionary will contain num_parts/world_size metadata json 
+    # which correspond to local partitions on the respective ranks.
+    metadata_list = []
+    for local_part_id in range(num_parts//world_size):
+        for idx in range(world_size):
+            metadata_list.append(input_list[idx]["local-part-id-"+str(local_part_id*world_size + idx)])
    #Initialize global metadata
    graph_metadata = {}
@@ -238,7 +246,7 @@ def write_metadata_json(metadata_list, output_dir, graph_name):
    _dump_part_config(f'{output_dir}/metadata.json', graph_metadata)
-def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size):
+def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size, num_parts):
    """
    Add partition-id (rank which owns an edge) column to the edge_data.
@@ -256,6 +264,8 @@ def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size):
        rank of the current process
    world_size : integer
        total no. of process participating in the communication primitives
+    num_parts : integer
+        total no. of partitions requested for the input graph
    Returns:
    --------
@@ -269,16 +279,18 @@ def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size):
    offset = 0
    for etype_name, tid_range in edge_tids.items(): 
        assert int(tid_range[0][0]) == 0
-        assert len(tid_range) == world_size
+        assert len(tid_range) == num_parts 
        etype_offset[etype_name] = offset + int(tid_range[0][0])
        offset += int(tid_range[-1][1])
    global_eids = []
    for etype_name, tid_range in edge_tids.items(): 
-        global_eid_start = etype_offset[etype_name]
+        for idx in range(num_parts):
-        begin = global_eid_start + int(tid_range[rank][0])
+            if map_partid_rank(idx, world_size) == rank:
-        end = global_eid_start + int(tid_range[rank][1])
+                global_eid_start = etype_offset[etype_name]
-        global_eids.append(np.arange(begin, end, dtype=np.int64))
+                begin = global_eid_start + int(tid_range[idx][0])
+                end = global_eid_start + int(tid_range[idx][1])
+                global_eids.append(np.arange(begin, end, dtype=np.int64))
    global_eids = np.concatenate(global_eids)
    assert global_eids.shape[0] == edge_data[constants.ETYPE_ID].shape[0]
    edge_data[constants.GLOBAL_EID] = global_eids
@@ -528,3 +540,22 @@ def memory_snapshot(tag, rank):
    mem_string = f'{total:.0f} (MB) total, {peak:.0f} (MB) peak, {used:.0f} (MB) used, {avail:.0f} (MB) avail'
    logging.debug(f'[Rank: {rank} MEMORY_SNAPSHOT] {mem_string} - {tag}')
+def map_partid_rank(partid, world_size):
+    """Auxiliary function to map a given partition id to one of the rank in the
+    MPI_WORLD processes. The range of partition ids is assumed to equal or a 
+    multiple of the total size of MPI_WORLD. In this implementation, we use
+    a cyclical mapping procedure to convert partition ids to ranks.
+    Parameters:
+    -----------
+    partid : int
+        partition id, as read from node id to partition id mappings.
+    Returns:
+    --------
+    int : 
+        rank of the process, which will be responsible for the given partition
+        id.
+    """
+    return partid % world_size