[Dist] enable to partition many chunks into less partitions via pipeline (#4620)

* [Dist] enable to partition many chunks into less partitions via pipeline * refine * add meta file for num_parts, add more tests, refine docstring * remove args.num_parts * create pydantic class for partition metadata * refine * rename json file

[Dist] enable to partition many chunks into less partitions via pipeline (#4620)
* [Dist] enable to partition many chunks into less partitions via pipeline * refine * add meta file for num_parts, add more tests, refine docstring * remove args.num_parts * create pydantic class for partition metadata * refine * rename json file
cf19254a · Rhett Ying · GitHub · 6c1500d4 · cf19254a · cf19254a
Unverified Commit cf19254a authored Sep 28, 2022 by Rhett Ying Committed by GitHub Sep 28, 2022
8 changed files
--- a/tests/tools/test_dist_part.py
+++ b/tests/tools/test_dist_part.py
-import argparse
-import dgl
 import json
 import numpy as np
 import os
-import sys
 import tempfile
 import torch
-
+import pytest, unittest
+import dgl
 from dgl.data.utils import load_tensors, load_graphs
-
 from chunk_graph import chunk_graph

-def test_part_pipeline():
+@pytest.mark.parametrize("num_chunks", [1, 2, 3, 4, 8])
+@pytest.mark.parametrize("num_parts", [1, 2, 3, 4, 8])
+def test_part_pipeline(num_chunks, num_parts):
+    if num_chunks < num_parts:
+        # num_parts should less/equal than num_chunks
+        return
+
    # Step0: prepare chunked graph data format

    # A synthetic mini MAG240
-    num_institutions = 20
-    num_authors = 100
-    num_papers = 600
+    num_institutions = 1200
+    num_authors = 1200
+    num_papers = 1200

    def rand_edges(num_src, num_dst, num_edges):
        eids = np.random.choice(num_src * num_dst, num_edges, replace=False)
@@ -26,9 +29,9 @@ def test_part_pipeline():

        return src, dst

-    num_cite_edges = 2000
-    num_write_edges = 1000
-    num_affiliate_edges = 200
+    num_cite_edges = 24 * 1000
+    num_write_edges = 12 * 1000
+    num_affiliate_edges = 2400

    # Structure
    data_dict = {
@@ -85,7 +88,6 @@ def test_part_pipeline():
            np.save(f, write_year)

        output_dir = os.path.join(root_dir, 'chunked-data')
-        num_chunks = 2
        chunk_graph(
            g,
            'mag240m',
@@ -159,10 +161,10 @@ def test_part_pipeline():

        # Step1: graph partition
        in_dir = os.path.join(root_dir, 'chunked-data')
-        output_dir = os.path.join(root_dir, '2parts')
+        output_dir = os.path.join(root_dir, 'parted_data')
        os.system('python3 tools/partition_algo/random_partition.py '\
                  '--in_dir {} --out_dir {} --num_partitions {}'.format(
-                    in_dir, output_dir, num_chunks))
+                    in_dir, output_dir, num_parts))
        for ntype in ['author', 'institution', 'paper']:
            fname = os.path.join(output_dir, '{}.txt'.format(ntype))
            with open(fname, 'r') as f:
@@ -170,12 +172,12 @@ def test_part_pipeline():
                assert isinstance(int(header), int)

        # Step2: data dispatch
-        partition_dir = os.path.join(root_dir, '2parts')
+        partition_dir = os.path.join(root_dir, 'parted_data')
        out_dir = os.path.join(root_dir, 'partitioned')
        ip_config = os.path.join(root_dir, 'ip_config.txt')
        with open(ip_config, 'w') as f:
-            f.write('127.0.0.1\n')
-            f.write('127.0.0.2\n')
+            for i in range(num_parts):
+                f.write(f'127.0.0.{i + 1}\n')

        cmd = 'python3 tools/dispatch_data.py'
        cmd += f' --in-dir {in_dir}'
@@ -195,19 +197,19 @@ def test_part_pipeline():

        all_etypes = ['affiliated_with', 'writes', 'cites', 'rev_writes']
        for etype in all_etypes:
-            assert len(meta_data['edge_map'][etype]) == num_chunks
+            assert len(meta_data['edge_map'][etype]) == num_parts
        assert meta_data['etypes'].keys() == set(all_etypes)
        assert meta_data['graph_name'] == 'mag240m'

        all_ntypes = ['author', 'institution', 'paper']
        for ntype in all_ntypes:
-            assert len(meta_data['node_map'][ntype]) == num_chunks
+            assert len(meta_data['node_map'][ntype]) == num_parts
        assert meta_data['ntypes'].keys() == set(all_ntypes)
-        assert meta_data['num_edges'] == 4200
-        assert meta_data['num_nodes'] == 720
-        assert meta_data['num_parts'] == num_chunks
+        assert meta_data['num_edges'] == g.num_edges()
+        assert meta_data['num_nodes'] == g.num_nodes()
+        assert meta_data['num_parts'] == num_parts

-        for i in range(num_chunks):
+        for i in range(num_parts):
            sub_dir = 'part-' + str(i)
            assert meta_data[sub_dir]['node_feats'] == 'part{}/node_feat.dgl'.format(i)
            assert meta_data[sub_dir]['edge_feats'] == 'part{}/edge_feat.dgl'.format(i)
@@ -251,5 +253,3 @@ def test_part_pipeline():
            orig_eids = load_tensors(fname)
            assert len(orig_eids.keys()) == 4

-if __name__ == '__main__':
-    test_part_pipeline()
--- a/tools/dispatch_data.py
+++ b/tools/dispatch_data.py
@@ -4,6 +4,7 @@ import sys
 import argparse
 import logging
 import json
+from partition_algo.base import load_partition_meta

 INSTALL_DIR = os.path.abspath(os.path.join(__file__, '..'))
 LAUNCH_SCRIPT = "distgraphlaunch.py"
@@ -33,16 +34,29 @@ def get_launch_cmd(args) -> str:


 def submit_jobs(args) -> str:
-    wrapper_command = os.path.join(INSTALL_DIR, LAUNCH_SCRIPT)
-
    #read the json file and get the remaining argument here.
    schema_path = "metadata.json"
    with open(os.path.join(args.in_dir, schema_path)) as schema:
        schema_map = json.load(schema)

-    num_parts = len(schema_map["num_nodes_per_chunk"][0])
    graph_name = schema_map["graph_name"]

+    # retrieve num_parts
+    num_chunks = len(schema_map["num_nodes_per_chunk"][0])
+    num_parts = num_chunks
+    partition_path = os.path.join(args.partitions_dir, "partition_meta.json")
+    if os.path.isfile(partition_path):
+        part_meta = load_partition_meta(partition_path)
+        num_parts = part_meta.num_parts
+    if num_parts > num_chunks:
+        raise Exception('Number of partitions should be less/equal than number of chunks.')
+
+    # verify ip_config
+    with open(args.ip_config, 'r') as f:
+        num_ips = len(f.readlines())
+        assert num_ips == num_parts, \
+            f'The number of lines[{args.ip_config}] should be equal to num_parts[{num_parts}].'
+
    argslist = ""
    argslist += "--world-size {} ".format(num_parts)
    argslist += "--partitions-dir {} ".format(os.path.abspath(args.partitions_dir))

--- a/tools/distpartitioning/convert_partition.py
+++ b/tools/distpartitioning/convert_partition.py
@@ -104,10 +104,11 @@ def create_dgl_object(schema, part_id, node_data, edge_data, edgeid_offset,
    """
    #create auxiliary data structures from the schema object
    memory_snapshot("CreateDGLObj_Begin", part_id)
-    ntid_dict, global_nid_ranges = get_idranges(schema[constants.STR_NODE_TYPE], 
-                                    schema[constants.STR_NUM_NODES_PER_CHUNK])
+    _, global_nid_ranges = get_idranges(schema[constants.STR_NODE_TYPE],
+        schema[constants.STR_NUM_NODES_PER_CHUNK])
+    memory_snapshot("CreateDGLObj_Begin", part_id)

-    etid_dict, global_eid_ranges = get_idranges(schema[constants.STR_EDGE_TYPE], 
+    _, global_eid_ranges = get_idranges(schema[constants.STR_EDGE_TYPE],
                                    schema[constants.STR_NUM_EDGES_PER_CHUNK])

    id_map = dgl.distributed.id_map.IdMap(global_nid_ranges)
@@ -119,7 +120,6 @@ def create_dgl_object(schema, part_id, node_data, edge_data, edgeid_offset,
    ntypes_map = {e: i for i, e in enumerate(ntypes)}
    etypes = [(key, global_eid_ranges[key][0, 0]) for key in global_eid_ranges]
    etypes.sort(key=lambda e: e[1])
-    etype_offset_np = np.array([e[1] for e in etypes])
    etypes = [e[0] for e in etypes]
    etypes_map = {e.split(":")[1]: i for i, e in enumerate(etypes)}


--- a/tools/distpartitioning/data_shuffle.py
+++ b/tools/distpartitioning/data_shuffle.py
@@ -97,7 +97,8 @@ def gen_node_data(rank, world_size, id_lookup, ntid_ntype_map, schema_map):
                        }

    type_nid_dict, global_nid_dict = get_idranges(schema_map[constants.STR_NODE_TYPE],
-                                        schema_map[constants.STR_NUM_NODES_PER_CHUNK])
+                                        schema_map[constants.STR_NUM_NODES_PER_CHUNK],
+                                        num_chunks=world_size)

    for ntype_id, ntype_name in ntid_ntype_map.items():
        type_start, type_end = type_nid_dict[ntype_name][0][0], type_nid_dict[ntype_name][-1][1]
@@ -290,7 +291,7 @@ def exchange_node_features(rank, world_size, node_feature_tids, ntype_gnid_map,
    return own_node_features, own_global_nids

 def exchange_graph_data(rank, world_size, node_features, node_feat_tids, edge_data,
-        id_lookup, ntypes_ntypeid_map, ntypes_gnid_range_map, ntid_ntype_map, schema_map):
+        id_lookup, ntypes_gnid_range_map, ntid_ntype_map, schema_map):
    """
    Wrapper function which is used to shuffle graph data on all the processes.

@@ -556,8 +557,8 @@ def gen_dist_partitions(rank, world_size, params):
    #and return the aggregated data
    ntypes_gnid_range_map = get_gnid_range_map(node_tids)
    node_data, rcvd_node_features, rcvd_global_nids, edge_data  = \
-                    exchange_graph_data(rank, world_size, node_features, node_feat_tids, \
-                                        edge_data, id_lookup, ntypes_ntypeid_map, ntypes_gnid_range_map, \
+                    exchange_graph_data(rank, world_size, node_features, node_feat_tids,
+                                        edge_data, id_lookup, ntypes_gnid_range_map,
                                        ntypeid_ntypes_map, schema_map)
    gc.collect()
    logging.info(f'[Rank: {rank}] Done with data shuffling...')

--- a/tools/distpartitioning/dataset_utils.py
+++ b/tools/distpartitioning/dataset_utils.py
@@ -110,18 +110,18 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
            #where key: feature_name, value: dictionary in which keys are "format", "data"
            node_feature_tids[ntype_name] = []
            for feat_name, feat_data in ntype_feature_data.items():
-                assert len(feat_data[constants.STR_DATA]) == world_size
                assert feat_data[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_NUMPY
-                my_feat_data_fname = feat_data[constants.STR_DATA][rank] #this will be just the file name
-                if (os.path.isabs(my_feat_data_fname)):
-                    logging.info(f'Loading numpy from {my_feat_data_fname}')
-                    node_features[ntype_name+'/'+feat_name] = \
-                            torch.from_numpy(np.load(my_feat_data_fname))
-                else:
-                    numpy_path = os.path.join(input_dir, my_feat_data_fname)
-                    logging.info(f'Loading numpy from {numpy_path}')
-                    node_features[ntype_name+'/'+feat_name] = \
-                            torch.from_numpy(np.load(numpy_path))
+                num_chunks = len(feat_data[constants.STR_DATA])
+                read_list = np.array_split(np.arange(num_chunks), world_size)
+                nfeat = []
+                for idx in read_list[rank]:
+                    nfeat_file = feat_data[constants.STR_DATA][idx]
+                    if not os.path.isabs(nfeat_file):
+                        nfeat_file = os.path.join(input_dir, nfeat_file)
+                    logging.info(f'Loading node feature[{feat_name}] of ntype[{ntype_name}] from {nfeat_file}')
+                    nfeat.append(np.load(nfeat_file))
+                nfeat = np.concatenate(nfeat)
+                node_features[ntype_name + '/' + feat_name] = torch.from_numpy(nfeat)

                node_feature_tids[ntype_name].append([feat_name, -1, -1])

@@ -152,7 +152,8 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):

    #read my nodes for each node type
    node_tids, ntype_gnid_offset = get_idranges(schema_map[constants.STR_NODE_TYPE], 
-                                    schema_map[constants.STR_NUM_NODES_PER_CHUNK])
+                                    schema_map[constants.STR_NUM_NODES_PER_CHUNK],
+                                    num_chunks=world_size)
    for ntype_name in schema_map[constants.STR_NODE_TYPE]: 
        if ntype_name in node_feature_tids: 
            for item in node_feature_tids[ntype_name]:
@@ -211,8 +212,9 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
    #read my edges for each edge type
    etype_names = schema_map[constants.STR_EDGE_TYPE]
    etype_name_idmap = {e : idx for idx, e in enumerate(etype_names)}
-    edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE], 
-                                    schema_map[constants.STR_NUM_EDGES_PER_CHUNK])
+    edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE],
+                    schema_map[constants.STR_NUM_EDGES_PER_CHUNK],
+                    num_chunks=world_size)

    edge_datadict = {}
    edge_data = schema_map[constants.STR_EDGES]
@@ -226,26 +228,38 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
        assert etype_info[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_CSV

        edge_info = etype_info[constants.STR_DATA]
-        assert len(edge_info) == world_size

        #edgetype strings are in canonical format, src_node_type:edge_type:dst_node_type
        tokens = etype_name.split(":")
        assert len(tokens) == 3

        src_ntype_name = tokens[0]
-        rel_name = tokens[1]
        dst_ntype_name = tokens[2]

-        logging.info(f'Reading csv files from {edge_info[rank]}')
-        data_df = csv.read_csv(edge_info[rank], read_options=pyarrow.csv.ReadOptions(autogenerate_column_names=True), 
-                                    parse_options=pyarrow.csv.ParseOptions(delimiter=' '))
+        num_chunks = len(edge_info)
+        read_list = np.array_split(np.arange(num_chunks), world_size)
+        src_ids = []
+        dst_ids = []
+        for idx in read_list[rank]:
+            edge_file = edge_info[idx]
+            if not os.path.isabs(edge_file):
+                edge_file = os.path.join(input_dir, edge_file)
+            logging.info(f'Loading edges of etype[{etype_name}] from {edge_file}')
+            data_df = csv.read_csv(edge_file,
+                        read_options=pyarrow.csv.ReadOptions(autogenerate_column_names=True), 
+                        parse_options=pyarrow.csv.ParseOptions(delimiter=' '))
+            src_ids.append(data_df['f0'].to_numpy())
+            dst_ids.append(data_df['f1'].to_numpy())
+        src_ids = np.concatenate(src_ids)
+        dst_ids = np.concatenate(dst_ids)
+
        #currently these are just type_edge_ids... which will be converted to global ids
-        edge_datadict[constants.GLOBAL_SRC_ID].append(data_df['f0'].to_numpy() + ntype_gnid_offset[src_ntype_name][0, 0])
-        edge_datadict[constants.GLOBAL_DST_ID].append(data_df['f1'].to_numpy() + ntype_gnid_offset[dst_ntype_name][0, 0])
+        edge_datadict[constants.GLOBAL_SRC_ID].append(src_ids + ntype_gnid_offset[src_ntype_name][0, 0])
+        edge_datadict[constants.GLOBAL_DST_ID].append(dst_ids + ntype_gnid_offset[dst_ntype_name][0, 0])
        edge_datadict[constants.GLOBAL_TYPE_EID].append(np.arange(edge_tids[etype_name][rank][0],\
                edge_tids[etype_name][rank][1] ,dtype=np.int64))
        edge_datadict[constants.ETYPE_ID].append(etype_name_idmap[etype_name] * \
-                np.ones(shape=(data_df['f0'].to_numpy().shape), dtype=np.int64))
+                np.ones(shape=(src_ids.shape), dtype=np.int64))

    #stitch together to create the final data on the local machine
    for col in [constants.GLOBAL_SRC_ID, constants.GLOBAL_DST_ID, constants.GLOBAL_TYPE_EID, constants.ETYPE_ID]:

--- a/tools/distpartitioning/utils.py
+++ b/tools/distpartitioning/utils.py
@@ -85,15 +85,9 @@ def get_ntype_featnames(ntype_name, schema_map):
    list : 
        a list of feature names for a given node_type
    """
-    ntype_featdict = schema_map[constants.STR_NODE_DATA]
-    if (ntype_name in ntype_featdict):
-        featnames = []
-        ntype_info = ntype_featdict[ntype_name]
-        for k, v in ntype_info.items(): 
-            featnames.append(k)
-        return featnames
-    else: 
-        return []
+    node_data = schema_map[constants.STR_NODE_DATA]
+    feats = node_data.get(ntype_name, {})
+    return [feat for feat in feats]

 def get_node_types(schema_map):
    """ 
@@ -393,7 +387,7 @@ def write_dgl_objects(graph_obj, node_features, edge_features,
        orig_eids_file = os.path.join(part_dir, 'orig_eids.dgl')
        dgl.data.utils.save_tensors(orig_eids_file, orig_eids)

-def get_idranges(names, counts): 
+def get_idranges(names, counts, num_chunks=None): 
    """
    Utility function to compute typd_id/global_id ranges for both nodes and edges. 

@@ -403,6 +397,11 @@ def get_idranges(names, counts):
        list of node/edge types as strings
    counts : list of lists
        each list contains no. of nodes/edges in a given chunk
+    num_chunks : int, optional
+        In distributed partition pipeline, ID ranges are grouped into chunks.
+        In some scenarios, we'd like to merge ID ranges into specific number
+        of chunks. This parameter indicates the expected number of chunks.
+        If not specified, no merge is applied.

    Returns:
    --------
@@ -418,21 +417,33 @@ def get_idranges(names, counts):
    gnid_end = gnid_start
    tid_dict = {}
    gid_dict = {}
+    orig_num_chunks = 0
    for idx, typename in enumerate(names): 
        type_counts = counts[idx]
        tid_start = np.cumsum([0] + type_counts[:-1])
        tid_end = np.cumsum(type_counts)
        tid_ranges = list(zip(tid_start, tid_end))

-        type_start = tid_ranges[0][0]
-        type_end = tid_ranges[-1][1]
-
        gnid_end += tid_ranges[-1][1]

        tid_dict[typename] = tid_ranges
        gid_dict[typename] = np.array([gnid_start, gnid_end]).reshape([1,2])

        gnid_start = gnid_end
+        orig_num_chunks = len(tid_start)
+
+    if num_chunks is None:
+        return tid_dict, gid_dict
+
+    assert num_chunks <= orig_num_chunks, \
+        'Specified number of chunks should be less/euqual than original numbers of ID ranges.'
+    chunk_list = np.array_split(np.arange(orig_num_chunks), num_chunks)
+    for typename in tid_dict:
+        orig_tid_ranges = tid_dict[typename]
+        tid_ranges = []
+        for idx in chunk_list:
+            tid_ranges.append((orig_tid_ranges[idx[0]][0], orig_tid_ranges[idx[-1]][-1]))
+        tid_dict[typename] = tid_ranges

    return tid_dict, gid_dict


--- a/tools/partition_algo/base.py
+++ b/tools/partition_algo/base.py
+from typing import Optional
+import pydantic as dt
+import json
+from dgl import DGLError
+
+class PartitionMeta(dt.BaseModel):
+    """ Metadata that describes the partition assignment results.
+
+    Regardless of the choice of partitioning algorithm, a metadata JSON file
+    will be created in the output directory which includes the meta information
+    of the partition algorithm.
+
+    To generate a metadata JSON:
+
+    >>> part_meta = PartitionMeta(version='1.0.0', num_parts=4, algo_name='random')
+    >>> with open('metadata.json', 'w') as f:
+    ...     json.dump(part_meta.dict(), f)
+
+    To read a metadata JSON:
+
+    >>> with open('metadata.json') as f:
+    ...     part_meta = PartitionMeta(**(json.load(f)))
+
+    """
+    # version of metadata JSON.
+    version: Optional[str] = '1.0.0'
+    # number of partitions.
+    num_parts: int
+    # name of partition algorithm.
+    algo_name: str
+
+def dump_partition_meta(part_meta, meta_file):
+    """ Dump partition metadata into json file.
+
+    Parameters
+    ----------
+    part_meta : PartitionMeta
+        The partition metadata.
+    meta_file : str
+        The target file to save data.
+    """
+    with open(meta_file, 'w') as f:
+        json.dump(part_meta.dict(), f, sort_keys=True, indent=4)
+
+def load_partition_meta(meta_file):
+    """ Load partition metadata and do sanity check.
+
+    Parameters
+    ----------
+    meta_file : str
+        The path of the partition metadata file.
+
+    Returns
+    -------
+    PartitionMeta
+        The partition metadata.
+    """
+    with open(meta_file) as f:
+        try:
+            part_meta = PartitionMeta(**(json.load(f)))
+        except dt.ValidationError as e:
+            raise DGLError(
+                f"Invalid partition metadata JSON. Error details: {e.json()}")
+        if part_meta.version != '1.0.0':
+            raise DGLError(
+                f"Invalid version[{part_meta.version}]. Supported versions: '1.0.0'")
+        if part_meta.num_parts <= 0:
+            raise DGLError(
+                f"num_parts[{part_meta.num_parts}] should be greater than 0.")
+        if part_meta.algo_name not in ['random', 'metis']:
+            raise DGLError(
+                f"algo_name[{part_meta.num_parts}] is not supported.")
+        return part_meta
--- a/tools/partition_algo/random_partition.py
+++ b/tools/partition_algo/random_partition.py
@@ -8,6 +8,7 @@ import argparse

 from utils import setdir
 from utils import array_readwriter
+from base import PartitionMeta, dump_partition_meta

 def _random_partition(metadata, num_parts):
    num_nodes_per_type = [sum(_) for _ in metadata['num_nodes_per_chunk']]
@@ -26,9 +27,12 @@ def random_partition(metadata, num_parts, output_path):
    mapping files named "<node-type>.txt" (e.g. "author.txt", "paper.txt" and
    "institution.txt" for OGB-MAG240M).  Each file contains one line per node representing
    the partition ID the node belongs to.
+    In addition, metadata which includes version, number of partitions is dumped.
    """
    with setdir(output_path):
        _random_partition(metadata, num_parts)
+        part_meta = PartitionMeta(version='1.0.0', num_parts=num_parts, algo_name='random')
+        dump_partition_meta(part_meta, 'partition_meta.json')

 # Run with PYTHONPATH=${GIT_ROOT_DIR}/tools
 # where ${GIT_ROOT_DIR} is the directory to the DGL git repository.