[Dist][CI] Unit test for the new distributed partitioning pipeline (#4394)

* chunked graph data format * Update * Update * Update task_distributed_test.sh * Update * Update * Revert "Update" This reverts commit 03c461870f19375fb03125b061fc853ab555577f. * Update * Update * ssh-keygen * CI * install openssh * openssh * Update * CI * Update * Update Co-authored-by: Ubuntu <ubuntu@ip-172-31-53-142.us-west-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-16-87.us-west-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-20-21.us-west-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-9-26.ap-northeast-1.compute.internal>

[Dist][CI] Unit test for the new distributed partitioning pipeline (#4394)
* chunked graph data format * Update * Update * Update task_distributed_test.sh * Update * Update * Revert "Update" This reverts commit 03c461870f19375fb03125b061fc853ab555577f. * Update * Update * ssh-keygen * CI * install openssh * openssh * Update * CI * Update * Update Co-authored-by: Ubuntu <ubuntu@ip-172-31-53-142.us-west-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-16-87.us-west-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-20-21.us-west-2.compute.internal> Co-authored-by: Ubuntu <ubuntu@ip-172-31-9-26.ap-northeast-1.compute.internal>
2e8ae9f9 · Mufei Li · GitHub · b039ea99 · 2e8ae9f9 · 2e8ae9f9
Unverified Commit 2e8ae9f9 authored Aug 19, 2022 by Mufei Li Committed by GitHub Aug 19, 2022
6 changed files
--- a/tests/scripts/task_distributed_test.sh
+++ b/tests/scripts/task_distributed_test.sh
@@ -30,7 +30,7 @@ export CUDA_VISIBLE_DEVICES=-1
 conda activate ${DGLBACKEND}-ci
-python3 -m pip install pytest psutil pyyaml pydantic pandas rdflib ogb filelock || fail "pip install"
+python3 -m pip install pytest psutil pyyaml pydantic pandas rdflib ogb filelock pyarrow || fail "pip install"
 export PYTHONUNBUFFERED=1
 export OMP_NUM_THREADS=1

--- a/tests/tools/test_dist_part.py
+++ b/tests/tools/test_dist_part.py
+import argparse
+import dgl
+import json
+import numpy as np
+import os
+import sys
+import tempfile
+import torch
+from dgl.data.utils import load_tensors, load_graphs
+from chunk_graph import chunk_graph
+def test_part_pipeline():
+    # Step0: prepare chunked graph data format
+    # A synthetic mini MAG240
+    num_institutions = 20
+    num_authors = 100
+    num_papers = 600
+    def rand_edges(num_src, num_dst, num_edges):
+        eids = np.random.choice(num_src * num_dst, num_edges, replace=False)
+        src = torch.from_numpy(eids // num_dst)
+        dst = torch.from_numpy(eids % num_dst)
+        return src, dst
+    num_cite_edges = 2000
+    num_write_edges = 1000
+    num_affiliate_edges = 200
+    # Structure
+    data_dict = {
+        ('paper', 'cites', 'paper'): rand_edges(num_papers, num_papers, num_cite_edges),
+        ('author', 'writes', 'paper'): rand_edges(num_authors, num_papers, num_write_edges),
+        ('author', 'affiliated_with', 'institution'): rand_edges(num_authors, num_institutions, num_affiliate_edges)
+    }
+    src, dst = data_dict[('author', 'writes', 'paper')]
+    data_dict[('paper', 'rev_writes', 'author')] = (dst, src)
+    g = dgl.heterograph(data_dict)
+    # paper feat, label, year
+    num_paper_feats = 3
+    paper_feat = np.random.randn(num_papers, num_paper_feats)
+    num_classes = 4
+    paper_label = np.random.choice(num_classes, num_papers)
+    paper_year = np.random.choice(2022, num_papers)
+    # edge features
+    cite_count = np.random.choice(10, num_cite_edges)
+    write_year = np.random.choice(2022, num_write_edges)
+    # Save features
+    with tempfile.TemporaryDirectory() as root_dir:
+        print('root_dir', root_dir)
+        input_dir = os.path.join(root_dir, 'data_test')
+        os.makedirs(input_dir)
+        for sub_d in ['paper', 'cites', 'writes']:
+            os.makedirs(os.path.join(input_dir, sub_d))
+        paper_feat_path = os.path.join(input_dir, 'paper/feat.npy')
+        with open(paper_feat_path, 'wb') as f:
+            np.save(f, paper_feat)
+        paper_label_path = os.path.join(input_dir, 'paper/label.npy')
+        with open(paper_label_path, 'wb') as f:
+            np.save(f, paper_label)
+        paper_year_path = os.path.join(input_dir, 'paper/year.npy')
+        with open(paper_year_path, 'wb') as f:
+            np.save(f, paper_year)
+        cite_count_path = os.path.join(input_dir, 'cites/count.npy')
+        with open(cite_count_path, 'wb') as f:
+            np.save(f, cite_count)
+        write_year_path = os.path.join(input_dir, 'writes/year.npy')
+        with open(write_year_path, 'wb') as f:
+            np.save(f, write_year)
+        output_dir = os.path.join(root_dir, 'chunked-data')
+        num_chunks = 2
+        chunk_graph(
+            g,
+            'mag240m',
+            {'paper':
+                {
+                'feat': paper_feat_path,
+                'label': paper_label_path,
+                'year': paper_year_path
+                }
+            },
+            {
+                'cites': {'count': cite_count_path},
+                'writes': {'year': write_year_path},
+                # you can put the same data file if they indeed share the features.
+                'rev_writes': {'year': write_year_path}
+            },
+            num_chunks=num_chunks,
+            output_path=output_dir)
+        # check metadata.json
+        json_file = os.path.join(output_dir, 'metadata.json')
+        assert os.path.isfile(json_file)
+        with open(json_file, 'rb') as f:
+            meta_data = json.load(f)
+        assert meta_data['graph_name'] == 'mag240m'
+        assert len(meta_data['num_nodes_per_chunk'][0]) == num_chunks
+        # check edge_index
+        output_edge_index_dir = os.path.join(output_dir, 'edge_index')
+        for utype, etype, vtype in data_dict.keys():
+            fname = ':'.join([utype, etype, vtype])
+            for i in range(num_chunks):
+                chunk_f_name = os.path.join(output_edge_index_dir, fname + str(i) + '.txt')
+                assert os.path.isfile(chunk_f_name)
+                with open(chunk_f_name, 'r') as f:
+                    header = f.readline()
+                    num1, num2 = header.rstrip().split(' ')
+                    assert isinstance(int(num1), int)
+                    assert isinstance(int(num2), int)
+        # check node_data
+        output_node_data_dir = os.path.join(output_dir, 'node_data', 'paper')
+        for feat in ['feat', 'label', 'year']:
+            for i in range(num_chunks):
+                chunk_f_name = '{}-{}.npy'.format(feat, i)
+                chunk_f_name = os.path.join(output_node_data_dir, chunk_f_name)
+                assert os.path.isfile(chunk_f_name)
+                feat_array = np.load(chunk_f_name)
+                assert feat_array.shape[0] == num_papers // num_chunks
+        # check edge_data
+        num_edges = {
+            'paper:cites:paper': num_cite_edges,
+            'author:writes:paper': num_write_edges,
+            'paper:rev_writes:author': num_write_edges
+        }
+        output_edge_data_dir = os.path.join(output_dir, 'edge_data')
+        for etype, feat in [
+            ['paper:cites:paper', 'count'],
+            ['author:writes:paper', 'year'],
+            ['paper:rev_writes:author', 'year']
+        ]:
+            output_edge_sub_dir = os.path.join(output_edge_data_dir, etype)
+            for i in range(num_chunks):
+                chunk_f_name = '{}-{}.npy'.format(feat, i)
+                chunk_f_name = os.path.join(output_edge_sub_dir, chunk_f_name)
+                assert os.path.isfile(chunk_f_name)
+                feat_array = np.load(chunk_f_name)
+                assert feat_array.shape[0] == num_edges[etype] // num_chunks
+        # Step1: graph partition
+        in_dir = os.path.join(root_dir, 'chunked-data')
+        output_dir = os.path.join(root_dir, '2parts')
+        os.system('python tools/partition_algo/random_partition.py '\
+                  '--metadata {}/metadata.json --output_path {} --num_partitions {}'.format(
+                    in_dir, output_dir, num_chunks))
+        for ntype in ['author', 'institution', 'paper']:
+            fname = os.path.join(output_dir, '{}.txt'.format(ntype))
+            with open(fname, 'r') as f:
+                header = f.readline().rstrip()
+                assert isinstance(int(header), int)
+        # Step2: data dispatch
+        partition_dir = os.path.join(root_dir, '2parts')
+        out_dir = os.path.join(root_dir, 'partitioned')
+        ip_config = os.path.join(root_dir, 'ip_config.txt')
+        with open(ip_config, 'w') as f:
+            f.write('127.0.0.1\n')
+            f.write('127.0.0.2\n')
+        os.system('python tools/dispatch_data.py '\
+                  '--in-dir {} --partitions-dir {} --out-dir {} --ip-config {}'.format(
+                    in_dir, partition_dir, out_dir, ip_config))
+        # check metadata.json
+        meta_fname = os.path.join(out_dir, 'metadata.json')
+        with open(meta_fname, 'rb') as f:
+            meta_data = json.load(f)
+        all_etypes = ['affiliated_with', 'writes', 'cites', 'rev_writes']
+        for etype in all_etypes:
+            assert len(meta_data['edge_map'][etype]) == num_chunks
+        assert meta_data['etypes'].keys() == set(all_etypes)
+        assert meta_data['graph_name'] == 'mag240m'
+        all_ntypes = ['author', 'institution', 'paper']
+        for ntype in all_ntypes:
+            assert len(meta_data['node_map'][ntype]) == num_chunks
+        assert meta_data['ntypes'].keys() == set(all_ntypes)
+        assert meta_data['num_edges'] == 4200
+        assert meta_data['num_nodes'] == 720
+        assert meta_data['num_parts'] == num_chunks
+        for i in range(num_chunks):
+            sub_dir = 'part-' + str(i)
+            assert meta_data[sub_dir]['node_feats'] == 'part{}/node_feat.dgl'.format(i)
+            assert meta_data[sub_dir]['edge_feats'] == 'part{}/edge_feat.dgl'.format(i)
+            assert meta_data[sub_dir]['part_graph'] == 'part{}/graph.dgl'.format(i)
+            # check data
+            sub_dir = os.path.join(out_dir, 'part' + str(i))
+            # graph.dgl
+            fname = os.path.join(sub_dir, 'graph.dgl')
+            assert os.path.isfile(fname)
+            g_list, data_dict = load_graphs(fname)
+            g = g_list[0]
+            assert isinstance(g, dgl.DGLGraph)
+            # node_feat.dgl
+            fname = os.path.join(sub_dir, 'node_feat.dgl')
+            assert os.path.isfile(fname)
+            tensor_dict = load_tensors(fname)
+            all_tensors = ['paper/feat', 'paper/label', 'paper/year']
+            assert tensor_dict.keys() == set(all_tensors)
+            for key in all_tensors:
+                assert isinstance(tensor_dict[key], torch.Tensor)
+            # edge_feat.dgl
+            fname = os.path.join(sub_dir, 'edge_feat.dgl')
+            assert os.path.isfile(fname)
+            tensor_dict = load_tensors(fname)
+if __name__ == '__main__':
+    test_part_pipeline()
--- a/tools/chunk_graph.py
+++ b/tools/chunk_graph.py
@@ -111,6 +111,7 @@ def _chunk_graph(g, name, ndata_paths, edata_paths, num_chunks, output_path):
                    reader_fmt_meta = writer_fmt_meta = {"name": "numpy"}
                    arr = array_readwriter.get_array_parser(**reader_fmt_meta).read(path)
                    edata_key_meta['format'] = writer_fmt_meta
+                    etype = tuple(etypestr.split(':'))
                    edata_key_meta['data'] = chunk_numpy_array(
                            arr, writer_fmt_meta, num_edges_per_chunk_dict[etype],
                            key + '-%d.npy')

--- a/tools/dispatch_data.py
+++ b/tools/dispatch_data.py
@@ -34,8 +34,8 @@ def submit_jobs(args) -> str:
    wrapper_command = os.path.join(INSTALL_DIR, LAUNCH_SCRIPT)
    #read the json file and get the remaining argument here.
-    schema_path = os.path.join(args.in_dir, "metadata.json")
+    schema_path = "metadata.json"
-    with open(schema_path) as schema:
+    with open(os.path.join(args.in_dir, schema_path)) as schema:
        schema_map = json.load(schema)
    num_parts = len(schema_map["num_nodes_per_chunk"][0])
@@ -43,12 +43,12 @@ def submit_jobs(args) -> str:
    argslist = ""
    argslist += "--world-size {} ".format(num_parts)
-    argslist += "--partitions-dir {} ".format(args.partitions_dir)
+    argslist += "--partitions-dir {} ".format(os.path.abspath(args.partitions_dir))
-    argslist += "--input-dir {} ".format(args.in_dir)
+    argslist += "--input-dir {} ".format(os.path.abspath(args.in_dir))
    argslist += "--graph-name {} ".format(graph_name)
    argslist += "--schema {} ".format(schema_path)
    argslist += "--num-parts {} ".format(num_parts)
-    argslist += "--output {} ".format(args.out_dir)
+    argslist += "--output {} ".format(os.path.abspath(args.out_dir))
    # (BarclayII) Is it safe to assume all the workers have the Python executable at the same path?
    pipeline_cmd = os.path.join(INSTALL_DIR, PIPELINE_SCRIPT)

--- a/tools/distpartitioning/data_shuffle.py
+++ b/tools/distpartitioning/data_shuffle.py
--- a/tools/partition_algorithms/random_partition.py
+++ b/tools/partition_algorithms/random_partition.py
@@ -35,11 +35,11 @@ def random_partition(metadata, num_parts, output_path):
 if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
-            'metadata', type=str, help='input metadata file of the chunked graph format')
+            '--metadata', type=str, help='input metadata file of the chunked graph format')
    parser.add_argument(
-            'output_path', type=str, help='output directory')
+            '--output_path', type=str, help='output directory')
    parser.add_argument(
-            'num_partitions', type=int, help='number of partitions')
+            '--num_partitions', type=int, help='number of partitions')
    logging.basicConfig(level='INFO')
    args = parser.parse_args()
    with open(args.metadata) as f: