[Distributed] Measure peak memory size in graph partitioning (#3633)

* measure peak memory size. * fix lint. Co-authored-by: Ubuntu <ubuntu@ip-172-31-30-164.us-west-2.compute.internal>

[Distributed] Measure peak memory size in graph partitioning (#3633)
* measure peak memory size. * fix lint. Co-authored-by: Ubuntu <ubuntu@ip-172-31-30-164.us-west-2.compute.internal>
2dd114ed · Da Zheng · GitHub · d03138e2 · 2dd114ed · 2dd114ed
Unverified Commit 2dd114ed authored Jan 13, 2022 by Da Zheng Committed by GitHub Jan 13, 2022
Show whitespace changes
Inline Side-by-side

Showing with 41 additions and 7 deletions

python/dgl/distributed/partition.py python/dgl/distributed/partition.py +18 -2

python/dgl/partition.py python/dgl/partition.py +23 -5

No files found.
--- a/python/dgl/distributed/partition.py
+++ b/python/dgl/distributed/partition.py
@@ -10,7 +10,7 @@ from ..base import NID, EID, NTYPE, ETYPE, dgl_warning
 from ..convert import to_homogeneous
 from ..random import choice as random_choice
 from ..data.utils import load_graphs, save_graphs, load_tensors, save_tensors
-from ..transform import metis_partition_assignment, partition_graph_with_halo
+from ..partition import metis_partition_assignment, partition_graph_with_halo, get_peak_mem
 from .graph_partition_book import BasicPartitionBook, RangePartitionBook

 def _get_inner_node_mask(graph, ntype_id):
@@ -537,17 +537,23 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
                    "For heterogeneous graphs, reshuffle must be enabled.")

    if num_parts == 1:
+        start = time.time()
        sim_g, balance_ntypes = get_homogeneous(g, balance_ntypes)
+        print('Converting to homogeneous graph takes {:.3f}s, peak mem: {:.3f} GB'.format(
+            time.time() - start, get_peak_mem()))
        assert num_trainers_per_machine >= 1
        if num_trainers_per_machine > 1:
            # First partition the whole graph to each trainer and save the trainer ids in
            # the node feature "trainer_id".
+            start = time.time()
            node_parts = metis_partition_assignment(
                sim_g, num_parts * num_trainers_per_machine,
                balance_ntypes=balance_ntypes,
                balance_edges=balance_edges,
                mode='k-way')
            _set_trainer_ids(g, sim_g, node_parts)
+            print('Assigning nodes to METIS partitions takes {:.3f}s, peak mem: {:.3f} GB'.format(
+                time.time() - start, get_peak_mem()))

        node_parts = F.zeros((sim_g.number_of_nodes(),), F.int64, F.cpu())
        parts = {0: sim_g.clone()}
@@ -563,9 +569,13 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
        parts[0].ndata['inner_node'] = F.ones((sim_g.number_of_nodes(),), F.int8, F.cpu())
        parts[0].edata['inner_edge'] = F.ones((sim_g.number_of_edges(),), F.int8, F.cpu())
    elif part_method in ('metis', 'random'):
+        start = time.time()
        sim_g, balance_ntypes = get_homogeneous(g, balance_ntypes)
+        print('Converting to homogeneous graph takes {:.3f}s, peak mem: {:.3f} GB'.format(
+            time.time() - start, get_peak_mem()))
        if part_method == 'metis':
            assert num_trainers_per_machine >= 1
+            start = time.time()
            if num_trainers_per_machine > 1:
                # First partition the whole graph to each trainer and save the trainer ids in
                # the node feature "trainer_id".
@@ -583,10 +593,15 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
                node_parts = metis_partition_assignment(sim_g, num_parts,
                                                        balance_ntypes=balance_ntypes,
                                                        balance_edges=balance_edges)
+            print('Assigning nodes to METIS partitions takes {:.3f}s, peak mem: {:.3f} GB'.format(
+                time.time() - start, get_peak_mem()))
        else:
            node_parts = random_choice(num_parts, sim_g.number_of_nodes())
+        start = time.time()
        parts, orig_nids, orig_eids = partition_graph_with_halo(sim_g, node_parts, num_hops,
                                                                reshuffle=reshuffle)
+        print('Splitting the graph into partitions takes {:.3f}s, peak mem: {:.3f} GB'.format(
+            time.time() - start, get_peak_mem()))
        if return_mapping:
            orig_nids, orig_eids = _get_orig_ids(g, sim_g, reshuffle, orig_nids, orig_eids)
    else:
@@ -833,10 +848,11 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
        save_tensors(edge_feat_file, edge_feats)

        save_graphs(part_graph_file, [part])
+    print('Save partitions: {:.3f} seconds, peak memory: {:.3f} GB'.format(
+        time.time() - start, get_peak_mem()))

    with open('{}/{}.json'.format(out_path, graph_name), 'w') as outfile:
        json.dump(part_metadata, outfile, sort_keys=True, indent=4)
-    print('Save partitions: {:.3f} seconds'.format(time.time() - start))

    num_cuts = sim_g.number_of_edges() - tot_num_inner_edges
    if num_parts == 1:

--- a/python/dgl/partition.py
+++ b/python/dgl/partition.py
 """Module for graph partition utilities."""
+import os
+import re
 import time
 import numpy as np

@@ -230,6 +232,21 @@ def partition_graph_with_halo(g, node_part, extra_cached_hops, reshuffle=False):
    else:
        return subg_dict, None, None

+def get_peak_mem():
+    ''' Get the peak memory size.
+
+    Returns
+    -------
+    float
+        The peak memory size in GB.
+    '''
+    if not os.path.exists('/proc/self/status'):
+        return 0.0
+    for line in open('/proc/self/status', 'r'):
+        if 'VmPeak' in line:
+            mem = re.findall(r'\d+', line)[0]
+            return int(mem) / 1024 / 1024
+    return 0.0

 def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False, mode="k-way"):
    ''' This assigns nodes to different partitions with Metis partitioning algorithm.
@@ -271,8 +288,8 @@ def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False, m
    start = time.time()
    sym_gidx = _CAPI_DGLMakeSymmetric_Hetero(g._graph)
    sym_g = DGLHeteroGraph(gidx=sym_gidx)
-    print('Convert a graph into a bidirected graph: {:.3f} seconds'.format(
-        time.time() - start))
+    print('Convert a graph into a bidirected graph: {:.3f} seconds, peak memory: {:.3f} GB'.format(
+        time.time() - start, get_peak_mem()))
    vwgt = []
    # To balance the node types in each partition, we can take advantage of the vertex weights
    # in Metis. When vertex weights are provided, Metis will tries to generate partitions with
@@ -310,15 +327,16 @@ def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False, m
        shape = (np.prod(F.shape(vwgt),),)
        vwgt = F.reshape(vwgt, shape)
        vwgt = F.to_dgl_nd(vwgt)
-        print(
-            'Construct multi-constraint weights: {:.3f} seconds'.format(time.time() - start))
    else:
        vwgt = F.zeros((0,), F.int64, F.cpu())
        vwgt = F.to_dgl_nd(vwgt)
+    print('Construct multi-constraint weights: {:.3f} seconds, peak memory: {:.3f} GB'.format(
+        time.time() - start, get_peak_mem()))

    start = time.time()
    node_part = _CAPI_DGLMetisPartition_Hetero(sym_g._graph, k, vwgt, mode)
-    print('Metis partitioning: {:.3f} seconds'.format(time.time() - start))
+    print('Metis partitioning: {:.3f} seconds, peak memory: {:.3f} GB'.format(
+        time.time() - start, get_peak_mem()))
    if len(node_part) == 0:
        return None
    else: