Unverified Commit 2dd114ed authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

[Distributed] Measure peak memory size in graph partitioning (#3633)



* measure peak memory size.

* fix lint.
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-30-164.us-west-2.compute.internal>
parent d03138e2
......@@ -10,7 +10,7 @@ from ..base import NID, EID, NTYPE, ETYPE, dgl_warning
from ..convert import to_homogeneous
from ..random import choice as random_choice
from ..data.utils import load_graphs, save_graphs, load_tensors, save_tensors
from ..transform import metis_partition_assignment, partition_graph_with_halo
from ..partition import metis_partition_assignment, partition_graph_with_halo, get_peak_mem
from .graph_partition_book import BasicPartitionBook, RangePartitionBook
def _get_inner_node_mask(graph, ntype_id):
......@@ -537,17 +537,23 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
"For heterogeneous graphs, reshuffle must be enabled.")
if num_parts == 1:
start = time.time()
sim_g, balance_ntypes = get_homogeneous(g, balance_ntypes)
print('Converting to homogeneous graph takes {:.3f}s, peak mem: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
assert num_trainers_per_machine >= 1
if num_trainers_per_machine > 1:
# First partition the whole graph to each trainer and save the trainer ids in
# the node feature "trainer_id".
start = time.time()
node_parts = metis_partition_assignment(
sim_g, num_parts * num_trainers_per_machine,
balance_ntypes=balance_ntypes,
balance_edges=balance_edges,
mode='k-way')
_set_trainer_ids(g, sim_g, node_parts)
print('Assigning nodes to METIS partitions takes {:.3f}s, peak mem: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
node_parts = F.zeros((sim_g.number_of_nodes(),), F.int64, F.cpu())
parts = {0: sim_g.clone()}
......@@ -563,9 +569,13 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
parts[0].ndata['inner_node'] = F.ones((sim_g.number_of_nodes(),), F.int8, F.cpu())
parts[0].edata['inner_edge'] = F.ones((sim_g.number_of_edges(),), F.int8, F.cpu())
elif part_method in ('metis', 'random'):
start = time.time()
sim_g, balance_ntypes = get_homogeneous(g, balance_ntypes)
print('Converting to homogeneous graph takes {:.3f}s, peak mem: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
if part_method == 'metis':
assert num_trainers_per_machine >= 1
start = time.time()
if num_trainers_per_machine > 1:
# First partition the whole graph to each trainer and save the trainer ids in
# the node feature "trainer_id".
......@@ -583,10 +593,15 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
node_parts = metis_partition_assignment(sim_g, num_parts,
balance_ntypes=balance_ntypes,
balance_edges=balance_edges)
print('Assigning nodes to METIS partitions takes {:.3f}s, peak mem: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
else:
node_parts = random_choice(num_parts, sim_g.number_of_nodes())
start = time.time()
parts, orig_nids, orig_eids = partition_graph_with_halo(sim_g, node_parts, num_hops,
reshuffle=reshuffle)
print('Splitting the graph into partitions takes {:.3f}s, peak mem: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
if return_mapping:
orig_nids, orig_eids = _get_orig_ids(g, sim_g, reshuffle, orig_nids, orig_eids)
else:
......@@ -833,10 +848,11 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
save_tensors(edge_feat_file, edge_feats)
save_graphs(part_graph_file, [part])
print('Save partitions: {:.3f} seconds, peak memory: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
with open('{}/{}.json'.format(out_path, graph_name), 'w') as outfile:
json.dump(part_metadata, outfile, sort_keys=True, indent=4)
print('Save partitions: {:.3f} seconds'.format(time.time() - start))
num_cuts = sim_g.number_of_edges() - tot_num_inner_edges
if num_parts == 1:
......
"""Module for graph partition utilities."""
import os
import re
import time
import numpy as np
......@@ -230,6 +232,21 @@ def partition_graph_with_halo(g, node_part, extra_cached_hops, reshuffle=False):
else:
return subg_dict, None, None
def get_peak_mem():
''' Get the peak memory size.
Returns
-------
float
The peak memory size in GB.
'''
if not os.path.exists('/proc/self/status'):
return 0.0
for line in open('/proc/self/status', 'r'):
if 'VmPeak' in line:
mem = re.findall(r'\d+', line)[0]
return int(mem) / 1024 / 1024
return 0.0
def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False, mode="k-way"):
''' This assigns nodes to different partitions with Metis partitioning algorithm.
......@@ -271,8 +288,8 @@ def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False, m
start = time.time()
sym_gidx = _CAPI_DGLMakeSymmetric_Hetero(g._graph)
sym_g = DGLHeteroGraph(gidx=sym_gidx)
print('Convert a graph into a bidirected graph: {:.3f} seconds'.format(
time.time() - start))
print('Convert a graph into a bidirected graph: {:.3f} seconds, peak memory: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
vwgt = []
# To balance the node types in each partition, we can take advantage of the vertex weights
# in Metis. When vertex weights are provided, Metis will tries to generate partitions with
......@@ -310,15 +327,16 @@ def metis_partition_assignment(g, k, balance_ntypes=None, balance_edges=False, m
shape = (np.prod(F.shape(vwgt),),)
vwgt = F.reshape(vwgt, shape)
vwgt = F.to_dgl_nd(vwgt)
print(
'Construct multi-constraint weights: {:.3f} seconds'.format(time.time() - start))
else:
vwgt = F.zeros((0,), F.int64, F.cpu())
vwgt = F.to_dgl_nd(vwgt)
print('Construct multi-constraint weights: {:.3f} seconds, peak memory: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
start = time.time()
node_part = _CAPI_DGLMetisPartition_Hetero(sym_g._graph, k, vwgt, mode)
print('Metis partitioning: {:.3f} seconds'.format(time.time() - start))
print('Metis partitioning: {:.3f} seconds, peak memory: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
if len(node_part) == 0:
return None
else:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment