Unverified Commit 1990e797 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[Dist] Reduce startup overhead: sort etypes and save in specified formats (#4735)

* [Dist] reduce startup overhead: enable to save in specified formats

* [Dist] reduce startup overhead: sort partitions when generating

* sort csc/csr only whenmultiple etypes

* refine
parent e682fa74
......@@ -10,7 +10,7 @@ import numpy as np
from ..heterograph import DGLHeteroGraph
from ..convert import heterograph as dgl_heterograph
from ..convert import graph as dgl_graph
from ..transforms import compact_graphs, sort_csr_by_tag, sort_csc_by_tag
from ..transforms import compact_graphs
from .. import heterograph_index
from .. import backend as F
from ..base import NID, EID, ETYPE, ALL, is_all
......@@ -345,14 +345,6 @@ class DistGraphServer(KVServer):
# Create the graph formats specified the users.
self.client_g = self.client_g.formats(graph_format)
self.client_g.create_formats_()
# Sort underlying matrix beforehand to avoid runtime overhead during sampling.
if len(etypes) > 1:
if 'csr' in graph_format:
self.client_g = sort_csr_by_tag(
self.client_g, tag=self.client_g.edata[ETYPE], tag_type='edge')
if 'csc' in graph_format:
self.client_g = sort_csc_by_tag(
self.client_g, tag=self.client_g.edata[ETYPE], tag_type='edge')
if not disable_shared_mem:
self.client_g = _copy_graph_to_shared_mem(self.client_g, graph_name, graph_format)
......
......@@ -9,6 +9,7 @@ from .. import backend as F
from ..base import NID, EID, NTYPE, ETYPE, dgl_warning
from ..convert import to_homogeneous
from ..random import choice as random_choice
from ..transforms import sort_csr_by_tag, sort_csc_by_tag
from ..data.utils import load_graphs, save_graphs, load_tensors, save_tensors
from ..partition import metis_partition_assignment, partition_graph_with_halo, get_peak_mem
from .graph_partition_book import BasicPartitionBook, RangePartitionBook
......@@ -23,8 +24,10 @@ RESERVED_FIELD_DTYPE = {
ETYPE: F.int32
}
def _save_graphs(filename, g_list):
'''Format data types in graphs before saving
def _save_graphs(filename, g_list, formats=None, sort_etypes=False):
'''Preprocess partitions before saving:
1. format data types.
2. sort csc/csr by tag.
'''
for g in g_list:
for k, dtype in RESERVED_FIELD_DTYPE.items():
......@@ -32,7 +35,14 @@ def _save_graphs(filename, g_list):
g.ndata[k] = F.astype(g.ndata[k], dtype)
if k in g.edata:
g.edata[k] = F.astype(g.edata[k], dtype)
save_graphs(filename , g_list)
for g in g_list:
if (not sort_etypes) or (formats is None):
continue
if 'csr' in formats:
g = sort_csr_by_tag(g, tag=g.edata[ETYPE], tag_type='edge')
if 'csc' in formats:
g = sort_csc_by_tag(g, tag=g.edata[ETYPE], tag_type='edge')
save_graphs(filename , g_list, formats=formats)
def _get_inner_node_mask(graph, ntype_id):
if NTYPE in graph.ndata:
......@@ -368,7 +378,8 @@ def _set_trainer_ids(g, sim_g, node_parts):
def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method="metis",
reshuffle=True, balance_ntypes=None, balance_edges=False, return_mapping=False,
num_trainers_per_machine=1, objtype='cut'):
num_trainers_per_machine=1, objtype='cut',
graph_formats=None):
''' Partition a graph for distributed training and store the partitions on files.
The partitioning occurs in three steps: 1) run a partition algorithm (e.g., Metis) to
......@@ -549,6 +560,11 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
objtype : str, "cut" or "vol"
Set the objective as edge-cut minimization or communication volume minimization. This
argument is used by the Metis algorithm.
graph_formats : str or list[str]
Save partitions in specified formats. It could be any combination of ``coo``,
``csc`` and ``csr``. If not specified, save one format only according to what
format is available. If multiple formats are available, selection priority
from high to low is ``coo``, ``csc``, ``csr``.
Returns
-------
......@@ -573,6 +589,9 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
... g, node_feats, edge_feats, gpb, graph_name, ntypes_list, etypes_list,
... ) = dgl.distributed.load_partition('output/test.json', 0)
'''
# 'coo' is required for partition
assert 'coo' in np.concatenate(list(g.formats().values())), \
"'coo' format should be allowed for partitioning graph."
def get_homogeneous(g, balance_ntypes):
if g.is_homogeneous:
sim_g = to_homogeneous(g)
......@@ -930,7 +949,9 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
save_tensors(node_feat_file, node_feats)
save_tensors(edge_feat_file, edge_feats)
_save_graphs(part_graph_file, [part])
sort_etypes = len(g.etypes) > 1
_save_graphs(part_graph_file, [part], formats=graph_formats,
sort_etypes=sort_etypes)
print('Save partitions: {:.3f} seconds, peak memory: {:.3f} GB'.format(
time.time() - start, get_peak_mem()))
......
......@@ -447,7 +447,7 @@ def check_rpc_hetero_sampling_empty_shuffle(tmpdir, num_server):
assert block.number_of_edges() == 0
assert len(block.etypes) == len(g.etypes)
def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, etype_sorted=False):
def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, graph_formats=None):
generate_ip_config("rpc_ip_config.txt", num_server, num_server)
g = create_random_hetero(dense=True)
......@@ -455,7 +455,8 @@ def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, etype_sorted=Fal
num_hops = 1
orig_nid_map, orig_eid_map = partition_graph(g, 'test_sampling', num_parts, tmpdir,
num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True)
num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True,
graph_formats=graph_formats)
pserver_list = []
ctx = mp.get_context('spawn')
......@@ -466,6 +467,9 @@ def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, etype_sorted=Fal
pserver_list.append(p)
fanout = 3
etype_sorted = False
if graph_formats is not None:
etype_sorted = 'csc' in graph_formats or 'csr' in graph_formats
block, gpb = start_hetero_etype_sample_client(0, tmpdir, num_server > 1, fanout,
nodes={'n3': [0, 10, 99, 66, 124, 208]},
etype_sorted=etype_sorted)
......@@ -768,7 +772,9 @@ def test_rpc_sampling_shuffle(num_server):
check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server)
check_rpc_hetero_sampling_empty_shuffle(Path(tmpdirname), num_server)
check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server)
check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server, etype_sorted=True)
check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server, ['csc'])
check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server, ['csr'])
check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server, ['csc', 'coo'])
check_rpc_hetero_etype_sampling_empty_shuffle(Path(tmpdirname), num_server)
check_rpc_bipartite_sampling_empty(Path(tmpdirname), num_server)
check_rpc_bipartite_sampling_shuffle(Path(tmpdirname), num_server)
......
This diff is collapsed.
......@@ -26,6 +26,15 @@ def _verify_partition_data_types(part_g):
if k in part_g.edata:
assert part_g.edata[k].dtype == dtype
def _verify_partition_formats(part_g, formats):
# Verify saved graph formats
if formats is None:
assert "coo" in part_g.formats()["created"]
else:
formats = formats.split(',')
for format in formats:
assert format in part_g.formats()["created"]
def _verify_graph_feats(
g, gpb, part, node_feats, edge_feats, orig_nids, orig_eids
......@@ -139,9 +148,12 @@ def test_chunk_graph(num_chunks):
assert feat_array.shape[0] == num_edges[etype] // num_chunks
@pytest.mark.parametrize("num_chunks", [1, 2, 3, 4, 8])
@pytest.mark.parametrize("num_parts", [1, 2, 3, 4, 8])
def test_part_pipeline(num_chunks, num_parts):
@pytest.mark.parametrize("num_chunks", [1, 3, 8])
@pytest.mark.parametrize("num_parts", [1, 3, 8])
@pytest.mark.parametrize(
"graph_formats", [None, "csc", "coo,csc", "coo,csc,csr"]
)
def test_part_pipeline(num_chunks, num_parts, graph_formats):
if num_chunks < num_parts:
# num_parts should less/equal than num_chunks
return
......@@ -182,6 +194,7 @@ def test_part_pipeline(num_chunks, num_parts):
cmd += " --process-group-timeout 60"
cmd += " --save-orig-nids"
cmd += " --save-orig-eids"
cmd += f" --graph-formats {graph_formats}" if graph_formats else ""
os.system(cmd)
# read original node/edge IDs
......@@ -207,6 +220,7 @@ def test_part_pipeline(num_chunks, num_parts):
part_config, i
)
_verify_partition_data_types(part_g)
_verify_partition_formats(part_g, graph_formats)
_verify_graph_feats(
g, gpb, part_g, node_feats, edge_feats, orig_nids, orig_eids
)
......@@ -76,6 +76,7 @@ def submit_jobs(args) -> str:
argslist += "--log-level {} ".format(args.log_level)
argslist += "--save-orig-nids " if args.save_orig_nids else ""
argslist += "--save-orig-eids " if args.save_orig_eids else ""
argslist += f"--graph-formats {args.graph_formats} " if args.graph_formats else ""
# (BarclayII) Is it safe to assume all the workers have the Python executable at the same path?
pipeline_cmd = os.path.join(INSTALL_DIR, PIPELINE_SCRIPT)
......@@ -149,6 +150,15 @@ def main():
action="store_true",
help="Save original edge IDs into files",
)
parser.add_argument(
"--graph-formats",
type=str,
default=None,
help="Save partitions in specified formats. It could be any combination(joined with ``,``) "
"of ``coo``, ``csc`` and ``csr``. If not specified, save one format only according to "
"what format is available. If multiple formats are available, selection priority "
"from high to low is ``coo``, ``csc``, ``csr``.",
)
args, udf_command = parser.parse_known_args()
......
......@@ -58,6 +58,8 @@ if __name__ == "__main__":
help='Save original node IDs into files')
parser.add_argument('--save-orig-eids', action='store_true',
help='Save original edge IDs into files')
parser.add_argument('--graph-formats', default=None, type=str,
help='Save partitions in specified formats.')
params = parser.parse_args()
#invoke the pipeline function
......
......@@ -697,8 +697,12 @@ def gen_dist_partitions(rank, world_size, params):
orig_nids, orig_eids = create_dgl_object(schema_map, rank, node_data, \
edge_data, num_edges, params.save_orig_nids, params.save_orig_eids)
memory_snapshot("CreateDGLObjectsComplete: ", rank)
graph_formats = None
if params.graph_formats:
graph_formats = params.graph_formats.split(',')
sort_etypes = len(etypes_map) > 1
write_dgl_objects(graph_obj, rcvd_node_features, rcvd_edge_features, params.output, \
rank, orig_nids, orig_eids)
rank, orig_nids, orig_eids, graph_formats, sort_etypes)
memory_snapshot("DiskWriteDGLObjectsComplete: ", rank)
#get the meta-data
......
......@@ -379,7 +379,7 @@ def write_edge_features(edge_features, edge_file):
"""
dgl.data.utils.save_tensors(edge_file, edge_features)
def write_graph_dgl(graph_file, graph_obj):
def write_graph_dgl(graph_file, graph_obj, formats, sort_etypes):
"""
Utility function to serialize graph dgl objects
......@@ -389,11 +389,16 @@ def write_graph_dgl(graph_file, graph_obj):
graph dgl object, as created in convert_partition.py, which is to be serialized
graph_file : string
File name in which graph object is serialized
formats : str or list[str]
Save graph in specified formats.
sort_etypes : bool
Whether to sort etypes in csc/csr.
"""
dgl.distributed.partition._save_graphs(graph_file, [graph_obj])
dgl.distributed.partition._save_graphs(graph_file, [graph_obj],
formats, sort_etypes)
def write_dgl_objects(graph_obj, node_features, edge_features,
output_dir, part_id, orig_nids, orig_eids):
output_dir, part_id, orig_nids, orig_eids, formats, sort_etypes):
"""
Wrapper function to write graph, node/edge feature, original node/edge IDs.
......@@ -413,11 +418,15 @@ def write_dgl_objects(graph_obj, node_features, edge_features,
original node IDs
orig_eids : dict
original edge IDs
formats : str or list[str]
Save graph in formats.
sort_etypes : bool
Whether to sort etypes in csc/csr.
"""
part_dir = output_dir + '/part' + str(part_id)
os.makedirs(part_dir, exist_ok=True)
write_graph_dgl(os.path.join(part_dir ,'graph.dgl'), graph_obj)
write_graph_dgl(os.path.join(part_dir ,'graph.dgl'), graph_obj,
formats, sort_etypes)
if node_features != None:
write_node_features(node_features, os.path.join(part_dir, "node_feat.dgl"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment