Unverified Commit 1990e797 authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[Dist] Reduce startup overhead: sort etypes and save in specified formats (#4735)

* [Dist] reduce startup overhead: enable to save in specified formats

* [Dist] reduce startup overhead: sort partitions when generating

* sort csc/csr only whenmultiple etypes

* refine
parent e682fa74
...@@ -10,7 +10,7 @@ import numpy as np ...@@ -10,7 +10,7 @@ import numpy as np
from ..heterograph import DGLHeteroGraph from ..heterograph import DGLHeteroGraph
from ..convert import heterograph as dgl_heterograph from ..convert import heterograph as dgl_heterograph
from ..convert import graph as dgl_graph from ..convert import graph as dgl_graph
from ..transforms import compact_graphs, sort_csr_by_tag, sort_csc_by_tag from ..transforms import compact_graphs
from .. import heterograph_index from .. import heterograph_index
from .. import backend as F from .. import backend as F
from ..base import NID, EID, ETYPE, ALL, is_all from ..base import NID, EID, ETYPE, ALL, is_all
...@@ -345,14 +345,6 @@ class DistGraphServer(KVServer): ...@@ -345,14 +345,6 @@ class DistGraphServer(KVServer):
# Create the graph formats specified the users. # Create the graph formats specified the users.
self.client_g = self.client_g.formats(graph_format) self.client_g = self.client_g.formats(graph_format)
self.client_g.create_formats_() self.client_g.create_formats_()
# Sort underlying matrix beforehand to avoid runtime overhead during sampling.
if len(etypes) > 1:
if 'csr' in graph_format:
self.client_g = sort_csr_by_tag(
self.client_g, tag=self.client_g.edata[ETYPE], tag_type='edge')
if 'csc' in graph_format:
self.client_g = sort_csc_by_tag(
self.client_g, tag=self.client_g.edata[ETYPE], tag_type='edge')
if not disable_shared_mem: if not disable_shared_mem:
self.client_g = _copy_graph_to_shared_mem(self.client_g, graph_name, graph_format) self.client_g = _copy_graph_to_shared_mem(self.client_g, graph_name, graph_format)
......
...@@ -9,6 +9,7 @@ from .. import backend as F ...@@ -9,6 +9,7 @@ from .. import backend as F
from ..base import NID, EID, NTYPE, ETYPE, dgl_warning from ..base import NID, EID, NTYPE, ETYPE, dgl_warning
from ..convert import to_homogeneous from ..convert import to_homogeneous
from ..random import choice as random_choice from ..random import choice as random_choice
from ..transforms import sort_csr_by_tag, sort_csc_by_tag
from ..data.utils import load_graphs, save_graphs, load_tensors, save_tensors from ..data.utils import load_graphs, save_graphs, load_tensors, save_tensors
from ..partition import metis_partition_assignment, partition_graph_with_halo, get_peak_mem from ..partition import metis_partition_assignment, partition_graph_with_halo, get_peak_mem
from .graph_partition_book import BasicPartitionBook, RangePartitionBook from .graph_partition_book import BasicPartitionBook, RangePartitionBook
...@@ -23,8 +24,10 @@ RESERVED_FIELD_DTYPE = { ...@@ -23,8 +24,10 @@ RESERVED_FIELD_DTYPE = {
ETYPE: F.int32 ETYPE: F.int32
} }
def _save_graphs(filename, g_list): def _save_graphs(filename, g_list, formats=None, sort_etypes=False):
'''Format data types in graphs before saving '''Preprocess partitions before saving:
1. format data types.
2. sort csc/csr by tag.
''' '''
for g in g_list: for g in g_list:
for k, dtype in RESERVED_FIELD_DTYPE.items(): for k, dtype in RESERVED_FIELD_DTYPE.items():
...@@ -32,7 +35,14 @@ def _save_graphs(filename, g_list): ...@@ -32,7 +35,14 @@ def _save_graphs(filename, g_list):
g.ndata[k] = F.astype(g.ndata[k], dtype) g.ndata[k] = F.astype(g.ndata[k], dtype)
if k in g.edata: if k in g.edata:
g.edata[k] = F.astype(g.edata[k], dtype) g.edata[k] = F.astype(g.edata[k], dtype)
save_graphs(filename , g_list) for g in g_list:
if (not sort_etypes) or (formats is None):
continue
if 'csr' in formats:
g = sort_csr_by_tag(g, tag=g.edata[ETYPE], tag_type='edge')
if 'csc' in formats:
g = sort_csc_by_tag(g, tag=g.edata[ETYPE], tag_type='edge')
save_graphs(filename , g_list, formats=formats)
def _get_inner_node_mask(graph, ntype_id): def _get_inner_node_mask(graph, ntype_id):
if NTYPE in graph.ndata: if NTYPE in graph.ndata:
...@@ -368,7 +378,8 @@ def _set_trainer_ids(g, sim_g, node_parts): ...@@ -368,7 +378,8 @@ def _set_trainer_ids(g, sim_g, node_parts):
def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method="metis", def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method="metis",
reshuffle=True, balance_ntypes=None, balance_edges=False, return_mapping=False, reshuffle=True, balance_ntypes=None, balance_edges=False, return_mapping=False,
num_trainers_per_machine=1, objtype='cut'): num_trainers_per_machine=1, objtype='cut',
graph_formats=None):
''' Partition a graph for distributed training and store the partitions on files. ''' Partition a graph for distributed training and store the partitions on files.
The partitioning occurs in three steps: 1) run a partition algorithm (e.g., Metis) to The partitioning occurs in three steps: 1) run a partition algorithm (e.g., Metis) to
...@@ -549,6 +560,11 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method= ...@@ -549,6 +560,11 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
objtype : str, "cut" or "vol" objtype : str, "cut" or "vol"
Set the objective as edge-cut minimization or communication volume minimization. This Set the objective as edge-cut minimization or communication volume minimization. This
argument is used by the Metis algorithm. argument is used by the Metis algorithm.
graph_formats : str or list[str]
Save partitions in specified formats. It could be any combination of ``coo``,
``csc`` and ``csr``. If not specified, save one format only according to what
format is available. If multiple formats are available, selection priority
from high to low is ``coo``, ``csc``, ``csr``.
Returns Returns
------- -------
...@@ -573,6 +589,9 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method= ...@@ -573,6 +589,9 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
... g, node_feats, edge_feats, gpb, graph_name, ntypes_list, etypes_list, ... g, node_feats, edge_feats, gpb, graph_name, ntypes_list, etypes_list,
... ) = dgl.distributed.load_partition('output/test.json', 0) ... ) = dgl.distributed.load_partition('output/test.json', 0)
''' '''
# 'coo' is required for partition
assert 'coo' in np.concatenate(list(g.formats().values())), \
"'coo' format should be allowed for partitioning graph."
def get_homogeneous(g, balance_ntypes): def get_homogeneous(g, balance_ntypes):
if g.is_homogeneous: if g.is_homogeneous:
sim_g = to_homogeneous(g) sim_g = to_homogeneous(g)
...@@ -930,7 +949,9 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method= ...@@ -930,7 +949,9 @@ def partition_graph(g, graph_name, num_parts, out_path, num_hops=1, part_method=
save_tensors(node_feat_file, node_feats) save_tensors(node_feat_file, node_feats)
save_tensors(edge_feat_file, edge_feats) save_tensors(edge_feat_file, edge_feats)
_save_graphs(part_graph_file, [part]) sort_etypes = len(g.etypes) > 1
_save_graphs(part_graph_file, [part], formats=graph_formats,
sort_etypes=sort_etypes)
print('Save partitions: {:.3f} seconds, peak memory: {:.3f} GB'.format( print('Save partitions: {:.3f} seconds, peak memory: {:.3f} GB'.format(
time.time() - start, get_peak_mem())) time.time() - start, get_peak_mem()))
......
...@@ -447,7 +447,7 @@ def check_rpc_hetero_sampling_empty_shuffle(tmpdir, num_server): ...@@ -447,7 +447,7 @@ def check_rpc_hetero_sampling_empty_shuffle(tmpdir, num_server):
assert block.number_of_edges() == 0 assert block.number_of_edges() == 0
assert len(block.etypes) == len(g.etypes) assert len(block.etypes) == len(g.etypes)
def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, etype_sorted=False): def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, graph_formats=None):
generate_ip_config("rpc_ip_config.txt", num_server, num_server) generate_ip_config("rpc_ip_config.txt", num_server, num_server)
g = create_random_hetero(dense=True) g = create_random_hetero(dense=True)
...@@ -455,7 +455,8 @@ def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, etype_sorted=Fal ...@@ -455,7 +455,8 @@ def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, etype_sorted=Fal
num_hops = 1 num_hops = 1
orig_nid_map, orig_eid_map = partition_graph(g, 'test_sampling', num_parts, tmpdir, orig_nid_map, orig_eid_map = partition_graph(g, 'test_sampling', num_parts, tmpdir,
num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True) num_hops=num_hops, part_method='metis', reshuffle=True, return_mapping=True,
graph_formats=graph_formats)
pserver_list = [] pserver_list = []
ctx = mp.get_context('spawn') ctx = mp.get_context('spawn')
...@@ -466,6 +467,9 @@ def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, etype_sorted=Fal ...@@ -466,6 +467,9 @@ def check_rpc_hetero_etype_sampling_shuffle(tmpdir, num_server, etype_sorted=Fal
pserver_list.append(p) pserver_list.append(p)
fanout = 3 fanout = 3
etype_sorted = False
if graph_formats is not None:
etype_sorted = 'csc' in graph_formats or 'csr' in graph_formats
block, gpb = start_hetero_etype_sample_client(0, tmpdir, num_server > 1, fanout, block, gpb = start_hetero_etype_sample_client(0, tmpdir, num_server > 1, fanout,
nodes={'n3': [0, 10, 99, 66, 124, 208]}, nodes={'n3': [0, 10, 99, 66, 124, 208]},
etype_sorted=etype_sorted) etype_sorted=etype_sorted)
...@@ -768,7 +772,9 @@ def test_rpc_sampling_shuffle(num_server): ...@@ -768,7 +772,9 @@ def test_rpc_sampling_shuffle(num_server):
check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_sampling_shuffle(Path(tmpdirname), num_server)
check_rpc_hetero_sampling_empty_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_sampling_empty_shuffle(Path(tmpdirname), num_server)
check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server)
check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server, etype_sorted=True) check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server, ['csc'])
check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server, ['csr'])
check_rpc_hetero_etype_sampling_shuffle(Path(tmpdirname), num_server, ['csc', 'coo'])
check_rpc_hetero_etype_sampling_empty_shuffle(Path(tmpdirname), num_server) check_rpc_hetero_etype_sampling_empty_shuffle(Path(tmpdirname), num_server)
check_rpc_bipartite_sampling_empty(Path(tmpdirname), num_server) check_rpc_bipartite_sampling_empty(Path(tmpdirname), num_server)
check_rpc_bipartite_sampling_shuffle(Path(tmpdirname), num_server) check_rpc_bipartite_sampling_shuffle(Path(tmpdirname), num_server)
......
This diff is collapsed.
...@@ -26,6 +26,15 @@ def _verify_partition_data_types(part_g): ...@@ -26,6 +26,15 @@ def _verify_partition_data_types(part_g):
if k in part_g.edata: if k in part_g.edata:
assert part_g.edata[k].dtype == dtype assert part_g.edata[k].dtype == dtype
def _verify_partition_formats(part_g, formats):
# Verify saved graph formats
if formats is None:
assert "coo" in part_g.formats()["created"]
else:
formats = formats.split(',')
for format in formats:
assert format in part_g.formats()["created"]
def _verify_graph_feats( def _verify_graph_feats(
g, gpb, part, node_feats, edge_feats, orig_nids, orig_eids g, gpb, part, node_feats, edge_feats, orig_nids, orig_eids
...@@ -139,9 +148,12 @@ def test_chunk_graph(num_chunks): ...@@ -139,9 +148,12 @@ def test_chunk_graph(num_chunks):
assert feat_array.shape[0] == num_edges[etype] // num_chunks assert feat_array.shape[0] == num_edges[etype] // num_chunks
@pytest.mark.parametrize("num_chunks", [1, 2, 3, 4, 8]) @pytest.mark.parametrize("num_chunks", [1, 3, 8])
@pytest.mark.parametrize("num_parts", [1, 2, 3, 4, 8]) @pytest.mark.parametrize("num_parts", [1, 3, 8])
def test_part_pipeline(num_chunks, num_parts): @pytest.mark.parametrize(
"graph_formats", [None, "csc", "coo,csc", "coo,csc,csr"]
)
def test_part_pipeline(num_chunks, num_parts, graph_formats):
if num_chunks < num_parts: if num_chunks < num_parts:
# num_parts should less/equal than num_chunks # num_parts should less/equal than num_chunks
return return
...@@ -182,6 +194,7 @@ def test_part_pipeline(num_chunks, num_parts): ...@@ -182,6 +194,7 @@ def test_part_pipeline(num_chunks, num_parts):
cmd += " --process-group-timeout 60" cmd += " --process-group-timeout 60"
cmd += " --save-orig-nids" cmd += " --save-orig-nids"
cmd += " --save-orig-eids" cmd += " --save-orig-eids"
cmd += f" --graph-formats {graph_formats}" if graph_formats else ""
os.system(cmd) os.system(cmd)
# read original node/edge IDs # read original node/edge IDs
...@@ -207,6 +220,7 @@ def test_part_pipeline(num_chunks, num_parts): ...@@ -207,6 +220,7 @@ def test_part_pipeline(num_chunks, num_parts):
part_config, i part_config, i
) )
_verify_partition_data_types(part_g) _verify_partition_data_types(part_g)
_verify_partition_formats(part_g, graph_formats)
_verify_graph_feats( _verify_graph_feats(
g, gpb, part_g, node_feats, edge_feats, orig_nids, orig_eids g, gpb, part_g, node_feats, edge_feats, orig_nids, orig_eids
) )
...@@ -76,6 +76,7 @@ def submit_jobs(args) -> str: ...@@ -76,6 +76,7 @@ def submit_jobs(args) -> str:
argslist += "--log-level {} ".format(args.log_level) argslist += "--log-level {} ".format(args.log_level)
argslist += "--save-orig-nids " if args.save_orig_nids else "" argslist += "--save-orig-nids " if args.save_orig_nids else ""
argslist += "--save-orig-eids " if args.save_orig_eids else "" argslist += "--save-orig-eids " if args.save_orig_eids else ""
argslist += f"--graph-formats {args.graph_formats} " if args.graph_formats else ""
# (BarclayII) Is it safe to assume all the workers have the Python executable at the same path? # (BarclayII) Is it safe to assume all the workers have the Python executable at the same path?
pipeline_cmd = os.path.join(INSTALL_DIR, PIPELINE_SCRIPT) pipeline_cmd = os.path.join(INSTALL_DIR, PIPELINE_SCRIPT)
...@@ -149,6 +150,15 @@ def main(): ...@@ -149,6 +150,15 @@ def main():
action="store_true", action="store_true",
help="Save original edge IDs into files", help="Save original edge IDs into files",
) )
parser.add_argument(
"--graph-formats",
type=str,
default=None,
help="Save partitions in specified formats. It could be any combination(joined with ``,``) "
"of ``coo``, ``csc`` and ``csr``. If not specified, save one format only according to "
"what format is available. If multiple formats are available, selection priority "
"from high to low is ``coo``, ``csc``, ``csr``.",
)
args, udf_command = parser.parse_known_args() args, udf_command = parser.parse_known_args()
......
...@@ -58,6 +58,8 @@ if __name__ == "__main__": ...@@ -58,6 +58,8 @@ if __name__ == "__main__":
help='Save original node IDs into files') help='Save original node IDs into files')
parser.add_argument('--save-orig-eids', action='store_true', parser.add_argument('--save-orig-eids', action='store_true',
help='Save original edge IDs into files') help='Save original edge IDs into files')
parser.add_argument('--graph-formats', default=None, type=str,
help='Save partitions in specified formats.')
params = parser.parse_args() params = parser.parse_args()
#invoke the pipeline function #invoke the pipeline function
......
...@@ -697,8 +697,12 @@ def gen_dist_partitions(rank, world_size, params): ...@@ -697,8 +697,12 @@ def gen_dist_partitions(rank, world_size, params):
orig_nids, orig_eids = create_dgl_object(schema_map, rank, node_data, \ orig_nids, orig_eids = create_dgl_object(schema_map, rank, node_data, \
edge_data, num_edges, params.save_orig_nids, params.save_orig_eids) edge_data, num_edges, params.save_orig_nids, params.save_orig_eids)
memory_snapshot("CreateDGLObjectsComplete: ", rank) memory_snapshot("CreateDGLObjectsComplete: ", rank)
graph_formats = None
if params.graph_formats:
graph_formats = params.graph_formats.split(',')
sort_etypes = len(etypes_map) > 1
write_dgl_objects(graph_obj, rcvd_node_features, rcvd_edge_features, params.output, \ write_dgl_objects(graph_obj, rcvd_node_features, rcvd_edge_features, params.output, \
rank, orig_nids, orig_eids) rank, orig_nids, orig_eids, graph_formats, sort_etypes)
memory_snapshot("DiskWriteDGLObjectsComplete: ", rank) memory_snapshot("DiskWriteDGLObjectsComplete: ", rank)
#get the meta-data #get the meta-data
......
...@@ -379,7 +379,7 @@ def write_edge_features(edge_features, edge_file): ...@@ -379,7 +379,7 @@ def write_edge_features(edge_features, edge_file):
""" """
dgl.data.utils.save_tensors(edge_file, edge_features) dgl.data.utils.save_tensors(edge_file, edge_features)
def write_graph_dgl(graph_file, graph_obj): def write_graph_dgl(graph_file, graph_obj, formats, sort_etypes):
""" """
Utility function to serialize graph dgl objects Utility function to serialize graph dgl objects
...@@ -389,11 +389,16 @@ def write_graph_dgl(graph_file, graph_obj): ...@@ -389,11 +389,16 @@ def write_graph_dgl(graph_file, graph_obj):
graph dgl object, as created in convert_partition.py, which is to be serialized graph dgl object, as created in convert_partition.py, which is to be serialized
graph_file : string graph_file : string
File name in which graph object is serialized File name in which graph object is serialized
formats : str or list[str]
Save graph in specified formats.
sort_etypes : bool
Whether to sort etypes in csc/csr.
""" """
dgl.distributed.partition._save_graphs(graph_file, [graph_obj]) dgl.distributed.partition._save_graphs(graph_file, [graph_obj],
formats, sort_etypes)
def write_dgl_objects(graph_obj, node_features, edge_features, def write_dgl_objects(graph_obj, node_features, edge_features,
output_dir, part_id, orig_nids, orig_eids): output_dir, part_id, orig_nids, orig_eids, formats, sort_etypes):
""" """
Wrapper function to write graph, node/edge feature, original node/edge IDs. Wrapper function to write graph, node/edge feature, original node/edge IDs.
...@@ -413,11 +418,15 @@ def write_dgl_objects(graph_obj, node_features, edge_features, ...@@ -413,11 +418,15 @@ def write_dgl_objects(graph_obj, node_features, edge_features,
original node IDs original node IDs
orig_eids : dict orig_eids : dict
original edge IDs original edge IDs
formats : str or list[str]
Save graph in formats.
sort_etypes : bool
Whether to sort etypes in csc/csr.
""" """
part_dir = output_dir + '/part' + str(part_id) part_dir = output_dir + '/part' + str(part_id)
os.makedirs(part_dir, exist_ok=True) os.makedirs(part_dir, exist_ok=True)
write_graph_dgl(os.path.join(part_dir ,'graph.dgl'), graph_obj) write_graph_dgl(os.path.join(part_dir ,'graph.dgl'), graph_obj,
formats, sort_etypes)
if node_features != None: if node_features != None:
write_node_features(node_features, os.path.join(part_dir, "node_feat.dgl")) write_node_features(node_features, os.path.join(part_dir, "node_feat.dgl"))
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment