Unverified Commit c8ea9fa4 authored by kylasa's avatar kylasa Committed by GitHub
Browse files

[Dist] Flexible pipeline - Initial commit (#4733)

* Flexible pipeline - Initial commit

1. Implementation of flexible pipeline feature.
2. With this implementation, the pipeline now supports multiple partitions per process. And also assumes that num_partitions is always a multiple of num_processes.

* Update test_dist_part.py

* Code changes to address review comments

* Code refactoring of exchange_features function into two functions for better readability

* Upadting test_dist_part to fix merge issues with the master branch

* corrected variable names...

* Fixed code refactoring issues.

* Provide missing function arguments to exchange_feature function

* Providing the missing function argument to fix error.

* Provide missing function argument to 'get_shuffle_nids' function.

* Repositioned a variable within its scope.

* Removed tab space which is causing the indentation problem

* Fix issue with the CI test framework, which is the root cause for the failure of the CI tests.

1. Now we read files specific to the partition-id and store this data separately, identified by the local_part_id, in the local process.
2. Similarly as above, we also differentiate the node and edge features type_ids with the same keys as above.
3. These above two changes will help up to get the appropriate feature data during the feature exchange and send to the destination process correctly.

* Correct the parametrization for the CI unit test cases.

* Addressing Rui's code review comments.

* Addressing code review comments.
parent ee5f0967
...@@ -136,11 +136,15 @@ def test_chunk_graph(num_chunks): ...@@ -136,11 +136,15 @@ def test_chunk_graph(num_chunks):
test_data(sub_dir, feat, data, g.num_edges(c_etype) // num_chunks) test_data(sub_dir, feat, data, g.num_edges(c_etype) // num_chunks)
def _test_pipeline(num_chunks, num_parts, graph_formats=None): def _test_pipeline(num_chunks, num_parts, world_size, graph_formats=None):
if num_chunks < num_parts: if num_chunks < num_parts:
# num_parts should less/equal than num_chunks # num_parts should less/equal than num_chunks
return return
if num_parts % world_size != 0:
# num_parts should be a multiple of world_size
return
with tempfile.TemporaryDirectory() as root_dir: with tempfile.TemporaryDirectory() as root_dir:
g = create_chunked_dataset(root_dir, num_chunks) g = create_chunked_dataset(root_dir, num_chunks)
...@@ -161,12 +165,12 @@ def _test_pipeline(num_chunks, num_parts, graph_formats=None): ...@@ -161,12 +165,12 @@ def _test_pipeline(num_chunks, num_parts, graph_formats=None):
assert isinstance(int(header), int) assert isinstance(int(header), int)
# Step2: data dispatch # Step2: data dispatch
partition_dir = os.path.join(root_dir, "parted_data") partition_dir = os.path.join(root_dir, 'parted_data')
out_dir = os.path.join(root_dir, "partitioned") out_dir = os.path.join(root_dir, 'partitioned')
ip_config = os.path.join(root_dir, "ip_config.txt") ip_config = os.path.join(root_dir, 'ip_config.txt')
with open(ip_config, "w") as f: with open(ip_config, 'w') as f:
for i in range(num_parts): for i in range(world_size):
f.write(f"127.0.0.{i + 1}\n") f.write(f'127.0.0.{i + 1}\n')
cmd = "python3 tools/dispatch_data.py" cmd = "python3 tools/dispatch_data.py"
cmd += f" --in-dir {in_dir}" cmd += f" --in-dir {in_dir}"
...@@ -209,15 +213,14 @@ def _test_pipeline(num_chunks, num_parts, graph_formats=None): ...@@ -209,15 +213,14 @@ def _test_pipeline(num_chunks, num_parts, graph_formats=None):
) )
@pytest.mark.parametrize("num_chunks", [1, 3, 4, 8]) @pytest.mark.parametrize("num_chunks, num_parts, world_size", [[8, 4, 2], [9, 6, 3], [11, 11, 1], [11, 4, 2], [5, 3, 1]])
@pytest.mark.parametrize("num_parts", [1, 3, 4, 8]) def test_pipeline_basics(num_chunks, num_parts, world_size):
def test_pipeline_basics(num_chunks, num_parts): _test_pipeline(num_chunks, num_parts, world_size)
_test_pipeline(num_chunks, num_parts)
@pytest.mark.parametrize( @pytest.mark.parametrize(
"graph_formats", [None, "csc", "coo,csc", "coo,csc,csr"] "graph_formats", [None, "csc", "coo,csc", "coo,csc,csr"]
) )
def test_pipeline_formats(graph_formats): def test_pipeline_formats(graph_formats):
_test_pipeline(4, 4, graph_formats) _test_pipeline(4, 4, 4, graph_formats)
...@@ -59,14 +59,12 @@ def submit_jobs(args) -> str: ...@@ -59,14 +59,12 @@ def submit_jobs(args) -> str:
with open(args.ip_config, "r") as f: with open(args.ip_config, "r") as f:
num_ips = len(f.readlines()) num_ips = len(f.readlines())
assert ( assert (
num_ips == num_parts num_parts % num_ips == 0
), f"The number of lines[{args.ip_config}] should be equal to num_parts[{num_parts}]." ), f"The num_parts[{args.num_parts}] should be a multiple of number of lines(ip addresses)[{args.ip_config}]."
argslist = "" argslist = ""
argslist += "--world-size {} ".format(num_parts) argslist += "--world-size {} ".format(num_ips)
argslist += "--partitions-dir {} ".format( argslist += "--partitions-dir {} ".format(os.path.abspath(args.partitions_dir))
os.path.abspath(args.partitions_dir)
)
argslist += "--input-dir {} ".format(os.path.abspath(args.in_dir)) argslist += "--input-dir {} ".format(os.path.abspath(args.in_dir))
argslist += "--graph-name {} ".format(graph_name) argslist += "--graph-name {} ".format(graph_name)
argslist += "--schema {} ".format(schema_path) argslist += "--schema {} ".format(schema_path)
......
...@@ -110,8 +110,7 @@ def create_dgl_object(schema, part_id, node_data, edge_data, edgeid_offset, ...@@ -110,8 +110,7 @@ def create_dgl_object(schema, part_id, node_data, edge_data, edgeid_offset,
#create auxiliary data structures from the schema object #create auxiliary data structures from the schema object
memory_snapshot("CreateDGLObj_Begin", part_id) memory_snapshot("CreateDGLObj_Begin", part_id)
_, global_nid_ranges = get_idranges(schema[constants.STR_NODE_TYPE], _, global_nid_ranges = get_idranges(schema[constants.STR_NODE_TYPE],
schema[constants.STR_NUM_NODES_PER_CHUNK]) schema[constants.STR_NUM_NODES_PER_CHUNK])
_, global_eid_ranges = get_idranges(schema[constants.STR_EDGE_TYPE], _, global_eid_ranges = get_idranges(schema[constants.STR_EDGE_TYPE],
schema[constants.STR_NUM_EDGES_PER_CHUNK]) schema[constants.STR_NUM_EDGES_PER_CHUNK])
......
This diff is collapsed.
...@@ -7,10 +7,10 @@ import torch ...@@ -7,10 +7,10 @@ import torch
from pyarrow import csv from pyarrow import csv
import constants import constants
from utils import get_idranges from utils import get_idranges, map_partid_rank
def get_dataset(input_dir, graph_name, rank, world_size, schema_map): def get_dataset(input_dir, graph_name, rank, world_size, num_parts, schema_map):
""" """
Function to read the multiple file formatted dataset. Function to read the multiple file formatted dataset.
...@@ -24,6 +24,8 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map): ...@@ -24,6 +24,8 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
rank of the current process rank of the current process
world_size : int world_size : int
total number of process in the current execution total number of process in the current execution
num_parts : int
total number of output graph partitions
schema_map : dictionary schema_map : dictionary
this is the dictionary created by reading the graph metadata json file this is the dictionary created by reading the graph metadata json file
for the input graph dataset for the input graph dataset
...@@ -105,34 +107,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map): ...@@ -105,34 +107,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
Data read from each of the node features file is a multi-dimensional tensor data and is read Data read from each of the node features file is a multi-dimensional tensor data and is read
in numpy format, which is also the storage format of node features on the permanent storage. in numpy format, which is also the storage format of node features on the permanent storage.
'''
#iterate over the "node_data" dictionary in the schema_map
#read the node features if exists
#also keep track of the type_nids for which the node_features are read.
dataset_features = schema_map[constants.STR_NODE_DATA]
if((dataset_features is not None) and (len(dataset_features) > 0)):
for ntype_name, ntype_feature_data in dataset_features.items():
#ntype_feature_data is a dictionary
#where key: feature_name, value: dictionary in which keys are "format", "data"
node_feature_tids[ntype_name] = []
for feat_name, feat_data in ntype_feature_data.items():
assert feat_data[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_NUMPY
num_chunks = len(feat_data[constants.STR_DATA])
read_list = np.array_split(np.arange(num_chunks), world_size)
nfeat = []
for idx in read_list[rank]:
nfeat_file = feat_data[constants.STR_DATA][idx]
if not os.path.isabs(nfeat_file):
nfeat_file = os.path.join(input_dir, nfeat_file)
logging.info(f'Loading node feature[{feat_name}] of ntype[{ntype_name}] from {nfeat_file}')
nfeat.append(np.load(nfeat_file))
nfeat = np.concatenate(nfeat)
node_features[ntype_name + '/' + feat_name] = torch.from_numpy(nfeat)
node_feature_tids[ntype_name].append([feat_name, -1, -1])
'''
"node_type" : ["ntype0-name", "ntype1-name", ....], #m node types "node_type" : ["ntype0-name", "ntype1-name", ....], #m node types
"num_nodes_per_chunk" : [ "num_nodes_per_chunk" : [
[a0, a1, ...a<p-1>], #p partitions [a0, a1, ...a<p-1>], #p partitions
...@@ -154,25 +129,66 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map): ...@@ -154,25 +129,66 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
which are owned by that particular rank. And using the "num_nodes_per_chunk" information each which are owned by that particular rank. And using the "num_nodes_per_chunk" information each
process can easily compute any nodes per-type node_id and global node_id. process can easily compute any nodes per-type node_id and global node_id.
The node-ids are treated as int64's in order to support billions of nodes in the input graph. The node-ids are treated as int64's in order to support billions of nodes in the input graph.
''' '''
#read my nodes for each node type #read my nodes for each node type
node_tids, ntype_gnid_offset = get_idranges(schema_map[constants.STR_NODE_TYPE], node_tids, ntype_gnid_offset = get_idranges(schema_map[constants.STR_NODE_TYPE],
schema_map[constants.STR_NUM_NODES_PER_CHUNK], schema_map[constants.STR_NUM_NODES_PER_CHUNK],
num_chunks=world_size) num_chunks=num_parts)
for ntype_name in schema_map[constants.STR_NODE_TYPE]:
if ntype_name in node_feature_tids: #iterate over the "node_data" dictionary in the schema_map
for item in node_feature_tids[ntype_name]: #read the node features if exists
item[1] = node_tids[ntype_name][rank][0] #also keep track of the type_nids for which the node_features are read.
item[2] = node_tids[ntype_name][rank][1] dataset_features = schema_map[constants.STR_NODE_DATA]
if((dataset_features is not None) and (len(dataset_features) > 0)):
for ntype_name, ntype_feature_data in dataset_features.items():
for feat_name, feat_data in ntype_feature_data.items():
assert feat_data[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_NUMPY
# It is guaranteed that num_chunks is always greater
# than num_partitions.
num_chunks = len(feat_data[constants.STR_DATA])
read_list = np.array_split(np.arange(num_chunks), num_parts)
for local_part_id in range(num_parts):
if map_partid_rank(local_part_id, world_size) == rank:
nfeat = []
nfeat_tids = []
for idx in read_list[local_part_id]:
nfeat_file = feat_data[constants.STR_DATA][idx]
if not os.path.isabs(nfeat_file):
nfeat_file = os.path.join(input_dir, nfeat_file)
logging.info(f'Loading node feature[{feat_name}] of ntype[{ntype_name}] from {nfeat_file}')
nfeat.append(np.load(nfeat_file))
nfeat = np.concatenate(nfeat) if len(nfeat) != 0 else np.array([])
node_features[ntype_name+"/"+feat_name+"/"+str(local_part_id//world_size)] = torch.from_numpy(nfeat)
nfeat_tids.append(node_tids[ntype_name][local_part_id])
node_feature_tids[ntype_name+"/"+feat_name+"/"+str(local_part_id//world_size)] = nfeat_tids
#done building node_features locally. #done building node_features locally.
if len(node_features) <= 0: if len(node_features) <= 0:
logging.info(f'[Rank: {rank}] This dataset does not have any node features') logging.info(f'[Rank: {rank}] This dataset does not have any node features')
else: else:
for k, v in node_features.items(): assert len(node_features) == len(node_feature_tids)
logging.info(f'[Rank: {rank}] node feature name: {k}, feature data shape: {v.size()}')
# Note that the keys in the node_features dictionary are as follows:
# `ntype_name/feat_name/local_part_id`.
# where ntype_name and feat_name are self-explanatory, and
# local_part_id indicates the partition-id, in the context of current
# process which take the values 0, 1, 2, ....
for feat_name, feat_info in node_features.items():
logging.info(f'[Rank: {rank}] node feature name: {feat_name}, feature data shape: {feat_info.size()}')
tokens = feat_name.split("/")
assert len(tokens) == 3
# Get the range of type ids which are mapped to the current node.
tids = node_feature_tids[feat_name]
# Iterate over the range of type ids for the current node feature
# and count the number of features for this feature name.
count = tids[0][1] - tids[0][0]
assert count == feat_info.size()[0]
''' '''
Reading edge features now. Reading edge features now.
...@@ -214,50 +230,48 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map): ...@@ -214,50 +230,48 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
edge_features = {} edge_features = {}
edge_feature_tids = {} edge_feature_tids = {}
# Read edges for each edge type that are processed by the currnet process.
edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE],
schema_map[constants.STR_NUM_EDGES_PER_CHUNK], num_parts)
# Iterate over the "edge_data" dictionary in the schema_map. # Iterate over the "edge_data" dictionary in the schema_map.
# Read the edge features if exists. # Read the edge features if exists.
# Also keep track of the type_eids for which the edge_features are read. # Also keep track of the type_eids for which the edge_features are read.
dataset_features = schema_map[constants.STR_EDGE_DATA] dataset_features = schema_map[constants.STR_EDGE_DATA]
if dataset_features and (len(dataset_features) > 0): if dataset_features and (len(dataset_features) > 0):
for etype_name, etype_feature_data in dataset_features.items(): for etype_name, etype_feature_data in dataset_features.items():
#etype_feature_data is a dictionary
#where key: feature_name, value: dictionary in which keys are "format", "data"
edge_feature_tids[etype_name] = []
for feat_name, feat_data in etype_feature_data.items(): for feat_name, feat_data in etype_feature_data.items():
assert feat_data[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_NUMPY assert feat_data[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_NUMPY
num_chunks = len(feat_data[constants.STR_DATA]) num_chunks = len(feat_data[constants.STR_DATA])
read_list = np.array_split(np.arange(num_chunks), world_size) read_list = np.array_split(np.arange(num_chunks), num_parts)
efeat = [] for local_part_id in range(num_parts):
for idx in read_list[rank]: if map_partid_rank(local_part_id, world_size) == rank:
efeat_file = feat_data[constants.STR_DATA][idx] efeats = []
if not os.path.isabs(efeat_file): efeat_tids = []
efeat_file = os.path.join(input_dir, efeat_file) for idx in read_list[local_part_id]:
logging.info( feature_fname = feat_data[constants.STR_DATA][idx]
f'Loading edge feature[{feat_name}] of etype[{etype_name}] from {efeat_file}' if (os.path.isabs(feature_fname)):
) logging.info(f'Loading numpy from {feature_fname}')
efeat.append(np.load(efeat_file)) efeats.append(torch.from_numpy(np.load(feature_fname)))
efeat = np.concatenate(efeat) else:
edge_features[etype_name + '/' + feat_name] = torch.from_numpy(efeat) numpy_path = os.path.join(input_dir, feature_fname)
logging.info(f'Loading numpy from {numpy_path}')
edge_feature_tids[etype_name].append([feat_name, -1, -1]) efeats.append(torch.from_numpy(np.load(numpy_path)))
efeat_tids.append(edge_tids[etype_name][local_part_id])
# Read edges for each node types that are processed by the currnet process. edge_features[etype_name+'/'+feat_name+"/"+str(local_part_id//world_size)] = torch.from_numpy(np.concatenate(efeats))
edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE], edge_feature_tids[etype_name+"/"+feat_name+"/"+str(local_part_id//world_size)] = efeat_tids
schema_map[constants.STR_NUM_EDGES_PER_CHUNK],
num_chunks=world_size)
for etype_name in schema_map[constants.STR_EDGE_TYPE]:
if etype_name in edge_feature_tids:
for item in edge_feature_tids[etype_name]:
item[1] = edge_tids[etype_name][rank][0]
item[2] = edge_tids[etype_name][rank][1]
# Done with building node_features locally. # Done with building node_features locally.
if len(edge_features) <= 0: if len(edge_features) <= 0:
logging.info(f'[Rank: {rank}] This dataset does not have any edge features') logging.info(f'[Rank: {rank}] This dataset does not have any edge features')
else: else:
for k, v in edge_features.items(): assert len(edge_features) == len(edge_feature_tids)
logging.info(f'[Rank: {rank}] edge feature name: {k}, feature data shape: {v.size()}')
for k, v in edge_features.items():
logging.info(f'[Rank: {rank}] edge feature name: {k}, feature data shape: {v.shape}')
tids = edge_feature_tids[k]
count = tids[0][1] - tids[0][0]
assert count == v.size()[0]
''' '''
Code below is used to read edges from the input dataset with the help of the metadata json file Code below is used to read edges from the input dataset with the help of the metadata json file
...@@ -306,7 +320,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map): ...@@ -306,7 +320,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
etype_name_idmap = {e : idx for idx, e in enumerate(etype_names)} etype_name_idmap = {e : idx for idx, e in enumerate(etype_names)}
edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE], edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE],
schema_map[constants.STR_NUM_EDGES_PER_CHUNK], schema_map[constants.STR_NUM_EDGES_PER_CHUNK],
num_chunks=world_size) num_chunks=num_parts)
edge_datadict = {} edge_datadict = {}
edge_data = schema_map[constants.STR_EDGES] edge_data = schema_map[constants.STR_EDGES]
...@@ -329,10 +343,16 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map): ...@@ -329,10 +343,16 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
dst_ntype_name = tokens[2] dst_ntype_name = tokens[2]
num_chunks = len(edge_info) num_chunks = len(edge_info)
read_list = np.array_split(np.arange(num_chunks), world_size) read_list = np.array_split(np.arange(num_chunks), num_parts)
src_ids = [] src_ids = []
dst_ids = [] dst_ids = []
for idx in read_list[rank]:
curr_partids = []
for part_id in range(num_parts):
if map_partid_rank(part_id, world_size) == rank:
curr_partids.append(read_list[part_id])
for idx in np.concatenate(curr_partids):
edge_file = edge_info[idx] edge_file = edge_info[idx]
if not os.path.isabs(edge_file): if not os.path.isabs(edge_file):
edge_file = os.path.join(input_dir, edge_file) edge_file = os.path.join(input_dir, edge_file)
...@@ -355,10 +375,13 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map): ...@@ -355,10 +375,13 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
#currently these are just type_edge_ids... which will be converted to global ids #currently these are just type_edge_ids... which will be converted to global ids
edge_datadict[constants.GLOBAL_SRC_ID].append(src_ids + ntype_gnid_offset[src_ntype_name][0, 0]) edge_datadict[constants.GLOBAL_SRC_ID].append(src_ids + ntype_gnid_offset[src_ntype_name][0, 0])
edge_datadict[constants.GLOBAL_DST_ID].append(dst_ids + ntype_gnid_offset[dst_ntype_name][0, 0]) edge_datadict[constants.GLOBAL_DST_ID].append(dst_ids + ntype_gnid_offset[dst_ntype_name][0, 0])
edge_datadict[constants.GLOBAL_TYPE_EID].append(np.arange(edge_tids[etype_name][rank][0],\
edge_tids[etype_name][rank][1] ,dtype=np.int64))
edge_datadict[constants.ETYPE_ID].append(etype_name_idmap[etype_name] * \ edge_datadict[constants.ETYPE_ID].append(etype_name_idmap[etype_name] * \
np.ones(shape=(src_ids.shape), dtype=np.int64)) np.ones(shape=(src_ids.shape), dtype=np.int64))
for local_part_id in range(num_parts):
if (map_partid_rank(local_part_id, world_size) == rank):
edge_datadict[constants.GLOBAL_TYPE_EID].append(np.arange(edge_tids[etype_name][local_part_id][0],\
edge_tids[etype_name][local_part_id][1] ,dtype=np.int64))
#stitch together to create the final data on the local machine #stitch together to create the final data on the local machine
for col in [constants.GLOBAL_SRC_ID, constants.GLOBAL_DST_ID, constants.GLOBAL_TYPE_EID, constants.ETYPE_ID]: for col in [constants.GLOBAL_SRC_ID, constants.GLOBAL_DST_ID, constants.GLOBAL_TYPE_EID, constants.ETYPE_ID]:
...@@ -368,6 +391,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map): ...@@ -368,6 +391,7 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
assert edge_datadict[constants.GLOBAL_DST_ID].shape == edge_datadict[constants.GLOBAL_TYPE_EID].shape assert edge_datadict[constants.GLOBAL_DST_ID].shape == edge_datadict[constants.GLOBAL_TYPE_EID].shape
assert edge_datadict[constants.GLOBAL_TYPE_EID].shape == edge_datadict[constants.ETYPE_ID].shape assert edge_datadict[constants.GLOBAL_TYPE_EID].shape == edge_datadict[constants.ETYPE_ID].shape
logging.info(f'[Rank: {rank}] Done reading edge_file: {len(edge_datadict)}, {edge_datadict[constants.GLOBAL_SRC_ID].shape}') logging.info(f'[Rank: {rank}] Done reading edge_file: {len(edge_datadict)}, {edge_datadict[constants.GLOBAL_SRC_ID].shape}')
logging.info(f'Rank: {rank} edge_feat_tids: {edge_feature_tids}')
return node_tids, node_features, node_feature_tids, edge_datadict, edge_tids, edge_features, edge_feature_tids return node_tids, node_features, node_feature_tids, edge_datadict, edge_tids, edge_features, edge_feature_tids
...@@ -7,6 +7,7 @@ import copy ...@@ -7,6 +7,7 @@ import copy
from pyarrow import csv from pyarrow import csv
from gloo_wrapper import alltoallv_cpu from gloo_wrapper import alltoallv_cpu
from utils import map_partid_rank
class DistLookupService: class DistLookupService:
...@@ -100,7 +101,7 @@ class DistLookupService: ...@@ -100,7 +101,7 @@ class DistLookupService:
self.ntype_count = np.array(ntype_count, dtype=np.int64) self.ntype_count = np.array(ntype_count, dtype=np.int64)
self.rank = rank self.rank = rank
self.world_size = world_size self.world_size = world_size
def get_partition_ids(self, global_nids): def get_partition_ids(self, global_nids):
''' '''
...@@ -237,7 +238,7 @@ class DistLookupService: ...@@ -237,7 +238,7 @@ class DistLookupService:
# Now the owner_ids (partition-ids) which corresponding to the global_nids. # Now the owner_ids (partition-ids) which corresponding to the global_nids.
return owner_ids return owner_ids
def get_shuffle_nids(self, global_nids, my_global_nids, my_shuffle_global_nids): def get_shuffle_nids(self, global_nids, my_global_nids, my_shuffle_global_nids, world_size):
''' '''
This function is used to retrieve shuffle_global_nids for a given set of incoming This function is used to retrieve shuffle_global_nids for a given set of incoming
global_nids. Note that global_nids are of random order and will contain duplicates global_nids. Note that global_nids are of random order and will contain duplicates
...@@ -267,6 +268,8 @@ class DistLookupService: ...@@ -267,6 +268,8 @@ class DistLookupService:
This process has the node <-> partition id mapping This process has the node <-> partition id mapping
my_shuffle_global_nids : numpy ndarray my_shuffle_global_nids : numpy ndarray
array of shuffle_global_nids which are assigned by the current process/rank array of shuffle_global_nids which are assigned by the current process/rank
world_size : int
total no. of processes in the MPI_WORLD
Returns: Returns:
-------- --------
...@@ -278,6 +281,21 @@ class DistLookupService: ...@@ -278,6 +281,21 @@ class DistLookupService:
# Get the owner_ids (partition-ids or rank). # Get the owner_ids (partition-ids or rank).
owner_ids = self.get_partition_ids(global_nids) owner_ids = self.get_partition_ids(global_nids)
# These owner_ids, which are also partition ids of the nodes in the
# input graph, are in the range 0 - (num_partitions - 1).
# These ids are generated using some kind of graph partitioning method.
# Distribuged lookup service, as used by the graph partitioning
# pipeline, is used to store ntype-ids (also type_nids) and their
# mapping to the associated partition-id.
# These ids are split into `num_process` chunks and processes in the
# dist. lookup service are assigned the owernship of these chunks.
# The pipeline also enforeces the following constraint among the
# pipeline input parameters: num_partitions, num_processes
# num_partitions is an integer multiple of num_processes
# which means each individual node in the cluster will be running
# equal number of processes.
owner_ids = map_partid_rank(owner_ids, world_size)
# Ask these owners to supply for the shuffle_global_nids. # Ask these owners to supply for the shuffle_global_nids.
send_list = [] send_list = []
id_list = [] id_list = []
......
...@@ -59,7 +59,7 @@ def get_shuffle_global_nids(rank, world_size, global_nids_ranks, node_data): ...@@ -59,7 +59,7 @@ def get_shuffle_global_nids(rank, world_size, global_nids_ranks, node_data):
ret_val = np.column_stack([global_nids, shuffle_global_nids]) ret_val = np.column_stack([global_nids, shuffle_global_nids])
return ret_val return ret_val
def lookup_shuffle_global_nids_edges(rank, world_size, edge_data, id_lookup, node_data): def lookup_shuffle_global_nids_edges(rank, world_size, num_parts, edge_data, id_lookup, node_data):
''' '''
This function is a helper function used to lookup shuffle-global-nids for a given set of This function is a helper function used to lookup shuffle-global-nids for a given set of
global-nids using a distributed lookup service. global-nids using a distributed lookup service.
...@@ -70,6 +70,8 @@ def lookup_shuffle_global_nids_edges(rank, world_size, edge_data, id_lookup, nod ...@@ -70,6 +70,8 @@ def lookup_shuffle_global_nids_edges(rank, world_size, edge_data, id_lookup, nod
rank of the process rank of the process
world_size : integer world_size : integer
total number of processes used in the process group total number of processes used in the process group
num_parts : integer
total number of output graph partitions
edge_data : dictionary edge_data : dictionary
edge_data is a dicitonary with keys as column names and values as numpy arrays representing edge_data is a dicitonary with keys as column names and values as numpy arrays representing
all the edges present in the current graph partition all the edges present in the current graph partition
...@@ -93,40 +95,49 @@ def lookup_shuffle_global_nids_edges(rank, world_size, edge_data, id_lookup, nod ...@@ -93,40 +95,49 @@ def lookup_shuffle_global_nids_edges(rank, world_size, edge_data, id_lookup, nod
MILLION = 1000 * 1000 MILLION = 1000 * 1000
BATCH_SIZE = 250 * MILLION BATCH_SIZE = 250 * MILLION
memory_snapshot("GlobalToShuffleIDMapBegin: ", rank) memory_snapshot("GlobalToShuffleIDMapBegin: ", rank)
node_list = edge_data[constants.GLOBAL_SRC_ID]
local_nids = []
# Determine the no. of times each process has to send alltoall messages. local_shuffle_nids = []
all_sizes = allgather_sizes([node_list.shape[0]], world_size, return_sizes=True) for local_part_id in range(num_parts//world_size):
max_count = np.amax(all_sizes) local_nids.append(node_data[constants.GLOBAL_NID+"/"+str(local_part_id)])
num_splits = max_count // BATCH_SIZE + 1 local_shuffle_nids.append(node_data[constants.SHUFFLE_GLOBAL_NID+"/"+str(local_part_id)])
# Split the message into batches and send. local_nids = np.concatenate(local_nids)
splits = np.array_split(node_list, num_splits) local_shuffle_nids = np.concatenate(local_shuffle_nids)
shuffle_mappings = []
for item in splits: for local_part_id in range(num_parts//world_size):
shuffle_ids = id_lookup.get_shuffle_nids(item, node_list = edge_data[constants.GLOBAL_SRC_ID+"/"+str(local_part_id)]
node_data[constants.GLOBAL_NID],
node_data[constants.SHUFFLE_GLOBAL_NID]) # Determine the no. of times each process has to send alltoall messages.
shuffle_mappings.append(shuffle_ids) all_sizes = allgather_sizes([node_list.shape[0]], world_size, num_parts, return_sizes=True)
max_count = np.amax(all_sizes)
shuffle_ids = np.concatenate(shuffle_mappings) num_splits = max_count // BATCH_SIZE + 1
assert shuffle_ids.shape[0] == node_list.shape[0]
edge_data[constants.SHUFFLE_GLOBAL_SRC_ID] = shuffle_ids # Split the message into batches and send.
splits = np.array_split(node_list, num_splits)
# Destination end points of edges are owned by the current node and therefore shuffle_mappings = []
# should have corresponding SHUFFLE_GLOBAL_NODE_IDs. for item in splits:
# Here retrieve SHUFFLE_GLOBAL_NODE_IDs for the destination end points of local edges. shuffle_ids = id_lookup.get_shuffle_nids(item, local_nids, local_shuffle_nids, world_size)
uniq_ids, inverse_idx = np.unique(edge_data[constants.GLOBAL_DST_ID], return_inverse=True) shuffle_mappings.append(shuffle_ids)
common, idx1, idx2 = np.intersect1d(uniq_ids, node_data[constants.GLOBAL_NID], assume_unique=True, return_indices=True)
assert len(common) == len(uniq_ids) shuffle_ids = np.concatenate(shuffle_mappings)
assert shuffle_ids.shape[0] == node_list.shape[0]
edge_data[constants.SHUFFLE_GLOBAL_DST_ID] = node_data[constants.SHUFFLE_GLOBAL_NID][idx2][inverse_idx] edge_data[constants.SHUFFLE_GLOBAL_SRC_ID+"/"+str(local_part_id)] = shuffle_ids
assert len(edge_data[constants.SHUFFLE_GLOBAL_DST_ID]) == len(edge_data[constants.GLOBAL_DST_ID])
# Destination end points of edges are owned by the current node and therefore
# should have corresponding SHUFFLE_GLOBAL_NODE_IDs.
# Here retrieve SHUFFLE_GLOBAL_NODE_IDs for the destination end points of local edges.
uniq_ids, inverse_idx = np.unique(edge_data[constants.GLOBAL_DST_ID+"/"+str(local_part_id)], return_inverse=True)
common, idx1, idx2 = np.intersect1d(uniq_ids, node_data[constants.GLOBAL_NID+"/"+str(local_part_id)], assume_unique=True, return_indices=True)
assert len(common) == len(uniq_ids)
edge_data[constants.SHUFFLE_GLOBAL_DST_ID+"/"+str(local_part_id)] = node_data[constants.SHUFFLE_GLOBAL_NID+"/"+str(local_part_id)][idx2][inverse_idx]
assert len(edge_data[constants.SHUFFLE_GLOBAL_DST_ID+"/"+str(local_part_id)]) == len(edge_data[constants.GLOBAL_DST_ID+"/"+str(local_part_id)])
memory_snapshot("GlobalToShuffleIDMap_AfterLookupServiceCalls: ", rank) memory_snapshot("GlobalToShuffleIDMap_AfterLookupServiceCalls: ", rank)
return edge_data return edge_data
def assign_shuffle_global_nids_nodes(rank, world_size, node_data): def assign_shuffle_global_nids_nodes(rank, world_size, num_parts, node_data):
""" """
Utility function to assign shuffle global ids to nodes at a given rank Utility function to assign shuffle global ids to nodes at a given rank
node_data gets converted from [ntype, global_type_nid, global_nid] node_data gets converted from [ntype, global_type_nid, global_nid]
...@@ -144,25 +155,27 @@ def assign_shuffle_global_nids_nodes(rank, world_size, node_data): ...@@ -144,25 +155,27 @@ def assign_shuffle_global_nids_nodes(rank, world_size, node_data):
rank of the process rank of the process
world_size : integer world_size : integer
total number of processes used in the process group total number of processes used in the process group
ntype_counts: list of tuples num_parts : integer
list of tuples (x,y), where x=ntype and y=no. of nodes whose shuffle_global_nids are needed total number of output graph partitions
node_data : dictionary node_data : dictionary
node_data is a dictionary with keys as column names and values as numpy arrays node_data is a dictionary with keys as column names and values as numpy arrays
""" """
# Compute prefix sum to determine node-id offsets # Compute prefix sum to determine node-id offsets
prefix_sum_nodes = allgather_sizes([node_data[constants.GLOBAL_NID].shape[0]], world_size) local_row_counts = []
for local_part_id in range(num_parts//world_size):
local_row_counts.append(node_data[constants.GLOBAL_NID+"/"+str(local_part_id)].shape[0])
# assigning node-ids from localNodeStartId to (localNodeEndId - 1) # Perform allgather to compute the local offsets.
# Assuming here that the nodeDataArr is sorted based on the nodeType. prefix_sum_nodes = allgather_sizes(local_row_counts, world_size, num_parts)
shuffle_global_nid_start = prefix_sum_nodes[rank]
shuffle_global_nid_end = prefix_sum_nodes[rank + 1]
# add a column with global-ids (after data shuffle) for local_part_id in range(num_parts//world_size):
shuffle_global_nids = np.arange(shuffle_global_nid_start, shuffle_global_nid_end, dtype=np.int64) shuffle_global_nid_start = prefix_sum_nodes[rank + (local_part_id*world_size)]
node_data[constants.SHUFFLE_GLOBAL_NID] = shuffle_global_nids shuffle_global_nid_end = prefix_sum_nodes[rank + 1 + (local_part_id*world_size)]
shuffle_global_nids = np.arange(shuffle_global_nid_start, shuffle_global_nid_end, dtype=np.int64)
node_data[constants.SHUFFLE_GLOBAL_NID+"/"+str(local_part_id)] = shuffle_global_nids
def assign_shuffle_global_nids_edges(rank, world_size, edge_data): def assign_shuffle_global_nids_edges(rank, world_size, num_parts, edge_data):
""" """
Utility function to assign shuffle_global_eids to edges Utility function to assign shuffle_global_eids to edges
edge_data gets converted from [global_src_nid, global_dst_nid, global_type_eid, etype] edge_data gets converted from [global_src_nid, global_dst_nid, global_type_eid, etype]
...@@ -174,8 +187,8 @@ def assign_shuffle_global_nids_edges(rank, world_size, edge_data): ...@@ -174,8 +187,8 @@ def assign_shuffle_global_nids_edges(rank, world_size, edge_data):
rank of the current process rank of the current process
world_size : integer world_size : integer
total count of processes in execution total count of processes in execution
etype_counts : list of tuples num_parts : integer
list of tuples (x,y), x = rank, y = no. of edges total number of output graph partitions
edge_data : numpy ndarray edge_data : numpy ndarray
edge data as read from xxx_edges.txt file edge data as read from xxx_edges.txt file
...@@ -187,12 +200,17 @@ def assign_shuffle_global_nids_edges(rank, world_size, edge_data): ...@@ -187,12 +200,17 @@ def assign_shuffle_global_nids_edges(rank, world_size, edge_data):
""" """
#get prefix sum of edge counts per rank to locate the starting point #get prefix sum of edge counts per rank to locate the starting point
#from which global-ids to edges are assigned in the current rank #from which global-ids to edges are assigned in the current rank
prefix_sum_edges = allgather_sizes([edge_data[constants.GLOBAL_SRC_ID].shape[0]], world_size) local_row_counts = []
shuffle_global_eid_start = prefix_sum_edges[rank] for local_part_id in range(num_parts//world_size):
shuffle_global_eid_end = prefix_sum_edges[rank + 1] local_row_counts.append(edge_data[constants.GLOBAL_SRC_ID+"/"+str(local_part_id)].shape[0])
# assigning edge-ids from localEdgeStart to (localEdgeEndId - 1) shuffle_global_eid_offset = []
# Assuming here that the edge_data is sorted by edge_type prefix_sum_edges = allgather_sizes(local_row_counts, world_size, num_parts)
shuffle_global_eids = np.arange(shuffle_global_eid_start, shuffle_global_eid_end, dtype=np.int64) for local_part_id in range(num_parts//world_size):
edge_data[constants.SHUFFLE_GLOBAL_EID] = shuffle_global_eids shuffle_global_eid_start = prefix_sum_edges[rank + (local_part_id*world_size)]
return shuffle_global_eid_start shuffle_global_eid_end = prefix_sum_edges[rank + 1 + (local_part_id*world_size)]
shuffle_global_eids = np.arange(shuffle_global_eid_start, shuffle_global_eid_end, dtype=np.int64)
edge_data[constants.SHUFFLE_GLOBAL_EID+"/"+str(local_part_id)] = shuffle_global_eids
shuffle_global_eid_offset.append(shuffle_global_eid_start)
return shuffle_global_eid_offset
...@@ -2,7 +2,7 @@ import numpy as np ...@@ -2,7 +2,7 @@ import numpy as np
import torch import torch
import torch.distributed as dist import torch.distributed as dist
def allgather_sizes(send_data, world_size, return_sizes=False): def allgather_sizes(send_data, world_size, num_parts, return_sizes=False):
""" """
Perform all gather on list lengths, used to compute prefix sums Perform all gather on list lengths, used to compute prefix sums
to determine the offsets on each ranks. This is used to allocate to determine the offsets on each ranks. This is used to allocate
...@@ -14,6 +14,8 @@ def allgather_sizes(send_data, world_size, return_sizes=False): ...@@ -14,6 +14,8 @@ def allgather_sizes(send_data, world_size, return_sizes=False):
Data on which allgather is performed. Data on which allgather is performed.
world_size : integer world_size : integer
No. of processes configured for execution No. of processes configured for execution
num_parts : integer
No. of output graph partitions
return_sizes : bool return_sizes : bool
Boolean flag to indicate whether to return raw sizes from each process Boolean flag to indicate whether to return raw sizes from each process
or perform prefix sum on the raw sizes. or perform prefix sum on the raw sizes.
...@@ -24,6 +26,9 @@ def allgather_sizes(send_data, world_size, return_sizes=False): ...@@ -24,6 +26,9 @@ def allgather_sizes(send_data, world_size, return_sizes=False):
array with the prefix sum array with the prefix sum
""" """
# Assert on the world_size, num_parts
assert (num_parts % world_size) == 0
#compute the length of the local data #compute the length of the local data
send_length = len(send_data) send_length = len(send_data)
out_tensor = torch.as_tensor(send_data, dtype=torch.int64) out_tensor = torch.as_tensor(send_data, dtype=torch.int64)
...@@ -38,11 +43,16 @@ def allgather_sizes(send_data, world_size, return_sizes=False): ...@@ -38,11 +43,16 @@ def allgather_sizes(send_data, world_size, return_sizes=False):
return torch.cat(in_tensor).numpy() return torch.cat(in_tensor).numpy()
#gather sizes in on array to return to the invoking function #gather sizes in on array to return to the invoking function
rank_sizes = np.zeros(world_size + 1, dtype=np.int64) rank_sizes = np.zeros(num_parts + 1, dtype=np.int64)
part_counts = torch.cat(in_tensor).numpy()
count = rank_sizes[0] count = rank_sizes[0]
for i, t in enumerate(in_tensor): idx = 1
count += t.item() for local_part_id in range(num_parts//world_size):
rank_sizes[i+1] = count for r in range(world_size):
count += part_counts[r*(num_parts//world_size) + local_part_id]
rank_sizes[idx] = count
idx += 1
return rank_sizes return rank_sizes
......
...@@ -188,7 +188,7 @@ def get_gnid_range_map(node_tids): ...@@ -188,7 +188,7 @@ def get_gnid_range_map(node_tids):
return ntypes_gid_range return ntypes_gid_range
def write_metadata_json(metadata_list, output_dir, graph_name): def write_metadata_json(input_list, output_dir, graph_name, world_size, num_parts):
""" """
Merge json schema's from each of the rank's on rank-0. Merge json schema's from each of the rank's on rank-0.
This utility function, to be used on rank-0, to create aggregated json file. This utility function, to be used on rank-0, to create aggregated json file.
...@@ -202,6 +202,14 @@ def write_metadata_json(metadata_list, output_dir, graph_name): ...@@ -202,6 +202,14 @@ def write_metadata_json(metadata_list, output_dir, graph_name):
graph-name : string graph-name : string
a string specifying the graph name a string specifying the graph name
""" """
# Preprocess the input_list, a list of dictionaries
# each dictionary will contain num_parts/world_size metadata json
# which correspond to local partitions on the respective ranks.
metadata_list = []
for local_part_id in range(num_parts//world_size):
for idx in range(world_size):
metadata_list.append(input_list[idx]["local-part-id-"+str(local_part_id*world_size + idx)])
#Initialize global metadata #Initialize global metadata
graph_metadata = {} graph_metadata = {}
...@@ -238,7 +246,7 @@ def write_metadata_json(metadata_list, output_dir, graph_name): ...@@ -238,7 +246,7 @@ def write_metadata_json(metadata_list, output_dir, graph_name):
_dump_part_config(f'{output_dir}/metadata.json', graph_metadata) _dump_part_config(f'{output_dir}/metadata.json', graph_metadata)
def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size): def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size, num_parts):
""" """
Add partition-id (rank which owns an edge) column to the edge_data. Add partition-id (rank which owns an edge) column to the edge_data.
...@@ -256,6 +264,8 @@ def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size): ...@@ -256,6 +264,8 @@ def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size):
rank of the current process rank of the current process
world_size : integer world_size : integer
total no. of process participating in the communication primitives total no. of process participating in the communication primitives
num_parts : integer
total no. of partitions requested for the input graph
Returns: Returns:
-------- --------
...@@ -269,16 +279,18 @@ def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size): ...@@ -269,16 +279,18 @@ def augment_edge_data(edge_data, lookup_service, edge_tids, rank, world_size):
offset = 0 offset = 0
for etype_name, tid_range in edge_tids.items(): for etype_name, tid_range in edge_tids.items():
assert int(tid_range[0][0]) == 0 assert int(tid_range[0][0]) == 0
assert len(tid_range) == world_size assert len(tid_range) == num_parts
etype_offset[etype_name] = offset + int(tid_range[0][0]) etype_offset[etype_name] = offset + int(tid_range[0][0])
offset += int(tid_range[-1][1]) offset += int(tid_range[-1][1])
global_eids = [] global_eids = []
for etype_name, tid_range in edge_tids.items(): for etype_name, tid_range in edge_tids.items():
global_eid_start = etype_offset[etype_name] for idx in range(num_parts):
begin = global_eid_start + int(tid_range[rank][0]) if map_partid_rank(idx, world_size) == rank:
end = global_eid_start + int(tid_range[rank][1]) global_eid_start = etype_offset[etype_name]
global_eids.append(np.arange(begin, end, dtype=np.int64)) begin = global_eid_start + int(tid_range[idx][0])
end = global_eid_start + int(tid_range[idx][1])
global_eids.append(np.arange(begin, end, dtype=np.int64))
global_eids = np.concatenate(global_eids) global_eids = np.concatenate(global_eids)
assert global_eids.shape[0] == edge_data[constants.ETYPE_ID].shape[0] assert global_eids.shape[0] == edge_data[constants.ETYPE_ID].shape[0]
edge_data[constants.GLOBAL_EID] = global_eids edge_data[constants.GLOBAL_EID] = global_eids
...@@ -528,3 +540,22 @@ def memory_snapshot(tag, rank): ...@@ -528,3 +540,22 @@ def memory_snapshot(tag, rank):
mem_string = f'{total:.0f} (MB) total, {peak:.0f} (MB) peak, {used:.0f} (MB) used, {avail:.0f} (MB) avail' mem_string = f'{total:.0f} (MB) total, {peak:.0f} (MB) peak, {used:.0f} (MB) used, {avail:.0f} (MB) avail'
logging.debug(f'[Rank: {rank} MEMORY_SNAPSHOT] {mem_string} - {tag}') logging.debug(f'[Rank: {rank} MEMORY_SNAPSHOT] {mem_string} - {tag}')
def map_partid_rank(partid, world_size):
"""Auxiliary function to map a given partition id to one of the rank in the
MPI_WORLD processes. The range of partition ids is assumed to equal or a
multiple of the total size of MPI_WORLD. In this implementation, we use
a cyclical mapping procedure to convert partition ids to ranks.
Parameters:
-----------
partid : int
partition id, as read from node id to partition id mappings.
Returns:
--------
int :
rank of the process, which will be responsible for the given partition
id.
"""
return partid % world_size
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment