Unverified Commit cf19254a authored by Rhett Ying's avatar Rhett Ying Committed by GitHub
Browse files

[Dist] enable to partition many chunks into less partitions via pipeline (#4620)

* [Dist] enable to partition many chunks into less partitions via pipeline

* refine

* add meta file for num_parts, add more tests, refine docstring

* remove args.num_parts

* create pydantic class for partition metadata

* refine

* rename json file
parent 6c1500d4
import argparse
import dgl
import json
import numpy as np
import os
import sys
import tempfile
import torch
import pytest, unittest
import dgl
from dgl.data.utils import load_tensors, load_graphs
from chunk_graph import chunk_graph
def test_part_pipeline():
@pytest.mark.parametrize("num_chunks", [1, 2, 3, 4, 8])
@pytest.mark.parametrize("num_parts", [1, 2, 3, 4, 8])
def test_part_pipeline(num_chunks, num_parts):
if num_chunks < num_parts:
# num_parts should less/equal than num_chunks
return
# Step0: prepare chunked graph data format
# A synthetic mini MAG240
num_institutions = 20
num_authors = 100
num_papers = 600
num_institutions = 1200
num_authors = 1200
num_papers = 1200
def rand_edges(num_src, num_dst, num_edges):
eids = np.random.choice(num_src * num_dst, num_edges, replace=False)
......@@ -26,9 +29,9 @@ def test_part_pipeline():
return src, dst
num_cite_edges = 2000
num_write_edges = 1000
num_affiliate_edges = 200
num_cite_edges = 24 * 1000
num_write_edges = 12 * 1000
num_affiliate_edges = 2400
# Structure
data_dict = {
......@@ -85,7 +88,6 @@ def test_part_pipeline():
np.save(f, write_year)
output_dir = os.path.join(root_dir, 'chunked-data')
num_chunks = 2
chunk_graph(
g,
'mag240m',
......@@ -159,10 +161,10 @@ def test_part_pipeline():
# Step1: graph partition
in_dir = os.path.join(root_dir, 'chunked-data')
output_dir = os.path.join(root_dir, '2parts')
output_dir = os.path.join(root_dir, 'parted_data')
os.system('python3 tools/partition_algo/random_partition.py '\
'--in_dir {} --out_dir {} --num_partitions {}'.format(
in_dir, output_dir, num_chunks))
in_dir, output_dir, num_parts))
for ntype in ['author', 'institution', 'paper']:
fname = os.path.join(output_dir, '{}.txt'.format(ntype))
with open(fname, 'r') as f:
......@@ -170,12 +172,12 @@ def test_part_pipeline():
assert isinstance(int(header), int)
# Step2: data dispatch
partition_dir = os.path.join(root_dir, '2parts')
partition_dir = os.path.join(root_dir, 'parted_data')
out_dir = os.path.join(root_dir, 'partitioned')
ip_config = os.path.join(root_dir, 'ip_config.txt')
with open(ip_config, 'w') as f:
f.write('127.0.0.1\n')
f.write('127.0.0.2\n')
for i in range(num_parts):
f.write(f'127.0.0.{i + 1}\n')
cmd = 'python3 tools/dispatch_data.py'
cmd += f' --in-dir {in_dir}'
......@@ -195,19 +197,19 @@ def test_part_pipeline():
all_etypes = ['affiliated_with', 'writes', 'cites', 'rev_writes']
for etype in all_etypes:
assert len(meta_data['edge_map'][etype]) == num_chunks
assert len(meta_data['edge_map'][etype]) == num_parts
assert meta_data['etypes'].keys() == set(all_etypes)
assert meta_data['graph_name'] == 'mag240m'
all_ntypes = ['author', 'institution', 'paper']
for ntype in all_ntypes:
assert len(meta_data['node_map'][ntype]) == num_chunks
assert len(meta_data['node_map'][ntype]) == num_parts
assert meta_data['ntypes'].keys() == set(all_ntypes)
assert meta_data['num_edges'] == 4200
assert meta_data['num_nodes'] == 720
assert meta_data['num_parts'] == num_chunks
assert meta_data['num_edges'] == g.num_edges()
assert meta_data['num_nodes'] == g.num_nodes()
assert meta_data['num_parts'] == num_parts
for i in range(num_chunks):
for i in range(num_parts):
sub_dir = 'part-' + str(i)
assert meta_data[sub_dir]['node_feats'] == 'part{}/node_feat.dgl'.format(i)
assert meta_data[sub_dir]['edge_feats'] == 'part{}/edge_feat.dgl'.format(i)
......@@ -251,5 +253,3 @@ def test_part_pipeline():
orig_eids = load_tensors(fname)
assert len(orig_eids.keys()) == 4
if __name__ == '__main__':
test_part_pipeline()
......@@ -4,6 +4,7 @@ import sys
import argparse
import logging
import json
from partition_algo.base import load_partition_meta
INSTALL_DIR = os.path.abspath(os.path.join(__file__, '..'))
LAUNCH_SCRIPT = "distgraphlaunch.py"
......@@ -33,16 +34,29 @@ def get_launch_cmd(args) -> str:
def submit_jobs(args) -> str:
wrapper_command = os.path.join(INSTALL_DIR, LAUNCH_SCRIPT)
#read the json file and get the remaining argument here.
schema_path = "metadata.json"
with open(os.path.join(args.in_dir, schema_path)) as schema:
schema_map = json.load(schema)
num_parts = len(schema_map["num_nodes_per_chunk"][0])
graph_name = schema_map["graph_name"]
# retrieve num_parts
num_chunks = len(schema_map["num_nodes_per_chunk"][0])
num_parts = num_chunks
partition_path = os.path.join(args.partitions_dir, "partition_meta.json")
if os.path.isfile(partition_path):
part_meta = load_partition_meta(partition_path)
num_parts = part_meta.num_parts
if num_parts > num_chunks:
raise Exception('Number of partitions should be less/equal than number of chunks.')
# verify ip_config
with open(args.ip_config, 'r') as f:
num_ips = len(f.readlines())
assert num_ips == num_parts, \
f'The number of lines[{args.ip_config}] should be equal to num_parts[{num_parts}].'
argslist = ""
argslist += "--world-size {} ".format(num_parts)
argslist += "--partitions-dir {} ".format(os.path.abspath(args.partitions_dir))
......
......@@ -104,10 +104,11 @@ def create_dgl_object(schema, part_id, node_data, edge_data, edgeid_offset,
"""
#create auxiliary data structures from the schema object
memory_snapshot("CreateDGLObj_Begin", part_id)
ntid_dict, global_nid_ranges = get_idranges(schema[constants.STR_NODE_TYPE],
schema[constants.STR_NUM_NODES_PER_CHUNK])
_, global_nid_ranges = get_idranges(schema[constants.STR_NODE_TYPE],
schema[constants.STR_NUM_NODES_PER_CHUNK])
memory_snapshot("CreateDGLObj_Begin", part_id)
etid_dict, global_eid_ranges = get_idranges(schema[constants.STR_EDGE_TYPE],
_, global_eid_ranges = get_idranges(schema[constants.STR_EDGE_TYPE],
schema[constants.STR_NUM_EDGES_PER_CHUNK])
id_map = dgl.distributed.id_map.IdMap(global_nid_ranges)
......@@ -119,7 +120,6 @@ def create_dgl_object(schema, part_id, node_data, edge_data, edgeid_offset,
ntypes_map = {e: i for i, e in enumerate(ntypes)}
etypes = [(key, global_eid_ranges[key][0, 0]) for key in global_eid_ranges]
etypes.sort(key=lambda e: e[1])
etype_offset_np = np.array([e[1] for e in etypes])
etypes = [e[0] for e in etypes]
etypes_map = {e.split(":")[1]: i for i, e in enumerate(etypes)}
......
......@@ -97,7 +97,8 @@ def gen_node_data(rank, world_size, id_lookup, ntid_ntype_map, schema_map):
}
type_nid_dict, global_nid_dict = get_idranges(schema_map[constants.STR_NODE_TYPE],
schema_map[constants.STR_NUM_NODES_PER_CHUNK])
schema_map[constants.STR_NUM_NODES_PER_CHUNK],
num_chunks=world_size)
for ntype_id, ntype_name in ntid_ntype_map.items():
type_start, type_end = type_nid_dict[ntype_name][0][0], type_nid_dict[ntype_name][-1][1]
......@@ -290,7 +291,7 @@ def exchange_node_features(rank, world_size, node_feature_tids, ntype_gnid_map,
return own_node_features, own_global_nids
def exchange_graph_data(rank, world_size, node_features, node_feat_tids, edge_data,
id_lookup, ntypes_ntypeid_map, ntypes_gnid_range_map, ntid_ntype_map, schema_map):
id_lookup, ntypes_gnid_range_map, ntid_ntype_map, schema_map):
"""
Wrapper function which is used to shuffle graph data on all the processes.
......@@ -556,8 +557,8 @@ def gen_dist_partitions(rank, world_size, params):
#and return the aggregated data
ntypes_gnid_range_map = get_gnid_range_map(node_tids)
node_data, rcvd_node_features, rcvd_global_nids, edge_data = \
exchange_graph_data(rank, world_size, node_features, node_feat_tids, \
edge_data, id_lookup, ntypes_ntypeid_map, ntypes_gnid_range_map, \
exchange_graph_data(rank, world_size, node_features, node_feat_tids,
edge_data, id_lookup, ntypes_gnid_range_map,
ntypeid_ntypes_map, schema_map)
gc.collect()
logging.info(f'[Rank: {rank}] Done with data shuffling...')
......
......@@ -110,18 +110,18 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
#where key: feature_name, value: dictionary in which keys are "format", "data"
node_feature_tids[ntype_name] = []
for feat_name, feat_data in ntype_feature_data.items():
assert len(feat_data[constants.STR_DATA]) == world_size
assert feat_data[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_NUMPY
my_feat_data_fname = feat_data[constants.STR_DATA][rank] #this will be just the file name
if (os.path.isabs(my_feat_data_fname)):
logging.info(f'Loading numpy from {my_feat_data_fname}')
node_features[ntype_name+'/'+feat_name] = \
torch.from_numpy(np.load(my_feat_data_fname))
else:
numpy_path = os.path.join(input_dir, my_feat_data_fname)
logging.info(f'Loading numpy from {numpy_path}')
node_features[ntype_name+'/'+feat_name] = \
torch.from_numpy(np.load(numpy_path))
num_chunks = len(feat_data[constants.STR_DATA])
read_list = np.array_split(np.arange(num_chunks), world_size)
nfeat = []
for idx in read_list[rank]:
nfeat_file = feat_data[constants.STR_DATA][idx]
if not os.path.isabs(nfeat_file):
nfeat_file = os.path.join(input_dir, nfeat_file)
logging.info(f'Loading node feature[{feat_name}] of ntype[{ntype_name}] from {nfeat_file}')
nfeat.append(np.load(nfeat_file))
nfeat = np.concatenate(nfeat)
node_features[ntype_name + '/' + feat_name] = torch.from_numpy(nfeat)
node_feature_tids[ntype_name].append([feat_name, -1, -1])
......@@ -152,7 +152,8 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
#read my nodes for each node type
node_tids, ntype_gnid_offset = get_idranges(schema_map[constants.STR_NODE_TYPE],
schema_map[constants.STR_NUM_NODES_PER_CHUNK])
schema_map[constants.STR_NUM_NODES_PER_CHUNK],
num_chunks=world_size)
for ntype_name in schema_map[constants.STR_NODE_TYPE]:
if ntype_name in node_feature_tids:
for item in node_feature_tids[ntype_name]:
......@@ -211,8 +212,9 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
#read my edges for each edge type
etype_names = schema_map[constants.STR_EDGE_TYPE]
etype_name_idmap = {e : idx for idx, e in enumerate(etype_names)}
edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE],
schema_map[constants.STR_NUM_EDGES_PER_CHUNK])
edge_tids, _ = get_idranges(schema_map[constants.STR_EDGE_TYPE],
schema_map[constants.STR_NUM_EDGES_PER_CHUNK],
num_chunks=world_size)
edge_datadict = {}
edge_data = schema_map[constants.STR_EDGES]
......@@ -226,26 +228,38 @@ def get_dataset(input_dir, graph_name, rank, world_size, schema_map):
assert etype_info[constants.STR_FORMAT][constants.STR_NAME] == constants.STR_CSV
edge_info = etype_info[constants.STR_DATA]
assert len(edge_info) == world_size
#edgetype strings are in canonical format, src_node_type:edge_type:dst_node_type
tokens = etype_name.split(":")
assert len(tokens) == 3
src_ntype_name = tokens[0]
rel_name = tokens[1]
dst_ntype_name = tokens[2]
logging.info(f'Reading csv files from {edge_info[rank]}')
data_df = csv.read_csv(edge_info[rank], read_options=pyarrow.csv.ReadOptions(autogenerate_column_names=True),
parse_options=pyarrow.csv.ParseOptions(delimiter=' '))
num_chunks = len(edge_info)
read_list = np.array_split(np.arange(num_chunks), world_size)
src_ids = []
dst_ids = []
for idx in read_list[rank]:
edge_file = edge_info[idx]
if not os.path.isabs(edge_file):
edge_file = os.path.join(input_dir, edge_file)
logging.info(f'Loading edges of etype[{etype_name}] from {edge_file}')
data_df = csv.read_csv(edge_file,
read_options=pyarrow.csv.ReadOptions(autogenerate_column_names=True),
parse_options=pyarrow.csv.ParseOptions(delimiter=' '))
src_ids.append(data_df['f0'].to_numpy())
dst_ids.append(data_df['f1'].to_numpy())
src_ids = np.concatenate(src_ids)
dst_ids = np.concatenate(dst_ids)
#currently these are just type_edge_ids... which will be converted to global ids
edge_datadict[constants.GLOBAL_SRC_ID].append(data_df['f0'].to_numpy() + ntype_gnid_offset[src_ntype_name][0, 0])
edge_datadict[constants.GLOBAL_DST_ID].append(data_df['f1'].to_numpy() + ntype_gnid_offset[dst_ntype_name][0, 0])
edge_datadict[constants.GLOBAL_SRC_ID].append(src_ids + ntype_gnid_offset[src_ntype_name][0, 0])
edge_datadict[constants.GLOBAL_DST_ID].append(dst_ids + ntype_gnid_offset[dst_ntype_name][0, 0])
edge_datadict[constants.GLOBAL_TYPE_EID].append(np.arange(edge_tids[etype_name][rank][0],\
edge_tids[etype_name][rank][1] ,dtype=np.int64))
edge_datadict[constants.ETYPE_ID].append(etype_name_idmap[etype_name] * \
np.ones(shape=(data_df['f0'].to_numpy().shape), dtype=np.int64))
np.ones(shape=(src_ids.shape), dtype=np.int64))
#stitch together to create the final data on the local machine
for col in [constants.GLOBAL_SRC_ID, constants.GLOBAL_DST_ID, constants.GLOBAL_TYPE_EID, constants.ETYPE_ID]:
......
......@@ -85,15 +85,9 @@ def get_ntype_featnames(ntype_name, schema_map):
list :
a list of feature names for a given node_type
"""
ntype_featdict = schema_map[constants.STR_NODE_DATA]
if (ntype_name in ntype_featdict):
featnames = []
ntype_info = ntype_featdict[ntype_name]
for k, v in ntype_info.items():
featnames.append(k)
return featnames
else:
return []
node_data = schema_map[constants.STR_NODE_DATA]
feats = node_data.get(ntype_name, {})
return [feat for feat in feats]
def get_node_types(schema_map):
"""
......@@ -393,7 +387,7 @@ def write_dgl_objects(graph_obj, node_features, edge_features,
orig_eids_file = os.path.join(part_dir, 'orig_eids.dgl')
dgl.data.utils.save_tensors(orig_eids_file, orig_eids)
def get_idranges(names, counts):
def get_idranges(names, counts, num_chunks=None):
"""
Utility function to compute typd_id/global_id ranges for both nodes and edges.
......@@ -403,6 +397,11 @@ def get_idranges(names, counts):
list of node/edge types as strings
counts : list of lists
each list contains no. of nodes/edges in a given chunk
num_chunks : int, optional
In distributed partition pipeline, ID ranges are grouped into chunks.
In some scenarios, we'd like to merge ID ranges into specific number
of chunks. This parameter indicates the expected number of chunks.
If not specified, no merge is applied.
Returns:
--------
......@@ -418,21 +417,33 @@ def get_idranges(names, counts):
gnid_end = gnid_start
tid_dict = {}
gid_dict = {}
orig_num_chunks = 0
for idx, typename in enumerate(names):
type_counts = counts[idx]
tid_start = np.cumsum([0] + type_counts[:-1])
tid_end = np.cumsum(type_counts)
tid_ranges = list(zip(tid_start, tid_end))
type_start = tid_ranges[0][0]
type_end = tid_ranges[-1][1]
gnid_end += tid_ranges[-1][1]
tid_dict[typename] = tid_ranges
gid_dict[typename] = np.array([gnid_start, gnid_end]).reshape([1,2])
gnid_start = gnid_end
orig_num_chunks = len(tid_start)
if num_chunks is None:
return tid_dict, gid_dict
assert num_chunks <= orig_num_chunks, \
'Specified number of chunks should be less/euqual than original numbers of ID ranges.'
chunk_list = np.array_split(np.arange(orig_num_chunks), num_chunks)
for typename in tid_dict:
orig_tid_ranges = tid_dict[typename]
tid_ranges = []
for idx in chunk_list:
tid_ranges.append((orig_tid_ranges[idx[0]][0], orig_tid_ranges[idx[-1]][-1]))
tid_dict[typename] = tid_ranges
return tid_dict, gid_dict
......
from typing import Optional
import pydantic as dt
import json
from dgl import DGLError
class PartitionMeta(dt.BaseModel):
""" Metadata that describes the partition assignment results.
Regardless of the choice of partitioning algorithm, a metadata JSON file
will be created in the output directory which includes the meta information
of the partition algorithm.
To generate a metadata JSON:
>>> part_meta = PartitionMeta(version='1.0.0', num_parts=4, algo_name='random')
>>> with open('metadata.json', 'w') as f:
... json.dump(part_meta.dict(), f)
To read a metadata JSON:
>>> with open('metadata.json') as f:
... part_meta = PartitionMeta(**(json.load(f)))
"""
# version of metadata JSON.
version: Optional[str] = '1.0.0'
# number of partitions.
num_parts: int
# name of partition algorithm.
algo_name: str
def dump_partition_meta(part_meta, meta_file):
""" Dump partition metadata into json file.
Parameters
----------
part_meta : PartitionMeta
The partition metadata.
meta_file : str
The target file to save data.
"""
with open(meta_file, 'w') as f:
json.dump(part_meta.dict(), f, sort_keys=True, indent=4)
def load_partition_meta(meta_file):
""" Load partition metadata and do sanity check.
Parameters
----------
meta_file : str
The path of the partition metadata file.
Returns
-------
PartitionMeta
The partition metadata.
"""
with open(meta_file) as f:
try:
part_meta = PartitionMeta(**(json.load(f)))
except dt.ValidationError as e:
raise DGLError(
f"Invalid partition metadata JSON. Error details: {e.json()}")
if part_meta.version != '1.0.0':
raise DGLError(
f"Invalid version[{part_meta.version}]. Supported versions: '1.0.0'")
if part_meta.num_parts <= 0:
raise DGLError(
f"num_parts[{part_meta.num_parts}] should be greater than 0.")
if part_meta.algo_name not in ['random', 'metis']:
raise DGLError(
f"algo_name[{part_meta.num_parts}] is not supported.")
return part_meta
......@@ -8,6 +8,7 @@ import argparse
from utils import setdir
from utils import array_readwriter
from base import PartitionMeta, dump_partition_meta
def _random_partition(metadata, num_parts):
num_nodes_per_type = [sum(_) for _ in metadata['num_nodes_per_chunk']]
......@@ -26,9 +27,12 @@ def random_partition(metadata, num_parts, output_path):
mapping files named "<node-type>.txt" (e.g. "author.txt", "paper.txt" and
"institution.txt" for OGB-MAG240M). Each file contains one line per node representing
the partition ID the node belongs to.
In addition, metadata which includes version, number of partitions is dumped.
"""
with setdir(output_path):
_random_partition(metadata, num_parts)
part_meta = PartitionMeta(version='1.0.0', num_parts=num_parts, algo_name='random')
dump_partition_meta(part_meta, 'partition_meta.json')
# Run with PYTHONPATH=${GIT_ROOT_DIR}/tools
# where ${GIT_ROOT_DIR} is the directory to the DGL git repository.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment