Unverified Commit 4be4b134 authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

[Distributed] add copy_partitions.py (#1866)



* fix bugs.

* eval on both vaidation and testing.

* add script.

* update.

* update launch.

* make train_dist.py independent.

* update readme.

* update readme.

* update readme.

* update readme.

* generate undirected graph.

* rename conf_file to part_config

* use rsync

* make train_dist independent.
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-19-1.us-west-2.compute.internal>
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-19-115.us-west-2.compute.internal>
Co-authored-by: default avatarxiang song(charlie.song) <classicxsong@gmail.com>
parent 8b64037e
...@@ -21,22 +21,33 @@ python3 partition_graph.py --dataset ogb-product --num_parts 4 --balance_train - ...@@ -21,22 +21,33 @@ python3 partition_graph.py --dataset ogb-product --num_parts 4 --balance_train -
### Step 2: copy the partitioned data to the cluster ### Step 2: copy the partitioned data to the cluster
When copying data to the cluster, we recommend users to copy the partitioned data to NFS so that all worker machines DGL provides a script for copying partitioned data to the cluster. The command below copies partition data
will be able to access the partitioned data. to the machines in the cluster. The configuration of the cluster is defined by `ip_config.txt`,
The data is copied to `~/graphsage/ogb-product` on each of the remote machines. `--part_config`
specifies the location of the partitioned data in the local machine (a user only needs to specify
the location of the partition configuration file).
```bash
python3 ~/dgl/tools/copy_partitions.py --ip_config ip_config.txt \
--workspace ~/graphsage --rel_data_path ogb-product \
--part_config data/ogb-product.json
```
### Step 3: Launch distributed jobs **Note**: users need to make sure that the master node has right permission to ssh to all the other nodes.
First make sure that the master node has the right permission to ssh to all the other nodes. Change the `ip_config.txt` file by using your own instance IP. Users need to copy the training script to the workspace directory on remote machines as well.
### Step 3: Launch distributed jobs
Then run script: DGL provides a script to launch the training job in the cluster. `part_config` and `ip_config`
specify relative paths to the path of the workspace.
```bash ```bash
python3 ~/dgl/tools/launch.py \ python3 ~/dgl/tools/launch.py \
--workspace ~/dgl/examples/pytorch/graphsage/experimental \ --workspace ~/graphsage/ \
--num_client 4 \ --num_client 4 \
--conf_path data/ogb-product.json \ --part_config ogb-product/ogb-product.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
"python3 train_dist.py --graph-name ogb-product --ip_config ip_config.txt --num-epochs 30 --batch-size 1000 --lr 0.1 --num-client 4" "python3 train_dist.py --graph-name ogb-product --ip_config ip_config.txt --num-epochs 30 --batch-size 1000"
``` ```
## Distributed code runs in the standalone mode ## Distributed code runs in the standalone mode
......
...@@ -12,8 +12,12 @@ if __name__ == '__main__': ...@@ -12,8 +12,12 @@ if __name__ == '__main__':
help='datasets: reddit, ogb-product, ogb-paper100M') help='datasets: reddit, ogb-product, ogb-paper100M')
argparser.add_argument('--num_parts', type=int, default=4, argparser.add_argument('--num_parts', type=int, default=4,
help='number of partitions') help='number of partitions')
argparser.add_argument('--part_method', type=str, default='metis',
help='the partition method')
argparser.add_argument('--balance_train', action='store_true', argparser.add_argument('--balance_train', action='store_true',
help='balance the training size in each partition.') help='balance the training size in each partition.')
argparser.add_argument('--undirected', action='store_true',
help='turn the graph into an undirected graph.')
argparser.add_argument('--balance_edges', action='store_true', argparser.add_argument('--balance_edges', action='store_true',
help='balance the number of edges in each partition.') help='balance the number of edges in each partition.')
args = argparser.parse_args() args = argparser.parse_args()
...@@ -34,6 +38,14 @@ if __name__ == '__main__': ...@@ -34,6 +38,14 @@ if __name__ == '__main__':
balance_ntypes = g.ndata['train_mask'] balance_ntypes = g.ndata['train_mask']
else: else:
balance_ntypes = None balance_ntypes = None
if args.undirected:
sym_g = dgl.to_bidirected_stale(g, readonly=True)
for key in g.ndata:
sym_g.ndata[key] = g.ndata[key]
g = sym_g
dgl.distributed.partition_graph(g, args.dataset, args.num_parts, 'data', dgl.distributed.partition_graph(g, args.dataset, args.num_parts, 'data',
part_method=args.part_method,
balance_ntypes=balance_ntypes, balance_ntypes=balance_ntypes,
balance_edges=args.balance_edges) balance_edges=args.balance_edges)
...@@ -21,8 +21,6 @@ import torch.multiprocessing as mp ...@@ -21,8 +21,6 @@ import torch.multiprocessing as mp
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from pyinstrument import Profiler from pyinstrument import Profiler
from train_sampling import run, SAGE, compute_acc, evaluate, load_subtensor
class NeighborSampler(object): class NeighborSampler(object):
def __init__(self, g, fanouts, sample_neighbors): def __init__(self, g, fanouts, sample_neighbors):
self.g = g self.g = g
...@@ -43,11 +41,29 @@ class NeighborSampler(object): ...@@ -43,11 +41,29 @@ class NeighborSampler(object):
blocks.insert(0, block) blocks.insert(0, block)
return blocks return blocks
class DistSAGE(SAGE): class DistSAGE(nn.Module):
def __init__(self, in_feats, n_hidden, n_classes, n_layers, def __init__(self, in_feats, n_hidden, n_classes, n_layers,
activation, dropout): activation, dropout):
super(DistSAGE, self).__init__(in_feats, n_hidden, n_classes, n_layers, super().__init__()
activation, dropout) self.n_layers = n_layers
self.n_hidden = n_hidden
self.n_classes = n_classes
self.layers = nn.ModuleList()
self.layers.append(dglnn.SAGEConv(in_feats, n_hidden, 'mean'))
for i in range(1, n_layers - 1):
self.layers.append(dglnn.SAGEConv(n_hidden, n_hidden, 'mean'))
self.layers.append(dglnn.SAGEConv(n_hidden, n_classes, 'mean'))
self.dropout = nn.Dropout(dropout)
self.activation = activation
def forward(self, blocks, x):
h = x
for l, (layer, block) in enumerate(zip(self.layers, blocks)):
h = layer(block, h)
if l != len(self.layers) - 1:
h = self.activation(h)
h = self.dropout(h)
return h
def inference(self, g, x, batch_size, device): def inference(self, g, x, batch_size, device):
""" """
...@@ -100,9 +116,40 @@ class DistSAGE(SAGE): ...@@ -100,9 +116,40 @@ class DistSAGE(SAGE):
g.barrier() g.barrier()
return y return y
def compute_acc(pred, labels):
"""
Compute the accuracy of prediction given the labels.
"""
labels = labels.long()
return (th.argmax(pred, dim=1) == labels).float().sum() / len(pred)
def evaluate(model, g, inputs, labels, val_nid, test_nid, batch_size, device):
"""
Evaluate the model on the validation set specified by ``val_nid``.
g : The entire graph.
inputs : The features of all the nodes.
labels : The labels of all the nodes.
val_nid : the node Ids for validation.
batch_size : Number of nodes to compute at the same time.
device : The GPU device to evaluate on.
"""
model.eval()
with th.no_grad():
pred = model.inference(g, inputs, batch_size, device)
model.train()
return compute_acc(pred[val_nid], labels[val_nid]), compute_acc(pred[test_nid], labels[test_nid])
def load_subtensor(g, seeds, input_nodes, device):
"""
Copys features and labels of a set of nodes onto GPU.
"""
batch_inputs = g.ndata['features'][input_nodes].to(device)
batch_labels = g.ndata['labels'][seeds].to(device)
return batch_inputs, batch_labels
def run(args, device, data): def run(args, device, data):
# Unpack data # Unpack data
train_nid, val_nid, in_feats, n_classes, g = data train_nid, val_nid, test_nid, in_feats, n_classes, g = data
# Create sampler # Create sampler
sampler = NeighborSampler(g, [int(fanout) for fanout in args.fan_out.split(',')], sampler = NeighborSampler(g, [int(fanout) for fanout in args.fan_out.split(',')],
dgl.distributed.sample_neighbors) dgl.distributed.sample_neighbors)
...@@ -204,9 +251,10 @@ def run(args, device, data): ...@@ -204,9 +251,10 @@ def run(args, device, data):
if epoch % args.eval_every == 0 and epoch != 0: if epoch % args.eval_every == 0 and epoch != 0:
start = time.time() start = time.time()
eval_acc = evaluate(model.module, g, g.ndata['features'], val_acc, test_acc = evaluate(model.module, g, g.ndata['features'],
g.ndata['labels'], val_nid, args.batch_size_eval, device) g.ndata['labels'], val_nid, test_nid, args.batch_size_eval, device)
print('Part {}, Eval Acc {:.4f}, time: {:.4f}'.format(g.rank(), eval_acc, time.time() - start)) print('Part {}, Val Acc {:.4f}, Test Acc {:.4f}, time: {:.4f}'.format(g.rank(), val_acc, test_acc,
time.time() - start))
profiler.stop() profiler.stop()
print(profiler.output_text(unicode=True, color=True)) print(profiler.output_text(unicode=True, color=True))
...@@ -217,7 +265,7 @@ def run(args, device, data): ...@@ -217,7 +265,7 @@ def run(args, device, data):
def main(args): def main(args):
if not args.standalone: if not args.standalone:
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
g = dgl.distributed.DistGraph(args.ip_config, args.graph_name, conf_file=args.conf_path) g = dgl.distributed.DistGraph(args.ip_config, args.graph_name, part_config=args.conf_path)
print('rank:', g.rank()) print('rank:', g.rank())
pb = g.get_partition_book() pb = g.get_partition_book()
...@@ -236,7 +284,7 @@ def main(args): ...@@ -236,7 +284,7 @@ def main(args):
# Pack data # Pack data
in_feats = g.ndata['features'].shape[1] in_feats = g.ndata['features'].shape[1]
data = train_nid, val_nid, in_feats, n_classes, g data = train_nid, val_nid, test_nid, in_feats, n_classes, g
run(args, device, data) run(args, device, data)
print("parent ends") print("parent ends")
......
...@@ -207,17 +207,17 @@ class DistGraphServer(KVServer): ...@@ -207,17 +207,17 @@ class DistGraphServer(KVServer):
Path of IP configuration file. Path of IP configuration file.
num_clients : int num_clients : int
Total number of client nodes. Total number of client nodes.
conf_file : string part_config : string
The path of the config file generated by the partition tool. The path of the config file generated by the partition tool.
disable_shared_mem : bool disable_shared_mem : bool
Disable shared memory. Disable shared memory.
''' '''
def __init__(self, server_id, ip_config, num_clients, conf_file, disable_shared_mem=False): def __init__(self, server_id, ip_config, num_clients, part_config, disable_shared_mem=False):
super(DistGraphServer, self).__init__(server_id=server_id, ip_config=ip_config, super(DistGraphServer, self).__init__(server_id=server_id, ip_config=ip_config,
num_clients=num_clients) num_clients=num_clients)
self.ip_config = ip_config self.ip_config = ip_config
# Load graph partition data. # Load graph partition data.
self.client_g, node_feats, edge_feats, self.gpb, graph_name = load_partition(conf_file, self.client_g, node_feats, edge_feats, self.gpb, graph_name = load_partition(part_config,
server_id) server_id)
print('load ' + graph_name) print('load ' + graph_name)
if not disable_shared_mem: if not disable_shared_mem:
...@@ -286,16 +286,16 @@ class DistGraph: ...@@ -286,16 +286,16 @@ class DistGraph:
The name of the graph. This name has to be the same as the one used in DistGraphServer. The name of the graph. This name has to be the same as the one used in DistGraphServer.
gpb : PartitionBook gpb : PartitionBook
The partition book object The partition book object
conf_file : str part_config : str
The partition config file. It's used in the standalone mode. The partition config file. It's used in the standalone mode.
''' '''
def __init__(self, ip_config, graph_name, gpb=None, conf_file=None): def __init__(self, ip_config, graph_name, gpb=None, part_config=None):
if os.environ.get('DGL_DIST_MODE', 'standalone') == 'standalone': if os.environ.get('DGL_DIST_MODE', 'standalone') == 'standalone':
assert conf_file is not None, \ assert part_config is not None, \
'When running in the standalone model, the partition config file is required' 'When running in the standalone model, the partition config file is required'
self._client = SA_KVClient() self._client = SA_KVClient()
# Load graph partition data. # Load graph partition data.
g, node_feats, edge_feats, self._gpb, _ = load_partition(conf_file, 0) g, node_feats, edge_feats, self._gpb, _ = load_partition(part_config, 0)
assert self._gpb.num_partitions() == 1, \ assert self._gpb.num_partitions() == 1, \
'The standalone mode can only work with the graph data with one partition' 'The standalone mode can only work with the graph data with one partition'
if self._gpb is None: if self._gpb is None:
......
...@@ -241,7 +241,7 @@ def test_standalone(): ...@@ -241,7 +241,7 @@ def test_standalone():
g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1) g.edata['features'] = F.unsqueeze(F.arange(0, g.number_of_edges()), 1)
partition_graph(g, graph_name, num_parts, '/tmp/dist_graph') partition_graph(g, graph_name, num_parts, '/tmp/dist_graph')
dist_g = DistGraph("kv_ip_config.txt", graph_name, dist_g = DistGraph("kv_ip_config.txt", graph_name,
conf_file='/tmp/dist_graph/{}.json'.format(graph_name)) part_config='/tmp/dist_graph/{}.json'.format(graph_name))
check_dist_graph(dist_g, g.number_of_nodes(), g.number_of_edges()) check_dist_graph(dist_g, g.number_of_nodes(), g.number_of_edges())
def test_split(): def test_split():
......
...@@ -136,7 +136,7 @@ def check_standalone_sampling(tmpdir): ...@@ -136,7 +136,7 @@ def check_standalone_sampling(tmpdir):
partition_graph(g, 'test_sampling', num_parts, tmpdir, partition_graph(g, 'test_sampling', num_parts, tmpdir,
num_hops=num_hops, part_method='metis', reshuffle=False) num_hops=num_hops, part_method='metis', reshuffle=False)
dist_graph = DistGraph(None, "test_sampling", conf_file=tmpdir / 'test_sampling.json') dist_graph = DistGraph(None, "test_sampling", part_config=tmpdir / 'test_sampling.json')
sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3) sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3)
src, dst = sampled_graph.edges() src, dst = sampled_graph.edges()
......
"""Copy the partitions to a cluster of machines."""
import os
import stat
import sys
import subprocess
import argparse
import signal
import logging
import json
import copy
def copy_file(file_name, ip, workspace):
print('copy {} to {}'.format(file_name, ip + ':' + workspace + '/'))
cmd = 'rsync -e \"ssh -o StrictHostKeyChecking=no\" -arvc ' + file_name + ' ' + ip + ':' + workspace + '/'
subprocess.check_call(cmd, shell = True)
def exec_cmd(ip, cmd):
cmd = 'ssh -o StrictHostKeyChecking=no ' + ip + ' \'' + cmd + '\''
subprocess.check_call(cmd, shell = True)
def main():
parser = argparse.ArgumentParser(description='Copy data to the servers.')
parser.add_argument('--workspace', type=str, required=True,
help='Path of user directory of distributed tasks. \
This is used to specify a destination location where \
data are copied to on remote machines.')
parser.add_argument('--rel_data_path', type=str, required=True,
help='Relative path in workspace to store the partition data.')
parser.add_argument('--part_config', type=str, required=True,
help='The partition config file. The path is on the local machine.')
parser.add_argument('--ip_config', type=str, required=True,
help='The file of IP configuration for servers. \
The path is on the local machine.')
args = parser.parse_args()
hosts = []
with open(args.ip_config) as f:
for line in f:
ip, _, _ = line.strip().split(' ')
hosts.append(ip)
# We need to update the partition config file so that the paths are relative to
# the workspace in the remote machines.
with open(args.part_config) as conf_f:
part_metadata = json.load(conf_f)
tmp_part_metadata = copy.deepcopy(part_metadata)
num_parts = part_metadata['num_parts']
assert num_parts == len(hosts), \
'The number of partitions needs to be the same as the number of hosts.'
graph_name = part_metadata['graph_name']
node_map = part_metadata['node_map']
edge_map = part_metadata['edge_map']
if not isinstance(node_map, list):
assert node_map[-4:] == '.npy', 'node map should be stored in a NumPy array.'
tmp_part_metadata['node_map'] = '{}/{}/node_map.npy'.format(args.workspace,
args.rel_data_path)
if not isinstance(edge_map, list):
assert edge_map[-4:] == '.npy', 'edge map should be stored in a NumPy array.'
tmp_part_metadata['edge_map'] = '{}/{}/edge_map.npy'.format(args.workspace,
args.rel_data_path)
for part_id in range(num_parts):
part_files = tmp_part_metadata['part-{}'.format(part_id)]
part_files['edge_feats'] = '{}/part{}/edge_feat.dgl'.format(args.rel_data_path, part_id)
part_files['node_feats'] = '{}/part{}/node_feat.dgl'.format(args.rel_data_path, part_id)
part_files['part_graph'] = '{}/part{}/graph.dgl'.format(args.rel_data_path, part_id)
tmp_part_config = '/tmp/{}.json'.format(graph_name)
with open(tmp_part_config, 'w') as outfile:
json.dump(tmp_part_metadata, outfile, sort_keys=True, indent=4)
# Copy ip config.
for part_id, ip in enumerate(hosts):
remote_path = '{}/{}'.format(args.workspace, args.rel_data_path)
exec_cmd(ip, 'mkdir -p {}'.format(remote_path))
copy_file(args.ip_config, ip, args.workspace)
copy_file(tmp_part_config, ip, '{}/{}'.format(args.workspace, args.rel_data_path))
node_map = part_metadata['node_map']
edge_map = part_metadata['edge_map']
if not isinstance(node_map, list):
copy_file(node_map, ip, tmp_part_metadata['node_map'])
if not isinstance(edge_map, list):
copy_file(edge_map, ip, tmp_part_metadata['edge_map'])
remote_path = '{}/{}/part{}'.format(args.workspace, args.rel_data_path, part_id)
exec_cmd(ip, 'mkdir -p {}'.format(remote_path))
part_files = part_metadata['part-{}'.format(part_id)]
copy_file(part_files['node_feats'], ip, remote_path)
copy_file(part_files['edge_feats'], ip, remote_path)
copy_file(part_files['part_graph'], ip, remote_path)
def signal_handler(signal, frame):
logging.info('Stop copying')
sys.exit(0)
if __name__ == '__main__':
fmt = '%(asctime)s %(levelname)s %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)
signal.signal(signal.SIGINT, signal_handler)
main()
...@@ -39,7 +39,7 @@ def submit_jobs(args, udf_command): ...@@ -39,7 +39,7 @@ def submit_jobs(args, udf_command):
# launch server tasks # launch server tasks
server_cmd = 'DGL_ROLE=server' server_cmd = 'DGL_ROLE=server'
server_cmd = server_cmd + ' ' + 'DGL_NUM_CLIENT=' + str(args.num_client) server_cmd = server_cmd + ' ' + 'DGL_NUM_CLIENT=' + str(args.num_client)
server_cmd = server_cmd + ' ' + 'DGL_CONF_PATH=' + str(args.conf_path) server_cmd = server_cmd + ' ' + 'DGL_CONF_PATH=' + str(args.part_config)
server_cmd = server_cmd + ' ' + 'DGL_IP_CONFIG=' + str(args.ip_config) server_cmd = server_cmd + ' ' + 'DGL_IP_CONFIG=' + str(args.ip_config)
for i in range(len(hosts)*server_count_per_machine): for i in range(len(hosts)*server_count_per_machine):
ip, _ = hosts[int(i / server_count_per_machine)] ip, _ = hosts[int(i / server_count_per_machine)]
...@@ -50,7 +50,7 @@ def submit_jobs(args, udf_command): ...@@ -50,7 +50,7 @@ def submit_jobs(args, udf_command):
# launch client tasks # launch client tasks
client_cmd = 'DGL_DIST_MODE="distributed" DGL_ROLE=client' client_cmd = 'DGL_DIST_MODE="distributed" DGL_ROLE=client'
client_cmd = client_cmd + ' ' + 'DGL_NUM_CLIENT=' + str(args.num_client) client_cmd = client_cmd + ' ' + 'DGL_NUM_CLIENT=' + str(args.num_client)
client_cmd = client_cmd + ' ' + 'DGL_CONF_PATH=' + str(args.conf_path) client_cmd = client_cmd + ' ' + 'DGL_CONF_PATH=' + str(args.part_config)
client_cmd = client_cmd + ' ' + 'DGL_IP_CONFIG=' + str(args.ip_config) client_cmd = client_cmd + ' ' + 'DGL_IP_CONFIG=' + str(args.ip_config)
if os.environ.get('OMP_NUM_THREADS') is not None: if os.environ.get('OMP_NUM_THREADS') is not None:
client_cmd = client_cmd + ' ' + 'OMP_NUM_THREADS=' + os.environ.get('OMP_NUM_THREADS') client_cmd = client_cmd + ' ' + 'OMP_NUM_THREADS=' + os.environ.get('OMP_NUM_THREADS')
...@@ -87,8 +87,8 @@ def main(): ...@@ -87,8 +87,8 @@ def main():
the contents of current directory will be rsyncd') the contents of current directory will be rsyncd')
parser.add_argument('--num_client', type=int, parser.add_argument('--num_client', type=int,
help='Total number of client processes in the cluster') help='Total number of client processes in the cluster')
parser.add_argument('--conf_path', type=str, parser.add_argument('--part_config', type=str,
help='The file (in workspace) of the partition config file') help='The file (in workspace) of the partition config')
parser.add_argument('--ip_config', type=str, parser.add_argument('--ip_config', type=str,
help='The file (in workspace) of IP configuration for server processes') help='The file (in workspace) of IP configuration for server processes')
args, udf_command = parser.parse_known_args() args, udf_command = parser.parse_known_args()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment