Unverified Commit e57c6e35 authored by Jinjing Zhou's avatar Jinjing Zhou Committed by GitHub
Browse files

[Fix] Fix lint resource usage & Fix Docs (#3032)

* fix

* remove nvidiasmi

* fix

* fix docs

* fix

* fix
parent 55e7796a
...@@ -506,7 +506,7 @@ def skip_if_not_4gpu(): ...@@ -506,7 +506,7 @@ def skip_if_not_4gpu():
def _wrapper(func): def _wrapper(func):
if GPU_COUNT != 4: if GPU_COUNT != 4:
# skip if not enabled # skip if not enabled
print("Skip {}".format(func.benchmark_name)) print("Skip {}".format(func.__name__))
func.benchmark_name = "skip_" + func.__name__ func.benchmark_name = "skip_" + func.__name__
return func return func
return _wrapper return _wrapper
......
...@@ -12,8 +12,6 @@ pip install --upgrade pip ...@@ -12,8 +12,6 @@ pip install --upgrade pip
pip install asv pip install asv
pip uninstall -y dgl pip uninstall -y dgl
nvidia-smi
export DGL_BENCH_DEVICE=$DEVICE export DGL_BENCH_DEVICE=$DEVICE
echo "DGL_BENCH_DEVICE=$DGL_BENCH_DEVICE" echo "DGL_BENCH_DEVICE=$DGL_BENCH_DEVICE"
pushd $ROOT/benchmarks pushd $ROOT/benchmarks
......
...@@ -10,5 +10,5 @@ spec: ...@@ -10,5 +10,5 @@ spec:
tty: true tty: true
resources: resources:
requests: requests:
cpu: 4 cpu: 1
serviceAccountName: dglciuser serviceAccountName: dglciuser
\ No newline at end of file
...@@ -8,174 +8,222 @@ read the tutorial of multi-GPU training first. This tutorial is developed on top ...@@ -8,174 +8,222 @@ read the tutorial of multi-GPU training first. This tutorial is developed on top
multi-GPU training by providing extra steps for partitioning a graph, modifying the training script multi-GPU training by providing extra steps for partitioning a graph, modifying the training script
and setting up the environment for distributed training. and setting up the environment for distributed training.
'''
###################################################### Partition a graph
#Partition a graph -----------------
#-----------------
# In this tutorial, we will use `OGBN products graph <https://ogb.stanford.edu/docs/nodeprop/#ogbn-products>`_
#In this tutorial, we will use `OGBN products graph <https://ogb.stanford.edu/docs/nodeprop/#ogbn-products>`_ as an example to illustrate the graph partitioning. Let's first load the graph into a DGL graph.
#as an example to illustrate the graph partitioning. Let's first load the graph into a DGL graph. Here we store the node labels as node data in the DGL Graph.
#Here we store the node labels as node data in the DGL Graph.
#
import dgl
import torch as th .. code-block:: python
from ogb.nodeproppred import DglNodePropPredDataset
data = DglNodePropPredDataset(name='ogbn-products')
graph, labels = data[0] import dgl
labels = labels[:, 0] import torch as th
graph.ndata['labels'] = labels from ogb.nodeproppred import DglNodePropPredDataset
data = DglNodePropPredDataset(name='ogbn-products')
###################################################### graph, labels = data[0]
#We need to split the data into training/validation/test set during the graph partitioning. labels = labels[:, 0]
#Because this is a node classification task, the training/validation/test sets contain node IDs. graph.ndata['labels'] = labels
#We recommend users to convert them as boolean arrays, in which True indicates the existence
#of the node ID in the set. In this way, we can store them as node data. After the partitioning,
#the boolean arrays will be stored with the graph partitions. We need to split the data into training/validation/test set during the graph partitioning.
# Because this is a node classification task, the training/validation/test sets contain node IDs.
We recommend users to convert them as boolean arrays, in which True indicates the existence
splitted_idx = data.get_idx_split() of the node ID in the set. In this way, we can store them as node data. After the partitioning,
train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test'] the boolean arrays will be stored with the graph partitions.
train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
train_mask[train_nid] = True
val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
val_mask[val_nid] = True
test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool) .. code-block:: python
test_mask[test_nid] = True
graph.ndata['train_mask'] = train_mask
graph.ndata['val_mask'] = val_mask splitted_idx = data.get_idx_split()
graph.ndata['test_mask'] = test_mask train_nid, val_nid, test_nid = splitted_idx['train'], splitted_idx['valid'], splitted_idx['test']
train_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
##################################################### train_mask[train_nid] = True
#Then we call the `partition_graph` function to partition the graph with val_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
#`METIS <http://glaros.dtc.umn.edu/gkhome/metis/metis/overview>`_ and save the partitioned results val_mask[val_nid] = True
#in the specified folder. **Note**: `partition_graph` runs on a single machine with a single thread. test_mask = th.zeros((graph.number_of_nodes(),), dtype=th.bool)
#You can go to `our user guide <https://docs.dgl.ai/en/latest/guide/distributed-preprocessing.html#distributed-partitioning>`_ test_mask[test_nid] = True
#to see more information on distributed graph partitioning. graph.ndata['train_mask'] = train_mask
# graph.ndata['val_mask'] = val_mask
#The code below shows an example of invoking the partitioning algorithm and generate four partitions. graph.ndata['test_mask'] = test_mask
#The partitioned results are stored in a folder called `4part_data`. While partitioning a graph,
#we allow users to specify how to balance the partitions. By default, the algorithm balances the number
#of nodes in each partition as much as possible. However, this balancing strategy is not sufficient Then we call the `partition_graph` function to partition the graph with
#for distributed GNN training because some partitions may have many more training nodes than other partitions `METIS <http://glaros.dtc.umn.edu/gkhome/metis/metis/overview>`_ and save the partitioned results
#or some partitions may have more edges than others. As such, `partition_graph` provides two additional arguments in the specified folder. **Note**: `partition_graph` runs on a single machine with a single thread.
#`balance_ntypes` and `balance_edges` to enforce more balancing criteria. For example, we can use the training mask You can go to `our user guide <https://docs.dgl.ai/en/latest/guide/distributed-preprocessing.html#distributed-partitioning>`_
#to balance the number of training nodes in each partition, as shown in the example below. We can also turn on to see more information on distributed graph partitioning.
#the `balance_edges` flag to ensure that all partitions have roughly the same number of edges.
# The code below shows an example of invoking the partitioning algorithm and generate four partitions.
The partitioned results are stored in a folder called `4part_data`. While partitioning a graph,
dgl.distributed.partition_graph(graph, graph_name='ogbn-products', num_parts=4, we allow users to specify how to balance the partitions. By default, the algorithm balances the number
of nodes in each partition as much as possible. However, this balancing strategy is not sufficient
for distributed GNN training because some partitions may have many more training nodes than other partitions
or some partitions may have more edges than others. As such, `partition_graph` provides two additional arguments
`balance_ntypes` and `balance_edges` to enforce more balancing criteria. For example, we can use the training mask
to balance the number of training nodes in each partition, as shown in the example below. We can also turn on
the `balance_edges` flag to ensure that all partitions have roughly the same number of edges.
.. code-block:: python
dgl.distributed.partition_graph(graph, graph_name='ogbn-products', num_parts=4,
out_path='4part_data', out_path='4part_data',
balance_ntypes=graph.ndata['train_mask'], balance_ntypes=graph.ndata['train_mask'],
balance_edges=True) balance_edges=True)
###################################################
#When partitioning a graph, DGL shuffles node IDs and edge IDs so that nodes/edges assigned to When partitioning a graph, DGL shuffles node IDs and edge IDs so that nodes/edges assigned to
#a partition have contiguous IDs. This is necessary for DGL to maintain the mappings of global a partition have contiguous IDs. This is necessary for DGL to maintain the mappings of global
#node/edge IDs and partition IDs. If a user needs to map the shuffled node/edge IDs to their original IDs, node/edge IDs and partition IDs. If a user needs to map the shuffled node/edge IDs to their original IDs,
#they can turn on the `return_mapping` flag of `partition_graph`, which returns a vector for the node ID mapping they can turn on the `return_mapping` flag of `partition_graph`, which returns a vector for the node ID mapping
#and edge ID mapping. Below shows an example of using the ID mapping to save the node embeddings after and edge ID mapping. Below shows an example of using the ID mapping to save the node embeddings after
#distributed training. This is a common use case when users want to use the trained node embeddings distributed training. This is a common use case when users want to use the trained node embeddings
#in their downstream task. Below let's assume that the trained node embeddings are stored in the `node_emb` tensor, in their downstream task. Below let's assume that the trained node embeddings are stored in the `node_emb` tensor,
#which is indexed by the shuffled node IDs. We shuffle the embeddings again and store them in which is indexed by the shuffled node IDs. We shuffle the embeddings again and store them in
#the `orig_node_emb` tensor, which is indexed by the original node IDs. the `orig_node_emb` tensor, which is indexed by the original node IDs.
#
nmap, emap = dgl.distributed.partition_graph(graph, graph_name='ogbn-products',
.. code-block:: python
nmap, emap = dgl.distributed.partition_graph(graph, graph_name='ogbn-products',
num_parts=4, num_parts=4,
out_path='4part_data', out_path='4part_data',
balance_ntypes=graph.ndata['train_mask'], balance_ntypes=graph.ndata['train_mask'],
balance_edges=True, balance_edges=True,
return_mapping=True) return_mapping=True)
orig_node_emb = th.zeros(node_emb.shape, dtype=node_emb.dtype) orig_node_emb = th.zeros(node_emb.shape, dtype=node_emb.dtype)
orig_node_emb[nmap] = node_emb orig_node_emb[nmap] = node_emb
#####################################################
#Distributed training script Distributed training script
#--------------------------- ---------------------------
#
#The distributed training script is very similar to multi-GPU training script with just a few modifications. The distributed training script is very similar to multi-GPU training script with just a few modifications.
#It also relies on the Pytorch distributed component to exchange gradients and update model parameters. It also relies on the Pytorch distributed component to exchange gradients and update model parameters.
#The distributed training script only contains the code of the trainers. The distributed training script only contains the code of the trainers.
#
#Initialize network communication Initialize network communication
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
#Distributed GNN training requires to access the partitioned graph structure and node/edge features Distributed GNN training requires to access the partitioned graph structure and node/edge features
#as well as aggregating the gradients of model parameters from multiple trainers. DGL's distributed as well as aggregating the gradients of model parameters from multiple trainers. DGL's distributed
#component is responsible for accessing the distributed graph structure and distributed node features component is responsible for accessing the distributed graph structure and distributed node features
#and edge features while Pytorch distributed is responsible for exchanging the gradients of model parameters. and edge features while Pytorch distributed is responsible for exchanging the gradients of model parameters.
#As such, we need to initialize both DGL and Pytorch distributed components at the beginning of the training script. As such, we need to initialize both DGL and Pytorch distributed components at the beginning of the training script.
#
#We need to call DGL's initialize function to initialize the trainers' network communication and We need to call DGL's initialize function to initialize the trainers' network communication and
#connect with DGL's servers at the very beginning of the distributed training script. This function connect with DGL's servers at the very beginning of the distributed training script. This function
#has an argument that accepts the path to the cluster configuration file. has an argument that accepts the path to the cluster configuration file.
#
import dgl
import torch as th
dgl.distributed.initialize(ip_config='ip_config.txt')
.. code-block:: python
#####################################################################
#The configuration file `ip_config.txt` has the following format: import dgl
# import torch as th
#.. code-block:: shell dgl.distributed.initialize(ip_config='ip_config.txt')
#
# ip_addr1 [port1]
# ip_addr2 [port2] The configuration file `ip_config.txt` has the following format:
#
#Each row is a machine. The first column is the IP address and the second column is the port for .. code-block:: shell
#connecting to the DGL server on the machine. The port is optional and the default port is 30050.
# ip_addr1 [port1]
#After initializing DGL's network communication, a user can initialize Pytorch's distributed communication. ip_addr2 [port2]
#
th.distributed.init_process_group(backend='gloo') Each row is a machine. The first column is the IP address and the second column is the port for
connecting to the DGL server on the machine. The port is optional and the default port is 30050.
#######################################################################
#Reference to the distributed graph After initializing DGL's network communication, a user can initialize Pytorch's distributed communication.
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
#DGL's servers load the graph partitions automatically. After the servers load the partitions,
#trainers connect to the servers and can start to reference to the distributed graph in the cluster as below.
# .. code-block:: python
g = dgl.distributed.DistGraph('ogbn-products')
th.distributed.init_process_group(backend='gloo')
#######################################################################
#As shown in the code, we refer to a distributed graph by its name. This name is basically the one passed
#to the `partition_graph` function as shown in the section above. Reference to the distributed graph
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#Get training and validation node IDs
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ DGL's servers load the graph partitions automatically. After the servers load the partitions,
# trainers connect to the servers and can start to reference to the distributed graph in the cluster as below.
#For distributed training, each trainer can run its own set of training nodes.
#The training nodes of the entire graph are stored in a distributed tensor as the `train_mask` node data,
#which was constructed before we partitioned the graph. Each trainer can invoke `node_split` to its set
#of training nodes. The `node_split` function splits the full training set evenly and returns
#the training nodes, majority of which are stored in the local partition, to ensure good data locality. .. code-block:: python
#
train_nid = dgl.distributed.node_split(g.ndata['train_mask']) g = dgl.distributed.DistGraph('ogbn-products')
######################################################################
#We can split the validation nodes in the same way as above. In this case, each trainer gets As shown in the code, we refer to a distributed graph by its name. This name is basically the one passed
#a different set of validation nodes. to the `partition_graph` function as shown in the section above.
#
valid_nid = dgl.distributed.node_split(g.ndata['val_mask']) Get training and validation node IDs
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#####################################################################
#Define a GNN model For distributed training, each trainer can run its own set of training nodes.
#^^^^^^^^^^^^^^^^^^ The training nodes of the entire graph are stored in a distributed tensor as the `train_mask` node data,
# which was constructed before we partitioned the graph. Each trainer can invoke `node_split` to its set
#For distributed training, we define a GNN model exactly in the same way as of training nodes. The `node_split` function splits the full training set evenly and returns
#`mini-batch training <https://doc.dgl.ai/guide/minibatch.html#>`_ or the training nodes, majority of which are stored in the local partition, to ensure good data locality.
#`full-graph training <https://doc.dgl.ai/guide/training-node.html#guide-training-node-classification>`_.
#The code below defines the GraphSage model.
#
import torch.nn as nn
import torch.nn.functional as F .. code-block:: python
import dgl.nn as dglnn
import torch.optim as optim train_nid = dgl.distributed.node_split(g.ndata['train_mask'])
class SAGE(nn.Module):
We can split the validation nodes in the same way as above. In this case, each trainer gets
a different set of validation nodes.
.. code-block:: python
valid_nid = dgl.distributed.node_split(g.ndata['val_mask'])
Define a GNN model
^^^^^^^^^^^^^^^^^^
For distributed training, we define a GNN model exactly in the same way as
`mini-batch training <https://doc.dgl.ai/guide/minibatch.html#>`_ or
`full-graph training <https://doc.dgl.ai/guide/training-node.html#guide-training-node-classification>`_.
The code below defines the GraphSage model.
.. code-block:: python
import torch.nn as nn
import torch.nn.functional as F
import dgl.nn as dglnn
import torch.optim as optim
class SAGE(nn.Module):
def __init__(self, in_feats, n_hidden, n_classes, n_layers): def __init__(self, in_feats, n_hidden, n_classes, n_layers):
super().__init__() super().__init__()
self.n_layers = n_layers self.n_layers = n_layers
...@@ -194,45 +242,60 @@ class SAGE(nn.Module): ...@@ -194,45 +242,60 @@ class SAGE(nn.Module):
x = F.relu(x) x = F.relu(x)
return x return x
num_hidden = 256 num_hidden = 256
num_labels = len(th.unique(g.ndata['labels'][0:g.number_of_nodes()])) num_labels = len(th.unique(g.ndata['labels'][0:g.number_of_nodes()]))
num_layers = 2 num_layers = 2
lr = 0.001 lr = 0.001
model = SAGE(g.ndata['feat'].shape[1], num_hidden, num_labels, num_layers) model = SAGE(g.ndata['feat'].shape[1], num_hidden, num_labels, num_layers)
loss_fcn = nn.CrossEntropyLoss() loss_fcn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr) optimizer = optim.Adam(model.parameters(), lr=lr)
###################################################################
#For distributed training, we need to convert the model into a distributed model with For distributed training, we need to convert the model into a distributed model with
#Pytorch's `DistributedDataParallel`. Pytorch's `DistributedDataParallel`.
#
model = th.nn.parallel.DistributedDataParallel(model)
####################################################################
#Distributed mini-batch sampler .. code-block:: python
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
# model = th.nn.parallel.DistributedDataParallel(model)
#We can use the same `NodeDataLoader` to create a distributed mini-batch sampler for
#node classification.
# Distributed mini-batch sampler
sampler = dgl.dataloading.MultiLayerNeighborSampler([25,10]) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
train_dataloader = dgl.dataloading.NodeDataLoader(
We can use the same `NodeDataLoader` to create a distributed mini-batch sampler for
node classification.
.. code-block:: python
sampler = dgl.dataloading.MultiLayerNeighborSampler([25,10])
train_dataloader = dgl.dataloading.NodeDataLoader(
g, train_nid, sampler, batch_size=1024, g, train_nid, sampler, batch_size=1024,
shuffle=True, drop_last=False) shuffle=True, drop_last=False)
valid_dataloader = dgl.dataloading.NodeDataLoader( valid_dataloader = dgl.dataloading.NodeDataLoader(
g, valid_nid, sampler, batch_size=1024, g, valid_nid, sampler, batch_size=1024,
shuffle=False, drop_last=False) shuffle=False, drop_last=False)
###################################################################
#Training loop
#^^^^^^^^^^^^^
#
#The training loop for distributed training is also exactly the same as the single-process training.
#
import sklearn.metrics
import numpy as np
for epoch in range(10): Training loop
^^^^^^^^^^^^^
The training loop for distributed training is also exactly the same as the single-process training.
.. code-block:: python
import sklearn.metrics
import numpy as np
for epoch in range(10):
# Loop over the dataloader to sample mini-batches. # Loop over the dataloader to sample mini-batches.
losses = [] losses = []
for step, (input_nodes, seeds, blocks) in enumerate(train_dataloader): for step, (input_nodes, seeds, blocks) in enumerate(train_dataloader):
...@@ -261,110 +324,112 @@ for epoch in range(10): ...@@ -261,110 +324,112 @@ for epoch in range(10):
accuracy = sklearn.metrics.accuracy_score(labels, predictions) accuracy = sklearn.metrics.accuracy_score(labels, predictions)
print('Epoch {}: Validation Accuracy {}'.format(epoch, accuracy)) print('Epoch {}: Validation Accuracy {}'.format(epoch, accuracy))
######################################################################
#Set up distributed training environment Set up distributed training environment
#--------------------------------------- ---------------------------------------
#
#After partitioning a graph and preparing the training script, we now need to set up After partitioning a graph and preparing the training script, we now need to set up
#the distributed training environment and launch the training job. Basically, we need to the distributed training environment and launch the training job. Basically, we need to
#create a cluster of machines and upload both the training script and the partitioned data create a cluster of machines and upload both the training script and the partitioned data
#to each machine in the cluster. A recommended solution of sharing the training script and to each machine in the cluster. A recommended solution of sharing the training script and
#the partitioned data in the cluster is to use NFS (Network File System). the partitioned data in the cluster is to use NFS (Network File System).
#
#For any users who are not familiar with NFS, below is a small tutorial of setting up NFS For any users who are not familiar with NFS, below is a small tutorial of setting up NFS
#in an existing cluster. in an existing cluster.
#
#NFS server side setup (ubuntu only) NFS server side setup (ubuntu only)
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
#First, install essential libs on the storage server First, install essential libs on the storage server
#
#.. code-block:: shell .. code-block:: shell
#
# sudo apt-get install nfs-kernel-server sudo apt-get install nfs-kernel-server
#
#Below we assume the user account is ubuntu and we create a directory of workspace in the home directory. Below we assume the user account is ubuntu and we create a directory of workspace in the home directory.
#
#.. code-block:: shell .. code-block:: shell
#
# mkdir -p /home/ubuntu/workspace mkdir -p /home/ubuntu/workspace
#
#We assume that the all servers are under a subnet with ip range 192.168.0.0 to 192.168.255.255. We assume that the all servers are under a subnet with ip range 192.168.0.0 to 192.168.255.255.
#We need to add the following line to `/etc/exports` We need to add the following line to `/etc/exports`
#
#.. code-block:: shell .. code-block:: shell
#
# /home/ubuntu/workspace 192.168.0.0/16(rw,sync,no_subtree_check) /home/ubuntu/workspace 192.168.0.0/16(rw,sync,no_subtree_check)
#
#Then restart NFS, the setup on server side is finished. Then restart NFS, the setup on server side is finished.
#
#.. code-block:: shell .. code-block:: shell
#
# sudo systemctl restart nfs-kernel-server sudo systemctl restart nfs-kernel-server
#
#For configuration details, please refer to NFS ArchWiki (https://wiki.archlinux.org/index.php/NFS). For configuration details, please refer to NFS ArchWiki (https://wiki.archlinux.org/index.php/NFS).
#
#NFS client side setup (ubuntu only) NFS client side setup (ubuntu only)
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
#To use NFS, clients also require to install essential packages To use NFS, clients also require to install essential packages
#
#.. code-block:: shell .. code-block:: shell
#
# sudo apt-get install nfs-common sudo apt-get install nfs-common
#
#You can either mount the NFS manually You can either mount the NFS manually
#
#.. code-block:: shell .. code-block:: shell
#
# mkdir -p /home/ubuntu/workspace mkdir -p /home/ubuntu/workspace
# sudo mount -t nfs <nfs-server-ip>:/home/ubuntu/workspace /home/ubuntu/workspace sudo mount -t nfs <nfs-server-ip>:/home/ubuntu/workspace /home/ubuntu/workspace
#
#or add the following line to `/etc/fstab` so the folder will be mounted automatically or add the following line to `/etc/fstab` so the folder will be mounted automatically
#
#.. code-block:: shell .. code-block:: shell
#
# <nfs-server-ip>:/home/ubuntu/workspace /home/ubuntu/workspace nfs defaults 0 0 <nfs-server-ip>:/home/ubuntu/workspace /home/ubuntu/workspace nfs defaults 0 0
#
#Then run Then run
#
#.. code-block:: shell .. code-block:: shell
#
# mount -a mount -a
#
#Now go to `/home/ubuntu/workspace` and save the training script and the partitioned data in the folder. Now go to `/home/ubuntu/workspace` and save the training script and the partitioned data in the folder.
#
#SSH Access SSH Access
#^^^^^^^^^^ ^^^^^^^^^^
#
#The launch script accesses the machines in the cluster via SSH. Users should follow the instruction The launch script accesses the machines in the cluster via SSH. Users should follow the instruction
#in `this document <https://linuxize.com/post/how-to-setup-passwordless-ssh-login/>`_ to set up in `this document <https://linuxize.com/post/how-to-setup-passwordless-ssh-login/>`_ to set up
#the passwordless SSH login on every machine in the cluster. After setting up the passwordless SSH, the passwordless SSH login on every machine in the cluster. After setting up the passwordless SSH,
#users need to authenticate the connection to each machine and add their key fingerprints to `~/.ssh/known_hosts`. users need to authenticate the connection to each machine and add their key fingerprints to `~/.ssh/known_hosts`.
#This can be done automatically when we ssh to a machine for the first time. This can be done automatically when we ssh to a machine for the first time.
#
#Launch the distributed training job Launch the distributed training job
#^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
#
#After everything is ready, we can now use the launch script provided by DGL to launch the distributed After everything is ready, we can now use the launch script provided by DGL to launch the distributed
#training job in the cluster. We can run the launch script on any machine in the cluster. training job in the cluster. We can run the launch script on any machine in the cluster.
#
#.. code-block:: shell .. code-block:: shell
#
# python3 ~/workspace/dgl/tools/launch.py \ python3 ~/workspace/dgl/tools/launch.py \
# --workspace ~/workspace/ \ --workspace ~/workspace/ \
# --num_trainers 1 \ --num_trainers 1 \
# --num_samplers 0 \ --num_samplers 0 \
# --num_servers 1 \ --num_servers 1 \
# --part_config 4part_data/ogbn-products.json \ --part_config 4part_data/ogbn-products.json \
# --ip_config ip_config.txt \ --ip_config ip_config.txt \
# "python3 train_dist.py" "python3 train_dist.py"
#
#If we split the graph into four partitions as demonstrated at the beginning of the tutorial, the cluster has to have four machines. The command above launches one trainer and one server on each machine in the cluster. `ip_config.txt` lists the IP addresses of all machines in the cluster as follows: If we split the graph into four partitions as demonstrated at the beginning of the tutorial, the cluster has to have four machines. The command above launches one trainer and one server on each machine in the cluster. `ip_config.txt` lists the IP addresses of all machines in the cluster as follows:
#
#.. code-block:: shell .. code-block:: shell
#
# ip_addr1 ip_addr1
# ip_addr2 ip_addr2
# ip_addr3 ip_addr3
# ip_addr4 ip_addr4
'''
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment