"...source/git@developer.sourcefind.cn:OpenDAS/torchaudio.git" did not exist on "613cb669e99ea8294c9ae6c54f9eb8553762bc5c"
Unverified Commit e36c5db6 authored by Da Zheng's avatar Da Zheng Committed by GitHub
Browse files

[Distributed] Simplify distributed API (#2775)



* remove num_workers.

* remove num_workers.

* remove num_workers.

* remove num-servers.

* update error message.

* update docstring.

* fix docs.

* fix tests.

* fix test.

* fix.

* print messages in test.

* fix.

* fix test.

* fix.
Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-9-132.us-west-1.compute.internal>
parent 97863ab8
...@@ -23,7 +23,7 @@ Typically, the initialization APIs should be invoked in the following order: ...@@ -23,7 +23,7 @@ Typically, the initialization APIs should be invoked in the following order:
.. code:: python .. code:: python
dgl.distributed.initialize('ip_config.txt', num_workers=4) dgl.distributed.initialize('ip_config.txt')
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
**Note**: If the training script contains user-defined functions (UDFs) that have to be invoked on **Note**: If the training script contains user-defined functions (UDFs) that have to be invoked on
......
...@@ -26,7 +26,7 @@ are the same as :ref:`mini-batch training <guide-minibatch>`. ...@@ -26,7 +26,7 @@ are the same as :ref:`mini-batch training <guide-minibatch>`.
import dgl import dgl
import torch as th import torch as th
dgl.distributed.initialize('ip_config.txt', num_servers, num_workers) dgl.distributed.initialize('ip_config.txt')
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
g = dgl.distributed.DistGraph('graph_name', 'part_config.json') g = dgl.distributed.DistGraph('graph_name', 'part_config.json')
pb = g.get_partition_book() pb = g.get_partition_book()
......
...@@ -21,7 +21,7 @@ DGL分布式模块的初始化 ...@@ -21,7 +21,7 @@ DGL分布式模块的初始化
.. code:: python .. code:: python
dgl.distributed.initialize('ip_config.txt', num_workers=4) dgl.distributed.initialize('ip_config.txt')
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
**Note**: 如果训练脚本里包含需要在服务器(细节内容可以在下面的DistTensor和DistEmbedding章节里查看)上调用的用户自定义函数(UDF), **Note**: 如果训练脚本里包含需要在服务器(细节内容可以在下面的DistTensor和DistEmbedding章节里查看)上调用的用户自定义函数(UDF),
......
...@@ -20,7 +20,7 @@ DGL采用完全分布式的方法,可将数据和计算同时分布在一组 ...@@ -20,7 +20,7 @@ DGL采用完全分布式的方法,可将数据和计算同时分布在一组
import dgl import dgl
import torch as th import torch as th
dgl.distributed.initialize('ip_config.txt', num_servers, num_workers) dgl.distributed.initialize('ip_config.txt')
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
g = dgl.distributed.DistGraph('graph_name', 'part_config.json') g = dgl.distributed.DistGraph('graph_name', 'part_config.json')
pb = g.get_partition_book() pb = g.get_partition_book()
......
...@@ -124,7 +124,7 @@ python3 ~/workspace/dgl/tools/launch.py \ ...@@ -124,7 +124,7 @@ python3 ~/workspace/dgl/tools/launch.py \
--num_servers 1 \ --num_servers 1 \
--part_config data/ogb-product.json \ --part_config data/ogb-product.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
"python3 train_dist.py --graph_name ogb-product --ip_config ip_config.txt --num_servers 1 --num_epochs 30 --batch_size 1000 --num_workers 4" "python3 train_dist.py --graph_name ogb-product --ip_config ip_config.txt --num_epochs 30 --batch_size 1000"
``` ```
To run unsupervised training: To run unsupervised training:
...@@ -137,7 +137,7 @@ python3 ~/workspace/dgl/tools/launch.py \ ...@@ -137,7 +137,7 @@ python3 ~/workspace/dgl/tools/launch.py \
--num_servers 1 \ --num_servers 1 \
--part_config data/ogb-product.json \ --part_config data/ogb-product.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
"python3 train_dist_unsupervised.py --graph_name ogb-product --ip_config ip_config.txt --num_servers 1 --num_epochs 3 --batch_size 1000 --num_workers 4" "python3 train_dist_unsupervised.py --graph_name ogb-product --ip_config ip_config.txt --num_epochs 3 --batch_size 1000"
``` ```
By default, this code will run on CPU. If you have GPU support, you can just add a `--num_gpus` argument in user command: By default, this code will run on CPU. If you have GPU support, you can just add a `--num_gpus` argument in user command:
...@@ -150,7 +150,7 @@ python3 ~/workspace/dgl/tools/launch.py \ ...@@ -150,7 +150,7 @@ python3 ~/workspace/dgl/tools/launch.py \
--num_servers 1 \ --num_servers 1 \
--part_config data/ogb-product.json \ --part_config data/ogb-product.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
"python3 train_dist.py --graph_name ogb-product --ip_config ip_config.txt --num_servers 1 --num_epochs 30 --batch_size 1000 --num_workers 4 --num_gpus 4" "python3 train_dist.py --graph_name ogb-product --ip_config ip_config.txt --num_epochs 30 --batch_size 1000 --num_gpus 4"
``` ```
**Note:** if you are using conda or other virtual environments on the remote machines, you need to replace `python3` in the command string (i.e. the last argument) with the path to the Python interpreter in that environment. **Note:** if you are using conda or other virtual environments on the remote machines, you need to replace `python3` in the command string (i.e. the last argument) with the path to the Python interpreter in that environment.
......
...@@ -251,7 +251,7 @@ def run(args, device, data): ...@@ -251,7 +251,7 @@ def run(args, device, data):
time.time() - start)) time.time() - start))
def main(args): def main(args):
dgl.distributed.initialize(args.ip_config, args.num_servers, num_workers=args.num_workers) dgl.distributed.initialize(args.ip_config)
if not args.standalone: if not args.standalone:
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config) g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
...@@ -288,7 +288,6 @@ if __name__ == '__main__': ...@@ -288,7 +288,6 @@ if __name__ == '__main__':
parser.add_argument('--ip_config', type=str, help='The file for IP configuration') parser.add_argument('--ip_config', type=str, help='The file for IP configuration')
parser.add_argument('--part_config', type=str, help='The path to the partition config file') parser.add_argument('--part_config', type=str, help='The path to the partition config file')
parser.add_argument('--num_clients', type=int, help='The number of clients') parser.add_argument('--num_clients', type=int, help='The number of clients')
parser.add_argument('--num_servers', type=int, default=1, help='The number of servers')
parser.add_argument('--n_classes', type=int, help='the number of classes') parser.add_argument('--n_classes', type=int, help='the number of classes')
parser.add_argument('--num_gpus', type=int, default=-1, parser.add_argument('--num_gpus', type=int, default=-1,
help="the number of GPU device. Use -1 for CPU training") help="the number of GPU device. Use -1 for CPU training")
...@@ -302,15 +301,9 @@ if __name__ == '__main__': ...@@ -302,15 +301,9 @@ if __name__ == '__main__':
parser.add_argument('--eval_every', type=int, default=5) parser.add_argument('--eval_every', type=int, default=5)
parser.add_argument('--lr', type=float, default=0.003) parser.add_argument('--lr', type=float, default=0.003)
parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--dropout', type=float, default=0.5)
parser.add_argument('--num_workers', type=int, default=4,
help="Number of sampling processes. Use 0 for no extra process.")
parser.add_argument('--local_rank', type=int, help='get rank of the process') parser.add_argument('--local_rank', type=int, help='get rank of the process')
parser.add_argument('--standalone', action='store_true', help='run in the standalone mode') parser.add_argument('--standalone', action='store_true', help='run in the standalone mode')
args = parser.parse_args() args = parser.parse_args()
assert args.num_workers == int(os.environ.get('DGL_NUM_SAMPLER')), \
'The num_workers should be the same value with DGL_NUM_SAMPLER.'
assert args.num_servers == int(os.environ.get('DGL_NUM_SERVER')), \
'The num_servers should be the same value with DGL_NUM_SERVER.'
print(args) print(args)
main(args) main(args)
...@@ -421,7 +421,7 @@ def run(args, device, data): ...@@ -421,7 +421,7 @@ def run(args, device, data):
th.save(pred, 'emb.pt') th.save(pred, 'emb.pt')
def main(args): def main(args):
dgl.distributed.initialize(args.ip_config, args.num_servers, num_workers=args.num_workers) dgl.distributed.initialize(args.ip_config)
if not args.standalone: if not args.standalone:
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config) g = dgl.distributed.DistGraph(args.graph_name, part_config=args.part_config)
...@@ -458,7 +458,6 @@ if __name__ == '__main__': ...@@ -458,7 +458,6 @@ if __name__ == '__main__':
parser.add_argument('--id', type=int, help='the partition id') parser.add_argument('--id', type=int, help='the partition id')
parser.add_argument('--ip_config', type=str, help='The file for IP configuration') parser.add_argument('--ip_config', type=str, help='The file for IP configuration')
parser.add_argument('--part_config', type=str, help='The path to the partition config file') parser.add_argument('--part_config', type=str, help='The path to the partition config file')
parser.add_argument('--num_servers', type=int, default=1, help='Server count on each machine.')
parser.add_argument('--n_classes', type=int, help='the number of classes') parser.add_argument('--n_classes', type=int, help='the number of classes')
parser.add_argument('--num_gpus', type=int, default=-1, parser.add_argument('--num_gpus', type=int, default=-1,
help="the number of GPU device. Use -1 for CPU training") help="the number of GPU device. Use -1 for CPU training")
...@@ -472,8 +471,6 @@ if __name__ == '__main__': ...@@ -472,8 +471,6 @@ if __name__ == '__main__':
parser.add_argument('--eval_every', type=int, default=5) parser.add_argument('--eval_every', type=int, default=5)
parser.add_argument('--lr', type=float, default=0.003) parser.add_argument('--lr', type=float, default=0.003)
parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--dropout', type=float, default=0.5)
parser.add_argument('--num_workers', type=int, default=0,
help="Number of sampling processes. Use 0 for no extra process.")
parser.add_argument('--local_rank', type=int, help='get rank of the process') parser.add_argument('--local_rank', type=int, help='get rank of the process')
parser.add_argument('--standalone', action='store_true', help='run in the standalone mode') parser.add_argument('--standalone', action='store_true', help='run in the standalone mode')
parser.add_argument('--num_negs', type=int, default=1) parser.add_argument('--num_negs', type=int, default=1)
...@@ -482,10 +479,6 @@ if __name__ == '__main__': ...@@ -482,10 +479,6 @@ if __name__ == '__main__':
parser.add_argument('--remove_edge', default=False, action='store_true', parser.add_argument('--remove_edge', default=False, action='store_true',
help="whether to remove edges during sampling") help="whether to remove edges during sampling")
args = parser.parse_args() args = parser.parse_args()
assert args.num_workers == int(os.environ.get('DGL_NUM_SAMPLER')), \
'The num_workers should be the same value with DGL_NUM_SAMPLER.'
assert args.num_servers == int(os.environ.get('DGL_NUM_SERVER')), \
'The num_servers should be the same value with DGL_NUM_SERVER.'
print(args) print(args)
main(args) main(args)
...@@ -120,7 +120,7 @@ python3 ~/workspace/dgl/tools/launch.py \ ...@@ -120,7 +120,7 @@ python3 ~/workspace/dgl/tools/launch.py \
--num_samplers 4 \ --num_samplers 4 \
--part_config data/ogbn-mag.json \ --part_config data/ogbn-mag.json \
--ip_config ip_config.txt \ --ip_config ip_config.txt \
"python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --fanout='25,25' --batch-size 1024 --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --layer-norm --ip-config ip_config.txt --num-workers 4 --num-servers 1 --sparse-embedding --sparse-lr 0.06 --num_gpus 1" "python3 entity_classify_dist.py --graph-name ogbn-mag --dataset ogbn-mag --fanout='25,25' --batch-size 1024 --n-hidden 64 --lr 0.01 --eval-batch-size 1024 --low-mem --dropout 0.5 --use-self-loop --n-bases 2 --n-epochs 3 --layer-norm --ip-config ip_config.txt --sparse-embedding --sparse-lr 0.06 --num_gpus 1"
``` ```
We can get the performance score at the second epoch: We can get the performance score at the second epoch:
......
...@@ -497,7 +497,7 @@ def run(args, device, data): ...@@ -497,7 +497,7 @@ def run(args, device, data):
time.time() - start)) time.time() - start))
def main(args): def main(args):
dgl.distributed.initialize(args.ip_config, args.num_servers, num_workers=args.num_workers) dgl.distributed.initialize(args.ip_config)
if not args.standalone: if not args.standalone:
th.distributed.init_process_group(backend='gloo') th.distributed.init_process_group(backend='gloo')
...@@ -532,8 +532,6 @@ if __name__ == '__main__': ...@@ -532,8 +532,6 @@ if __name__ == '__main__':
parser.add_argument('--id', type=int, help='the partition id') parser.add_argument('--id', type=int, help='the partition id')
parser.add_argument('--ip-config', type=str, help='The file for IP configuration') parser.add_argument('--ip-config', type=str, help='The file for IP configuration')
parser.add_argument('--conf-path', type=str, help='The path to the partition config file') parser.add_argument('--conf-path', type=str, help='The path to the partition config file')
parser.add_argument('--num-client', type=int, help='The number of clients')
parser.add_argument('--num-servers', type=int, default=1, help='Server count on each machine.')
# rgcn related # rgcn related
parser.add_argument('--num_gpus', type=int, default=-1, parser.add_argument('--num_gpus', type=int, default=-1,
...@@ -569,8 +567,6 @@ if __name__ == '__main__': ...@@ -569,8 +567,6 @@ if __name__ == '__main__':
parser.add_argument("--eval-batch-size", type=int, default=128, parser.add_argument("--eval-batch-size", type=int, default=128,
help="Mini-batch size. ") help="Mini-batch size. ")
parser.add_argument('--log-every', type=int, default=20) parser.add_argument('--log-every', type=int, default=20)
parser.add_argument("--num-workers", type=int, default=1,
help="Number of workers for distributed dataloader.")
parser.add_argument("--low-mem", default=False, action='store_true', parser.add_argument("--low-mem", default=False, action='store_true',
help="Whether use low mem RelGraphCov") help="Whether use low mem RelGraphCov")
parser.add_argument("--sparse-embedding", action='store_true', parser.add_argument("--sparse-embedding", action='store_true',
......
...@@ -59,10 +59,10 @@ def initialize(ip_config, num_servers=1, num_workers=0, ...@@ -59,10 +59,10 @@ def initialize(ip_config, num_servers=1, num_workers=0,
ip_config: str ip_config: str
File path of ip_config file File path of ip_config file
num_servers : int num_servers : int
The number of server processes on each machine The number of server processes on each machine. This argument is deprecated in DGL 0.7.0.
num_workers: int num_workers: int
Number of worker process on each machine. The worker processes are used Number of worker process on each machine. The worker processes are used
for distributed sampling. for distributed sampling. This argument is deprecated in DGL 0.7.0.
max_queue_size : int max_queue_size : int
Maximal size (bytes) of client queue buffer (~20 GB on default). Maximal size (bytes) of client queue buffer (~20 GB on default).
...@@ -101,6 +101,15 @@ def initialize(ip_config, num_servers=1, num_workers=0, ...@@ -101,6 +101,15 @@ def initialize(ip_config, num_servers=1, num_workers=0,
serv.start() serv.start()
sys.exit() sys.exit()
else: else:
if os.environ.get('DGL_NUM_SAMPLER') is not None:
num_workers = int(os.environ.get('DGL_NUM_SAMPLER'))
else:
num_workers = 0
if os.environ.get('DGL_NUM_SERVER') is not None:
num_servers = int(os.environ.get('DGL_NUM_SERVER'))
else:
num_servers = 1
rpc.reset() rpc.reset()
ctx = mp.get_context("spawn") ctx = mp.get_context("spawn")
global SAMPLER_POOL global SAMPLER_POOL
......
...@@ -68,7 +68,8 @@ def rand_init(shape, dtype): ...@@ -68,7 +68,8 @@ def rand_init(shape, dtype):
def run_client(graph_name, part_id, server_count, num_clients, num_nodes, num_edges): def run_client(graph_name, part_id, server_count, num_clients, num_nodes, num_edges):
time.sleep(5) time.sleep(5)
dgl.distributed.initialize("kv_ip_config.txt", server_count) os.environ['DGL_NUM_SERVER'] = str(server_count)
dgl.distributed.initialize("kv_ip_config.txt")
gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name), gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
part_id, None) part_id, None)
g = DistGraph(graph_name, gpb=gpb) g = DistGraph(graph_name, gpb=gpb)
...@@ -240,7 +241,8 @@ def check_server_client(shared_mem, num_servers, num_clients): ...@@ -240,7 +241,8 @@ def check_server_client(shared_mem, num_servers, num_clients):
def run_client_hetero(graph_name, part_id, server_count, num_clients, num_nodes, num_edges): def run_client_hetero(graph_name, part_id, server_count, num_clients, num_nodes, num_edges):
time.sleep(5) time.sleep(5)
dgl.distributed.initialize("kv_ip_config.txt", server_count) os.environ['DGL_NUM_SERVER'] = str(server_count)
dgl.distributed.initialize("kv_ip_config.txt")
gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name), gpb, graph_name, _, _ = load_partition_book('/tmp/dist_graph/{}.json'.format(graph_name),
part_id, None) part_id, None)
g = DistGraph(graph_name, gpb=gpb) g = DistGraph(graph_name, gpb=gpb)
......
...@@ -26,7 +26,7 @@ def start_sample_client(rank, tmpdir, disable_shared_mem): ...@@ -26,7 +26,7 @@ def start_sample_client(rank, tmpdir, disable_shared_mem):
gpb = None gpb = None
if disable_shared_mem: if disable_shared_mem:
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank) _, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank)
dgl.distributed.initialize("rpc_ip_config.txt", 1) dgl.distributed.initialize("rpc_ip_config.txt")
dist_graph = DistGraph("test_sampling", gpb=gpb) dist_graph = DistGraph("test_sampling", gpb=gpb)
try: try:
sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3) sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3)
...@@ -40,7 +40,7 @@ def start_find_edges_client(rank, tmpdir, disable_shared_mem, eids): ...@@ -40,7 +40,7 @@ def start_find_edges_client(rank, tmpdir, disable_shared_mem, eids):
gpb = None gpb = None
if disable_shared_mem: if disable_shared_mem:
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_find_edges.json', rank) _, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_find_edges.json', rank)
dgl.distributed.initialize("rpc_ip_config.txt", 1) dgl.distributed.initialize("rpc_ip_config.txt")
dist_graph = DistGraph("test_find_edges", gpb=gpb) dist_graph = DistGraph("test_find_edges", gpb=gpb)
try: try:
u, v = find_edges(dist_graph, eids) u, v = find_edges(dist_graph, eids)
...@@ -195,7 +195,7 @@ def start_hetero_sample_client(rank, tmpdir, disable_shared_mem): ...@@ -195,7 +195,7 @@ def start_hetero_sample_client(rank, tmpdir, disable_shared_mem):
gpb = None gpb = None
if disable_shared_mem: if disable_shared_mem:
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank) _, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_sampling.json', rank)
dgl.distributed.initialize("rpc_ip_config.txt", 1) dgl.distributed.initialize("rpc_ip_config.txt")
dist_graph = DistGraph("test_sampling", gpb=gpb) dist_graph = DistGraph("test_sampling", gpb=gpb)
assert 'feat' in dist_graph.nodes['n1'].data assert 'feat' in dist_graph.nodes['n1'].data
assert 'feat' not in dist_graph.nodes['n2'].data assert 'feat' not in dist_graph.nodes['n2'].data
...@@ -302,7 +302,7 @@ def check_standalone_sampling(tmpdir, reshuffle): ...@@ -302,7 +302,7 @@ def check_standalone_sampling(tmpdir, reshuffle):
num_hops=num_hops, part_method='metis', reshuffle=reshuffle) num_hops=num_hops, part_method='metis', reshuffle=reshuffle)
os.environ['DGL_DIST_MODE'] = 'standalone' os.environ['DGL_DIST_MODE'] = 'standalone'
dgl.distributed.initialize("rpc_ip_config.txt", 1) dgl.distributed.initialize("rpc_ip_config.txt")
dist_graph = DistGraph("test_sampling", part_config=tmpdir / 'test_sampling.json') dist_graph = DistGraph("test_sampling", part_config=tmpdir / 'test_sampling.json')
sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3) sampled_graph = sample_neighbors(dist_graph, [0, 10, 99, 66, 1024, 2008], 3)
...@@ -325,7 +325,7 @@ def test_standalone_sampling(): ...@@ -325,7 +325,7 @@ def test_standalone_sampling():
def start_in_subgraph_client(rank, tmpdir, disable_shared_mem, nodes): def start_in_subgraph_client(rank, tmpdir, disable_shared_mem, nodes):
gpb = None gpb = None
dgl.distributed.initialize("rpc_ip_config.txt", 1) dgl.distributed.initialize("rpc_ip_config.txt")
if disable_shared_mem: if disable_shared_mem:
_, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_in_subgraph.json', rank) _, _, _, gpb, _, _, _ = load_partition(tmpdir / 'test_in_subgraph.json', rank)
dist_graph = DistGraph("test_in_subgraph", gpb=gpb) dist_graph = DistGraph("test_in_subgraph", gpb=gpb)
......
...@@ -45,10 +45,10 @@ def start_server(rank, tmpdir, disable_shared_mem, num_clients): ...@@ -45,10 +45,10 @@ def start_server(rank, tmpdir, disable_shared_mem, num_clients):
g.start() g.start()
def start_dist_dataloader(rank, tmpdir, num_server, num_workers, drop_last): def start_dist_dataloader(rank, tmpdir, num_server, drop_last):
import dgl import dgl
import torch as th import torch as th
dgl.distributed.initialize("mp_ip_config.txt", 1, num_workers=num_workers) dgl.distributed.initialize("mp_ip_config.txt")
gpb = None gpb = None
disable_shared_mem = num_server > 0 disable_shared_mem = num_server > 0
if disable_shared_mem: if disable_shared_mem:
...@@ -122,7 +122,7 @@ def test_standalone(tmpdir): ...@@ -122,7 +122,7 @@ def test_standalone(tmpdir):
os.environ['DGL_DIST_MODE'] = 'standalone' os.environ['DGL_DIST_MODE'] = 'standalone'
try: try:
start_dist_dataloader(0, tmpdir, 1, 2, True) start_dist_dataloader(0, tmpdir, 1, True)
except Exception as e: except Exception as e:
print(e) print(e)
dgl.distributed.exit_client() # this is needed since there's two test here in one process dgl.distributed.exit_client() # this is needed since there's two test here in one process
...@@ -159,8 +159,9 @@ def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last, reshuffle): ...@@ -159,8 +159,9 @@ def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last, reshuffle):
time.sleep(3) time.sleep(3)
os.environ['DGL_DIST_MODE'] = 'distributed' os.environ['DGL_DIST_MODE'] = 'distributed'
os.environ['DGL_NUM_SAMPLER'] = str(num_workers)
ptrainer = ctx.Process(target=start_dist_dataloader, args=( ptrainer = ctx.Process(target=start_dist_dataloader, args=(
0, tmpdir, num_server, num_workers, drop_last)) 0, tmpdir, num_server, drop_last))
ptrainer.start() ptrainer.start()
time.sleep(1) time.sleep(1)
...@@ -171,7 +172,7 @@ def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last, reshuffle): ...@@ -171,7 +172,7 @@ def test_dist_dataloader(tmpdir, num_server, num_workers, drop_last, reshuffle):
def start_node_dataloader(rank, tmpdir, num_server, num_workers): def start_node_dataloader(rank, tmpdir, num_server, num_workers):
import dgl import dgl
import torch as th import torch as th
dgl.distributed.initialize("mp_ip_config.txt", 1, num_workers=num_workers) dgl.distributed.initialize("mp_ip_config.txt")
gpb = None gpb = None
disable_shared_mem = num_server > 1 disable_shared_mem = num_server > 1
if disable_shared_mem: if disable_shared_mem:
...@@ -252,6 +253,7 @@ def test_dataloader(tmpdir, num_server, num_workers, dataloader_type): ...@@ -252,6 +253,7 @@ def test_dataloader(tmpdir, num_server, num_workers, dataloader_type):
time.sleep(3) time.sleep(3)
os.environ['DGL_DIST_MODE'] = 'distributed' os.environ['DGL_DIST_MODE'] = 'distributed'
os.environ['DGL_NUM_SAMPLER'] = str(num_workers)
ptrainer_list = [] ptrainer_list = []
if dataloader_type == 'node': if dataloader_type == 'node':
p = ctx.Process(target=start_node_dataloader, args=( p = ctx.Process(target=start_node_dataloader, args=(
......
...@@ -151,7 +151,7 @@ def start_server_mul_role(server_id, num_clients, num_servers): ...@@ -151,7 +151,7 @@ def start_server_mul_role(server_id, num_clients, num_servers):
def start_client(num_clients, num_servers): def start_client(num_clients, num_servers):
os.environ['DGL_DIST_MODE'] = 'distributed' os.environ['DGL_DIST_MODE'] = 'distributed'
# Note: connect to server first ! # Note: connect to server first !
dgl.distributed.initialize(ip_config='kv_ip_config.txt', num_servers=num_servers) dgl.distributed.initialize(ip_config='kv_ip_config.txt')
# Init kvclient # Init kvclient
kvclient = dgl.distributed.KVClient(ip_config='kv_ip_config.txt', num_servers=num_servers) kvclient = dgl.distributed.KVClient(ip_config='kv_ip_config.txt', num_servers=num_servers)
kvclient.map_shared_data(partition_book=gpb) kvclient.map_shared_data(partition_book=gpb)
...@@ -278,10 +278,10 @@ def start_client(num_clients, num_servers): ...@@ -278,10 +278,10 @@ def start_client(num_clients, num_servers):
data_tensor = data_tensor * num_clients data_tensor = data_tensor * num_clients
assert_array_equal(F.asnumpy(res), F.asnumpy(data_tensor)) assert_array_equal(F.asnumpy(res), F.asnumpy(data_tensor))
def start_client_mul_role(i, num_workers, num_servers): def start_client_mul_role(i):
os.environ['DGL_DIST_MODE'] = 'distributed' os.environ['DGL_DIST_MODE'] = 'distributed'
# Initialize creates kvstore ! # Initialize creates kvstore !
dgl.distributed.initialize(ip_config='kv_ip_mul_config.txt', num_servers=num_servers, num_workers=num_workers) dgl.distributed.initialize(ip_config='kv_ip_mul_config.txt')
if i == 0: # block one trainer if i == 0: # block one trainer
time.sleep(5) time.sleep(5)
kvclient = dgl.distributed.kvstore.get_kvstore() kvclient = dgl.distributed.kvstore.get_kvstore()
...@@ -305,6 +305,7 @@ def test_kv_store(): ...@@ -305,6 +305,7 @@ def test_kv_store():
ctx = mp.get_context('spawn') ctx = mp.get_context('spawn')
pserver_list = [] pserver_list = []
pclient_list = [] pclient_list = []
os.environ['DGL_NUM_SERVER'] = str(num_servers)
for i in range(num_servers): for i in range(num_servers):
pserver = ctx.Process(target=start_server, args=(i, num_clients, num_servers)) pserver = ctx.Process(target=start_server, args=(i, num_clients, num_servers))
pserver.start() pserver.start()
...@@ -332,12 +333,14 @@ def test_kv_multi_role(): ...@@ -332,12 +333,14 @@ def test_kv_multi_role():
ctx = mp.get_context('spawn') ctx = mp.get_context('spawn')
pserver_list = [] pserver_list = []
pclient_list = [] pclient_list = []
os.environ['DGL_NUM_SAMPLER'] = str(num_samplers)
os.environ['DGL_NUM_SERVER'] = str(num_servers)
for i in range(num_servers): for i in range(num_servers):
pserver = ctx.Process(target=start_server_mul_role, args=(i, num_clients, num_servers)) pserver = ctx.Process(target=start_server_mul_role, args=(i, num_clients, num_servers))
pserver.start() pserver.start()
pserver_list.append(pserver) pserver_list.append(pserver)
for i in range(num_trainers): for i in range(num_trainers):
pclient = ctx.Process(target=start_client_mul_role, args=(i, num_samplers, num_servers)) pclient = ctx.Process(target=start_client_mul_role, args=(i,))
pclient.start() pclient.start()
pclient_list.append(pclient) pclient_list.append(pclient)
for i in range(num_trainers): for i in range(num_trainers):
......
...@@ -140,6 +140,11 @@ def main(): ...@@ -140,6 +140,11 @@ def main():
udf_command = str(udf_command[0]) udf_command = str(udf_command[0])
if 'python' not in udf_command: if 'python' not in udf_command:
raise RuntimeError("DGL launching script can only support Python executable file.") raise RuntimeError("DGL launching script can only support Python executable file.")
if sys.version_info.major and sys.version_info.minor >= 8:
if args.num_samplers > 0:
print('WARNING! DGL does not support multiple sampler processes in Python>=3.8. '
+ 'Set the number of sampler processes to 0.')
args.num_samplers = 0
submit_jobs(args, udf_command) submit_jobs(args, udf_command)
def signal_handler(signal, frame): def signal_handler(signal, frame):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment