Unverified Commit d57ff78d authored by Chao Ma's avatar Chao Ma Committed by GitHub
Browse files

[Small Fix] Change default message queue size of communicator to 2GB (#1140)

* API change of kvstore

* add demo for kvstore

* update

* remove duplicated log

* change queue size

* update

* update

* update

* update

* update

* update

* update

* update

* update

* fix lint

* change name

* update

* fix lint

* update

* update

* update

* update

* change message queue size to a python argument

* change default queue size to 2GB

* OMP_NUM_THREADS=1
parent 17aab812
...@@ -39,7 +39,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_ns --dataset cora --self-loop --nu ...@@ -39,7 +39,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_ns --dataset cora --self-loop --nu
Sampler side: Sampler side:
``` ```
DGLBACKEND=mxnet python3 sampler.py --model gcn_ns --dataset cora --self-loop --num-neighbors 2 --batch-size 1000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=mxnet python3 sampler.py --model gcn_ns --dataset cora --self-loop --num-neighbors 2 --batch-size 1000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### citeseer #### citeseer
...@@ -53,7 +53,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_ns --dataset citeseer --self-loop ...@@ -53,7 +53,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_ns --dataset citeseer --self-loop
Sampler side: Sampler side:
``` ```
DGLBACKEND=mxnet python3 sampler.py --model gcn_ns --dataset citeseer --self-loop --num-neighbors 2 --batch-size 1000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=mxnet python3 sampler.py --model gcn_ns --dataset citeseer --self-loop --num-neighbors 2 --batch-size 1000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### pubmed #### pubmed
...@@ -67,7 +67,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_ns --dataset pubmed --self-loop -- ...@@ -67,7 +67,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_ns --dataset pubmed --self-loop --
Sampler side: Sampler side:
``` ```
DGLBACKEND=mxnet python3 sampler.py --model gcn_ns --dataset pubmed --self-loop --num-neighbors 3 --batch-size 1000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=mxnet python3 sampler.py --model gcn_ns --dataset pubmed --self-loop --num-neighbors 3 --batch-size 1000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### reddit #### reddit
...@@ -81,7 +81,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_ns --dataset reddit-self-loop --nu ...@@ -81,7 +81,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_ns --dataset reddit-self-loop --nu
Sampler side: Sampler side:
``` ```
DGLBACKEND=mxnet python3 sampler.py --model gcn_ns --dataset reddit-self-loop --num-neighbors 2 --batch-size 1000 --ip 127.0.0.1:2049 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=mxnet python3 sampler.py --model gcn_ns --dataset reddit-self-loop --num-neighbors 2 --batch-size 1000 --ip 127.0.0.1:2049 --num-sampler 1
``` ```
### Control Variate & Skip Connection ### Control Variate & Skip Connection
...@@ -97,7 +97,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_cv --dataset cora --self-loop --nu ...@@ -97,7 +97,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_cv --dataset cora --self-loop --nu
Sampler side: Sampler side:
``` ```
DGLBACKEND=mxnet python3 sampler.py --model gcn_cv --dataset cora --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=mxnet python3 sampler.py --model gcn_cv --dataset cora --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### citeseer #### citeseer
...@@ -111,7 +111,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_cv --dataset citeseer --self-loop ...@@ -111,7 +111,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_cv --dataset citeseer --self-loop
Sampler Side: Sampler Side:
``` ```
DGLBACKEND=mxnet python3 sampler.py --model gcn_cv --dataset citeseer --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=mxnet python3 sampler.py --model gcn_cv --dataset citeseer --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### pubmed #### pubmed
...@@ -123,7 +123,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_cv --dataset pubmed --self-loop -- ...@@ -123,7 +123,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_cv --dataset pubmed --self-loop --
Sampler Side: Sampler Side:
``` ```
DGLBACKEND=mxnet python3 sampler.py --model gcn_cv --dataset pubmed --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=mxnet python3 sampler.py --model gcn_cv --dataset pubmed --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### reddit #### reddit
...@@ -137,7 +137,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_cv --dataset reddit-self-loop --nu ...@@ -137,7 +137,7 @@ DGLBACKEND=mxnet python3 train.py --model gcn_cv --dataset reddit-self-loop --nu
Sampler Side: Sampler Side:
``` ```
DGLBACKEND=mxnet python3 sampler.py --model gcn_cv --dataset reddit-self-loop --num-neighbors 1 --batch-size 10000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=mxnet python3 sampler.py --model gcn_cv --dataset reddit-self-loop --num-neighbors 1 --batch-size 10000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
### Control Variate & GraphSAGE-mean ### Control Variate & GraphSAGE-mean
...@@ -155,5 +155,5 @@ DGLBACKEND=mxnet python3 train.py --model graphsage_cv --batch-size 1000 --test- ...@@ -155,5 +155,5 @@ DGLBACKEND=mxnet python3 train.py --model graphsage_cv --batch-size 1000 --test-
Sampler side: Sampler side:
``` ```
DGLBACKEND=mxnet python3 sampler.py --model graphsage_cv --batch-size 1000 --dataset reddit --num-neighbors 1 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=mxnet python3 sampler.py --model graphsage_cv --batch-size 1000 --dataset reddit --num-neighbors 1 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
...@@ -39,7 +39,7 @@ DGLBACKEND=pytorch python3 gcn_ns_sc_train.py --dataset cora --self-loop --num-n ...@@ -39,7 +39,7 @@ DGLBACKEND=pytorch python3 gcn_ns_sc_train.py --dataset cora --self-loop --num-n
Sampler side: Sampler side:
``` ```
DGLBACKEND=pytorch python3 sampler.py --model gcn_ns --dataset cora --self-loop --num-neighbors 2 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 sampler.py --model gcn_ns --dataset cora --self-loop --num-neighbors 2 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### citeseer #### citeseer
...@@ -53,7 +53,7 @@ DGLBACKEND=pytorch python3 gcn_ns_sc_train.py --dataset citeseer --self-loop --n ...@@ -53,7 +53,7 @@ DGLBACKEND=pytorch python3 gcn_ns_sc_train.py --dataset citeseer --self-loop --n
Sampler side: Sampler side:
``` ```
DGLBACKEND=pytorch python3 sampler.py --model gcn_ns --dataset citeseer --self-loop --num-neighbors 2 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 sampler.py --model gcn_ns --dataset citeseer --self-loop --num-neighbors 2 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### pubmed #### pubmed
...@@ -67,7 +67,7 @@ DGLBACKEND=pytorch python3 gcn_ns_sc_train.py --dataset pubmed --self-loop --num ...@@ -67,7 +67,7 @@ DGLBACKEND=pytorch python3 gcn_ns_sc_train.py --dataset pubmed --self-loop --num
Sampler side: Sampler side:
``` ```
DGLBACKEND=pytorch python3 sampler.py --model gcn_ns --dataset pubmed --self-loop --num-neighbors 3 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 sampler.py --model gcn_ns --dataset pubmed --self-loop --num-neighbors 3 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
### Control Variate & Skip Connection ### Control Variate & Skip Connection
...@@ -83,7 +83,7 @@ DGLBACKEND=pytorch python3 gcn_cv_sc_train.py --dataset cora --self-loop --num-n ...@@ -83,7 +83,7 @@ DGLBACKEND=pytorch python3 gcn_cv_sc_train.py --dataset cora --self-loop --num-n
Sampler side: Sampler side:
``` ```
DGLBACKEND=pytorch python3 sampler.py --model gcn_cv --dataset cora --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 sampler.py --model gcn_cv --dataset cora --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### citeseer #### citeseer
...@@ -97,7 +97,7 @@ DGLBACKEND=pytorch python3 gcn_cv_sc_train.py --dataset citeseer --self-loop --n ...@@ -97,7 +97,7 @@ DGLBACKEND=pytorch python3 gcn_cv_sc_train.py --dataset citeseer --self-loop --n
Sampler side: Sampler side:
``` ```
DGLBACKEND=pytorch python3 sampler.py --model gcn_cv --dataset citeseer --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 sampler.py --model gcn_cv --dataset citeseer --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
#### pubmed #### pubmed
...@@ -111,5 +111,5 @@ DGLBACKEND=pytorch python3 gcn_cv_sc_train.py --dataset pubmed --self-loop --num ...@@ -111,5 +111,5 @@ DGLBACKEND=pytorch python3 gcn_cv_sc_train.py --dataset pubmed --self-loop --num
Sampler side: Sampler side:
``` ```
DGLBACKEND=pytorch python3 sampler.py --model gcn_cv --dataset pubmed --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1 OMP_NUM_THREADS=1 DGLBACKEND=pytorch python3 sampler.py --model gcn_cv --dataset pubmed --self-loop --num-neighbors 1 --batch-size 1000000 --ip 127.0.0.1:50051 --num-sampler 1
``` ```
...@@ -76,7 +76,7 @@ def start_server(server_id, ip_config, num_client, ndata, edata, ndata_g2l=None, ...@@ -76,7 +76,7 @@ def start_server(server_id, ip_config, num_client, ndata, edata, ndata_g2l=None,
edata_g2l : dict of tensor (mx.ndarray or torch.tensor) edata_g2l : dict of tensor (mx.ndarray or torch.tensor)
global2local mapping of edge data global2local mapping of edge data
msg_queue_size : int msg_queue_size : int
Size of message queue Size of message queue (2GB by default)
""" """
assert server_id >= 0, 'server_id (%d) cannot be a negative number.' % server_id assert server_id >= 0, 'server_id (%d) cannot be a negative number.' % server_id
assert len(ip_config) > 0, 'ip_config cannot be empty.' assert len(ip_config) > 0, 'ip_config cannot be empty.'
...@@ -123,7 +123,7 @@ def start_client(ip_config, ndata_partition_book, edata_partition_book, close_sh ...@@ -123,7 +123,7 @@ def start_client(ip_config, ndata_partition_book, edata_partition_book, close_sh
close_shared_mem : bool close_shared_mem : bool
Close local shared-memory tensor access. Close local shared-memory tensor access.
msg_queue_size : int msg_queue_size : int
Size of message queue Size of message queue (2GB by default)
Returns Returns
------- -------
...@@ -171,7 +171,7 @@ class KVServer(object): ...@@ -171,7 +171,7 @@ class KVServer(object):
num_client : int num_client : int
Total number of clients connecting to server. Total number of clients connecting to server.
msg_queue_size : int msg_queue_size : int
Size of message queue Size of message queue (2GB by default)
net_type : str net_type : str
networking type, e.g., 'socket' (default) or 'mpi' (do not support yet). networking type, e.g., 'socket' (default) or 'mpi' (do not support yet).
""" """
...@@ -478,7 +478,7 @@ class KVClient(object): ...@@ -478,7 +478,7 @@ class KVClient(object):
close_shared_mem : bool close_shared_mem : bool
DO NOT use shared-memory access on local machine. DO NOT use shared-memory access on local machine.
msg_queue_size : int msg_queue_size : int
Size of message queue. Size of message queue (2GB by default).
net_type : str net_type : str
networking type, e.g., 'socket' (default) or 'mpi'. networking type, e.g., 'socket' (default) or 'mpi'.
""" """
......
...@@ -31,7 +31,7 @@ def _network_wait(): ...@@ -31,7 +31,7 @@ def _network_wait():
""" """
time.sleep(_WAIT_TIME_SEC) time.sleep(_WAIT_TIME_SEC)
def _create_sender(net_type, msg_queue_size=2000*1024*1024*1024): def _create_sender(net_type, msg_queue_size=2*1024*1024*1024):
"""Create a Sender communicator via C api """Create a Sender communicator via C api
Parameters Parameters
...@@ -39,12 +39,12 @@ def _create_sender(net_type, msg_queue_size=2000*1024*1024*1024): ...@@ -39,12 +39,12 @@ def _create_sender(net_type, msg_queue_size=2000*1024*1024*1024):
net_type : str net_type : str
'socket' or 'mpi' 'socket' or 'mpi'
msg_queue_size : int msg_queue_size : int
message queue size message queue size (2GB by default)
""" """
assert net_type in ('socket', 'mpi'), 'Unknown network type.' assert net_type in ('socket', 'mpi'), 'Unknown network type.'
return _CAPI_DGLSenderCreate(net_type, msg_queue_size) return _CAPI_DGLSenderCreate(net_type, msg_queue_size)
def _create_receiver(net_type, msg_queue_size=2000*1024*1024*1024): def _create_receiver(net_type, msg_queue_size=2*1024*1024*1024):
"""Create a Receiver communicator via C api """Create a Receiver communicator via C api
Parameters Parameters
...@@ -52,7 +52,7 @@ def _create_receiver(net_type, msg_queue_size=2000*1024*1024*1024): ...@@ -52,7 +52,7 @@ def _create_receiver(net_type, msg_queue_size=2000*1024*1024*1024):
net_type : str net_type : str
'socket' or 'mpi' 'socket' or 'mpi'
msg_queue_size : int msg_queue_size : int
message queue size message queue size (2GB by default)
""" """
assert net_type in ('socket', 'mpi'), 'Unknown network type.' assert net_type in ('socket', 'mpi'), 'Unknown network type.'
return _CAPI_DGLReceiverCreate(net_type, msg_queue_size) return _CAPI_DGLReceiverCreate(net_type, msg_queue_size)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment