Unverified Commit b133abb8 authored by Chao Ma's avatar Chao Ma Committed by GitHub
Browse files

[KVStore] New kvstore used by DGL-KE (#1263)

* new kvstore

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* test warning

* update

* update

* udpate

* update

* update

* update

* update

* small fix

* small fix

* get group count

* update

* update

* make file

* update

* use addr

* get id

* partition book

* update

* partition

* barrier

* update

* loop count

* update

* update

* update

* update

* update

* update

* update

* update

* update

* add mxnet demo

* update ip

* update

* update

* update

* random

* update

* update

* update

* update

* update

* update

* fix lint

* fix lint

* fix lint
parent 49fe5b3c
## Usage of DGL distributed KVStore
This is a simple example shows how to use DGL distributed KVStore on MXNet locally. In this example, we start 4 servers and 4 clients, and you can first run the command:
./run_server.sh
And when you see the message
start server 1 on 127.0.0.1:50051
start server 2 on 127.0.0.1:50052
start server 0 on 127.0.0.1:50050
start server 3 on 127.0.0.1:50053
you can start client by:
./run_client.sh
# This is a simple MXNet server demo shows how to use DGL distributed kvstore.
import dgl
import os
import argparse
import mxnet as mx
import time
import dgl
from dgl.contrib import KVClient
import mxnet as mx
partition = mx.nd.array([0,0,1,1,2,2,3,3], dtype='int64')
ID = []
ID.append(mx.nd.array([0,1], dtype='int64'))
ID.append(mx.nd.array([2,3], dtype='int64'))
......@@ -16,44 +21,60 @@ DATA.append(mx.nd.array([[2.,2.,2.,],[2.,2.,2.,]]))
DATA.append(mx.nd.array([[3.,3.,3.,],[3.,3.,3.,]]))
DATA.append(mx.nd.array([[4.,4.,4.,],[4.,4.,4.,]]))
edata_partition_book = {'edata':mx.nd.array([0,0,1,1,2,2,3,3], dtype='int64')}
ndata_partition_book = {'ndata':mx.nd.array([0,0,1,1,2,2,3,3], dtype='int64')}
def start_client():
time.sleep(3)
class ArgParser(argparse.ArgumentParser):
def __init__(self):
super(ArgParser, self).__init__()
client = dgl.contrib.start_client(ip_config='ip_config.txt',
ndata_partition_book=ndata_partition_book,
edata_partition_book=edata_partition_book,
close_shared_mem=True)
self.add_argument('--ip_config', type=str, default='ip_config.txt',
help='IP configuration file of kvstore.')
self.add_argument('--num_worker', type=int, default=2,
help='Number of worker (client nodes) on single-machine.')
tensor_edata = client.pull(name='edata', id_tensor=mx.nd.array([0,1,2,3,4,5,6,7], dtype='int64'))
tensor_ndata = client.pull(name='ndata', id_tensor=mx.nd.array([0,1,2,3,4,5,6,7], dtype='int64'))
def start_client(args):
"""Start client
"""
server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config)
print(tensor_edata)
client.barrier()
my_client = KVClient(server_namebook=server_namebook)
print(tensor_ndata)
client.barrier()
my_client.connect()
client.push(name='edata', id_tensor=ID[client.get_id()], data_tensor=DATA[client.get_id()])
client.push(name='ndata', id_tensor=ID[client.get_id()], data_tensor=DATA[client.get_id()])
if my_client.get_id() % args.num_worker == 0:
my_client.set_partition_book(name='entity_embed', partition_book=partition)
else:
time.sleep(3)
my_client.set_partition_book(name='entity_embed')
client.barrier()
my_client.print()
tensor_edata = client.pull(name='edata', id_tensor=mx.nd.array([0,1,2,3,4,5,6,7], dtype='int64'))
tensor_ndata = client.pull(name='ndata', id_tensor=mx.nd.array([0,1,2,3,4,5,6,7], dtype='int64'))
my_client.barrier()
print(tensor_edata)
client.barrier()
print("send request...")
print(tensor_ndata)
client.barrier()
for i in range(4):
my_client.push(name='entity_embed', id_tensor=ID[i], data_tensor=DATA[i])
if client.get_id() == 0:
client.shut_down()
my_client.barrier()
if __name__ == '__main__':
if my_client.get_id() % args.num_worker == 0:
res = my_client.pull(name='entity_embed', id_tensor=mx.nd.array([0,1,2,3,4,5,6,7], dtype='int64'))
print(res)
start_client()
\ No newline at end of file
my_client.barrier()
my_client.push(name='entity_embed', id_tensor=ID[my_client.get_machine_id()], data_tensor=mx.nd.array([[0.,0.,0.],[0.,0.,0.]]))
my_client.barrier()
if my_client.get_id() % args.num_worker == 0:
res = my_client.pull(name='entity_embed', id_tensor=mx.nd.array([0,1,2,3,4,5,6,7], dtype='int64'))
print(res)
my_client.shut_down()
if __name__ == '__main__':
args = ArgParser().parse_args()
start_client(args)
\ No newline at end of file
0 127.0.0.1 50050
1 127.0.0.1 50051
2 127.0.0.1 50052
3 127.0.0.1 50053
\ No newline at end of file
0 172.31.6.94 30050 2
1 172.31.4.10 30050 2
2 172.31.11.99 30050 2
3 172.31.2.252 30050 2
\ No newline at end of file
DGLBACKEND=mxnet python3 client.py &
DGLBACKEND=mxnet python3 client.py &
DGLBACKEND=mxnet python3 client.py &
DGLBACKEND=mxnet python3 client.py
\ No newline at end of file
DGLBACKEND=mxnet python3 server.py --id 0 &
DGLBACKEND=mxnet python3 server.py --id 1 &
DGLBACKEND=mxnet python3 server.py --id 2 &
DGLBACKEND=mxnet python3 server.py --id 3
\ No newline at end of file
# This is a simple MXNet server demo shows how to use DGL distributed kvstore.
import dgl
import os
import argparse
import time
import dgl
from dgl.contrib import KVServer
import mxnet as mx
ndata_g2l = []
edata_g2l = []
g2l = []
g2l.append(mx.nd.array([0,1,0,0,0,0,0,0], dtype='int64'))
g2l.append(mx.nd.array([0,0,0,1,0,0,0,0], dtype='int64'))
g2l.append(mx.nd.array([0,0,0,0,0,1,0,0], dtype='int64'))
g2l.append(mx.nd.array([0,0,0,0,0,0,0,1], dtype='int64'))
data = []
data.append(mx.nd.array([[4.,4.,4.],[4.,4.,4.]]))
data.append(mx.nd.array([[3.,3.,3.],[3.,3.,3.]]))
data.append(mx.nd.array([[2.,2.,2.],[2.,2.,2.]]))
data.append(mx.nd.array([[1.,1.,1.],[1.,1.,1.]]))
ndata_g2l.append({'ndata':mx.nd.array([0,1,0,0,0,0,0,0], dtype='int64')})
ndata_g2l.append({'ndata':mx.nd.array([0,0,0,1,0,0,0,0], dtype='int64')})
ndata_g2l.append({'ndata':mx.nd.array([0,0,0,0,0,1,0,0], dtype='int64')})
ndata_g2l.append({'ndata':mx.nd.array([0,0,0,0,0,0,0,1], dtype='int64')})
class ArgParser(argparse.ArgumentParser):
def __init__(self):
super(ArgParser, self).__init__()
edata_g2l.append({'edata':mx.nd.array([0,1,0,0,0,0,0,0], dtype='int64')})
edata_g2l.append({'edata':mx.nd.array([0,0,0,1,0,0,0,0], dtype='int64')})
edata_g2l.append({'edata':mx.nd.array([0,0,0,0,0,1,0,0], dtype='int64')})
edata_g2l.append({'edata':mx.nd.array([0,0,0,0,0,0,0,1], dtype='int64')})
self.add_argument('--server_id', type=int, default=0,
help='Unique ID of each server.')
self.add_argument('--ip_config', type=str, default='ip_config.txt',
help='IP configuration file of kvstore.')
self.add_argument('--num_client', type=int, default=1,
help='Total number of client nodes.')
DATA = []
DATA.append(mx.nd.array([[4.,4.,4.,],[4.,4.,4.,]]))
DATA.append(mx.nd.array([[3.,3.,3.,],[3.,3.,3.,]]))
DATA.append(mx.nd.array([[2.,2.,2.,],[2.,2.,2.,]]))
DATA.append(mx.nd.array([[1.,1.,1.,],[1.,1.,1.,]]))
def start_server(args):
dgl.contrib.start_server(
server_id=args.id,
ip_config='ip_config.txt',
num_client=4,
ndata={'ndata':DATA[args.id]},
edata={'edata':DATA[args.id]},
ndata_g2l=ndata_g2l[args.id],
edata_g2l=edata_g2l[args.id])
"""Start kvstore service
"""
server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config)
my_server = KVServer(server_id=args.server_id, server_namebook=server_namebook, num_client=args.num_client)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='kvstore')
parser.add_argument("--id", type=int, default=0, help="node ID")
args = parser.parse_args()
if my_server.get_id() % my_server.get_group_count() == 0: # master server
my_server.set_global2local(name='entity_embed', global2local=g2l[my_server.get_machine_id()])
my_server.init_data(name='entity_embed', data_tensor=data[my_server.get_machine_id()])
else:
time.sleep(3)
my_server.set_global2local(name='entity_embed')
my_server.init_data(name='entity_embed')
my_server.print()
my_server.start()
if __name__ == '__main__':
args = ArgParser().parse_args()
start_server(args)
\ No newline at end of file
## Usage of DGL distributed KVStore
This is a simple example shows how to use DGL distributed KVStore on Pytorch locally. In this example, we start 4 servers and 4 clients, and you can first run the command:
./run_server.sh
And when you see the message
start server 1 on 127.0.0.1:40051
start server 2 on 127.0.0.1:40052
start server 0 on 127.0.0.1:40050
start server 3 on 127.0.0.1:40053
you can start client by:
./run_client.sh
\ No newline at end of file
# This is a simple MXNet server demo shows how to use DGL distributed kvstore.
import dgl
import os
import argparse
import torch as th
import time
import dgl
from dgl.contrib import KVClient
import torch as th
partition = th.tensor([0,0,1,1,2,2,3,3])
ID = []
ID.append(th.tensor([0,1]))
ID.append(th.tensor([2,3]))
......@@ -16,43 +21,60 @@ DATA.append(th.tensor([[2.,2.,2.,],[2.,2.,2.,]]))
DATA.append(th.tensor([[3.,3.,3.,],[3.,3.,3.,]]))
DATA.append(th.tensor([[4.,4.,4.,],[4.,4.,4.,]]))
edata_partition_book = {'edata':th.tensor([0,0,1,1,2,2,3,3])}
ndata_partition_book = {'ndata':th.tensor([0,0,1,1,2,2,3,3])}
def start_client():
time.sleep(3)
class ArgParser(argparse.ArgumentParser):
def __init__(self):
super(ArgParser, self).__init__()
client = dgl.contrib.start_client(ip_config='ip_config.txt',
ndata_partition_book=ndata_partition_book,
edata_partition_book=edata_partition_book,
close_shared_mem=True)
self.add_argument('--ip_config', type=str, default='ip_config.txt',
help='IP configuration file of kvstore.')
self.add_argument('--num_worker', type=int, default=2,
help='Number of worker (client nodes) on single-machine.')
tensor_edata = client.pull(name='edata', id_tensor=th.tensor([0,1,2,3,4,5,6,7]))
tensor_ndata = client.pull(name='ndata', id_tensor=th.tensor([0,1,2,3,4,5,6,7]))
print(tensor_edata)
client.barrier()
def start_client(args):
"""Start client
"""
server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config)
print(tensor_ndata)
client.barrier()
my_client = KVClient(server_namebook=server_namebook)
client.push(name='edata', id_tensor=ID[client.get_id()], data_tensor=DATA[client.get_id()])
client.push(name='ndata', id_tensor=ID[client.get_id()], data_tensor=DATA[client.get_id()])
my_client.connect()
client.barrier()
if my_client.get_id() % args.num_worker == 0:
my_client.set_partition_book(name='entity_embed', partition_book=partition)
else:
time.sleep(3)
my_client.set_partition_book(name='entity_embed')
tensor_edata = client.pull(name='edata', id_tensor=th.tensor([0,1,2,3,4,5,6,7]))
tensor_ndata = client.pull(name='ndata', id_tensor=th.tensor([0,1,2,3,4,5,6,7]))
my_client.print()
print(tensor_edata)
client.barrier()
my_client.barrier()
print(tensor_ndata)
client.barrier()
print("send request...")
if client.get_id() == 0:
client.shut_down()
for i in range(4):
my_client.push(name='entity_embed', id_tensor=ID[i], data_tensor=DATA[i])
if __name__ == '__main__':
my_client.barrier()
start_client()
\ No newline at end of file
if my_client.get_id() % args.num_worker == 0:
res = my_client.pull(name='entity_embed', id_tensor=th.tensor([0,1,2,3,4,5,6,7]))
print(res)
my_client.barrier()
my_client.push(name='entity_embed', id_tensor=ID[my_client.get_machine_id()], data_tensor=th.tensor([[0.,0.,0.],[0.,0.,0.]]))
my_client.barrier()
if my_client.get_id() % args.num_worker == 0:
res = my_client.pull(name='entity_embed', id_tensor=th.tensor([0,1,2,3,4,5,6,7]))
print(res)
my_client.shut_down()
if __name__ == '__main__':
args = ArgParser().parse_args()
start_client(args)
\ No newline at end of file
0 127.0.0.1 50050
1 127.0.0.1 50051
2 127.0.0.1 50052
3 127.0.0.1 50053
\ No newline at end of file
0 172.31.6.94 30050 2
1 172.31.4.10 30050 2
2 172.31.11.99 30050 2
3 172.31.2.252 30050 2
\ No newline at end of file
python3 client.py &
python3 client.py &
python3 client.py &
python3 client.py
\ No newline at end of file
python3 server.py --id 0 &
python3 server.py --id 1 &
python3 server.py --id 2 &
python3 server.py --id 3
\ No newline at end of file
# This is a simple MXNet server demo shows how to use DGL distributed kvstore.
import dgl
import os
import argparse
import time
import dgl
from dgl.contrib import KVServer
import torch as th
ndata_g2l = []
edata_g2l = []
g2l = []
g2l.append(th.tensor([0,1,0,0,0,0,0,0]))
g2l.append(th.tensor([0,0,0,1,0,0,0,0]))
g2l.append(th.tensor([0,0,0,0,0,1,0,0]))
g2l.append(th.tensor([0,0,0,0,0,0,0,1]))
data = []
data.append(th.tensor([[4.,4.,4.],[4.,4.,4.]]))
data.append(th.tensor([[3.,3.,3.],[3.,3.,3.]]))
data.append(th.tensor([[2.,2.,2.],[2.,2.,2.]]))
data.append(th.tensor([[1.,1.,1.],[1.,1.,1.]]))
ndata_g2l.append({'ndata':th.tensor([0,1,0,0,0,0,0,0])})
ndata_g2l.append({'ndata':th.tensor([0,0,0,1,0,0,0,0])})
ndata_g2l.append({'ndata':th.tensor([0,0,0,0,0,1,0,0])})
ndata_g2l.append({'ndata':th.tensor([0,0,0,0,0,0,0,1])})
class ArgParser(argparse.ArgumentParser):
def __init__(self):
super(ArgParser, self).__init__()
edata_g2l.append({'edata':th.tensor([0,1,0,0,0,0,0,0])})
edata_g2l.append({'edata':th.tensor([0,0,0,1,0,0,0,0])})
edata_g2l.append({'edata':th.tensor([0,0,0,0,0,1,0,0])})
edata_g2l.append({'edata':th.tensor([0,0,0,0,0,0,0,1])})
self.add_argument('--server_id', type=int, default=0,
help='Unique ID of each server.')
self.add_argument('--ip_config', type=str, default='ip_config.txt',
help='IP configuration file of kvstore.')
self.add_argument('--num_client', type=int, default=1,
help='Total number of client nodes.')
DATA = []
DATA.append(th.tensor([[4.,4.,4.,],[4.,4.,4.,]]))
DATA.append(th.tensor([[3.,3.,3.,],[3.,3.,3.,]]))
DATA.append(th.tensor([[2.,2.,2.,],[2.,2.,2.,]]))
DATA.append(th.tensor([[1.,1.,1.,],[1.,1.,1.,]]))
def start_server(args):
dgl.contrib.start_server(
server_id=args.id,
ip_config='ip_config.txt',
num_client=4,
ndata={'ndata':DATA[args.id]},
edata={'edata':DATA[args.id]},
ndata_g2l=ndata_g2l[args.id],
edata_g2l=edata_g2l[args.id])
"""Start kvstore service
"""
server_namebook = dgl.contrib.read_ip_config(filename=args.ip_config)
my_server = KVServer(server_id=args.server_id, server_namebook=server_namebook, num_client=args.num_client)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='kvstore')
parser.add_argument("--id", type=int, default=0, help="node ID")
args = parser.parse_args()
if my_server.get_id() % my_server.get_group_count() == 0: # master server
my_server.set_global2local(name='entity_embed', global2local=g2l[my_server.get_machine_id()])
my_server.init_data(name='entity_embed', data_tensor=data[my_server.get_machine_id()])
else:
time.sleep(3)
my_server.set_global2local(name='entity_embed')
my_server.init_data(name='entity_embed')
my_server.print()
my_server.start()
if __name__ == '__main__':
args = ArgParser().parse_args()
start_server(args)
\ No newline at end of file
from . import sampling
from . import graph_store
from .dis_kvstore import KVClient, KVServer
from .dis_kvstore import read_ip_config
from .dis_kvstore import start_server, start_client
\ No newline at end of file
from .dis_kvstore import read_ip_config
\ No newline at end of file
This diff is collapsed.
......@@ -23,6 +23,7 @@ def _network_wait():
"""
time.sleep(_WAIT_TIME_SEC)
def _create_sender(net_type, msg_queue_size=2*1024*1024*1024):
"""Create a Sender communicator via C api
......@@ -36,6 +37,7 @@ def _create_sender(net_type, msg_queue_size=2*1024*1024*1024):
assert net_type in ('socket', 'mpi'), 'Unknown network type.'
return _CAPI_DGLSenderCreate(net_type, msg_queue_size)
def _create_receiver(net_type, msg_queue_size=2*1024*1024*1024):
"""Create a Receiver communicator via C api
......@@ -49,6 +51,7 @@ def _create_receiver(net_type, msg_queue_size=2*1024*1024*1024):
assert net_type in ('socket', 'mpi'), 'Unknown network type.'
return _CAPI_DGLReceiverCreate(net_type, msg_queue_size)
def _finalize_sender(sender):
"""Finalize Sender communicator
......@@ -59,11 +62,13 @@ def _finalize_sender(sender):
"""
_CAPI_DGLFinalizeSender(sender)
def _finalize_receiver(receiver):
"""Finalize Receiver Communicator
"""
_CAPI_DGLFinalizeReceiver(receiver)
def _add_receiver_addr(sender, ip_addr, port, recv_id):
"""Add Receiver IP address to namebook
......@@ -81,6 +86,7 @@ def _add_receiver_addr(sender, ip_addr, port, recv_id):
assert recv_id >= 0, 'recv_id cannot be a negative number.'
_CAPI_DGLSenderAddReceiver(sender, ip_addr, int(port), int(recv_id))
def _sender_connect(sender):
"""Connect to all the Receiver
......@@ -91,8 +97,9 @@ def _sender_connect(sender):
"""
_CAPI_DGLSenderConnect(sender)
def _receiver_wait(receiver, ip_addr, port, num_sender):
"""Wait all Sender to connect..
"""Wait all Sender to connect.
Parameters
----------
......@@ -186,7 +193,8 @@ class KVMsgType(Enum):
BARRIER = 6
IP_ID = 7
KVStoreMsg = namedtuple("KVStoreMsg", "type rank name id data, c_ptr")
KVStoreMsg = namedtuple("KVStoreMsg", "type rank name id data c_ptr")
"""Message of DGL kvstore
Data Field
......@@ -201,6 +209,8 @@ id : tensor (mx.ndarray or torch.tensor)
data vector storing the global IDs
data : tensor (mx.ndarray or torch.tensor)
data matrix with the same row size of id
c_ptr : void*
c pointer of message
"""
def _send_kv_msg(sender, msg, recv_id):
......@@ -249,6 +259,7 @@ def _send_kv_msg(sender, msg, recv_id):
tensor_id,
data)
def _recv_kv_msg(receiver):
"""Receive kvstore message.
......@@ -256,7 +267,6 @@ def _recv_kv_msg(receiver):
----------
receiver : ctypes.c_void_p
C Receiver handle
Return
------
KVStoreMsg
......@@ -319,4 +329,3 @@ def _clear_kv_msg(garbage_msg):
if msg.c_ptr is not None:
_CAPI_DeleteKVMsg(msg.c_ptr)
garbage_msg = []
\ No newline at end of file
......@@ -24,7 +24,6 @@ using namespace dgl::runtime;
namespace dgl {
namespace network {
static void NaiveDeleter(DLManagedTensor* managed_tensor) {
delete [] managed_tensor->dl_tensor.shape;
delete [] managed_tensor->dl_tensor.strides;
......@@ -607,7 +606,5 @@ DGL_REGISTER_GLOBAL("network._CAPI_DeleteKVMsg")
delete msg;
});
} // namespace network
} // namespace dgl
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment