Unverified Commit 8531ee6a authored by xiang song(charlie.song)'s avatar xiang song(charlie.song) Committed by GitHub
Browse files

Fix input data source (#1612)


Co-authored-by: default avatarUbuntu <ubuntu@ip-172-31-52-181.ec2.internal>
Co-authored-by: default avatarJinjing Zhou <VoVAllen@users.noreply.github.com>
parent 484bbcc8
...@@ -12,27 +12,30 @@ The implementation includes multi-processing training with CPU and mixed trainin ...@@ -12,27 +12,30 @@ The implementation includes multi-processing training with CPU and mixed trainin
- PyTorch 1.5.0 - PyTorch 1.5.0
- DGL 0.4.3 - DGL 0.4.3
## How to run the code
Format of a network file: ## Input data
Currently, we support two builtin dataset: youtube and blog. Use --data\_file youtube to select youtube dataset and --data\_file blog to select blog dataset.
The data is avaliable at https://data.dgl.ai/dataset/DeepWalk/youtube.zip and https://data.dgl.ai/dataset/DeepWalk/blog.zip
The youtube.zip includes both youtube-net.txt, youtube-vocab.txt and youtube-label.txt; The blog.zip includes both blog-net.txt, blog-vocab.txt and blog-label.txt.
For other datasets please pass the full path to the trainer through --data\_file and the format of a network file should follow:
``` ```
1(node id) 2(node id) 1(node id) 2(node id)
1 3 1 3
1 4
2 4
... ...
``` ```
## How to run the code
To run the code: To run the code:
``` ```
python3 deepwalk.py --net_file net.txt --emb_file emb.txt --adam --mix --lr 0.2 --num_procs 4 --batch_size 100 --negative 5 python3 deepwalk.py --data_file youtube --output_emb_file emb.txt --adam --mix --lr 0.2 --gpus 0 1 2 3 --batch_size 100 --negative 5
``` ```
## How to save the embedding ## How to save the embedding
By default the trained embedding is saved under --output\_embe\_file FILE\_NAME as a numpy object.
Functions: To save the trained embedding in raw format(txt format), please use --save\_in\_txt argument.
```
SkipGramModel.save_embedding(dataset, file_name)
SkipGramModel.save_embedding_txt(dataset, file_name)
```
## Evaluation ## Evaluation
...@@ -60,4 +63,4 @@ Parameters. ...@@ -60,4 +63,4 @@ Parameters.
Speeding-up with mixed CPU & multi-GPU. The used parameters are the same as above. Speeding-up with mixed CPU & multi-GPU. The used parameters are the same as above.
| #GPUs | 1 | 2 | 4 | | #GPUs | 1 | 2 | 4 |
|----------|-------|-------|-------| |----------|-------|-------|-------|
| Time (s) |1419.64| 952.04|428.89 | | Time (s) |1419.64| 952.04|428.89 |
\ No newline at end of file
...@@ -17,14 +17,14 @@ class DeepwalkTrainer: ...@@ -17,14 +17,14 @@ class DeepwalkTrainer:
""" Initializing the trainer with the input arguments """ """ Initializing the trainer with the input arguments """
self.args = args self.args = args
self.dataset = DeepwalkDataset( self.dataset = DeepwalkDataset(
net_file=args.net_file, net_file=args.data_file,
map_file=args.map_file, map_file=args.map_file,
walk_length=args.walk_length, walk_length=args.walk_length,
window_size=args.window_size, window_size=args.window_size,
num_walks=args.num_walks, num_walks=args.num_walks,
batch_size=args.batch_size, batch_size=args.batch_size,
negative=args.negative, negative=args.negative,
num_procs=args.num_procs, gpus=args.gpus,
fast_neg=args.fast_neg, fast_neg=args.fast_neg,
) )
self.emb_size = len(self.dataset.net) self.emb_size = len(self.dataset.net)
...@@ -36,7 +36,6 @@ class DeepwalkTrainer: ...@@ -36,7 +36,6 @@ class DeepwalkTrainer:
""" """
choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix]) choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix])
assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]" assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]"
assert self.args.num_procs >= 1, "The number of process must be larger than 1"
choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd]) choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd])
assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]" assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]"
...@@ -63,17 +62,21 @@ class DeepwalkTrainer: ...@@ -63,17 +62,21 @@ class DeepwalkTrainer:
torch.set_num_threads(self.args.num_threads) torch.set_num_threads(self.args.num_threads)
if self.args.only_gpu: if self.args.only_gpu:
print("Run in 1 GPU") print("Run in 1 GPU")
self.emb_model.all_to_device(0) assert self.args.gpus[0] >= 0
self.emb_model.all_to_device(self.args.gpus[0])
elif self.args.mix: elif self.args.mix:
print("Mix CPU with %d GPU" % self.args.num_procs) print("Mix CPU with %d GPU" % len(self.args.gpus))
if self.args.num_procs == 1: if len(self.args.gpus) == 1:
self.emb_model.set_device(0) assert self.args.gpus[0] >= 0, 'mix CPU with GPU should have abaliable GPU'
self.emb_model.set_device(self.args.gpus[0])
else: else:
print("Run in %d CPU process" % self.args.num_procs) print("Run in CPU process")
self.args.gpus = [torch.device('cpu')]
def train(self): def train(self):
""" train the embedding """ """ train the embedding """
if self.args.num_procs > 1: if len(self.args.gpus) > 1:
self.fast_train_mp() self.fast_train_mp()
else: else:
self.fast_train() self.fast_train()
...@@ -86,9 +89,8 @@ class DeepwalkTrainer: ...@@ -86,9 +89,8 @@ class DeepwalkTrainer:
start_all = time.time() start_all = time.time()
ps = [] ps = []
np_ = self.args.num_procs for i in range(len(self.args.gpus)):
for i in range(np_): p = mp.Process(target=self.fast_train_sp, args=(self.args.gpus[i],))
p = mp.Process(target=self.fast_train_sp, args=(i,))
ps.append(p) ps.append(p)
p.start() p.start()
...@@ -96,7 +98,10 @@ class DeepwalkTrainer: ...@@ -96,7 +98,10 @@ class DeepwalkTrainer:
p.join() p.join()
print("Used time: %.2fs" % (time.time()-start_all)) print("Used time: %.2fs" % (time.time()-start_all))
self.emb_model.save_embedding(self.dataset, self.args.emb_file) if self.args.save_in_txt:
self.emb_model.save_embedding_txt(self.dataset, self.args.output_emb_file)
else:
self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)
@thread_wrapped_func @thread_wrapped_func
def fast_train_sp(self, gpu_id): def fast_train_sp(self, gpu_id):
...@@ -198,14 +203,19 @@ class DeepwalkTrainer: ...@@ -198,14 +203,19 @@ class DeepwalkTrainer:
start = time.time() start = time.time()
print("Training used time: %.2fs" % (time.time()-start_all)) print("Training used time: %.2fs" % (time.time()-start_all))
self.emb_model.save_embedding(self.dataset, self.args.emb_file) if self.args.save_in_txt:
self.emb_model.save_embedding_txt(self.dataset, self.args.output_emb_file)
else:
self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser(description="DeepWalk") parser = argparse.ArgumentParser(description="DeepWalk")
parser.add_argument('--net_file', type=str, parser.add_argument('--data_file', type=str,
help="path of the txt network file") help="path of the txt network file, builtin dataset include youtube-net and blog-net")
parser.add_argument('--emb_file', type=str, default="emb.npy", parser.add_argument('--save_in_txt', default=False, action="store_true",
help='path of the npy embedding file') help='Whether save dat in txt format or npy')
parser.add_argument('--output_emb_file', type=str, default="emb.npy",
help='path of the output npy embedding file')
parser.add_argument('--map_file', type=str, default="nodeid_to_index.pickle", parser.add_argument('--map_file', type=str, default="nodeid_to_index.pickle",
help='path of the mapping dict that maps node ids to embedding index') help='path of the mapping dict that maps node ids to embedding index')
parser.add_argument('--dim', default=128, type=int, parser.add_argument('--dim', default=128, type=int,
...@@ -246,11 +256,11 @@ if __name__ == '__main__': ...@@ -246,11 +256,11 @@ if __name__ == '__main__':
help="average gradients of sgd for embedding updation") help="average gradients of sgd for embedding updation")
parser.add_argument('--num_threads', default=2, type=int, parser.add_argument('--num_threads', default=2, type=int,
help="number of threads used for each CPU-core/GPU") help="number of threads used for each CPU-core/GPU")
parser.add_argument('--num_procs', default=1, type=int, parser.add_argument('--gpus', type=int, default=[-1], nargs='+',
help="number of GPUs/CPUs when mixed training") help='a list of active gpu ids, e.g. 0')
args = parser.parse_args() args = parser.parse_args()
start_time = time.time() start_time = time.time()
trainer = DeepwalkTrainer(args) trainer = DeepwalkTrainer(args)
trainer.train() trainer.train()
print("Total used time: %.2f" % (time.time() - start_time)) print("Total used time: %.2f" % (time.time() - start_time))
\ No newline at end of file
import os
import numpy as np import numpy as np
import scipy.sparse as sp import scipy.sparse as sp
import pickle import pickle
import torch import torch
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
from dgl.data.utils import download, _get_dgl_url, get_download_dir, extract_archive
import random import random
import time import time
import dgl import dgl
from utils import shuffle_walks from utils import shuffle_walks
np.random.seed(3141592653) #np.random.seed(3141592653)
def ReadTxtNet(file_path="", undirected=True): def ReadTxtNet(file_path="", undirected=True):
""" Read the txt network file. """ Read the txt network file.
...@@ -24,6 +26,15 @@ def ReadTxtNet(file_path="", undirected=True): ...@@ -24,6 +26,15 @@ def ReadTxtNet(file_path="", undirected=True):
node2id dict : a dict mapping the nodes to their embedding indices node2id dict : a dict mapping the nodes to their embedding indices
id2node dict : a dict mapping nodes embedding indices to the nodes id2node dict : a dict mapping nodes embedding indices to the nodes
""" """
if file_path == 'youtube' or file_path == 'blog':
name = file_path
dir = get_download_dir()
zip_file_path='{}/{}.zip'.format(dir, name)
download(_get_dgl_url(os.path.join('dataset/DeepWalk/', '{}.zip'.format(file_path))), path=zip_file_path)
extract_archive(zip_file_path,
'{}/{}'.format(dir, name))
file_path = "{}/{}/{}-net.txt".format(dir, name, name)
node2id = {} node2id = {}
id2node = {} id2node = {}
cid = 0 cid = 0
...@@ -97,7 +108,7 @@ class DeepwalkDataset: ...@@ -97,7 +108,7 @@ class DeepwalkDataset:
num_walks=10, num_walks=10,
batch_size=32, batch_size=32,
negative=5, negative=5,
num_procs=4, gpus=[0],
fast_neg=True, fast_neg=True,
): ):
""" This class has the following functions: """ This class has the following functions:
...@@ -121,7 +132,7 @@ class DeepwalkDataset: ...@@ -121,7 +132,7 @@ class DeepwalkDataset:
self.num_walks = num_walks self.num_walks = num_walks
self.batch_size = batch_size self.batch_size = batch_size
self.negative = negative self.negative = negative
self.num_procs = num_procs self.num_procs = len(gpus)
self.fast_neg = fast_neg self.fast_neg = fast_neg
self.net, self.node2id, self.id2node, self.sm = ReadTxtNet(net_file) self.net, self.node2id, self.id2node, self.sm = ReadTxtNet(net_file)
self.save_mapping(map_file) self.save_mapping(map_file)
...@@ -175,4 +186,4 @@ class DeepwalkSampler(object): ...@@ -175,4 +186,4 @@ class DeepwalkSampler(object):
def sample(self, seeds): def sample(self, seeds):
walks = dgl.contrib.sampling.random_walk(self.G, seeds, walks = dgl.contrib.sampling.random_walk(self.G, seeds,
1, self.walk_length-1) 1, self.walk_length-1)
return walks return walks
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment