Fix input data source (#1612)

Co-authored-by: Ubuntu <ubuntu@ip-172-31-52-181.ec2.internal> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>

Fix input data source (#1612)
Co-authored-by: Ubuntu <ubuntu@ip-172-31-52-181.ec2.internal> Co-authored-by: Jinjing Zhou <VoVAllen@users.noreply.github.com>
8531ee6a · xiang song(charlie.song) · GitHub · 484bbcc8 · 8531ee6a · 8531ee6a
Unverified Commit 8531ee6a authored Jun 10, 2020 by xiang song(charlie.song) Committed by GitHub Jun 10, 2020
3 changed files
--- a/examples/pytorch/deepwalk/README.md
+++ b/examples/pytorch/deepwalk/README.md
@@ -12,27 +12,30 @@ The implementation includes multi-processing training with CPU and mixed trainin
 - PyTorch 1.5.0
 - DGL 0.4.3
-## How to run the code
-Format of a network file:
+## Input data
+Currently, we support two builtin dataset: youtube and blog. Use --data\_file youtube to select youtube dataset and --data\_file blog to select blog dataset.
+The data is avaliable at  https://data.dgl.ai/dataset/DeepWalk/youtube.zip and https://data.dgl.ai/dataset/DeepWalk/blog.zip
+The youtube.zip includes both youtube-net.txt, youtube-vocab.txt and youtube-label.txt; The blog.zip includes both blog-net.txt, blog-vocab.txt and blog-label.txt. 
+For other datasets please pass the full path to the trainer through --data\_file and the format of a network file should follow:
 ```
 1(node id) 2(node id)
 1 3
+1 4
+2 4
 ...
 ```
+## How to run the code
 To run the code:
 ```
-python3 deepwalk.py --net_file net.txt --emb_file emb.txt --adam --mix --lr 0.2 --num_procs 4 --batch_size 100 --negative 5
+python3 deepwalk.py --data_file youtube --output_emb_file emb.txt --adam --mix --lr 0.2 --gpus 0 1 2 3 --batch_size 100 --negative 5
 ```
 ## How to save the embedding
+By default the trained embedding is saved under --output\_embe\_file FILE\_NAME as a numpy object.
-Functions:
+To save the trained embedding in raw format(txt format), please use --save\_in\_txt argument.
-```
-SkipGramModel.save_embedding(dataset, file_name)
-SkipGramModel.save_embedding_txt(dataset, file_name)
-```
 ## Evaluation
@@ -60,4 +63,4 @@ Parameters.
 Speeding-up with mixed CPU & multi-GPU. The used parameters are the same as above.
 |  #GPUs   |   1   |   2   |   4   |
 |----------|-------|-------|-------|
 | Time (s) |1419.64| 952.04|428.89 |
\ No newline at end of file
--- a/examples/pytorch/deepwalk/deepwalk.py
+++ b/examples/pytorch/deepwalk/deepwalk.py
@@ -17,14 +17,14 @@ class DeepwalkTrainer:
        """ Initializing the trainer with the input arguments """
        self.args = args
        self.dataset = DeepwalkDataset(
-            net_file=args.net_file,
+            net_file=args.data_file,
            map_file=args.map_file,
            walk_length=args.walk_length,
            window_size=args.window_size,
            num_walks=args.num_walks,
            batch_size=args.batch_size,
            negative=args.negative,
-            num_procs=args.num_procs,
+            gpus=args.gpus,
            fast_neg=args.fast_neg,
            )
        self.emb_size = len(self.dataset.net)
@@ -36,7 +36,6 @@ class DeepwalkTrainer:
        """
        choices = sum([self.args.only_gpu, self.args.only_cpu, self.args.mix])
        assert choices == 1, "Must choose only *one* training mode in [only_cpu, only_gpu, mix]"
-        assert self.args.num_procs >= 1, "The number of process must be larger than 1"
        choices = sum([self.args.sgd, self.args.adam, self.args.avg_sgd])
        assert choices == 1, "Must choose only *one* gradient descent strategy in [sgd, avg_sgd, adam]"
@@ -63,17 +62,21 @@ class DeepwalkTrainer:
        torch.set_num_threads(self.args.num_threads)
        if self.args.only_gpu:
            print("Run in 1 GPU")
-            self.emb_model.all_to_device(0)
+            assert self.args.gpus[0] >= 0
+            self.emb_model.all_to_device(self.args.gpus[0])
        elif self.args.mix:
-            print("Mix CPU with %d GPU" % self.args.num_procs)
+            print("Mix CPU with %d GPU" % len(self.args.gpus))
-            if self.args.num_procs == 1:
+            if len(self.args.gpus) == 1:
-                self.emb_model.set_device(0)
+                assert self.args.gpus[0] >= 0, 'mix CPU with GPU should have abaliable GPU'
+                self.emb_model.set_device(self.args.gpus[0])
        else:
-            print("Run in %d CPU process" % self.args.num_procs)
+            print("Run in CPU process")
+            self.args.gpus = [torch.device('cpu')]
    def train(self):
        """ train the embedding """
-        if self.args.num_procs > 1:
+        if len(self.args.gpus) > 1:
            self.fast_train_mp()
        else:
            self.fast_train()
@@ -86,9 +89,8 @@ class DeepwalkTrainer:
        start_all = time.time()
        ps = []
-        np_ = self.args.num_procs
+        for i in range(len(self.args.gpus)):
-        for i in range(np_):
+            p = mp.Process(target=self.fast_train_sp, args=(self.args.gpus[i],))
-            p = mp.Process(target=self.fast_train_sp, args=(i,))
            ps.append(p)
            p.start()
@@ -96,7 +98,10 @@ class DeepwalkTrainer:
            p.join()
        print("Used time: %.2fs" % (time.time()-start_all))
-        self.emb_model.save_embedding(self.dataset, self.args.emb_file)
+        if self.args.save_in_txt:
+            self.emb_model.save_embedding_txt(self.dataset, self.args.output_emb_file)
+        else:
+            self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)
    @thread_wrapped_func
    def fast_train_sp(self, gpu_id):
@@ -198,14 +203,19 @@ class DeepwalkTrainer:
                        start = time.time()
        print("Training used time: %.2fs" % (time.time()-start_all))
-        self.emb_model.save_embedding(self.dataset, self.args.emb_file)
+        if self.args.save_in_txt:
+            self.emb_model.save_embedding_txt(self.dataset, self.args.output_emb_file)
+        else:
+            self.emb_model.save_embedding(self.dataset, self.args.output_emb_file)
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description="DeepWalk")
-    parser.add_argument('--net_file', type=str, 
+    parser.add_argument('--data_file', type=str, 
-            help="path of the txt network file")
+            help="path of the txt network file, builtin dataset include youtube-net and blog-net") 
-    parser.add_argument('--emb_file', type=str, default="emb.npy",
+    parser.add_argument('--save_in_txt', default=False, action="store_true",
-            help='path of the npy embedding file')
+            help='Whether save dat in txt format or npy')
+    parser.add_argument('--output_emb_file', type=str, default="emb.npy",
+            help='path of the output npy embedding file')
    parser.add_argument('--map_file', type=str, default="nodeid_to_index.pickle",
            help='path of the mapping dict that maps node ids to embedding index')
    parser.add_argument('--dim', default=128, type=int, 
@@ -246,11 +256,11 @@ if __name__ == '__main__':
            help="average gradients of sgd for embedding updation")
    parser.add_argument('--num_threads', default=2, type=int, 
            help="number of threads used for each CPU-core/GPU")
-    parser.add_argument('--num_procs', default=1, type=int, 
+    parser.add_argument('--gpus', type=int, default=[-1], nargs='+', 
-            help="number of GPUs/CPUs when mixed training")
+            help='a list of active gpu ids, e.g. 0')
    args = parser.parse_args()
    start_time = time.time()
    trainer = DeepwalkTrainer(args)
    trainer.train()
    print("Total used time: %.2f" % (time.time() - start_time))
\ No newline at end of file
--- a/examples/pytorch/deepwalk/reading_data.py
+++ b/examples/pytorch/deepwalk/reading_data.py
+import os
 import numpy as np
 import scipy.sparse as sp
 import pickle
 import torch
 from torch.utils.data import DataLoader
+from dgl.data.utils import download, _get_dgl_url, get_download_dir, extract_archive
 import random
 import time
 import dgl
 from utils import shuffle_walks
-np.random.seed(3141592653)
+#np.random.seed(3141592653)
 def ReadTxtNet(file_path="", undirected=True):
    """ Read the txt network file. 
@@ -24,6 +26,15 @@ def ReadTxtNet(file_path="", undirected=True):
    node2id dict : a dict mapping the nodes to their embedding indices 
    id2node dict : a dict mapping nodes embedding indices to the nodes
    """
+    if file_path == 'youtube' or file_path == 'blog':
+        name = file_path
+        dir = get_download_dir()
+        zip_file_path='{}/{}.zip'.format(dir, name)
+        download(_get_dgl_url(os.path.join('dataset/DeepWalk/', '{}.zip'.format(file_path))), path=zip_file_path)
+        extract_archive(zip_file_path,
+                        '{}/{}'.format(dir, name))
+        file_path = "{}/{}/{}-net.txt".format(dir, name, name)
    node2id = {}
    id2node = {}
    cid = 0
@@ -97,7 +108,7 @@ class DeepwalkDataset:
            num_walks=10,
            batch_size=32,
            negative=5,
-            num_procs=4,
+            gpus=[0],
            fast_neg=True,
            ):
        """ This class has the following functions:
@@ -121,7 +132,7 @@ class DeepwalkDataset:
        self.num_walks = num_walks
        self.batch_size = batch_size
        self.negative = negative
-        self.num_procs = num_procs
+        self.num_procs = len(gpus)
        self.fast_neg = fast_neg
        self.net, self.node2id, self.id2node, self.sm = ReadTxtNet(net_file)
        self.save_mapping(map_file)
@@ -175,4 +186,4 @@ class DeepwalkSampler(object):
    def sample(self, seeds):
        walks = dgl.contrib.sampling.random_walk(self.G, seeds, 
            1, self.walk_length-1)
        return walks
\ No newline at end of file