[KG] reduce memory consumption. (#902)

* reduce memory consumption. * fix a bug. * fix a bug. * fix.

[KG] reduce memory consumption. (#902)
* reduce memory consumption. * fix a bug. * fix a bug. * fix.
f8184153 · Da Zheng · GitHub · 655d7568 · f8184153 · f8184153
Unverified Commit f8184153 authored Dec 28, 2019 by Da Zheng Committed by GitHub Dec 28, 2019
Hide whitespace changes
Inline Side-by-side

Showing with 65 additions and 47 deletions

apps/kg/dataloader/KGDataset.py apps/kg/dataloader/KGDataset.py +22 -7

apps/kg/dataloader/sampler.py apps/kg/dataloader/sampler.py +43 -40

No files found.
--- a/apps/kg/dataloader/KGDataset.py
+++ b/apps/kg/dataloader/KGDataset.py
 import os
+import numpy as np

 def _download_and_extract(url, path, filename):
    import shutil, zipfile
@@ -71,13 +72,20 @@ class KGDataset1:

    def read_triple(self, path, mode):
        # mode: train/valid/test
-        triples = []
+        heads = []
+        tails = []
+        rels = []
        with open(os.path.join(path, '{}.txt'.format(mode))) as f:
            for line in f:
                h, r, t = line.strip().split('\t')
-                triples.append((self.entity2id[h], self.relation2id[r], self.entity2id[t]))
+                heads.append(self.entity2id[h])
+                rels.append(self.relation2id[r])
+                tails.append(self.entity2id[t])
+        heads = np.array(heads, dtype=np.int64)
+        tails = np.array(tails, dtype=np.int64)
+        rels = np.array(rels, dtype=np.int64)

-        return triples
+        return (heads, rels, tails)


 class KGDataset2:
@@ -115,16 +123,23 @@ class KGDataset2:
        self.test = self.read_triple(self.path, 'test')

    def read_triple(self, path, mode, skip_first_line=False):
-        triples = []
+        heads = []
+        tails = []
+        rels = []
        print('Reading {} triples....'.format(mode))
        with open(os.path.join(path, '{}.txt'.format(mode))) as f:
            if skip_first_line:
                _ = f.readline()
            for line in f:
                h, t, r = line.strip().split('\t')
-                triples.append((int(h), int(r), int(t)))
-        print('Finished. Read {} {} triples.'.format(len(triples), mode))
-        return triples
+                heads.append(int(h))
+                tails.append(int(t))
+                rels.append(int(r))
+        heads = np.array(heads, dtype=np.int64)
+        tails = np.array(tails, dtype=np.int64)
+        rels = np.array(rels, dtype=np.int64)
+        print('Finished. Read {} {} triples.'.format(len(heads), mode))
+        return (heads, rels, tails)


 def get_dataset(data_path, data_name, format_str):

--- a/apps/kg/dataloader/sampler.py
+++ b/apps/kg/dataloader/sampler.py
@@ -11,9 +11,9 @@ import time
 # This partitions a list of edges based on relations to make sure
 # each partition has roughly the same number of edges and relations.
 def RelationPartition(edges, n):
-    print('relation partition {} edges into {} parts'.format(len(edges), n))
-    rel = np.array([r for h, r, t in edges])
-    uniq, cnts = np.unique(rel, return_counts=True)
+    heads, rels, tails = edges
+    print('relation partition {} edges into {} parts'.format(len(heads), n))
+    uniq, cnts = np.unique(rels, return_counts=True)
    idx = np.flip(np.argsort(cnts))
    cnts = cnts[idx]
    uniq = uniq[idx]
@@ -30,35 +30,39 @@ def RelationPartition(edges, n):
        rel_cnts[idx] += 1
    for i, edge_cnt in enumerate(edge_cnts):
        print('part {} has {} edges and {} relations'.format(i, edge_cnt, rel_cnts[i]))
+
    parts = []
    for _ in range(n):
        parts.append([])
-    for h, r, t in edges:
-        idx = rel_dict[r]
-        parts[idx].append((h, r, t))
+    # let's store the edge index to each partition first.
+    for i, r in enumerate(rels):
+        part_idx = rel_dict[r]
+        parts[part_idx].append(i)
+    for i, part in enumerate(parts):
+        parts[i] = np.array(part, dtype=np.int64)
    return parts

 def RandomPartition(edges, n):
-    print('random partition {} edges into {} parts'.format(len(edges), n))
-    idx = np.random.permutation(len(edges))
+    heads, rels, tails = edges
+    print('random partition {} edges into {} parts'.format(len(heads), n))
+    idx = np.random.permutation(len(heads))
    part_size = int(math.ceil(len(idx) / n))
    parts = []
    for i in range(n):
        start = part_size * i
        end = min(part_size * (i + 1), len(idx))
-        parts.append([edges[i] for i in idx[start:end]])
+        parts.append(idx[start:end])
+        print('part {} has {} edges'.format(i, len(parts[-1])))
    return parts

-def ConstructGraph(edges, n_entities, i, args):
-    pickle_name = 'graph_train_{}.pickle'.format(i)
+def ConstructGraph(edges, n_entities, args):
+    pickle_name = 'graph_train.pickle'
    if args.pickle_graph and os.path.exists(os.path.join(args.data_path, args.dataset, pickle_name)):
        with open(os.path.join(args.data_path, args.dataset, pickle_name), 'rb') as graph_file:
            g = pickle.load(graph_file)
            print('Load pickled graph.')
    else:
-        src = [t[0] for t in edges]
-        etype_id = [t[1] for t in edges]
-        dst = [t[2] for t in edges]
+        src, etype_id, dst = edges
        coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)), shape=[n_entities, n_entities])
        g = dgl.DGLGraph(coo, readonly=True, sort_csr=True)
        g.ndata['id'] = F.arange(0, g.number_of_nodes())
@@ -71,26 +75,23 @@ def ConstructGraph(edges, n_entities, i, args):
 class TrainDataset(object):
    def __init__(self, dataset, args, weighting=False, ranks=64):
        triples = dataset.train
-        print('|Train|:', len(triples))
+        self.g = ConstructGraph(triples, dataset.n_entities, args)
+        num_train = len(triples[0])
+        print('|Train|:', num_train)
        if ranks > 1 and args.rel_part:
-            triples_list = RelationPartition(triples, ranks)
+            self.edge_parts = RelationPartition(triples, ranks)
        elif ranks > 1:
-            triples_list = RandomPartition(triples, ranks)
+            self.edge_parts = RandomPartition(triples, ranks)
        else:
-            triples_list = [triples]
-        self.graphs = []
-        for i, triples in enumerate(triples_list):
-            g = ConstructGraph(triples, dataset.n_entities, i, args)
-            if weighting:
-                # TODO: weight to be added
-                count = self.count_freq(triples)
-                subsampling_weight = np.vectorize(
-                    lambda h, r, t: np.sqrt(1 / (count[(h, r)] + count[(t, -r - 1)]))
-                )
-                weight = subsampling_weight(src, etype_id, dst)
-                g.edata['weight'] = F.zerocopy_from_numpy(weight)
-                # to be added
-            self.graphs.append(g)
+            self.edge_parts = [np.arange(num_train)]
+        if weighting:
+            # TODO: weight to be added
+            count = self.count_freq(triples)
+            subsampling_weight = np.vectorize(
+                lambda h, r, t: np.sqrt(1 / (count[(h, r)] + count[(t, -r - 1)]))
+            )
+            weight = subsampling_weight(src, etype_id, dst)
+            self.g.edata['weight'] = F.zerocopy_from_numpy(weight)

    def count_freq(self, triples, start=4):
        count = {}
@@ -109,7 +110,8 @@ class TrainDataset(object):
    def create_sampler(self, batch_size, neg_sample_size=2, mode='head', num_workers=5,
                       shuffle=True, exclude_positive=False, rank=0):
        EdgeSampler = getattr(dgl.contrib.sampling, 'EdgeSampler')
-        return EdgeSampler(self.graphs[rank],
+        return EdgeSampler(self.g,
+                           seed_edges=F.tensor(self.edge_parts[rank]),
                           batch_size=batch_size,
                           neg_sample_size=neg_sample_size,
                           negative_mode=mode,
@@ -118,6 +120,7 @@ class TrainDataset(object):
                           exclude_positive=exclude_positive,
                           return_false_neg=False)

+
 class PBGNegEdgeSubgraph(dgl.subgraph.DGLSubGraph):
    def __init__(self, subg, num_chunks, chunk_size,
                 neg_sample_size, neg_head):
@@ -203,17 +206,17 @@ class EvalSampler(object):

 class EvalDataset(object):
    def __init__(self, dataset, args):
-        triples = dataset.train + dataset.valid + dataset.test
        pickle_name = 'graph_all.pickle'
        if args.pickle_graph and os.path.exists(os.path.join(args.data_path, args.dataset, pickle_name)):
            with open(os.path.join(args.data_path, args.dataset, pickle_name), 'rb') as graph_file:
                g = pickle.load(graph_file)
                print('Load pickled graph.')
        else:
-            src = [t[0] for t in triples]
-            etype_id = [t[1] for t in triples]
-            dst = [t[2] for t in triples]
-            coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)), shape=[dataset.n_entities, dataset.n_entities])
+            src = np.concatenate((dataset.train[0], dataset.valid[0], dataset.test[0]))
+            etype_id = np.concatenate((dataset.train[1], dataset.valid[1], dataset.test[1]))
+            dst = np.concatenate((dataset.train[2], dataset.valid[2], dataset.test[2]))
+            coo = sp.sparse.coo_matrix((np.ones(len(src)), (src, dst)),
+                                       shape=[dataset.n_entities, dataset.n_entities])
            g = dgl.DGLGraph(coo, readonly=True, sort_csr=True)
            g.ndata['id'] = F.arange(0, g.number_of_nodes())
            g.edata['id'] = F.tensor(etype_id, F.int64)
@@ -222,9 +225,9 @@ class EvalDataset(object):
                    pickle.dump(g, graph_file)
        self.g = g

-        self.num_train = len(dataset.train)
-        self.num_valid = len(dataset.valid)
-        self.num_test = len(dataset.test)
+        self.num_train = len(dataset.train[0])
+        self.num_valid = len(dataset.valid[0])
+        self.num_test = len(dataset.test[0])

        if args.eval_percent < 1:
            self.valid = np.random.randint(0, self.num_valid,