[KG][Optimization] Soft relation partition (#1252)

* Several optimizations on DGL-KG: 1. Sorted positive edges for sampling which can reduce random memory access during positive sampling 2. Asynchronous node embedding update 3. Balanced Relation Partition that gives balanced number of edges in each partition. When there is no cross partition relation, relation embedding can be pin into GPU memory 4. tunable neg_sample_size instead of fixed neg_sample_size * Fix test * Fix test and eval.py * Now TransR is OK * Fix single GPU with mix_cpu_gpu * Add app tests * Fix test script * fix mxnet * Fix sample * Add docstrings * Fix * Default value for num_workers * Add soft relation part * Upd * Some fix * upd * Now work * Fix TransR * Fix eval and add some doc string * triger * upd * Add some training scripts for freebase multi-gpu * upd * upd * upd

[KG][Optimization] Soft relation partition (#1252)
* Several optimizations on DGL-KG: 1. Sorted positive edges for sampling which can reduce random memory access during positive sampling 2. Asynchronous node embedding update 3. Balanced Relation Partition that gives balanced number of edges in each partition. When there is no cross partition relation, relation embedding can be pin into GPU memory 4. tunable neg_sample_size instead of fixed neg_sample_size * Fix test * Fix test and eval.py * Now TransR is OK * Fix single GPU with mix_cpu_gpu * Add app tests * Fix test script * fix mxnet * Fix sample * Add docstrings * Fix * Default value for num_workers * Add soft relation part * Upd * Some fix * upd * Now work * Fix TransR * Fix eval and add some doc string * triger * upd * Add some training scripts for freebase multi-gpu * upd * upd * upd
49fe5b3c · xiang song(charlie.song) · GitHub · 7a80faf1 · 49fe5b3c · 49fe5b3c
Unverified Commit 49fe5b3c authored Feb 16, 2020 by xiang song(charlie.song) Committed by GitHub Feb 15, 2020
8 changed files
--- a/apps/kg/config/best_config.sh
+++ b/apps/kg/config/best_config.sh
@@ -104,3 +104,20 @@ DGLBACKEND=pytorch python3 train.py --model ComplEx --dataset Freebase --batch_s
    --neg_sample_size 256 --hidden_dim 400 --gamma 500.0     --lr 0.1 --max_step 50000 \
    --batch_size_eval 128 --test -adv --eval_interval 300000 \
    --neg_sample_size_test 100000 --eval_percent 0.02 --num_proc 64
+
+# Freebase multi-gpu
+# TransE_l2 8 GPU
+DGLBACKEND=pytorch python3 train.py --model TransE_l2 --dataset Freebase --batch_size 1024 \
+    --neg_sample_size 256 --hidden_dim 400 --gamma 10 --lr 0.1 --batch_size_eval 1000 \
+    --valid --test -adv --mix_cpu_gpu --neg_deg_sample_eval --neg_sample_size_test 1000 \
+    --num_proc 8 --gpu 0 1 2 3 4 5 6 7 --num_worker 4 --regularization_coef 1e-9 \
+    --no_eval_filter --max_step 400000 --rel_part --eval_interval 100000 --log_interval 10000 \
+    --no_eval_filter --async_update --neg_deg_sample --force_sync_interval 1000
+
+# TransE_l2 16 GPU
+DGLBACKEND=pytorch python3 train.py --model TransE_l2 --dataset Freebase --batch_size 1024 \
+    --neg_sample_size 256 --hidden_dim 400 --gamma 10 --lr 0.1 --batch_size_eval 1000 \
+    --valid --test -adv --mix_cpu_gpu --neg_deg_sample_eval --neg_sample_size_test 1000 \
+    --num_proc 16 --gpu 0 1 2 3 4 5 6 7 --num_worker 4 --regularization_coef 1e-9 \
+    --no_eval_filter --max_step 200000 --soft_rel_part --eval_interval 100000 --log_interval 10000 \
+    --no_eval_filter --async_update --neg_deg_sample --force_sync_interval 1000
--- a/apps/kg/dataloader/sampler.py
+++ b/apps/kg/dataloader/sampler.py
@@ -8,6 +8,119 @@ import sys
 import pickle
 import time

+def SoftRelationPartition(edges, n, threshold=0.05):
+    """This partitions a list of edges to n partitions according to their
+    relation types. For any relation with number of edges larger than the
+    threshold, its edges will be evenly distributed into all partitions.
+    For any relation with number of edges smaller than the threshold, its
+    edges will be put into one single partition.
+
+    Algo:
+    For r in relations:
+        if r.size() > threadold
+            Evenly divide edges of r into n parts and put into each relation.
+        else
+            Find partition with fewest edges, and put edges of r into 
+            this partition.
+
+    Parameters
+    ----------
+    edges : (heads, rels, tails) triple
+        Edge list to partition
+    n : int
+        Number of partitions
+    threshold : float
+        The threshold of whether a relation is LARGE or SMALL
+        Default: 5%
+
+    Returns
+    -------
+    List of np.array
+        Edges of each partition
+    List of np.array
+        Edge types of each partition
+    bool
+        Whether there exists some relations belongs to multiple partitions
+    """
+    heads, rels, tails = edges
+    print('relation partition {} edges into {} parts'.format(len(heads), n))
+    uniq, cnts = np.unique(rels, return_counts=True)
+    idx = np.flip(np.argsort(cnts))
+    cnts = cnts[idx]
+    uniq = uniq[idx]
+    assert cnts[0] > cnts[-1]
+    edge_cnts = np.zeros(shape=(n,), dtype=np.int64)
+    rel_cnts = np.zeros(shape=(n,), dtype=np.int64)
+    rel_dict = {}
+    rel_parts = []
+    cross_rel_part = []
+    for _ in range(n):
+        rel_parts.append([])
+
+    large_threshold = int(len(rels) * threshold)
+    capacity_per_partition = int(len(rels) / n)
+    # ensure any relation larger than the partition capacity will be split
+    large_threshold = capacity_per_partition if capacity_per_partition < large_threshold \
+                      else large_threshold
+    num_cross_part = 0
+    for i in range(len(cnts)):
+        cnt = cnts[i]
+        r = uniq[i]
+        r_parts = []
+        if cnt > large_threshold:
+            avg_part_cnt = (cnt // n) + 1
+            num_cross_part += 1
+            for j in range(n):
+                part_cnt = avg_part_cnt if cnt > avg_part_cnt else cnt
+                r_parts.append([j, part_cnt])
+                rel_parts[j].append(r)
+                edge_cnts[j] += part_cnt
+                rel_cnts[j] += 1
+                cnt -= part_cnt
+            cross_rel_part.append(r)
+        else:
+            idx = np.argmin(edge_cnts)
+            r_parts.append([idx, cnt])
+            rel_parts[idx].append(r)
+            edge_cnts[idx] += cnt
+            rel_cnts[idx] += 1
+        rel_dict[r] = r_parts
+
+    for i, edge_cnt in enumerate(edge_cnts):
+        print('part {} has {} edges and {} relations'.format(i, edge_cnt, rel_cnts[i]))
+    print('{}/{} duplicated relation across partitions'.format(num_cross_part, len(cnts)))
+
+    parts = []
+    for i in range(n):
+        parts.append([])
+        rel_parts[i] = np.array(rel_parts[i])
+
+    for i, r in enumerate(rels):
+        r_part = rel_dict[r][0]
+        part_idx = r_part[0]
+        cnt = r_part[1]
+        parts[part_idx].append(i)
+        cnt -= 1
+        if cnt == 0:
+            rel_dict[r].pop(0)
+        else:
+            rel_dict[r][0][1] = cnt
+
+    for i, part in enumerate(parts):
+        parts[i] = np.array(part, dtype=np.int64)
+    shuffle_idx = np.concatenate(parts)
+    heads[:] = heads[shuffle_idx]
+    rels[:] = rels[shuffle_idx]
+    tails[:] = tails[shuffle_idx]
+
+    off = 0
+    for i, part in enumerate(parts):
+        parts[i] = np.arange(off, off + len(part))
+        off += len(part)
+    cross_rel_part = np.array(cross_rel_part)
+
+    return parts, rel_parts, num_cross_part > 0, cross_rel_part
+
 def BalancedRelationPartition(edges, n):
    """This partitions a list of edges based on relations to make sure
    each partition has roughly the same number of edges and relations.
@@ -184,7 +297,10 @@ class TrainDataset(object):
        num_train = len(triples[0])
        print('|Train|:', num_train)

-        if ranks > 1 and args.rel_part:
+        if ranks > 1 and args.soft_rel_part:
+            self.edge_parts, self.rel_parts, self.cross_part, self.cross_rels = \
+            SoftRelationPartition(triples, ranks)
+        elif ranks > 1 and args.rel_part:
            self.edge_parts, self.rel_parts, self.cross_part = \
                BalancedRelationPartition(triples, ranks)
        elif ranks > 1:

--- a/apps/kg/eval.py
+++ b/apps/kg/eval.py
@@ -101,6 +101,7 @@ def main(args):
    args.valid = False
    args.test = True
    args.strict_rel_part = False
+    args.soft_rel_part = False
    args.async_update = False
    args.batch_size_eval = args.batch_size


--- a/apps/kg/models/general_models.py
+++ b/apps/kg/models/general_models.py
@@ -82,7 +82,8 @@ class KEModel(object):
        self.rel_dim = rel_dim
        self.entity_dim = entity_dim
        self.strict_rel_part = args.strict_rel_part
-        if not self.strict_rel_part:
+        self.soft_rel_part = args.soft_rel_part
+        if not self.strict_rel_part and not self.soft_rel_part:
            self.relation_emb = ExternalEmbedding(args, n_relations, rel_dim,
                                                  F.cpu() if args.mix_cpu_gpu else device)
        else:
@@ -120,7 +121,7 @@ class KEModel(object):
        """Use torch.tensor.share_memory_() to allow cross process embeddings access.
        """
        self.entity_emb.share_memory()
-        if self.strict_rel_part:
+        if self.strict_rel_part or self.soft_rel_part:
            self.global_relation_emb.share_memory()
        else:
            self.relation_emb.share_memory()
@@ -139,7 +140,7 @@ class KEModel(object):
            Dataset name as prefix to the saved embeddings.
        """
        self.entity_emb.save(path, dataset+'_'+self.model_name+'_entity')
-        if self.strict_rel_part:
+        if self.strict_rel_part or self.soft_rel_part:
            self.global_relation_emb.save(path, dataset+'_'+self.model_name+'_relation')
        else:
            self.relation_emb.save(path, dataset+'_'+self.model_name+'_relation')   
@@ -165,8 +166,10 @@ class KEModel(object):
        """
        self.entity_emb.init(self.emb_init)
        self.score_func.reset_parameters()
-        if not self.strict_rel_part:
+        if (not self.strict_rel_part) and (not self.soft_rel_part):
            self.relation_emb.init(self.emb_init)
+        else:
+            self.global_relation_emb.init(self.emb_init)

    def predict_score(self, g):
        """Predict the positive score.
@@ -415,6 +418,11 @@ class KEModel(object):
            self.score_func.prepare_local_emb(local_projection_emb)
            self.score_func.reset_parameters()

+    def prepare_cross_rels(self, cross_rels):
+        self.relation_emb.setup_cross_rels(cross_rels, self.global_relation_emb)
+        if self.model_name == 'TransR':
+            self.score_func.prepare_cross_rels(cross_rels)
+
    def writeback_relation(self, rank=0, rel_parts=None):
        """ Writeback relation embeddings in a specific process to global relation embedding.
        Used in multi-process multi-gpu training model.
@@ -425,6 +433,8 @@ class KEModel(object):
            List of tensor stroing edge types of each partition.
        """
        idx = rel_parts[rank]
+        if self.soft_rel_part:
+            idx = self.relation_emb.get_noncross_idx(idx)
        self.global_relation_emb.emb[idx] = F.copy_to(self.relation_emb.emb, F.cpu())[idx]
        if self.model_name == 'TransR':
            self.score_func.writeback_local_emb(idx)

--- a/apps/kg/models/pytorch/score_fun.py
+++ b/apps/kg/models/pytorch/score_fun.py
@@ -157,6 +157,9 @@ class TransRScore(nn.Module):
        self.global_projection_emb = self.projection_emb
        self.projection_emb = projection_emb

+    def prepare_cross_rels(self, cross_rels):
+        self.projection_emb.setup_cross_rels(cross_rels, self.global_projection_emb)
+
    def writeback_local_emb(self, idx):
        self.global_projection_emb.emb[idx] = self.projection_emb.emb.cpu()[idx]


--- a/apps/kg/models/pytorch/tensor_models.py
+++ b/apps/kg/models/pytorch/tensor_models.py
@@ -117,11 +117,13 @@ class ExternalEmbedding:
    def __init__(self, args, num, dim, device):
        self.gpu = args.gpu
        self.args = args
+        self.num = num
        self.trace = []

        self.emb = th.empty(num, dim, dtype=th.float32, device=device)
        self.state_sum = self.emb.new().resize_(self.emb.size(0)).zero_()
        self.state_step = 0
+        self.has_cross_rel = False
        # queue used by asynchronous update
        self.async_q = None
        # asynchronous update process
@@ -138,6 +140,19 @@ class ExternalEmbedding:
        INIT.uniform_(self.emb, -emb_init, emb_init)
        INIT.zeros_(self.state_sum)

+    def setup_cross_rels(self, cross_rels, global_emb):
+        cpu_bitmap = th.zeros((self.num,), dtype=th.bool)
+        for i, rel in enumerate(cross_rels):
+            cpu_bitmap[rel] = 1
+        self.cpu_bitmap = cpu_bitmap
+        self.has_cross_rel = True
+        self.global_emb = global_emb
+
+    def get_noncross_idx(self, idx):
+        cpu_mask = self.cpu_bitmap[idx]
+        gpu_mask = ~cpu_mask
+        return idx[gpu_mask]
+
    def share_memory(self):
        """Use torch.tensor.share_memory_() to allow cross process tensor access
        """
@@ -158,6 +173,14 @@ class ExternalEmbedding:
            If False, do not trace the computation.
            Default: True
        """
+        if self.has_cross_rel:
+            cpu_idx = idx.cpu()
+            cpu_mask = self.cpu_bitmap[cpu_idx]
+            cpu_idx = cpu_idx[cpu_mask]
+            cpu_idx = th.unique(cpu_idx)
+            if cpu_idx.shape[0] != 0:
+                cpu_emb = self.global_emb.emb[cpu_idx]
+                self.emb[cpu_idx] = cpu_emb.cuda(gpu_id)
        s = self.emb[idx]
        if gpu_id >= 0:
            s = s.cuda(gpu_id)
@@ -202,6 +225,22 @@ class ExternalEmbedding:
                        grad_indices = grad_indices.to(device)
                    if device != grad_sum.device:
                        grad_sum = grad_sum.to(device)
+
+                    if self.has_cross_rel:
+                        cpu_mask = self.cpu_bitmap[grad_indices]
+                        cpu_idx = grad_indices[cpu_mask]
+                        if cpu_idx.shape[0] > 0:
+                            cpu_grad = grad_values[cpu_mask]
+                            cpu_sum = grad_sum[cpu_mask].cpu()
+                            cpu_idx = cpu_idx.cpu()
+                            self.global_emb.state_sum.index_add_(0, cpu_idx, cpu_sum)
+                            std = self.global_emb.state_sum[cpu_idx]
+                            if gpu_id >= 0:
+                                std = std.cuda(gpu_id)
+                            std_values = std.sqrt_().add_(1e-10).unsqueeze(1)
+                            tmp = (-clr * cpu_grad / std_values)
+                            tmp = tmp.cpu()
+                            self.global_emb.emb.index_add_(0, cpu_idx, tmp)
                    self.state_sum.index_add_(0, grad_indices, grad_sum)
                    std = self.state_sum[grad_indices]  # _sparse_mask
                    if gpu_id >= 0:

--- a/apps/kg/train.py
+++ b/apps/kg/train.py
@@ -112,6 +112,8 @@ class ArgParser(argparse.ArgumentParser):
                          help='number of process used')
        self.add_argument('--rel_part', action='store_true',
                          help='enable relation partitioning')
+        self.add_argument('--soft_rel_part', action='store_true',
+                          help='enable soft relation partition')
        self.add_argument('--nomp_thread_per_process', type=int, default=-1,
                          help='num of omp threads used per process in multi-process training')
        self.add_argument('--async_update', action='store_true',
@@ -170,7 +172,9 @@ def run(args, logger):

    num_workers = args.num_worker
    train_data = TrainDataset(dataset, args, ranks=args.num_proc)
+    # if there is no cross partition relaiton, we fall back to strict_rel_part
    args.strict_rel_part = args.mix_cpu_gpu and (train_data.cross_part == False)
+    args.soft_rel_part = args.mix_cpu_gpu and args.soft_rel_part and train_data.cross_part

    # Automatically set number of OMP threads for each process if it is not provided
    # The value for GPU is evaluated in AWS p3.16xlarge
@@ -322,7 +326,8 @@ def run(args, logger):

    # train
    start = time.time()
-    rel_parts = train_data.rel_parts if args.strict_rel_part else None
+    rel_parts = train_data.rel_parts if args.strict_rel_part or args.soft_rel_part else None
+    cross_rels = train_data.cross_rels if args.soft_rel_part else None
    if args.num_proc > 1:
        procs = []
        barrier = mp.Barrier(args.num_proc)
@@ -334,6 +339,7 @@ def run(args, logger):
                                                     valid_sampler,
                                                     i,
                                                     rel_parts,
+                                                     cross_rels,
                                                     barrier))
            procs.append(proc)
            proc.start()

--- a/apps/kg/train_pytorch.py
+++ b/apps/kg/train_pytorch.py
@@ -29,7 +29,7 @@ def load_model_from_checkpoint(logger, args, n_entities, n_relations, ckpt_path)
    model.load_emb(ckpt_path, args.dataset)
    return model

-def train(args, model, train_sampler, valid_samplers=None, rank=0, rel_parts=None, barrier=None):
+def train(args, model, train_sampler, valid_samplers=None, rank=0, rel_parts=None, cross_rels=None, barrier=None):
    logs = []
    for arg in vars(args):
        logging.info('{:20}:{}'.format(arg, getattr(args, arg)))
@@ -41,8 +41,10 @@ def train(args, model, train_sampler, valid_samplers=None, rank=0, rel_parts=Non

    if args.async_update:
        model.create_async_update()
-    if args.strict_rel_part:
+    if args.strict_rel_part or args.soft_rel_part:
        model.prepare_relation(th.device('cuda:' + str(gpu_id)))
+    if args.soft_rel_part:
+        model.prepare_cross_rels(cross_rels)

    start = time.time()
    sample_time = 0
@@ -90,20 +92,22 @@ def train(args, model, train_sampler, valid_samplers=None, rank=0, rel_parts=Non

        if args.valid and (step + 1) % args.eval_interval == 0 and step > 1 and valid_samplers is not None:
            valid_start = time.time()
-            if args.strict_rel_part:
+            if args.strict_rel_part or args.soft_rel_part:
                model.writeback_relation(rank, rel_parts)
            # forced sync for validation
            if barrier is not None:
                barrier.wait()
            test(args, model, valid_samplers, rank, mode='Valid')
            print('test:', time.time() - valid_start)
+            if args.soft_rel_part:
+                model.prepare_cross_rels(cross_rels)
            if barrier is not None:
                barrier.wait()

    print('train {} takes {:.3f} seconds'.format(rank, time.time() - start))
    if args.async_update:
        model.finish_async_update()
-    if args.strict_rel_part:
+    if args.strict_rel_part or args.soft_rel_part:
        model.writeback_relation(rank, rel_parts)

 def test(args, model, test_samplers, rank=0, mode='Test', queue=None):
@@ -112,7 +116,7 @@ def test(args, model, test_samplers, rank=0, mode='Test', queue=None):
    else:
        gpu_id = -1

-    if args.strict_rel_part:
+    if args.strict_rel_part or args.soft_rel_part:
        model.load_relation(th.device('cuda:' + str(gpu_id)))

    with th.no_grad():
@@ -135,10 +139,10 @@ def test(args, model, test_samplers, rank=0, mode='Test', queue=None):
    test_samplers[1] = test_samplers[1].reset()

 @thread_wrapped_func
-def train_mp(args, model, train_sampler, valid_samplers=None, rank=0, rel_parts=None, barrier=None):
+def train_mp(args, model, train_sampler, valid_samplers=None, rank=0, rel_parts=None, cross_rels=None, barrier=None):
    if args.num_proc > 1:
        th.set_num_threads(args.num_thread)
-    train(args, model, train_sampler, valid_samplers, rank, rel_parts, barrier)
+    train(args, model, train_sampler, valid_samplers, rank, rel_parts, cross_rels, barrier)

 @thread_wrapped_func
 def test_mp(args, model, test_samplers, rank=0, mode='Test', queue=None):