"""
Knowledge Graph Embedding Models.
1. TransE
2. DistMult
3. ComplEx
4. RotatE
5. pRotatE
6. TransH
7. TransR
8. TransD
9. RESCAL
"""
import os
import numpy as np

import torch as th
import torch.nn as nn
import torch.nn.functional as functional
import torch.nn.init as INIT

from .. import *

logsigmoid = functional.logsigmoid

def get_device(args):
    return th.device('cpu') if args.gpu < 0 else th.device('cuda:' + str(args.gpu))

norm = lambda x, p: x.norm(p=p)**p

get_scalar = lambda x: x.detach().item()

reshape = lambda arr, x, y: arr.view(x, y)

cuda = lambda arr, gpu: arr.cuda(gpu)

class ExternalEmbedding:
    def __init__(self, args, num, dim, device):
        self.gpu = args.gpu
        self.args = args
        self.trace = []

        self.emb = th.empty(num, dim, dtype=th.float32, device=device)
        self.state_sum = self.emb.new().resize_(self.emb.size(0)).zero_()
        self.state_step = 0

    def init(self, emb_init):
        INIT.uniform_(self.emb, -emb_init, emb_init)
        INIT.zeros_(self.state_sum)

    def share_memory(self):
        self.emb.share_memory_()
        self.state_sum.share_memory_()

    def __call__(self, idx, gpu_id=-1, trace=True):
        s = self.emb[idx]
        if self.gpu >= 0:
            s = s.cuda(self.gpu)
        # During the training, we need to trace the computation.
        # In this case, we need to record the computation path and compute the gradients.
        if trace:
            data = s.clone().detach().requires_grad_(True)
            self.trace.append((idx, data))
        else:
            data = s
        return data

    def update(self):
        self.state_step += 1
        with th.no_grad():
            for idx, data in self.trace:
                grad = data.grad.data

                clr = self.args.lr
                #clr = self.args.lr / (1 + (self.state_step - 1) * group['lr_decay'])

                # the update is non-linear so indices must be unique
                grad_indices = idx
                grad_values = grad

                grad_sum = (grad_values * grad_values).mean(1)
                device = self.state_sum.device
                if device != grad_indices.device:
                    grad_indices = grad_indices.to(device)
                if device != grad_sum.device:
                    grad_sum = grad_sum.to(device)
                self.state_sum.index_add_(0, grad_indices, grad_sum)
                std = self.state_sum[grad_indices]  # _sparse_mask
                std_values = std.sqrt_().add_(1e-10).unsqueeze(1)
                if self.gpu >= 0:
                    std_values = std_values.cuda(self.args.gpu)
                tmp = (-clr * grad_values / std_values)
                if tmp.device != device:
                    tmp = tmp.to(device)
                # TODO(zhengda) the overhead is here.
                self.emb.index_add_(0, grad_indices, tmp)
        self.trace = []

    def curr_emb(self):
        data = [data for _, data in self.trace]
        return th.cat(data, 0)

    def save(self, path, name):
        file_name = os.path.join(path, name+'.npy')
        np.save(file_name, self.emb.cpu().detach().numpy())

    def load(self, path, name):
        file_name = os.path.join(path, name+'.npy')
        self.emb = th.Tensor(np.load(file_name))