import time
import torch
import warpctc_pytorch as warp_ctc
from warpctc_pytorch import CTCLoss


def test_compare_basic(repeat_num=20):
    probs = torch.FloatTensor([
        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
    ]).contiguous()
    labels = torch.IntTensor([1, 2])
    label_sizes = torch.IntTensor([2, 0])
    sizes = torch.IntTensor([2, 2])
    minibatch_size = probs.size(1)
    costs = torch.zeros(minibatch_size)
    grads = torch.zeros(probs.size())

    time_st = time.perf_counter()
    # 1.运行CPU
    for i in range(repeat_num):
        probs_new = probs.clone()
        costs_new = costs.clone()
        grads_new = grads.clone()
        warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
        if i == 0:
            print('CPU_costs: %f' % costs_new.sum())
            print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
    time_used = (time.perf_counter() - time_st) / repeat_num
    print('CPU warp_ctc using time: ', time_used)

    time_st = time.perf_counter()
    # 2.运行GPU
    for i in range(repeat_num):
        probs_new = probs.clone().cuda()
        costs_new = costs.clone().cuda()
        grads_new = grads.clone().cuda()
        warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
        if i == 0:
            print('GPU_costs_new: %f' % costs_new.sum())
            print('GPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
    time_used = (time.perf_counter() - time_st) / repeat_num
    print('GPU warp_ctc using time: ', time_used)


def test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100):
    criterion = CTCLoss(blank=0, size_average=False, length_average=False)

    # 测试用例，参考pytorch的CTCLoss
    # Target are to be un-padded
    T = 400  # Input sequence length
    C = 200  # Number of classes (including blank)
    N = 64  # Batch size

    # Initialize random batch of input vectors, for *size = (T,N,C)
    input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()

    input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int32)

    # Initialize random batch of targets (0 = blank, 1:C = classes)
    target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.int32)
    target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.int32)
    print('input shape: {}, target shape: {}'.format(input.shape, target.shape))

    # 测试CPU
    if test_cpu:
        # warmup
        for _ in range(10):
            input_cpu = input.detach().requires_grad_()
            loss = criterion(input_cpu, target, input_lengths, target_lengths)
            loss.backward()
        torch.cuda.synchronize()

        time_st = time.perf_counter()
        for _ in range(repeat_num):
            input_cpu = input.detach().requires_grad_()
            loss = criterion(input_cpu, target, input_lengths, target_lengths)
            loss.backward()
        torch.cuda.synchronize()
        time_used = (time.perf_counter() - time_st) / repeat_num
        print('CPU warp_ctc using time: ', time_used)

    # 测试GPU
    if test_gpu:
        # warmup
        for _ in range(10):
            input_gpu = input.detach().cuda().requires_grad_()
            loss = criterion(input_gpu, target, input_lengths, target_lengths)
            loss.backward()
        torch.cuda.synchronize()

        time_st = time.perf_counter()
        for _ in range(repeat_num):
            input_gpu = input.detach().cuda().requires_grad_()
            loss = criterion(input_gpu, target, input_lengths, target_lengths)
            loss.backward()
        torch.cuda.synchronize()
        time_used = (time.perf_counter() - time_st) / repeat_num
        print('GPU warp_ctc using time: ', time_used)


if __name__ == '__main__':
    print('torch.cuda.is_available() ', torch.cuda.is_available())
    # test_compare_basic()
    test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100)