test_gpu_speed.py 4.05 KB
Newer Older
lishen's avatar
lishen committed
1
import time
lishen's avatar
lishen committed
2
3
import torch
import warpctc_pytorch as warp_ctc
lishen's avatar
lishen committed
4
from warpctc_pytorch import CTCLoss
lishen's avatar
lishen committed
5
6


lishen's avatar
lishen committed
7
def test_compare_basic(repeat_num=20):
lishen's avatar
lishen committed
8
9
10
11
12
13
14
15
16
17
18
19
    probs = torch.FloatTensor([
        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
    ]).contiguous()
    labels = torch.IntTensor([1, 2])
    label_sizes = torch.IntTensor([2, 0])
    sizes = torch.IntTensor([2, 2])
    minibatch_size = probs.size(1)
    costs = torch.zeros(minibatch_size)
    grads = torch.zeros(probs.size())

    time_st = time.perf_counter()
lishen's avatar
lishen committed
20
    # 1.运行CPU
lishen's avatar
lishen committed
21
    for i in range(repeat_num):
lishen's avatar
lishen committed
22
23
24
25
        probs_new = probs.clone()
        costs_new = costs.clone()
        grads_new = grads.clone()
        warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
lishen's avatar
lishen committed
26
        if i == 0:
lishen's avatar
lishen committed
27
28
            print('CPU_costs: %f' % costs_new.sum())
            print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
lishen's avatar
lishen committed
29
    time_used = (time.perf_counter() - time_st) / repeat_num
lishen's avatar
lishen committed
30
    print('CPU warp_ctc using time: ', time_used)
lishen's avatar
lishen committed
31
32

    time_st = time.perf_counter()
lishen's avatar
lishen committed
33
    # 2.运行GPU
lishen's avatar
lishen committed
34
    for i in range(repeat_num):
lishen's avatar
lishen committed
35
36
37
38
        probs_new = probs.clone().cuda()
        costs_new = costs.clone().cuda()
        grads_new = grads.clone().cuda()
        warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
lishen's avatar
lishen committed
39
        if i == 0:
lishen's avatar
lishen committed
40
41
            print('GPU_costs_new: %f' % costs_new.sum())
            print('GPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
lishen's avatar
lishen committed
42
    time_used = (time.perf_counter() - time_st) / repeat_num
lishen's avatar
lishen committed
43
    print('GPU warp_ctc using time: ', time_used)
lishen's avatar
lishen committed
44
45


lishen's avatar
lishen committed
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100):
    criterion = CTCLoss(blank=0, size_average=False, length_average=False)

    # 测试用例,参考pytorch的CTCLoss
    # Target are to be un-padded
    T = 400  # Input sequence length
    C = 200  # Number of classes (including blank)
    N = 64  # Batch size

    # Initialize random batch of input vectors, for *size = (T,N,C)
    input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()

    input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int32)

    # Initialize random batch of targets (0 = blank, 1:C = classes)
    target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.int32)
    target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.int32)
    print('input shape: {}, target shape: {}'.format(input.shape, target.shape))

    # 测试CPU
    if test_cpu:
        # warmup
        for _ in range(10):
            input_cpu = input.detach().requires_grad_()
            loss = criterion(input_cpu, target, input_lengths, target_lengths)
            loss.backward()
        torch.cuda.synchronize()

        time_st = time.perf_counter()
        for _ in range(repeat_num):
            input_cpu = input.detach().requires_grad_()
            loss = criterion(input_cpu, target, input_lengths, target_lengths)
            loss.backward()
        torch.cuda.synchronize()
        time_used = (time.perf_counter() - time_st) / repeat_num
        print('CPU warp_ctc using time: ', time_used)

    # 测试GPU
    if test_gpu:
        # warmup
        for _ in range(10):
            input_gpu = input.detach().cuda().requires_grad_()
            loss = criterion(input_gpu, target, input_lengths, target_lengths)
            loss.backward()
        torch.cuda.synchronize()

        time_st = time.perf_counter()
        for _ in range(repeat_num):
            input_gpu = input.detach().cuda().requires_grad_()
            loss = criterion(input_gpu, target, input_lengths, target_lengths)
            loss.backward()
        torch.cuda.synchronize()
        time_used = (time.perf_counter() - time_st) / repeat_num
        print('GPU warp_ctc using time: ', time_used)
lishen's avatar
lishen committed
100
101
102
103


if __name__ == '__main__':
    print('torch.cuda.is_available() ', torch.cuda.is_available())
lishen's avatar
lishen committed
104
105
    # test_compare_basic()
    test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100)