warpctc for dcu

cd398a3f · lishen · f456860f · cd398a3f · cd398a3f
Commit cd398a3f authored May 17, 2023 by lishen
Hide whitespace changes
Inline Side-by-side

Showing with 114 additions and 54 deletions

pytorch_binding/tests/test_gpu.py pytorch_binding/tests/test_gpu.py +39 -9

pytorch_binding/tests/test_gpu_speed.py pytorch_binding/tests/test_gpu_speed.py +75 -45

No files found.
--- a/pytorch_binding/tests/test_gpu.py
+++ b/pytorch_binding/tests/test_gpu.py
 import torch
 import warpctc_pytorch as warp_ctc
+from warpctc_pytorch import CTCLoss


 def test_empty_label(test_cpu=True, test_gpu=True):
@@ -15,23 +16,52 @@ def test_empty_label(test_cpu=True, test_gpu=True):

    if test_cpu:
        costs = torch.zeros(minibatch_size)
-        warp_ctc.cpu_ctc(probs,  grads,  labels, label_sizes,  sizes,  minibatch_size, costs,  0)
-        print('CPU_cost: %f' % costs.sum())
-        print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
+        warp_ctc.cpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
+        print('CPU cost sum = %f' % costs.sum())
+        print('CPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs, grads, costs))

    if test_gpu:
        probs = probs.clone().cuda()
        grads = torch.zeros(probs.size()).cuda()
        costs = torch.zeros(minibatch_size)
        warp_ctc.gpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
-        print('GPU_cost: %f' % costs.sum())
+        print('GPU cost sum = %f' % costs.sum())
        print(grads.view(grads.size(0) * grads.size(1), grads.size(2)))
-        print('GPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
+        print('GPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs, grads, costs))


-if __name__ == '__main__':
+def test_ctcloss(test_cpu=True, test_gpu=True):
+    criterion = CTCLoss(blank=0, size_average=False, length_average=False)
+    probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
+    labels = torch.IntTensor([1, 2])
+    probs_sizes = torch.IntTensor([2])
+    label_sizes = torch.IntTensor([2])
+
+    print('probs shape ', probs.shape)
+    print('labels shape ', labels.shape)
+    print('label_sizes ', sum(label_sizes))
+    if test_cpu:
+        probs_cpu = probs.clone().cpu().requires_grad_(True)  # tells autograd to compute gradients for probs
+        cost = criterion(probs_cpu, labels, probs_sizes, label_sizes)
+        cost.backward()
+        print('CPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs_cpu, probs_cpu.grad, cost))
+
+    if test_gpu:
+        probs_gpu = probs.clone().cuda().requires_grad_(True)  # tells autograd to compute gradients for probs
+        cost = criterion(probs_gpu, labels, probs_sizes, label_sizes)
+        cost.backward()
+        print('GPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs_gpu, probs_gpu.grad, cost))
+
+
+def main():
    print('torch.cuda.is_available() ', torch.cuda.is_available())
-    # test_empty_label(test_cpu=True, test_gpu=False)
-    test_empty_label(test_cpu=False, test_gpu=True)
+    test_gpu = False
+    if torch.cuda.is_available():
+        test_gpu = True
+
+    # test_empty_label(test_cpu=True, test_gpu=test_gpu)
+    test_ctcloss(test_cpu=True, test_gpu=test_gpu)

-# HIP_VISIBLE_DEVICES=1 python3 test_gpu_new.py
+
+if __name__ == '__main__':
+    main()
--- a/pytorch_binding/tests/test_gpu_speed.py
+++ b/pytorch_binding/tests/test_gpu_speed.py
+import time
 import torch
-import warpctc_pytorch_change1 as warp_ctc_new
 import warpctc_pytorch as warp_ctc
-import time
+from warpctc_pytorch import CTCLoss


-def test_compare_cpu(repeat_num=20):
+def test_compare_basic(repeat_num=20):
    probs = torch.FloatTensor([
        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
@@ -17,59 +17,89 @@ def test_compare_cpu(repeat_num=20):
    grads = torch.zeros(probs.size())

    time_st = time.perf_counter()
-    # 1.运行老版本 CPU
+    # 1.运行CPU
    for i in range(repeat_num):
-        probs_old = probs.clone()
-        costs_old = costs.clone()
-        grads_old = grads.clone()
-        warp_ctc.cpu_ctc(probs_old, grads_old, labels, label_sizes, sizes, minibatch_size, costs_old, 0)
+        probs_new = probs.clone()
+        costs_new = costs.clone()
+        grads_new = grads.clone()
+        warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
        if i == 0:
-            print('CPU_costs_old: %f' % costs_old.sum())
-            print('CPU probs_old={}\ngrads_old={}\ncosts_old={}'.format(probs_old, grads_old, costs_old))
+            print('CPU_costs: %f' % costs_new.sum())
+            print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
    time_used = (time.perf_counter() - time_st) / repeat_num
-    print('CPU warp_ctc old version using time: ', time_used)
+    print('CPU warp_ctc using time: ', time_used)

    time_st = time.perf_counter()
-    # 2.运行新版本 CPU
+    # 2.运行GPU
    for i in range(repeat_num):
-        probs_new = probs.clone()
-        costs_new = costs.clone()
-        grads_new = grads.clone()
-        warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
+        probs_new = probs.clone().cuda()
+        costs_new = costs.clone().cuda()
+        grads_new = grads.clone().cuda()
+        warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
        if i == 0:
-            print('CPU_costs_new: %f' % costs_new.sum())
-            print('CPU probs={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
+            print('GPU_costs_new: %f' % costs_new.sum())
+            print('GPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
    time_used = (time.perf_counter() - time_st) / repeat_num
-    print('CPU warp_ctc new version using time: ', time_used)
+    print('GPU warp_ctc using time: ', time_used)


-def test_compare_gpu():
-    probs0 = torch.FloatTensor([
-        [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
-        [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
-    ]).contiguous().cuda()
-    labels = torch.IntTensor([1, 2])
-    label_sizes = torch.IntTensor([2, 0])
-    sizes = torch.IntTensor([2, 2])
-    minibatch_size = probs0.size(1)
-
-    # 1.运行新版本 CPU
-    probs_new = probs0.clone().cuda()
-    costs_new = torch.zeros(minibatch_size)
-    grads_new = torch.zeros(probs0.size())
-    warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
-    print('CPU_costs_new: %f' % costs_new.sum())
-    print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
-    # 2.运行老版本 CPU
-    probs = probs0.clone().cuda()
-    costs = torch.zeros(minibatch_size)
-    grads = torch.zeros(probs0.size())
-    warp_ctc.cpu_ctc(probs0, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
-    print('CPU_cost: %f' % costs.sum())
-    print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
+def test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100):
+    criterion = CTCLoss(blank=0, size_average=False, length_average=False)
+
+    # 测试用例，参考pytorch的CTCLoss
+    # Target are to be un-padded
+    T = 400  # Input sequence length
+    C = 200  # Number of classes (including blank)
+    N = 64  # Batch size
+
+    # Initialize random batch of input vectors, for *size = (T,N,C)
+    input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
+
+    input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int32)
+
+    # Initialize random batch of targets (0 = blank, 1:C = classes)
+    target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.int32)
+    target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.int32)
+    print('input shape: {}, target shape: {}'.format(input.shape, target.shape))
+
+    # 测试CPU
+    if test_cpu:
+        # warmup
+        for _ in range(10):
+            input_cpu = input.detach().requires_grad_()
+            loss = criterion(input_cpu, target, input_lengths, target_lengths)
+            loss.backward()
+        torch.cuda.synchronize()
+
+        time_st = time.perf_counter()
+        for _ in range(repeat_num):
+            input_cpu = input.detach().requires_grad_()
+            loss = criterion(input_cpu, target, input_lengths, target_lengths)
+            loss.backward()
+        torch.cuda.synchronize()
+        time_used = (time.perf_counter() - time_st) / repeat_num
+        print('CPU warp_ctc using time: ', time_used)
+
+    # 测试GPU
+    if test_gpu:
+        # warmup
+        for _ in range(10):
+            input_gpu = input.detach().cuda().requires_grad_()
+            loss = criterion(input_gpu, target, input_lengths, target_lengths)
+            loss.backward()
+        torch.cuda.synchronize()
+
+        time_st = time.perf_counter()
+        for _ in range(repeat_num):
+            input_gpu = input.detach().cuda().requires_grad_()
+            loss = criterion(input_gpu, target, input_lengths, target_lengths)
+            loss.backward()
+        torch.cuda.synchronize()
+        time_used = (time.perf_counter() - time_st) / repeat_num
+        print('GPU warp_ctc using time: ', time_used)


 if __name__ == '__main__':
    print('torch.cuda.is_available() ', torch.cuda.is_available())
-    test_compare_cpu()
-    test_compare_gpu()
+    # test_compare_basic()
+    test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100)