import time import torch import warpctc_pytorch as warp_ctc from warpctc_pytorch import CTCLoss def test_compare_basic(repeat_num=20): probs = torch.FloatTensor([ [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]], [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]] ]).contiguous() labels = torch.IntTensor([1, 2]) label_sizes = torch.IntTensor([2, 0]) sizes = torch.IntTensor([2, 2]) minibatch_size = probs.size(1) costs = torch.zeros(minibatch_size) grads = torch.zeros(probs.size()) time_st = time.perf_counter() # 1.运行CPU for i in range(repeat_num): probs_new = probs.clone() costs_new = costs.clone() grads_new = grads.clone() warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0) if i == 0: print('CPU_costs: %f' % costs_new.sum()) print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new)) time_used = (time.perf_counter() - time_st) / repeat_num print('CPU warp_ctc using time: ', time_used) time_st = time.perf_counter() # 2.运行GPU for i in range(repeat_num): probs_new = probs.clone().cuda() costs_new = costs.clone().cuda() grads_new = grads.clone().cuda() warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0) if i == 0: print('GPU_costs_new: %f' % costs_new.sum()) print('GPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new)) time_used = (time.perf_counter() - time_st) / repeat_num print('GPU warp_ctc using time: ', time_used) def test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100): criterion = CTCLoss(blank=0, size_average=False, length_average=False) # 测试用例,参考pytorch的CTCLoss # Target are to be un-padded T = 400 # Input sequence length C = 200 # Number of classes (including blank) N = 64 # Batch size # Initialize random batch of input vectors, for *size = (T,N,C) input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_() input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int32) # Initialize random batch of targets (0 = blank, 1:C = classes) target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.int32) target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.int32) print('input shape: {}, target shape: {}'.format(input.shape, target.shape)) # 测试CPU if test_cpu: # warmup for _ in range(10): input_cpu = input.detach().requires_grad_() loss = criterion(input_cpu, target, input_lengths, target_lengths) loss.backward() torch.cuda.synchronize() time_st = time.perf_counter() for _ in range(repeat_num): input_cpu = input.detach().requires_grad_() loss = criterion(input_cpu, target, input_lengths, target_lengths) loss.backward() torch.cuda.synchronize() time_used = (time.perf_counter() - time_st) / repeat_num print('CPU warp_ctc using time: ', time_used) # 测试GPU if test_gpu: # warmup for _ in range(10): input_gpu = input.detach().cuda().requires_grad_() loss = criterion(input_gpu, target, input_lengths, target_lengths) loss.backward() torch.cuda.synchronize() time_st = time.perf_counter() for _ in range(repeat_num): input_gpu = input.detach().cuda().requires_grad_() loss = criterion(input_gpu, target, input_lengths, target_lengths) loss.backward() torch.cuda.synchronize() time_used = (time.perf_counter() - time_st) / repeat_num print('GPU warp_ctc using time: ', time_used) if __name__ == '__main__': print('torch.cuda.is_available() ', torch.cuda.is_available()) # test_compare_basic() test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100)