Commit cd398a3f authored by lishen's avatar lishen
Browse files

warpctc for dcu

parent f456860f
import torch import torch
import warpctc_pytorch as warp_ctc import warpctc_pytorch as warp_ctc
from warpctc_pytorch import CTCLoss
def test_empty_label(test_cpu=True, test_gpu=True): def test_empty_label(test_cpu=True, test_gpu=True):
...@@ -15,23 +16,52 @@ def test_empty_label(test_cpu=True, test_gpu=True): ...@@ -15,23 +16,52 @@ def test_empty_label(test_cpu=True, test_gpu=True):
if test_cpu: if test_cpu:
costs = torch.zeros(minibatch_size) costs = torch.zeros(minibatch_size)
warp_ctc.cpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0) warp_ctc.cpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
print('CPU_cost: %f' % costs.sum()) print('CPU cost sum = %f' % costs.sum())
print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs)) print('CPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs, grads, costs))
if test_gpu: if test_gpu:
probs = probs.clone().cuda() probs = probs.clone().cuda()
grads = torch.zeros(probs.size()).cuda() grads = torch.zeros(probs.size()).cuda()
costs = torch.zeros(minibatch_size) costs = torch.zeros(minibatch_size)
warp_ctc.gpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0) warp_ctc.gpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
print('GPU_cost: %f' % costs.sum()) print('GPU cost sum = %f' % costs.sum())
print(grads.view(grads.size(0) * grads.size(1), grads.size(2))) print(grads.view(grads.size(0) * grads.size(1), grads.size(2)))
print('GPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs)) print('GPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs, grads, costs))
if __name__ == '__main__': def test_ctcloss(test_cpu=True, test_gpu=True):
criterion = CTCLoss(blank=0, size_average=False, length_average=False)
probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
labels = torch.IntTensor([1, 2])
probs_sizes = torch.IntTensor([2])
label_sizes = torch.IntTensor([2])
print('probs shape ', probs.shape)
print('labels shape ', labels.shape)
print('label_sizes ', sum(label_sizes))
if test_cpu:
probs_cpu = probs.clone().cpu().requires_grad_(True) # tells autograd to compute gradients for probs
cost = criterion(probs_cpu, labels, probs_sizes, label_sizes)
cost.backward()
print('CPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs_cpu, probs_cpu.grad, cost))
if test_gpu:
probs_gpu = probs.clone().cuda().requires_grad_(True) # tells autograd to compute gradients for probs
cost = criterion(probs_gpu, labels, probs_sizes, label_sizes)
cost.backward()
print('GPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs_gpu, probs_gpu.grad, cost))
def main():
print('torch.cuda.is_available() ', torch.cuda.is_available()) print('torch.cuda.is_available() ', torch.cuda.is_available())
# test_empty_label(test_cpu=True, test_gpu=False) test_gpu = False
test_empty_label(test_cpu=False, test_gpu=True) if torch.cuda.is_available():
test_gpu = True
# test_empty_label(test_cpu=True, test_gpu=test_gpu)
test_ctcloss(test_cpu=True, test_gpu=test_gpu)
# HIP_VISIBLE_DEVICES=1 python3 test_gpu_new.py
if __name__ == '__main__':
main()
import time
import torch import torch
import warpctc_pytorch_change1 as warp_ctc_new
import warpctc_pytorch as warp_ctc import warpctc_pytorch as warp_ctc
import time from warpctc_pytorch import CTCLoss
def test_compare_cpu(repeat_num=20): def test_compare_basic(repeat_num=20):
probs = torch.FloatTensor([ probs = torch.FloatTensor([
[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]], [[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]] [[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
...@@ -17,59 +17,89 @@ def test_compare_cpu(repeat_num=20): ...@@ -17,59 +17,89 @@ def test_compare_cpu(repeat_num=20):
grads = torch.zeros(probs.size()) grads = torch.zeros(probs.size())
time_st = time.perf_counter() time_st = time.perf_counter()
# 1.运行老版本 CPU # 1.运行CPU
for i in range(repeat_num): for i in range(repeat_num):
probs_old = probs.clone() probs_new = probs.clone()
costs_old = costs.clone() costs_new = costs.clone()
grads_old = grads.clone() grads_new = grads.clone()
warp_ctc.cpu_ctc(probs_old, grads_old, labels, label_sizes, sizes, minibatch_size, costs_old, 0) warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
if i == 0: if i == 0:
print('CPU_costs_old: %f' % costs_old.sum()) print('CPU_costs: %f' % costs_new.sum())
print('CPU probs_old={}\ngrads_old={}\ncosts_old={}'.format(probs_old, grads_old, costs_old)) print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
time_used = (time.perf_counter() - time_st) / repeat_num time_used = (time.perf_counter() - time_st) / repeat_num
print('CPU warp_ctc old version using time: ', time_used) print('CPU warp_ctc using time: ', time_used)
time_st = time.perf_counter() time_st = time.perf_counter()
# 2.运行新版本 CPU # 2.运行GPU
for i in range(repeat_num): for i in range(repeat_num):
probs_new = probs.clone() probs_new = probs.clone().cuda()
costs_new = costs.clone() costs_new = costs.clone().cuda()
grads_new = grads.clone() grads_new = grads.clone().cuda()
warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0) warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
if i == 0: if i == 0:
print('CPU_costs_new: %f' % costs_new.sum()) print('GPU_costs_new: %f' % costs_new.sum())
print('CPU probs={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new)) print('GPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
time_used = (time.perf_counter() - time_st) / repeat_num time_used = (time.perf_counter() - time_st) / repeat_num
print('CPU warp_ctc new version using time: ', time_used) print('GPU warp_ctc using time: ', time_used)
def test_compare_gpu(): def test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100):
probs0 = torch.FloatTensor([ criterion = CTCLoss(blank=0, size_average=False, length_average=False)
[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]] # 测试用例,参考pytorch的CTCLoss
]).contiguous().cuda() # Target are to be un-padded
labels = torch.IntTensor([1, 2]) T = 400 # Input sequence length
label_sizes = torch.IntTensor([2, 0]) C = 200 # Number of classes (including blank)
sizes = torch.IntTensor([2, 2]) N = 64 # Batch size
minibatch_size = probs0.size(1)
# Initialize random batch of input vectors, for *size = (T,N,C)
# 1.运行新版本 CPU input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
probs_new = probs0.clone().cuda()
costs_new = torch.zeros(minibatch_size) input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int32)
grads_new = torch.zeros(probs0.size())
warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0) # Initialize random batch of targets (0 = blank, 1:C = classes)
print('CPU_costs_new: %f' % costs_new.sum()) target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.int32)
print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new)) target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.int32)
# 2.运行老版本 CPU print('input shape: {}, target shape: {}'.format(input.shape, target.shape))
probs = probs0.clone().cuda()
costs = torch.zeros(minibatch_size) # 测试CPU
grads = torch.zeros(probs0.size()) if test_cpu:
warp_ctc.cpu_ctc(probs0, grads, labels, label_sizes, sizes, minibatch_size, costs, 0) # warmup
print('CPU_cost: %f' % costs.sum()) for _ in range(10):
print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs)) input_cpu = input.detach().requires_grad_()
loss = criterion(input_cpu, target, input_lengths, target_lengths)
loss.backward()
torch.cuda.synchronize()
time_st = time.perf_counter()
for _ in range(repeat_num):
input_cpu = input.detach().requires_grad_()
loss = criterion(input_cpu, target, input_lengths, target_lengths)
loss.backward()
torch.cuda.synchronize()
time_used = (time.perf_counter() - time_st) / repeat_num
print('CPU warp_ctc using time: ', time_used)
# 测试GPU
if test_gpu:
# warmup
for _ in range(10):
input_gpu = input.detach().cuda().requires_grad_()
loss = criterion(input_gpu, target, input_lengths, target_lengths)
loss.backward()
torch.cuda.synchronize()
time_st = time.perf_counter()
for _ in range(repeat_num):
input_gpu = input.detach().cuda().requires_grad_()
loss = criterion(input_gpu, target, input_lengths, target_lengths)
loss.backward()
torch.cuda.synchronize()
time_used = (time.perf_counter() - time_st) / repeat_num
print('GPU warp_ctc using time: ', time_used)
if __name__ == '__main__': if __name__ == '__main__':
print('torch.cuda.is_available() ', torch.cuda.is_available()) print('torch.cuda.is_available() ', torch.cuda.is_available())
test_compare_cpu() # test_compare_basic()
test_compare_gpu() test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment