Commit cd398a3f authored by lishen's avatar lishen
Browse files

warpctc for dcu

parent f456860f
import torch
import warpctc_pytorch as warp_ctc
from warpctc_pytorch import CTCLoss
def test_empty_label(test_cpu=True, test_gpu=True):
......@@ -15,23 +16,52 @@ def test_empty_label(test_cpu=True, test_gpu=True):
if test_cpu:
costs = torch.zeros(minibatch_size)
warp_ctc.cpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
print('CPU_cost: %f' % costs.sum())
print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
warp_ctc.cpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
print('CPU cost sum = %f' % costs.sum())
print('CPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs, grads, costs))
if test_gpu:
probs = probs.clone().cuda()
grads = torch.zeros(probs.size()).cuda()
costs = torch.zeros(minibatch_size)
warp_ctc.gpu_ctc(probs, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
print('GPU_cost: %f' % costs.sum())
print('GPU cost sum = %f' % costs.sum())
print(grads.view(grads.size(0) * grads.size(1), grads.size(2)))
print('GPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
print('GPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs, grads, costs))
if __name__ == '__main__':
def test_ctcloss(test_cpu=True, test_gpu=True):
criterion = CTCLoss(blank=0, size_average=False, length_average=False)
probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
labels = torch.IntTensor([1, 2])
probs_sizes = torch.IntTensor([2])
label_sizes = torch.IntTensor([2])
print('probs shape ', probs.shape)
print('labels shape ', labels.shape)
print('label_sizes ', sum(label_sizes))
if test_cpu:
probs_cpu = probs.clone().cpu().requires_grad_(True) # tells autograd to compute gradients for probs
cost = criterion(probs_cpu, labels, probs_sizes, label_sizes)
cost.backward()
print('CPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs_cpu, probs_cpu.grad, cost))
if test_gpu:
probs_gpu = probs.clone().cuda().requires_grad_(True) # tells autograd to compute gradients for probs
cost = criterion(probs_gpu, labels, probs_sizes, label_sizes)
cost.backward()
print('GPU probs={}\ngrads={}\ncosts={}\n\n'.format(probs_gpu, probs_gpu.grad, cost))
def main():
print('torch.cuda.is_available() ', torch.cuda.is_available())
# test_empty_label(test_cpu=True, test_gpu=False)
test_empty_label(test_cpu=False, test_gpu=True)
test_gpu = False
if torch.cuda.is_available():
test_gpu = True
# test_empty_label(test_cpu=True, test_gpu=test_gpu)
test_ctcloss(test_cpu=True, test_gpu=test_gpu)
# HIP_VISIBLE_DEVICES=1 python3 test_gpu_new.py
if __name__ == '__main__':
main()
import time
import torch
import warpctc_pytorch_change1 as warp_ctc_new
import warpctc_pytorch as warp_ctc
import time
from warpctc_pytorch import CTCLoss
def test_compare_cpu(repeat_num=20):
def test_compare_basic(repeat_num=20):
probs = torch.FloatTensor([
[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
......@@ -17,59 +17,89 @@ def test_compare_cpu(repeat_num=20):
grads = torch.zeros(probs.size())
time_st = time.perf_counter()
# 1.运行老版本 CPU
# 1.运行CPU
for i in range(repeat_num):
probs_old = probs.clone()
costs_old = costs.clone()
grads_old = grads.clone()
warp_ctc.cpu_ctc(probs_old, grads_old, labels, label_sizes, sizes, minibatch_size, costs_old, 0)
probs_new = probs.clone()
costs_new = costs.clone()
grads_new = grads.clone()
warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
if i == 0:
print('CPU_costs_old: %f' % costs_old.sum())
print('CPU probs_old={}\ngrads_old={}\ncosts_old={}'.format(probs_old, grads_old, costs_old))
print('CPU_costs: %f' % costs_new.sum())
print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
time_used = (time.perf_counter() - time_st) / repeat_num
print('CPU warp_ctc old version using time: ', time_used)
print('CPU warp_ctc using time: ', time_used)
time_st = time.perf_counter()
# 2.运行新版本 CPU
# 2.运行GPU
for i in range(repeat_num):
probs_new = probs.clone()
costs_new = costs.clone()
grads_new = grads.clone()
warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
probs_new = probs.clone().cuda()
costs_new = costs.clone().cuda()
grads_new = grads.clone().cuda()
warp_ctc.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
if i == 0:
print('CPU_costs_new: %f' % costs_new.sum())
print('CPU probs={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
print('GPU_costs_new: %f' % costs_new.sum())
print('GPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
time_used = (time.perf_counter() - time_st) / repeat_num
print('CPU warp_ctc new version using time: ', time_used)
print('GPU warp_ctc using time: ', time_used)
def test_compare_gpu():
probs0 = torch.FloatTensor([
[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]],
[[0.6, 0.1, 0.1, 0.1, 0.1], [0.1, 0.1, 0.5, 0.2, 0.1]]
]).contiguous().cuda()
labels = torch.IntTensor([1, 2])
label_sizes = torch.IntTensor([2, 0])
sizes = torch.IntTensor([2, 2])
minibatch_size = probs0.size(1)
# 1.运行新版本 CPU
probs_new = probs0.clone().cuda()
costs_new = torch.zeros(minibatch_size)
grads_new = torch.zeros(probs0.size())
warp_ctc_new.cpu_ctc(probs_new, grads_new, labels, label_sizes, sizes, minibatch_size, costs_new, 0)
print('CPU_costs_new: %f' % costs_new.sum())
print('CPU probs_new={}\ngrads_new={}\ncosts_new={}'.format(probs_new, grads_new, costs_new))
# 2.运行老版本 CPU
probs = probs0.clone().cuda()
costs = torch.zeros(minibatch_size)
grads = torch.zeros(probs0.size())
warp_ctc.cpu_ctc(probs0, grads, labels, label_sizes, sizes, minibatch_size, costs, 0)
print('CPU_cost: %f' % costs.sum())
print('CPU probs={}\ngrads={}\ncosts={}'.format(probs, grads, costs))
def test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100):
criterion = CTCLoss(blank=0, size_average=False, length_average=False)
# 测试用例,参考pytorch的CTCLoss
# Target are to be un-padded
T = 400 # Input sequence length
C = 200 # Number of classes (including blank)
N = 64 # Batch size
# Initialize random batch of input vectors, for *size = (T,N,C)
input = torch.randn(T, N, C).log_softmax(2).detach().requires_grad_()
input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int32)
# Initialize random batch of targets (0 = blank, 1:C = classes)
target_lengths = torch.randint(low=1, high=T, size=(N,), dtype=torch.int32)
target = torch.randint(low=1, high=C, size=(sum(target_lengths),), dtype=torch.int32)
print('input shape: {}, target shape: {}'.format(input.shape, target.shape))
# 测试CPU
if test_cpu:
# warmup
for _ in range(10):
input_cpu = input.detach().requires_grad_()
loss = criterion(input_cpu, target, input_lengths, target_lengths)
loss.backward()
torch.cuda.synchronize()
time_st = time.perf_counter()
for _ in range(repeat_num):
input_cpu = input.detach().requires_grad_()
loss = criterion(input_cpu, target, input_lengths, target_lengths)
loss.backward()
torch.cuda.synchronize()
time_used = (time.perf_counter() - time_st) / repeat_num
print('CPU warp_ctc using time: ', time_used)
# 测试GPU
if test_gpu:
# warmup
for _ in range(10):
input_gpu = input.detach().cuda().requires_grad_()
loss = criterion(input_gpu, target, input_lengths, target_lengths)
loss.backward()
torch.cuda.synchronize()
time_st = time.perf_counter()
for _ in range(repeat_num):
input_gpu = input.detach().cuda().requires_grad_()
loss = criterion(input_gpu, target, input_lengths, target_lengths)
loss.backward()
torch.cuda.synchronize()
time_used = (time.perf_counter() - time_st) / repeat_num
print('GPU warp_ctc using time: ', time_used)
if __name__ == '__main__':
print('torch.cuda.is_available() ', torch.cuda.is_available())
test_compare_cpu()
test_compare_gpu()
# test_compare_basic()
test_ctcloss_speed(test_cpu=True, test_gpu=True, repeat_num=100)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment