Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Warpctc
Commits
cd398a3f
Commit
cd398a3f
authored
May 17, 2023
by
lishen
Browse files
warpctc for dcu
parent
f456860f
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
114 additions
and
54 deletions
+114
-54
pytorch_binding/tests/test_gpu.py
pytorch_binding/tests/test_gpu.py
+39
-9
pytorch_binding/tests/test_gpu_speed.py
pytorch_binding/tests/test_gpu_speed.py
+75
-45
No files found.
pytorch_binding/tests/test_gpu.py
View file @
cd398a3f
import
torch
import
warpctc_pytorch
as
warp_ctc
from
warpctc_pytorch
import
CTCLoss
def
test_empty_label
(
test_cpu
=
True
,
test_gpu
=
True
):
...
...
@@ -15,23 +16,52 @@ def test_empty_label(test_cpu=True, test_gpu=True):
if
test_cpu
:
costs
=
torch
.
zeros
(
minibatch_size
)
warp_ctc
.
cpu_ctc
(
probs
,
grads
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs
,
0
)
print
(
'CPU
_
cost
:
%f'
%
costs
.
sum
())
print
(
'CPU probs={}
\n
grads={}
\n
costs={}'
.
format
(
probs
,
grads
,
costs
))
warp_ctc
.
cpu_ctc
(
probs
,
grads
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs
,
0
)
print
(
'CPU
cost
sum =
%f'
%
costs
.
sum
())
print
(
'CPU probs={}
\n
grads={}
\n
costs={}
\n\n
'
.
format
(
probs
,
grads
,
costs
))
if
test_gpu
:
probs
=
probs
.
clone
().
cuda
()
grads
=
torch
.
zeros
(
probs
.
size
()).
cuda
()
costs
=
torch
.
zeros
(
minibatch_size
)
warp_ctc
.
gpu_ctc
(
probs
,
grads
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs
,
0
)
print
(
'GPU
_
cost
:
%f'
%
costs
.
sum
())
print
(
'GPU
cost
sum =
%f'
%
costs
.
sum
())
print
(
grads
.
view
(
grads
.
size
(
0
)
*
grads
.
size
(
1
),
grads
.
size
(
2
)))
print
(
'GPU probs={}
\n
grads={}
\n
costs={}'
.
format
(
probs
,
grads
,
costs
))
print
(
'GPU probs={}
\n
grads={}
\n
costs={}
\n\n
'
.
format
(
probs
,
grads
,
costs
))
if
__name__
==
'__main__'
:
def
test_ctcloss
(
test_cpu
=
True
,
test_gpu
=
True
):
criterion
=
CTCLoss
(
blank
=
0
,
size_average
=
False
,
length_average
=
False
)
probs
=
torch
.
FloatTensor
([[[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.6
,
0.1
,
0.1
]]]).
transpose
(
0
,
1
).
contiguous
()
labels
=
torch
.
IntTensor
([
1
,
2
])
probs_sizes
=
torch
.
IntTensor
([
2
])
label_sizes
=
torch
.
IntTensor
([
2
])
print
(
'probs shape '
,
probs
.
shape
)
print
(
'labels shape '
,
labels
.
shape
)
print
(
'label_sizes '
,
sum
(
label_sizes
))
if
test_cpu
:
probs_cpu
=
probs
.
clone
().
cpu
().
requires_grad_
(
True
)
# tells autograd to compute gradients for probs
cost
=
criterion
(
probs_cpu
,
labels
,
probs_sizes
,
label_sizes
)
cost
.
backward
()
print
(
'CPU probs={}
\n
grads={}
\n
costs={}
\n\n
'
.
format
(
probs_cpu
,
probs_cpu
.
grad
,
cost
))
if
test_gpu
:
probs_gpu
=
probs
.
clone
().
cuda
().
requires_grad_
(
True
)
# tells autograd to compute gradients for probs
cost
=
criterion
(
probs_gpu
,
labels
,
probs_sizes
,
label_sizes
)
cost
.
backward
()
print
(
'GPU probs={}
\n
grads={}
\n
costs={}
\n\n
'
.
format
(
probs_gpu
,
probs_gpu
.
grad
,
cost
))
def
main
():
print
(
'torch.cuda.is_available() '
,
torch
.
cuda
.
is_available
())
# test_empty_label(test_cpu=True, test_gpu=False)
test_empty_label
(
test_cpu
=
False
,
test_gpu
=
True
)
test_gpu
=
False
if
torch
.
cuda
.
is_available
():
test_gpu
=
True
# test_empty_label(test_cpu=True, test_gpu=test_gpu)
test_ctcloss
(
test_cpu
=
True
,
test_gpu
=
test_gpu
)
# HIP_VISIBLE_DEVICES=1 python3 test_gpu_new.py
if
__name__
==
'__main__'
:
main
()
pytorch_binding/tests/test_gpu_speed.py
View file @
cd398a3f
import
time
import
torch
import
warpctc_pytorch_change1
as
warp_ctc_new
import
warpctc_pytorch
as
warp_ctc
import
time
from
warpctc_pytorch
import
CTCLoss
def
test_compare_
cpu
(
repeat_num
=
20
):
def
test_compare_
basic
(
repeat_num
=
20
):
probs
=
torch
.
FloatTensor
([
[[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.6
,
0.1
,
0.1
]],
[[
0.6
,
0.1
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.5
,
0.2
,
0.1
]]
...
...
@@ -17,59 +17,89 @@ def test_compare_cpu(repeat_num=20):
grads
=
torch
.
zeros
(
probs
.
size
())
time_st
=
time
.
perf_counter
()
# 1.运行
老版本
CPU
# 1.运行CPU
for
i
in
range
(
repeat_num
):
probs_
old
=
probs
.
clone
()
costs_
old
=
costs
.
clone
()
grads_
old
=
grads
.
clone
()
warp_ctc
.
cpu_ctc
(
probs_
old
,
grads_
old
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs_
old
,
0
)
probs_
new
=
probs
.
clone
()
costs_
new
=
costs
.
clone
()
grads_
new
=
grads
.
clone
()
warp_ctc
.
cpu_ctc
(
probs_
new
,
grads_
new
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs_
new
,
0
)
if
i
==
0
:
print
(
'CPU_costs
_old
: %f'
%
costs_
old
.
sum
())
print
(
'CPU probs_
old
={}
\n
grads_
old
={}
\n
costs_
old
={}'
.
format
(
probs_
old
,
grads_
old
,
costs_
old
))
print
(
'CPU_costs: %f'
%
costs_
new
.
sum
())
print
(
'CPU probs_
new
={}
\n
grads_
new
={}
\n
costs_
new
={}'
.
format
(
probs_
new
,
grads_
new
,
costs_
new
))
time_used
=
(
time
.
perf_counter
()
-
time_st
)
/
repeat_num
print
(
'CPU warp_ctc
old version
using time: '
,
time_used
)
print
(
'CPU warp_ctc using time: '
,
time_used
)
time_st
=
time
.
perf_counter
()
# 2.运行
新版本 C
PU
# 2.运行
G
PU
for
i
in
range
(
repeat_num
):
probs_new
=
probs
.
clone
()
costs_new
=
costs
.
clone
()
grads_new
=
grads
.
clone
()
warp_ctc
_new
.
cpu_ctc
(
probs_new
,
grads_new
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs_new
,
0
)
probs_new
=
probs
.
clone
()
.
cuda
()
costs_new
=
costs
.
clone
()
.
cuda
()
grads_new
=
grads
.
clone
()
.
cuda
()
warp_ctc
.
cpu_ctc
(
probs_new
,
grads_new
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs_new
,
0
)
if
i
==
0
:
print
(
'
C
PU_costs_new: %f'
%
costs_new
.
sum
())
print
(
'
C
PU probs={}
\n
grads_new={}
\n
costs_new={}'
.
format
(
probs_new
,
grads_new
,
costs_new
))
print
(
'
G
PU_costs_new: %f'
%
costs_new
.
sum
())
print
(
'
G
PU probs
_new
={}
\n
grads_new={}
\n
costs_new={}'
.
format
(
probs_new
,
grads_new
,
costs_new
))
time_used
=
(
time
.
perf_counter
()
-
time_st
)
/
repeat_num
print
(
'
C
PU warp_ctc
new version
using time: '
,
time_used
)
print
(
'
G
PU warp_ctc using time: '
,
time_used
)
def
test_compare_gpu
():
probs0
=
torch
.
FloatTensor
([
[[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.6
,
0.1
,
0.1
]],
[[
0.6
,
0.1
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.5
,
0.2
,
0.1
]]
]).
contiguous
().
cuda
()
labels
=
torch
.
IntTensor
([
1
,
2
])
label_sizes
=
torch
.
IntTensor
([
2
,
0
])
sizes
=
torch
.
IntTensor
([
2
,
2
])
minibatch_size
=
probs0
.
size
(
1
)
# 1.运行新版本 CPU
probs_new
=
probs0
.
clone
().
cuda
()
costs_new
=
torch
.
zeros
(
minibatch_size
)
grads_new
=
torch
.
zeros
(
probs0
.
size
())
warp_ctc_new
.
cpu_ctc
(
probs_new
,
grads_new
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs_new
,
0
)
print
(
'CPU_costs_new: %f'
%
costs_new
.
sum
())
print
(
'CPU probs_new={}
\n
grads_new={}
\n
costs_new={}'
.
format
(
probs_new
,
grads_new
,
costs_new
))
# 2.运行老版本 CPU
probs
=
probs0
.
clone
().
cuda
()
costs
=
torch
.
zeros
(
minibatch_size
)
grads
=
torch
.
zeros
(
probs0
.
size
())
warp_ctc
.
cpu_ctc
(
probs0
,
grads
,
labels
,
label_sizes
,
sizes
,
minibatch_size
,
costs
,
0
)
print
(
'CPU_cost: %f'
%
costs
.
sum
())
print
(
'CPU probs={}
\n
grads={}
\n
costs={}'
.
format
(
probs
,
grads
,
costs
))
def
test_ctcloss_speed
(
test_cpu
=
True
,
test_gpu
=
True
,
repeat_num
=
100
):
criterion
=
CTCLoss
(
blank
=
0
,
size_average
=
False
,
length_average
=
False
)
# 测试用例,参考pytorch的CTCLoss
# Target are to be un-padded
T
=
400
# Input sequence length
C
=
200
# Number of classes (including blank)
N
=
64
# Batch size
# Initialize random batch of input vectors, for *size = (T,N,C)
input
=
torch
.
randn
(
T
,
N
,
C
).
log_softmax
(
2
).
detach
().
requires_grad_
()
input_lengths
=
torch
.
full
(
size
=
(
N
,),
fill_value
=
T
,
dtype
=
torch
.
int32
)
# Initialize random batch of targets (0 = blank, 1:C = classes)
target_lengths
=
torch
.
randint
(
low
=
1
,
high
=
T
,
size
=
(
N
,),
dtype
=
torch
.
int32
)
target
=
torch
.
randint
(
low
=
1
,
high
=
C
,
size
=
(
sum
(
target_lengths
),),
dtype
=
torch
.
int32
)
print
(
'input shape: {}, target shape: {}'
.
format
(
input
.
shape
,
target
.
shape
))
# 测试CPU
if
test_cpu
:
# warmup
for
_
in
range
(
10
):
input_cpu
=
input
.
detach
().
requires_grad_
()
loss
=
criterion
(
input_cpu
,
target
,
input_lengths
,
target_lengths
)
loss
.
backward
()
torch
.
cuda
.
synchronize
()
time_st
=
time
.
perf_counter
()
for
_
in
range
(
repeat_num
):
input_cpu
=
input
.
detach
().
requires_grad_
()
loss
=
criterion
(
input_cpu
,
target
,
input_lengths
,
target_lengths
)
loss
.
backward
()
torch
.
cuda
.
synchronize
()
time_used
=
(
time
.
perf_counter
()
-
time_st
)
/
repeat_num
print
(
'CPU warp_ctc using time: '
,
time_used
)
# 测试GPU
if
test_gpu
:
# warmup
for
_
in
range
(
10
):
input_gpu
=
input
.
detach
().
cuda
().
requires_grad_
()
loss
=
criterion
(
input_gpu
,
target
,
input_lengths
,
target_lengths
)
loss
.
backward
()
torch
.
cuda
.
synchronize
()
time_st
=
time
.
perf_counter
()
for
_
in
range
(
repeat_num
):
input_gpu
=
input
.
detach
().
cuda
().
requires_grad_
()
loss
=
criterion
(
input_gpu
,
target
,
input_lengths
,
target_lengths
)
loss
.
backward
()
torch
.
cuda
.
synchronize
()
time_used
=
(
time
.
perf_counter
()
-
time_st
)
/
repeat_num
print
(
'GPU warp_ctc using time: '
,
time_used
)
if
__name__
==
'__main__'
:
print
(
'torch.cuda.is_available() '
,
torch
.
cuda
.
is_available
())
test_compare_
cpu
()
test_c
ompare_gpu
(
)
#
test_compare_
basic
()
test_c
tcloss_speed
(
test_cpu
=
True
,
test_gpu
=
True
,
repeat_num
=
100
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment