Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
2fa4dbaf
Commit
2fa4dbaf
authored
Apr 25, 2018
by
Christian Sarofeen
Browse files
Initial release
parents
Changes
45
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
418 additions
and
0 deletions
+418
-0
tests/raw_ops/compare.py
tests/raw_ops/compare.py
+42
-0
tests/raw_ops/norm.py
tests/raw_ops/norm.py
+20
-0
tests/raw_ops/test_autograd.py
tests/raw_ops/test_autograd.py
+146
-0
tests/raw_ops/test_backward.py
tests/raw_ops/test_backward.py
+129
-0
tests/raw_ops/test_forward.py
tests/raw_ops/test_forward.py
+81
-0
No files found.
tests/raw_ops/compare.py
0 → 100644
View file @
2fa4dbaf
import
torch
import
numpy
as
np
def
compare
(
cuda_out
,
pt_out
,
pt_out_control
,
rows
):
print
(
"Pytorch ops in fp16: "
,
pt_out
)
print
(
"Kernel result: "
,
cuda_out
)
print
(
"Control (Pytorch ops, sticking to fp32): "
,
pt_out_control
)
# Make upconverted copies for error check against fp32 control
cuda_out_fp32
=
cuda_out
.
float
()
pt_out_fp32
=
pt_out
.
float
()
# Flatten all but the slowest dimension
cuda_out
=
cuda_out
.
view
(
rows
,
-
1
)
pt_out
=
pt_out
.
view
(
rows
,
-
1
)
cuda_out_fp32
=
cuda_out_fp32
.
view
(
rows
,
-
1
)
pt_out_fp32
=
pt_out_fp32
.
view
(
rows
,
-
1
)
pt_out_control
=
pt_out_control
.
view
(
rows
,
-
1
)
cuda_maxdiffs
,
cuda_maxdiff_locs
=
torch
.
max
((
pt_out_control
-
cuda_out_fp32
).
abs
(),
1
)
pt_maxdiffs
,
pt_maxdiff_locs
=
torch
.
max
((
pt_out_control
-
pt_out_fp32
).
abs
(),
1
)
print
(
"cuda_maxdiffs = "
,
cuda_maxdiffs
)
print
(
"cuda_maxdiff_locs = "
,
cuda_maxdiff_locs
)
print
(
"pt_maxdiffs = "
,
pt_maxdiffs
)
print
(
"pt_maxdiff_locs = "
,
pt_maxdiff_locs
)
row_indices
=
torch
.
LongTensor
(
np
.
arange
(
rows
))
print
(
"cuda_out at cuda_maxdiff_locs in each row:"
)
# bizarrely, this will work if you do it at the python prompt:
# print(cuda_out[row_indices,cuda_maxdiff_locs])
# ...but it only seems to work here if you wrap with numpy arrays:
print
(
cuda_out
[
np
.
array
(
row_indices
),
np
.
array
(
cuda_maxdiff_locs
)])
print
(
"pt_out_control at cuda_maxdiff_locs in each row:"
)
print
(
pt_out_control
[
np
.
array
(
row_indices
),
np
.
array
(
cuda_maxdiff_locs
)])
print
(
"pt_out at pt_maxdiff_locs in each row:"
)
print
(
pt_out
[
np
.
array
(
row_indices
),
np
.
array
(
pt_maxdiff_locs
)])
print
(
"pt_out_control at pt_maxdiff_locs in each row:"
)
print
(
pt_out_control
[
np
.
array
(
row_indices
),
np
.
array
(
pt_maxdiff_locs
)])
tests/raw_ops/norm.py
0 → 100644
View file @
2fa4dbaf
import
torch
def
get_norm_shape
(
p
,
dim
):
if
dim
==
0
:
output_size
=
(
p
.
size
(
0
),)
+
(
1
,)
*
(
p
.
dim
()
-
1
)
return
output_size
elif
dim
==
p
.
dim
()
-
1
:
output_size
=
(
1
,)
*
(
p
.
dim
()
-
1
)
+
(
p
.
size
(
-
1
),)
return
output_size
return
None
def
pt_norm
(
p
,
dim
):
"""Computes the norm over all dimensions except dim"""
if
dim
is
None
:
return
p
.
norm
()
elif
dim
==
0
:
return
p
.
contiguous
().
view
(
p
.
size
(
0
),
-
1
).
norm
(
2
,
dim
=
1
).
view
(
*
get_norm_shape
(
p
,
dim
))
elif
dim
==
p
.
dim
()
-
1
:
return
p
.
contiguous
().
view
(
-
1
,
p
.
size
(
-
1
)).
norm
(
2
,
dim
=
0
).
view
(
*
get_norm_shape
(
p
,
dim
))
return
pt_norm
(
p
.
transpose
(
0
,
dim
),
0
).
transpose
(
0
,
dim
)
tests/raw_ops/test_autograd.py
0 → 100644
View file @
2fa4dbaf
import
torch
from
torch.autograd
import
Variable
from
apex.fp16_utils
import
Fused_Weight_Norm
from
compare
import
compare
from
norm
import
pt_norm
,
get_norm_shape
torch
.
manual_seed
(
2
)
torch
.
cuda
.
manual_seed
(
2
)
# torch.cuda.manual_seed_all(2)
torch
.
set_printoptions
(
precision
=
10
)
rows
=
1
# 321
cols
=
4096
# 33
fast
=
4096
# 185
dims
=
rows
,
cols
,
fast
dim
=
2
CUDA_HALF
=
False
RAND
=
True
# If false, input gradients (the result of the backward pass)
# should be analytically zero.
# Loss will be computed via (output*elementwise).sum().
# This means that output gradients in the backward pass will be equal
# to elementwise, so by manipulating elementwise, we have easy
# fine-grained control over the output gradients we'd like to use for
# testing purposes.
#
# The alternative is just to create the output_gradients manually
# and call output.backward(gradient=output_gradients),
# as is done in test_backward.py.
# But I wanted a minimal working sample similar to an "actual" use case,
# where gradients are computed by calling backward() on a scalar Loss.
if
RAND
:
# With std=6.0, I observe the pytorch fp16 ops going unstable
# while the fused kernel remains stable (sometimes).
pt_in_fp32
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
normal_
(
std
=
1.0
)
norm_shape
=
get_norm_shape
(
pt_in_fp32
,
dim
)
pt_g_fp32
=
torch
.
cuda
.
FloatTensor
(
*
norm_shape
).
normal_
(
std
=
1.0
)
elementwise_fp32
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
normal_
(
std
=
1.0
)
else
:
pt_in_fp32
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
fill_
(
1.0
)
norm_shape
=
get_norm_shape
(
pt_in_fp32
,
dim
)
pt_g_fp32
=
torch
.
cuda
.
FloatTensor
(
*
norm_shape
).
fill_
(
2.0
)
elementwise_fp32
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
fill_
(
0.5
)
pt_in_fp16
=
pt_in_fp32
.
half
()
cd_in_prec
=
pt_in_fp32
.
clone
()
pt_g_fp16
=
pt_g_fp32
.
half
()
cd_g_prec
=
pt_g_fp32
.
clone
()
elementwise_fp16
=
elementwise_fp32
.
half
()
elementwise_prec
=
elementwise_fp32
.
clone
()
if
CUDA_HALF
:
cd_in_prec
=
cd_in_prec
.
half
()
cd_g_prec
=
cd_g_prec
.
half
()
elementwise_prec
=
elementwise_prec
.
half
()
pt_in_fp32
=
Variable
(
pt_in_fp32
,
requires_grad
=
True
)
pt_in_fp16
=
Variable
(
pt_in_fp16
,
requires_grad
=
True
)
cd_in_prec
=
Variable
(
cd_in_prec
,
requires_grad
=
True
)
pt_g_fp32
=
Variable
(
pt_g_fp32
,
requires_grad
=
True
)
pt_g_fp16
=
Variable
(
pt_g_fp16
,
requires_grad
=
True
)
cd_g_prec
=
Variable
(
cd_g_prec
,
requires_grad
=
True
)
elementwise_fp32
=
Variable
(
elementwise_fp32
,
requires_grad
=
False
)
elementwise_fp16
=
Variable
(
elementwise_fp16
,
requires_grad
=
False
)
elementwise_prec
=
Variable
(
elementwise_prec
,
requires_grad
=
False
)
torch
.
cuda
.
nvtx
.
range_push
(
"fp16 forward, {}"
.
format
(
pt_in_fp16
.
size
()))
pt_norms_fp16
=
pt_norm
(
pt_in_fp16
,
dim
)
pt_out_fp16
=
pt_in_fp16
*
(
pt_g_fp16
/
pt_norms_fp16
)
torch
.
cuda
.
nvtx
.
range_pop
()
# torch.cuda.synchronize()
torch
.
cuda
.
nvtx
.
range_push
(
"fp32 forward, {}"
.
format
(
pt_in_fp32
.
size
()))
pt_norms_fp32
=
pt_norm
(
pt_in_fp32
,
dim
)
pt_out_fp32
=
pt_in_fp32
*
(
pt_g_fp32
/
pt_norms_fp32
)
torch
.
cuda
.
nvtx
.
range_pop
()
# torch.cuda.synchronize()
# print("pt_norms_fp16 = ", pt_norms_fp16 )
# print("pt_norms_fp32 = ", pt_norms_fp32)
# print( "cd_in_prec.data_ptr = {:x}".format(cd_in_prec.data_ptr()))
# print("elementwise_fp16 = ", elementwise_fp16)
cd_in_contig
=
cd_in_prec
.
contiguous
()
# Deliberately make noncontig to see if fused_norm
# will handle the error
# cd_in_contig = cd_in_contig[:,0:5]
# print(type(cd_in_contig))
torch
.
cuda
.
nvtx
.
range_push
(
"kernel forward"
)
fused_weight_norm
=
Fused_Weight_Norm
.
apply
cd_out_prec
=
fused_weight_norm
(
cd_in_contig
,
cd_g_prec
,
dim
)
torch
.
cuda
.
nvtx
.
range_pop
()
# torch.cuda.synchronize()
# print("type(cd_out_prec.data) = ", type(cd_out_prec.data))
# print("cd_out_prec.data_ptr = {:x}".format(cd_out_prec.data_ptr()))
print
(
"
\n\n\n
COMPARING FORWARD PASS RESULTS
\n\n\n
"
)
compare
(
cd_out_prec
.
data
,
pt_out_fp16
.
data
,
pt_out_fp32
.
data
,
rows
)
# It's ok to use elementwise_fp16 as a leaf in both the cuda and pytorch graphs.
# This sharing should not affect the computed gradients wrt pt_in_fp16 and cd_in_prec.
# However, just remember:
# If we set requires_grad=True for elementwise_fp16, elementwise_fp16.grad.data
# will accumulate gradients during the backward passes for both the cd and pytorch Losses.
#
# I do need v these parentheses v
Loss_cd_prec
=
(
cd_out_prec
*
elementwise_prec
).
sum
()
# print(L_cd_fp16)
Loss_pt_fp16
=
(
pt_out_fp16
*
elementwise_fp16
).
sum
()
# print(L_pt_fp16)
Loss_pt_fp32
=
(
pt_out_fp32
*
elementwise_fp32
).
sum
()
# print(L_pt_fp32)
torch
.
cuda
.
nvtx
.
range_push
(
"kernel backward"
)
Loss_cd_prec
.
backward
()
torch
.
cuda
.
nvtx
.
range_pop
()
torch
.
cuda
.
nvtx
.
range_push
(
"fp16 backward"
)
Loss_pt_fp16
.
backward
()
torch
.
cuda
.
nvtx
.
range_pop
()
torch
.
cuda
.
nvtx
.
range_push
(
"fp32 backward"
)
Loss_pt_fp32
.
backward
()
torch
.
cuda
.
nvtx
.
range_pop
()
print
(
"
\n\n\n
COMPARING v GRADIENT RESULTS
\n\n\n
"
)
compare
(
cd_in_prec
.
grad
.
data
,
pt_in_fp16
.
grad
.
data
,
pt_in_fp32
.
grad
.
data
,
rows
)
print
(
"
\n\n\n
COMPARING g GRADIENT RESULTS
\n\n\n
"
)
compare
(
cd_g_prec
.
grad
.
data
,
pt_g_fp16
.
grad
.
data
,
pt_g_fp32
.
grad
.
data
,
cd_g_prec
.
size
(
0
))
tests/raw_ops/test_backward.py
0 → 100644
View file @
2fa4dbaf
import
torch
from
torch.autograd
import
Variable
import
apex._C
import
numpy
as
np
from
compare
import
compare
from
norm
import
pt_norm
,
get_norm_shape
torch
.
manual_seed
(
2
)
torch
.
cuda
.
manual_seed
(
2
)
# torch.cuda.manual_seed_all(2)
torch
.
set_printoptions
(
precision
=
10
)
sizes
=
[
# (3, 512, 1024),
# (3, 512, 1536),
(
3
,
768
,
1536
),
# (3, 768, 2048),
# (3, 1024, 2048),
# (1, 1024, 4096),
# (1, 2048, 8192),
# (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark.
# (4096, 4096, 1), # this is not one of natalia's sizes, just a reference benchmark.
]
# rows = 3
# cols = 512
# fast = 1024
HALF
=
True
RAND
=
True
dim
=
2
for
rows
,
cols
,
fast
in
sizes
:
dims
=
rows
,
cols
,
fast
# Incoming gradient vectors we will use later
# Need to create the fp16 versions as a half() copy of a Tensor first rather than
# a Variable, because if you create pt_input_control as a Variable then say
# pt_input_fp16 = pt_input_control.half(), you are accidentally making pt_input_fp16 part of
# pLpOutput_control's computational graph, instead of the leaf of its own separate graph.
# Careful: if you initialize with torch.ones, the gradient wrt input becomes analytically zero :P
if
RAND
:
pLpOutput_control
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
uniform_
()
*
1.0
norm_shape
=
get_norm_shape
(
pLpOutput_control
,
dim
)
pLpg_control
=
torch
.
cuda
.
FloatTensor
(
*
norm_shape
).
uniform_
()
pt_input_control
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
uniform_
()
pt_g_control
=
torch
.
cuda
.
FloatTensor
(
*
norm_shape
).
uniform_
()
else
:
pLpOutput_control
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
fill_
(
1.
)
norm_shape
=
get_norm_shape
(
pLpOutput_control
,
dim
)
pLpg_control
=
torch
.
cuda
.
FloatTensor
(
*
norm_shape
).
fill_
(
2.
)
pt_input_control
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
fill_
(
4.0
)
pt_g_control
=
torch
.
cuda
.
FloatTensor
(
*
norm_shape
).
fill_
(
3.0
)
pLpOutput_fp16
=
pLpOutput_control
.
clone
()
pLpg_fp16
=
pLpg_control
.
clone
()
pt_input_fp16
=
pt_input_control
.
clone
()
pt_g_fp16
=
pt_g_control
.
clone
()
if
HALF
:
pLpOutput_fp16
=
pLpOutput_fp16
.
half
()
pLpg_fp16
=
pLpg_fp16
.
half
()
pt_input_fp16
=
pt_input_fp16
.
half
()
pt_g_fp16
=
pt_g_fp16
.
half
()
pLpOutput_control
=
Variable
(
pLpOutput_control
)
pLpg_control
=
Variable
(
pLpg_control
)
pLpOutput_fp16
=
Variable
(
pLpOutput_fp16
)
pLpg_fp16
=
Variable
(
pLpg_fp16
)
pt_input_control
=
Variable
(
pt_input_control
,
requires_grad
=
True
)
pt_g_control
=
Variable
(
pt_g_control
,
requires_grad
=
True
)
pt_input_fp16
=
Variable
(
pt_input_fp16
,
requires_grad
=
True
)
pt_g_fp16
=
Variable
(
pt_g_fp16
,
requires_grad
=
True
)
# Do forward pass in fp16 and fp32
pt_norms_fp16
=
pt_norm
(
pt_input_fp16
,
dim
)
pt_norms_control
=
pt_norm
(
pt_input_control
,
dim
)
pt_output_fp16
=
pt_input_fp16
*
(
pt_g_fp16
/
pt_norms_fp16
)
pt_output_control
=
pt_input_control
*
(
pt_g_control
/
pt_norms_control
)
# Run the Cuda version
pLpInput_cuda
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
fill_
(
0.
)
pLpg_cuda
=
torch
.
cuda
.
FloatTensor
(
*
norm_shape
).
fill_
(
0.
)
if
HALF
:
pLpInput_cuda
=
pLpInput_cuda
.
half
()
pLpg_cuda
=
pLpg_cuda
.
half
()
torch
.
cuda
.
nvtx
.
range_push
(
"kernel weight norm backward"
)
apex
.
_C
.
weight_norm_bwd
(
pLpInput_cuda
,
pLpg_cuda
,
pLpOutput_fp16
,
pt_input_fp16
,
pt_g_fp16
,
pt_norms_control
.
data
,
dim
)
torch
.
cuda
.
nvtx
.
range_pop
()
print
(
"grad_output: "
,
pLpOutput_fp16
.
data
)
print
(
" grad_input: "
,
pLpInput_cuda
)
print
(
" savedInput: "
,
pt_input_fp16
.
data
)
print
(
"pt_norms_control: "
,
pt_norms_control
.
data
)
print
(
"pt_norms_fp16: "
,
pt_norms_fp16
.
data
)
torch
.
cuda
.
nvtx
.
range_push
(
"pytorch fp16 backward"
)
pt_output_fp16
.
backward
(
gradient
=
pLpOutput_fp16
,
create_graph
=
True
)
torch
.
cuda
.
nvtx
.
range_pop
()
torch
.
cuda
.
nvtx
.
range_push
(
"pytorch fp32 backward"
)
pt_output_control
.
backward
(
gradient
=
pLpOutput_control
,
create_graph
=
True
)
torch
.
cuda
.
nvtx
.
range_pop
()
# pt_output_fp16 and pt_output_control are still saved, but
# pt_output_fp16.grad and pt_output_control.grad are None at this point
# because the graph is freed in the backwards pass.
# Specifying create_/retain_ graph don't seem to force saving of
# either the intermediate variables or their gradients.
print
(
"Comparing gradients wrt v"
)
torch
.
cuda
.
nvtx
.
range_push
(
"compare pLpv"
)
compare
(
pLpInput_cuda
,
pt_input_fp16
.
grad
.
data
,
pt_input_control
.
grad
.
data
,
rows
)
torch
.
cuda
.
nvtx
.
range_pop
()
print
(
"Comparing gradients wrt g"
)
torch
.
cuda
.
nvtx
.
range_push
(
"compare pLpg"
)
compare
(
pLpg_cuda
,
pt_g_fp16
.
grad
.
data
,
pt_g_control
.
grad
.
data
,
pLpg_cuda
.
size
(
0
))
torch
.
cuda
.
nvtx
.
range_pop
()
tests/raw_ops/test_forward.py
0 → 100644
View file @
2fa4dbaf
import
torch
import
sys
import
apex._C
import
numpy
as
np
from
compare
import
compare
from
norm
import
pt_norm
,
get_norm_shape
torch
.
manual_seed
(
2
)
torch
.
cuda
.
manual_seed
(
2
)
# torch.cuda.manual_seed_all(2)
torch
.
set_printoptions
(
precision
=
10
)
sizes
=
[
# (3, 512, 1024),
# (3, 512, 1536),
# (3, 768, 1536),
# (3, 768, 2048),
# (3, 1024, 2048),
# (1, 1024, 4096),
# (1, 2048, 8192),
# (1, 4096, 4096), # this is not one of natalia's sizes, just a reference benchmark.
(
4096
,
4096
,
1
),
# this is not one of natalia's sizes, just a reference benchmark.
# (353, 55, 353), # this is not one of natalia's sizes, just a reference benchmark.
]
# rows = 3
# cols = 512
# fast = 1024
HALF
=
True
RAND
=
True
dim
=
0
for
rows
,
cols
,
fast
in
sizes
:
dims
=
rows
,
cols
,
fast
print
(
"
\n\n
TESTING dims = {}
\n\n
"
.
format
(
dims
))
if
RAND
:
pt_in
=
1.
*
torch
.
cuda
.
FloatTensor
(
*
dims
).
uniform_
()
g
=
torch
.
cuda
.
FloatTensor
(
*
get_norm_shape
(
pt_in
,
dim
)).
uniform_
()
else
:
pt_in
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
fill_
(
1.
)
g
=
torch
.
cuda
.
FloatTensor
(
*
get_norm_shape
(
pt_in
,
dim
)).
fill_
(
6.0
)
# per_col = torch.arange(1,cols+1).cuda()
# print((rows*per_col*per_col).sqrt())
# pt_in *= per_col
cuda_out
=
torch
.
cuda
.
FloatTensor
(
*
dims
).
fill_
(
0.
)
cuda_norms
=
torch
.
cuda
.
FloatTensor
(
*
get_norm_shape
(
pt_in
,
dim
)).
fill_
(
0.
)
# Save a copy of the input as float
pt_in_fp32
=
pt_in
.
clone
()
g_fp32
=
g
.
clone
()
if
HALF
:
pt_in
=
pt_in
.
half
()
g
=
g
.
half
()
cuda_out
=
cuda_out
.
half
()
apex
.
_C
.
weight_norm_fwd
(
cuda_out
,
cuda_norms
,
pt_in
,
g
,
dim
)
torch
.
cuda
.
synchronize
()
# quit()
print
(
"type(cuda_out) = {}
\n
"
.
format
(
type
(
cuda_out
)))
rownorms
=
pt_norm
(
pt_in
,
dim
)
rownorms_fp32
=
pt_norm
(
pt_in_fp32
,
dim
)
print
(
"rownorms_fp32:"
)
print
(
rownorms_fp32
)
print
(
"cuda_norms"
)
print
(
cuda_norms
)
# rownorms is broadcast; torch.div(pt_in, rownorms) and pt_in/rownorms work the same way
pt_out
=
pt_in
*
(
g
/
rownorms
)
pt_out_control
=
pt_in_fp32
*
(
g_fp32
/
rownorms_fp32
)
compare
(
cuda_out
,
pt_out
,
pt_out_control
,
rows
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment