Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
fb7d4e1d
"vscode:/vscode.git/clone" did not exist on "ef2adb04287a0eb2421536204dc4928f18072fc1"
Commit
fb7d4e1d
authored
May 25, 2018
by
Michael Carilli
Browse files
Fleshed out Cuda version checking and compiling for multiple arches
parent
d17a015f
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
66 additions
and
8 deletions
+66
-8
apex/fp16_utils/fp16_optimizer.py
apex/fp16_utils/fp16_optimizer.py
+1
-0
csrc/interface.cpp
csrc/interface.cpp
+4
-0
csrc/scale_cuda.cu
csrc/scale_cuda.cu
+4
-0
setup.py
setup.py
+52
-3
tests/raw_ops/test_autograd.py
tests/raw_ops/test_autograd.py
+5
-5
No files found.
apex/fp16_utils/fp16_optimizer.py
View file @
fb7d4e1d
...
...
@@ -49,6 +49,7 @@ class FP16_Module(nn.Module):
def
forward
(
self
,
*
inputs
,
**
kwargs
):
return
fp16_to_fp32
(
self
.
module
(
*
(
fp32_to_fp16
(
inputs
)),
**
kwargs
))
# TODO: Update overflow check + downscale to use Carl's fused kernel.
class
FP16_Optimizer
(
object
):
"""
:class:`FP16_Optimizer` is designed to wrap an existing PyTorch optimizer,
...
...
csrc/interface.cpp
View file @
fb7d4e1d
...
...
@@ -61,6 +61,10 @@ void scale_check_overflow
{
AT_CHECK
(
grads
.
type
().
is_cuda
(),
"x must be a CUDA tensor"
);
AT_CHECK
(
overflow_buf
.
type
().
is_cuda
(),
"y must be a CUDA tensor"
);
// Make sure we are downscaling the FP32 master grads
AT_CHECK
(
grads
.
type
().
scalarType
()
==
at
::
ScalarType
::
Float
,
"grads supplied to scale_check_overflow should be fp32 (master grads)."
)
scale_check_overflow_cuda
(
grads
,
scale
,
overflow_buf
);
}
...
...
csrc/scale_cuda.cu
View file @
fb7d4e1d
...
...
@@ -13,6 +13,10 @@
// It makes sense to lock the type to "float" here because the downscaling
// should only be applied to the FP32 master gradients. Also, if "in" were
// a different type, it would require divergent code for the vectorized load logic.
// TODO:
// Update overflow check to use reduction from kernel_utils.cuh with
// ReduceOp from THCTensorMathReduce.cuh.
__global__
void
scale_reduce_overflow
(
float
*
in
,
size_t
n
,
...
...
setup.py
View file @
fb7d4e1d
import
torch.cuda
import
os
import
re
import
subprocess
from
setuptools
import
setup
,
find_packages
from
distutils.command.clean
import
clean
from
torch.utils.cpp_extension
import
CppExtension
,
CUDAExtension
...
...
@@ -6,7 +9,53 @@ from torch.utils.cpp_extension import CUDA_HOME
# TODO: multiple modules, so we don't have to route all interfaces through
# the same interface.cpp file?
if
torch
.
cuda
.
is_available
()
and
CUDA_HOME
is
not
None
:
if
not
torch
.
cuda
.
is_available
():
print
(
"Warning: Torch did not find available GPUs on this system.
\n
"
,
"If your intention is to cross-compile, this is not an error."
)
def
find
(
path
,
regex_func
,
collect
=
False
):
collection
=
[]
if
collect
else
None
for
root
,
dirs
,
files
in
os
.
walk
(
path
):
for
file
in
files
:
if
regex_func
(
file
):
if
collect
:
collection
.
append
(
os
.
path
.
join
(
root
,
file
))
else
:
return
os
.
path
.
join
(
root
,
file
)
return
list
(
set
(
collection
))
def
get_cuda_version
():
NVCC
=
find
(
CUDA_HOME
+
os
.
sep
+
"bin"
,
re
.
compile
(
'nvcc$'
).
search
)
print
(
"Found NVCC = "
,
NVCC
)
# Parse output of nvcc to get cuda major version
nvcc_output
=
subprocess
.
check_output
([
NVCC
,
'--version'
]).
decode
(
"utf-8"
)
CUDA_LIB
=
re
.
compile
(
', V[0-9]+\.[0-9]+\.[0-9]+'
).
search
(
nvcc_output
).
group
(
0
).
split
(
'V'
)[
1
]
print
(
"Found CUDA_LIB = "
,
CUDA_LIB
)
CUDA_MAJOR_VERSION
=
int
(
CUDA_LIB
.
split
(
'.'
)[
0
])
print
(
"Found CUDA_MAJOR_VERSION = "
,
CUDA_MAJOR_VERSION
)
if
CUDA_MAJOR_VERSION
<
8
:
raise
RuntimeError
(
"APex requires CUDA 8.0 or newer"
)
return
CUDA_MAJOR_VERSION
if
CUDA_HOME
is
not
None
:
print
(
"Found CUDA_HOME = "
,
CUDA_HOME
)
CUDA_MAJOR_VERSION
=
get_cuda_version
()
gencodes
=
[
'-gencode'
,
'arch=compute_52,code=sm_52'
,
'-gencode'
,
'arch=compute_60,code=sm_60'
,
'-gencode'
,
'arch=compute_61,code=sm_61'
,]
if
CUDA_MAJOR_VERSION
>
8
:
gencodes
+=
[
'-gencode'
,
'arch=compute_70,code=sm_70'
,
'-gencode'
,
'arch=compute_70,code=compute_70'
,]
ext_modules
=
[]
extension
=
CUDAExtension
(
'apex._C'
,
[
...
...
@@ -16,10 +65,10 @@ if torch.cuda.is_available() and CUDA_HOME is not None:
'csrc/scale_cuda.cu'
,
],
extra_compile_args
=
{
'cxx'
:
[
'-g'
],
'nvcc'
:
[
'-O
2'
,
'-arch=sm_70'
]})
# TODO: compile for all arches.
'nvcc'
:
[
'-O
3'
]
+
gencodes
})
ext_modules
.
append
(
extension
)
else
:
raise
RuntimeError
(
"
Apex requires Cuda 9.0 or higher
"
)
raise
RuntimeError
(
"
Could not find Cuda install directory
"
)
setup
(
name
=
'apex'
,
...
...
tests/raw_ops/test_autograd.py
View file @
fb7d4e1d
...
...
@@ -9,13 +9,13 @@ torch.cuda.manual_seed(2)
# torch.cuda.manual_seed_all(2)
torch
.
set_printoptions
(
precision
=
10
)
rows
=
1
# 321
cols
=
4096
# 33
fast
=
4096
# 185
rows
=
321
#
1
cols
=
33
#
4096
fast
=
185
#
4096
dims
=
rows
,
cols
,
fast
dim
=
2
CUDA_HALF
=
Fals
e
dim
=
0
CUDA_HALF
=
Tru
e
RAND
=
True
# If false, input gradients (the result of the backward pass)
# should be analytically zero.
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment