Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
15648029
Commit
15648029
authored
Aug 26, 2019
by
Michael Carilli
Browse files
Merge branch 'FDecaYed-deyuf/fused_optimizer_v2'
parents
880ab925
b9f0995b
Changes
51
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
6329 additions
and
144 deletions
+6329
-144
README.md
README.md
+4
-4
apex/amp/_initialize.py
apex/amp/_initialize.py
+4
-26
apex/amp/_process_optimizer.py
apex/amp/_process_optimizer.py
+161
-91
apex/amp/handle.py
apex/amp/handle.py
+9
-11
apex/amp/scaler.py
apex/amp/scaler.py
+19
-12
apex/contrib/__init__.py
apex/contrib/__init__.py
+0
-0
apex/contrib/csrc/groupbn/batch_norm.cu
apex/contrib/csrc/groupbn/batch_norm.cu
+331
-0
apex/contrib/csrc/groupbn/batch_norm.h
apex/contrib/csrc/groupbn/batch_norm.h
+734
-0
apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
+343
-0
apex/contrib/csrc/groupbn/batch_norm_add_relu.h
apex/contrib/csrc/groupbn/batch_norm_add_relu.h
+681
-0
apex/contrib/csrc/groupbn/cuda_utils.h
apex/contrib/csrc/groupbn/cuda_utils.h
+20
-0
apex/contrib/csrc/groupbn/interface.cpp
apex/contrib/csrc/groupbn/interface.cpp
+175
-0
apex/contrib/csrc/groupbn/ipc.cu
apex/contrib/csrc/groupbn/ipc.cu
+130
-0
apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
+2685
-0
apex/contrib/csrc/xentropy/interface.cpp
apex/contrib/csrc/xentropy/interface.cpp
+52
-0
apex/contrib/csrc/xentropy/xentropy_kernel.cu
apex/contrib/csrc/xentropy/xentropy_kernel.cu
+610
-0
apex/contrib/groupbn/__init__.py
apex/contrib/groupbn/__init__.py
+9
-0
apex/contrib/groupbn/batch_norm.py
apex/contrib/groupbn/batch_norm.py
+225
-0
apex/contrib/test/test_label_smoothing.py
apex/contrib/test/test_label_smoothing.py
+128
-0
apex/contrib/xentropy/__init__.py
apex/contrib/xentropy/__init__.py
+9
-0
No files found.
README.md
View file @
15648029
# Introduction
# Introduction
This repository holds NVIDIA-maintained utilities to streamline
This repository holds NVIDIA-maintained utilities to streamline
mixed precision and distributed training in Pytorch.
mixed precision and distributed training in Pytorch.
Some of the code here will be included in upstream Pytorch eventually.
Some of the code here will be included in upstream Pytorch eventually.
The intention of Apex is to make up-to-date utilities available to
The intention of Apex is to make up-to-date utilities available to
users as quickly as possible.
users as quickly as possible.
## Full API Documentation: [https://nvidia.github.io/apex](https://nvidia.github.io/apex)
## Full API Documentation: [https://nvidia.github.io/apex](https://nvidia.github.io/apex)
...
@@ -29,7 +29,7 @@ different flags to `amp.initialize`.
...
@@ -29,7 +29,7 @@ different flags to `amp.initialize`.
## 2. Distributed Training
## 2. Distributed Training
`apex.parallel.DistributedDataParallel`
is a module wrapper, similar to
`apex.parallel.DistributedDataParallel`
is a module wrapper, similar to
`torch.nn.parallel.DistributedDataParallel`
. It enables convenient multiprocess distributed training,
`torch.nn.parallel.DistributedDataParallel`
. It enables convenient multiprocess distributed training,
optimized for NVIDIA's NCCL communication library.
optimized for NVIDIA's NCCL communication library.
...
...
apex/amp/_initialize.py
View file @
15648029
...
@@ -10,7 +10,6 @@ from ._process_optimizer import _process_optimizer
...
@@ -10,7 +10,6 @@ from ._process_optimizer import _process_optimizer
from
apex.fp16_utils
import
convert_network
from
apex.fp16_utils
import
convert_network
from
..fp16_utils
import
FP16_Optimizer
as
FP16_Optimizer_general
from
..fp16_utils
import
FP16_Optimizer
as
FP16_Optimizer_general
from
..optimizers
import
FP16_Optimizer
as
FP16_Optimizer_for_fused
from
..optimizers
import
FP16_Optimizer
as
FP16_Optimizer_for_fused
from
..optimizers
import
FusedAdam
from
..parallel
import
DistributedDataParallel
as
apex_DDP
from
..parallel
import
DistributedDataParallel
as
apex_DDP
from
..parallel.LARC
import
LARC
from
..parallel.LARC
import
LARC
...
@@ -124,29 +123,13 @@ def check_optimizers(optimizers):
...
@@ -124,29 +123,13 @@ def check_optimizers(optimizers):
raise
RuntimeError
(
"An incoming optimizer is an instance of {}. "
.
format
(
bad_optim_type
)
+
raise
RuntimeError
(
"An incoming optimizer is an instance of {}. "
.
format
(
bad_optim_type
)
+
"The optimizer(s) passed to amp.initialize() must be bare
\n
"
"The optimizer(s) passed to amp.initialize() must be bare
\n
"
"instances of either ordinary Pytorch optimizers, or Apex fused
\n
"
"instances of either ordinary Pytorch optimizers, or Apex fused
\n
"
"optimizers (
currently just
FusedAdam
, but
FusedSGD
will be added
\n
"
"optimizers (FusedAdam
or
FusedSGD
).
\n
"
"
soon).
You should not manually wrap your optimizer in either
\n
"
"You should not manually wrap your optimizer in either
\n
"
"apex.fp16_utils.FP16_Optimizer or apex.optimizers.FP16_Optimizer.
\n
"
"apex.fp16_utils.FP16_Optimizer or apex.optimizers.FP16_Optimizer.
\n
"
"amp.initialize will take care of that for you (if necessary) based
\n
"
"amp.initialize will take care of that for you (if necessary) based
\n
"
"on the specified opt_level (and optional overridden properties)."
)
"on the specified opt_level (and optional overridden properties)."
)
def
wrap_fused_adam
(
optimizer
,
properties
):
msg
=
'Currently, the usage of FusedAdam is restricted to '
\
'amp.initialize(..., opt_level="O2", keep_batchnorm_fp32=False, '
\
'loss_scale=float or "dynamic"). We are working on enabling more general usage.'
assert
properties
.
master_weights
is
True
,
msg
assert
properties
.
cast_model_type
is
torch
.
float16
,
msg
assert
(
properties
.
keep_batchnorm_fp32
is
False
or
properties
.
keep_batchnorm_fp32
is
None
),
msg
if
properties
.
loss_scale
==
"dynamic"
:
return
FP16_Optimizer_for_fused
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
return
FP16_Optimizer_for_fused
(
optimizer
,
static_loss_scale
=
properties
.
loss_scale
)
def
_initialize
(
models
,
optimizers
,
properties
,
num_losses
=
1
,
cast_model_outputs
=
None
):
def
_initialize
(
models
,
optimizers
,
properties
,
num_losses
=
1
,
cast_model_outputs
=
None
):
from
apex.parallel
import
DistributedDataParallel
as
apex_DDP
from
apex.parallel
import
DistributedDataParallel
as
apex_DDP
from
.amp
import
init
as
amp_init
from
.amp
import
init
as
amp_init
...
@@ -176,7 +159,6 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
...
@@ -176,7 +159,6 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
if
not
_amp_state
.
allow_incoming_model_not_fp32
:
if
not
_amp_state
.
allow_incoming_model_not_fp32
:
check_params_fp32
(
models
)
check_params_fp32
(
models
)
# In the future, when FP16_Optimizer can be deprecated and master weights can
# In the future, when FP16_Optimizer can be deprecated and master weights can
# become an attribute, remember to stash master weights before casting the model.
# become an attribute, remember to stash master weights before casting the model.
...
@@ -207,7 +189,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
...
@@ -207,7 +189,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
model
.
forward
=
patch_forward
(
model
.
forward
)
model
.
forward
=
patch_forward
(
model
.
forward
)
# State dict trick to recast any preexisting per-param state tensors
# State dict trick to recast any preexisting per-param state tensors
for
optimizer
in
optimizers
:
for
optimizer
in
optimizers
:
optimizer
.
load_state_dict
(
optimizer
.
state_dict
())
optimizer
.
load_state_dict
(
optimizer
.
state_dict
())
elif
cast_model_outputs
is
not
None
:
elif
cast_model_outputs
is
not
None
:
...
@@ -223,11 +205,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
...
@@ -223,11 +205,7 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
model
.
forward
=
patch_forward
(
model
.
forward
)
model
.
forward
=
patch_forward
(
model
.
forward
)
for
i
,
optimizer
in
enumerate
(
optimizers
):
for
i
,
optimizer
in
enumerate
(
optimizers
):
# Still need to special case this for the first pass
optimizers
[
i
]
=
_process_optimizer
(
optimizer
,
properties
)
if
isinstance
(
optimizer
,
FusedAdam
):
optimizers
[
i
]
=
wrap_fused_adam
(
optimizer
,
properties
)
else
:
optimizers
[
i
]
=
_process_optimizer
(
optimizer
,
properties
)
_amp_state
.
loss_scalers
=
[]
_amp_state
.
loss_scalers
=
[]
for
_
in
range
(
num_losses
):
for
_
in
range
(
num_losses
):
...
...
apex/amp/_process_optimizer.py
View file @
15648029
...
@@ -3,6 +3,7 @@ from ..fp16_utils import master_params_to_model_params
...
@@ -3,6 +3,7 @@ from ..fp16_utils import master_params_to_model_params
from
..multi_tensor_apply
import
multi_tensor_applier
from
..multi_tensor_apply
import
multi_tensor_applier
from
._amp_state
import
maybe_print
from
._amp_state
import
maybe_print
import
torch
import
torch
from
..optimizers
import
FusedSGD
class
AmpOptimizerState
(
object
):
class
AmpOptimizerState
(
object
):
...
@@ -10,6 +11,20 @@ class AmpOptimizerState(object):
...
@@ -10,6 +11,20 @@ class AmpOptimizerState(object):
pass
pass
def
_master_params_to_model_params
(
self
):
stash
=
self
.
_amp_stash
if
multi_tensor_applier
.
available
:
if
len
(
stash
.
all_fp16_params
)
>
0
:
multi_tensor_applier
(
stash
.
multi_tensor_scale
,
stash
.
dummy_overflow_buf
,
[
stash
.
all_fp32_from_fp16_params
,
stash
.
all_fp16_params
],
1.0
)
else
:
for
fp16_group
,
fp32_from_fp16_group
in
zip
(
stash
.
fp16_groups
,
stash
.
fp32_from_fp16_groups
):
master_params_to_model_params
(
fp16_group
,
fp32_from_fp16_group
)
def
lazy_init_with_master_weights
(
self
):
def
lazy_init_with_master_weights
(
self
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
stash
.
fp16_groups
=
[]
stash
.
fp16_groups
=
[]
...
@@ -60,6 +75,8 @@ def lazy_init_with_master_weights(self):
...
@@ -60,6 +75,8 @@ def lazy_init_with_master_weights(self):
for
group
in
stash
.
fp32_from_fp32_groups
:
for
group
in
stash
.
fp32_from_fp32_groups
:
stash
.
all_fp32_from_fp32_params
+=
group
stash
.
all_fp32_from_fp32_params
+=
group
# all_fp16_grad_stash is only needed for fused optimizers.
stash
.
all_fp16_grad_stash
=
[
None
for
_
in
stash
.
all_fp16_params
]
# stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
# stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
stash
.
all_fp32_from_fp32_grad_stash
=
[
None
for
_
in
stash
.
all_fp32_from_fp32_params
]
stash
.
all_fp32_from_fp32_grad_stash
=
[
None
for
_
in
stash
.
all_fp32_from_fp32_params
]
...
@@ -73,15 +90,55 @@ def lazy_init_with_master_weights(self):
...
@@ -73,15 +90,55 @@ def lazy_init_with_master_weights(self):
self
.
load_state_dict
(
self
.
state_dict
())
self
.
load_state_dict
(
self
.
state_dict
())
def
post_backward_models_are_masters
(
scaler
,
params
,
stashed_grads
,
scale_override
=
None
):
grads_have_scale
,
stashed_have_scale
,
out_scale
=
scaler
.
loss_scale
(),
1.0
,
1.0
if
scale_override
is
not
None
:
grads_have_scale
,
stashed_have_scale
,
out_scale
=
scale_override
# This is a lot of python overhead...
grads_needing_unscale
=
[]
grads_needing_unscale_with_stash
=
[]
stashed
=
[]
for
param
,
stashed_grad
in
zip
(
params
,
stashed_grads
):
if
param
.
grad
is
None
and
stashed_grad
is
not
None
:
param
.
grad
=
stashed_grad
elif
param
.
grad
is
not
None
and
stashed_grad
is
None
:
grads_needing_unscale
.
append
(
param
.
grad
)
elif
param
.
grad
is
not
None
and
stashed_grad
is
not
None
:
grads_needing_unscale_with_stash
.
append
(
param
.
grad
)
stashed
.
append
(
stashed_grad
)
else
:
# param.grad is None and stashed_grad is None
continue
# unscale() implements grads*(1/scale), so "scale" should be grads_have_scale/out_scale.
if
len
(
grads_needing_unscale
)
>
0
:
scaler
.
unscale
(
grads_needing_unscale
,
grads_needing_unscale
,
None
,
# unused_scale, currently present to avoid API breakage elsewhere
models_are_masters
=
True
,
scale_override
=
grads_have_scale
/
out_scale
)
if
len
(
grads_needing_unscale_with_stash
)
>
0
:
scaler
.
unscale_with_stashed
(
grads_needing_unscale_with_stash
,
stashed
,
grads_needing_unscale_with_stash
,
scale_override
=
(
grads_have_scale
,
stashed_have_scale
,
out_scale
))
# Clear the stash.
for
i
in
range
(
len
(
stashed_grads
)):
stashed_grads
[
i
]
=
None
def
prepare_backward_with_master_weights
(
self
):
def
prepare_backward_with_master_weights
(
self
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
if
not
stash
.
lazy_init_called
:
self
.
_amp_lazy_init
()
self
.
_lazy_init_maybe_master_weights
()
stash
.
lazy_init_called
=
True
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
# Set up to leverage grad copy elision:
# Set up to leverage grad copy elision.
# This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
param
.
grad
=
None
param
.
grad
=
None
# for i, param in enumerate(stash.all_fp32_from_fp16_params):
# for i, param in enumerate(stash.all_fp32_from_fp16_params):
...
@@ -96,6 +153,8 @@ def prepare_backward_with_master_weights(self):
...
@@ -96,6 +153,8 @@ def prepare_backward_with_master_weights(self):
def
post_backward_with_master_weights
(
self
,
scaler
):
def
post_backward_with_master_weights
(
self
,
scaler
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
self
.
_amp_lazy_init
()
# This is a lot of python overhead...
# This is a lot of python overhead...
fp16_grads_needing_unscale
=
[]
fp16_grads_needing_unscale
=
[]
new_fp32_grads
=
[]
new_fp32_grads
=
[]
...
@@ -129,37 +188,10 @@ def post_backward_with_master_weights(self, scaler):
...
@@ -129,37 +188,10 @@ def post_backward_with_master_weights(self, scaler):
preexisting_fp32_grads
)
preexisting_fp32_grads
)
# fp32 params can be treated as they would be in the "no_master_weights" case.
# fp32 params can be treated as they would be in the "no_master_weights" case.
grads_needing_unscale
=
[]
post_backward_models_are_masters
(
grads_needing_unscale_with_stash
=
[]
scaler
,
stashed
=
[]
stash
.
all_fp32_from_fp32_params
,
for
param
,
stashed_grad
in
zip
(
stash
.
all_fp32_from_fp32_params
,
stash
.
all_fp32_from_fp32_grad_stash
)
stash
.
all_fp32_from_fp32_grad_stash
):
if
param
.
grad
is
None
and
stashed_grad
is
not
None
:
param
.
grad
=
stashed_grad
elif
param
.
grad
is
not
None
and
stashed_grad
is
None
:
grads_needing_unscale
.
append
(
param
.
grad
)
elif
param
.
grad
is
not
None
and
stashed_grad
is
not
None
:
grads_needing_unscale_with_stash
.
append
(
param
.
grad
)
stashed
.
append
(
stashed_grad
)
else
:
# param.grad is None and stashed_grad is None:
continue
if
len
(
grads_needing_unscale
)
>
0
:
scaler
.
unscale
(
grads_needing_unscale
,
grads_needing_unscale
,
scaler
.
loss_scale
(),
models_are_masters
=
True
)
if
len
(
grads_needing_unscale_with_stash
)
>
0
:
scaler
.
unscale_with_stashed
(
grads_needing_unscale_with_stash
,
stashed
,
grads_needing_unscale_with_stash
)
# Clear the stash.
for
i
in
range
(
len
(
stash
.
all_fp32_from_fp32_grad_stash
)):
stash
.
all_fp32_from_fp32_grad_stash
[
i
]
=
None
def
lazy_init_no_master_weights
(
self
):
def
lazy_init_no_master_weights
(
self
):
...
@@ -184,9 +216,7 @@ def lazy_init_no_master_weights(self):
...
@@ -184,9 +216,7 @@ def lazy_init_no_master_weights(self):
def
prepare_backward_no_master_weights
(
self
):
def
prepare_backward_no_master_weights
(
self
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
if
not
stash
.
lazy_init_called
:
self
.
_amp_lazy_init
()
self
.
_lazy_init_maybe_master_weights
()
stash
.
lazy_init_called
=
True
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
stash
.
all_fp16_grad_stash
[
i
]
=
param
.
grad
stash
.
all_fp16_grad_stash
[
i
]
=
param
.
grad
...
@@ -202,55 +232,82 @@ def prepare_backward_no_master_weights(self):
...
@@ -202,55 +232,82 @@ def prepare_backward_no_master_weights(self):
def
post_backward_no_master_weights
(
self
,
scaler
):
def
post_backward_no_master_weights
(
self
,
scaler
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
self
.
_amp_lazy_init
()
split_types
=
((
stash
.
all_fp16_params
,
stash
.
all_fp16_grad_stash
),
split_types
=
((
stash
.
all_fp16_params
,
stash
.
all_fp16_grad_stash
),
(
stash
.
all_fp32_params
,
stash
.
all_fp32_grad_stash
))
(
stash
.
all_fp32_params
,
stash
.
all_fp32_grad_stash
))
for
params
,
stashed_grads
in
split_types
:
for
params
,
stashed_grads
in
split_types
:
# This is a lot of python overhead...
post_backward_models_are_masters
(
scaler
,
params
,
stashed_grads
)
grads_needing_unscale
=
[]
grads_needing_unscale_with_stash
=
[]
stashed
=
[]
for
param
,
stashed_grad
in
zip
(
params
,
stashed_grads
):
if
param
.
grad
is
None
and
stashed_grad
is
not
None
:
param
.
grad
=
stashed_grad
elif
param
.
grad
is
not
None
and
stashed_grad
is
None
:
grads_needing_unscale
.
append
(
param
.
grad
)
elif
param
.
grad
is
not
None
and
stashed_grad
is
not
None
:
grads_needing_unscale_with_stash
.
append
(
param
.
grad
)
stashed
.
append
(
stashed_grad
)
else
:
# param.grad is None and stashed_grad is None
continue
if
len
(
grads_needing_unscale
)
>
0
:
scaler
.
unscale
(
grads_needing_unscale
,
grads_needing_unscale
,
scaler
.
loss_scale
(),
models_are_masters
=
True
)
if
len
(
grads_needing_unscale_with_stash
)
>
0
:
#####################################################################################
scaler
.
unscale_with_stashed
(
# FusedSGD versions
grads_needing_unscale_with_stash
,
#####################################################################################
stashed
,
grads_needing_unscale_with_stash
)
# Clear the stash.
# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
for
i
in
range
(
len
(
stashed_grads
)):
# outside the kernel, so we must accumulate directly into the model grads.
stashed_grads
[
i
]
=
None
def
prepare_backward_with_master_weights_FusedSGD
(
self
):
if
self
.
materialize_master_grads
:
prepare_backward_with_master_weights
(
self
)
else
:
stash
=
self
.
_amp_stash
self
.
_amp_lazy_init
()
def
_master_params_to_model_params
(
self
):
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
stash
=
self
.
_amp_stash
stash
.
all_fp16_grad_stash
[
i
]
=
param
.
grad
if
multi_tensor_applier
.
available
:
# Set up to leverage grad copy elision:
if
len
(
stash
.
all_fp16_params
)
>
0
:
param
.
grad
=
None
multi_tensor_applier
(
stash
.
multi_tensor_scale
,
for
i
,
param
in
enumerate
(
stash
.
all_fp32_from_fp32_params
):
stash
.
dummy_overflow_buf
,
stash
.
all_fp32_from_fp32_grad_stash
[
i
]
=
param
.
grad
[
stash
.
all_fp32_from_fp16_params
,
stash
.
all_fp16_params
],
# Set up to leverage grad copy elision:
1.0
)
param
.
grad
=
None
def
post_backward_with_master_weights_FusedSGD
(
self
,
scaler
):
if
self
.
materialize_master_grads
:
post_backward_with_master_weights
(
self
,
scaler
)
else
:
else
:
for
fp16_group
,
fp32_from_fp16_group
in
zip
(
stash
.
fp16_groups
,
stash
.
fp32_from_fp16_groups
):
stash
=
self
.
_amp_stash
master_params_to_model_params
(
fp16_group
,
fp32_from_fp16_group
)
self
.
_amp_lazy_init
()
grads_have_scale
=
scaler
.
loss_scale
()
stashed_have_scale
=
self
.
most_recent_scale
out_scale
=
grads_have_scale
if
self
.
scale_set_by_backward
:
out_scale
=
min
(
grads_have_scale
,
self
.
most_recent_scale
)
split_types
=
((
stash
.
all_fp16_params
,
stash
.
all_fp16_grad_stash
),
(
stash
.
all_fp32_from_fp32_params
,
stash
.
all_fp32_from_fp32_grad_stash
))
# unscale_with_stashed() implements grads*1/scale + stashed_grads*1.
# stashed_grads are scaled by self.most_recent_scale.
for
params
,
stashed_grads
in
split_types
:
post_backward_models_are_masters
(
scaler
,
params
,
stashed_grads
,
(
grads_have_scale
,
stashed_have_scale
,
out_scale
))
self
.
most_recent_scale
=
out_scale
self
.
scale_set_by_backward
=
True
def
prepare_backward_no_master_weights_FusedSGD
(
self
):
prepare_backward_no_master_weights
(
self
)
def
post_backward_no_master_weights_FusedSGD
(
self
,
scaler
):
post_backward_no_master_weights
(
self
,
scaler
)
def
_amp_lazy_init
(
self
):
stash
=
self
.
_amp_stash
if
not
stash
.
lazy_init_called
:
self
.
_lazy_init_maybe_master_weights
()
stash
.
lazy_init_called
=
True
def
_process_optimizer
(
optimizer
,
properties
):
def
_process_optimizer
(
optimizer
,
properties
):
...
@@ -266,7 +323,8 @@ def _process_optimizer(optimizer, properties):
...
@@ -266,7 +323,8 @@ def _process_optimizer(optimizer, properties):
for
name
in
(
"_lazy_init_maybe_master_weights"
,
for
name
in
(
"_lazy_init_maybe_master_weights"
,
"_master_params_to_model_params"
,
"_master_params_to_model_params"
,
"_prepare_amp_backward"
,
"_prepare_amp_backward"
,
"_post_amp_backward"
):
"_post_amp_backward"
,
"_amp_lazy_init"
):
if
hasattr
(
optimizer
,
name
):
if
hasattr
(
optimizer
,
name
):
raise
RuntimeError
(
"Incoming optimizer already has {} defined."
.
format
(
name
))
raise
RuntimeError
(
"Incoming optimizer already has {} defined."
.
format
(
name
))
...
@@ -274,6 +332,7 @@ def _process_optimizer(optimizer, properties):
...
@@ -274,6 +332,7 @@ def _process_optimizer(optimizer, properties):
if
multi_tensor_applier
.
available
:
if
multi_tensor_applier
.
available
:
import
amp_C
import
amp_C
optimizer
.
_amp_stash
.
multi_tensor_scale
=
amp_C
.
multi_tensor_scale
optimizer
.
_amp_stash
.
multi_tensor_scale
=
amp_C
.
multi_tensor_scale
optimizer
.
_amp_stash
.
multi_tensor_l2norm
=
amp_C
.
multi_tensor_l2norm
optimizer
.
_amp_stash
.
dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
]);
optimizer
.
_amp_stash
.
dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
]);
if
properties
.
master_weights
:
if
properties
.
master_weights
:
...
@@ -288,7 +347,8 @@ def _process_optimizer(optimizer, properties):
...
@@ -288,7 +347,8 @@ def _process_optimizer(optimizer, properties):
if
closure
is
not
None
:
if
closure
is
not
None
:
raise
RuntimeError
(
"Currently, Amp does not support closure use with optimizers."
)
raise
RuntimeError
(
"Currently, Amp does not support closure use with optimizers."
)
retval
=
old_step
()
retval
=
old_step
()
self
.
_master_params_to_model_params
()
if
not
isinstance
(
self
,
FusedSGD
):
self
.
_master_params_to_model_params
()
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
for
param
in
self
.
_amp_stash
.
all_fp32_from_fp16_params
:
for
param
in
self
.
_amp_stash
.
all_fp32_from_fp16_params
:
param
.
grad
=
None
param
.
grad
=
None
...
@@ -298,9 +358,7 @@ def _process_optimizer(optimizer, properties):
...
@@ -298,9 +358,7 @@ def _process_optimizer(optimizer, properties):
old_zero_grad
=
optimizer
.
zero_grad
old_zero_grad
=
optimizer
.
zero_grad
def
new_zero_grad
(
self
):
def
new_zero_grad
(
self
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
if
not
stash
.
lazy_init_called
:
self
.
_amp_lazy_init
()
self
.
_lazy_init_maybe_master_weights
()
stash
.
lazy_init_called
=
True
# Zero the model grads.
# Zero the model grads.
for
param
in
stash
.
all_fp16_params
:
for
param
in
stash
.
all_fp16_params
:
if
param
.
grad
is
not
None
:
if
param
.
grad
is
not
None
:
...
@@ -315,20 +373,32 @@ def _process_optimizer(optimizer, properties):
...
@@ -315,20 +373,32 @@ def _process_optimizer(optimizer, properties):
param
.
grad
=
None
param
.
grad
=
None
optimizer
.
zero_grad
=
types
.
MethodType
(
new_zero_grad
,
optimizer
)
optimizer
.
zero_grad
=
types
.
MethodType
(
new_zero_grad
,
optimizer
)
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
if
isinstance
(
optimizer
,
FusedSGD
):
prepare_backward_with_master_weights
,
optimizer
)
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
prepare_backward_with_master_weights_FusedSGD
,
optimizer
)
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
post_backward_with_master_weights
,
optimizer
)
post_backward_with_master_weights_FusedSGD
,
optimizer
)
else
:
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
prepare_backward_with_master_weights
,
optimizer
)
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
post_backward_with_master_weights
,
optimizer
)
else
:
else
:
optimizer
.
_lazy_init_maybe_master_weights
=
types
.
MethodType
(
optimizer
.
_lazy_init_maybe_master_weights
=
types
.
MethodType
(
lazy_init_no_master_weights
,
optimizer
)
lazy_init_no_master_weights
,
optimizer
)
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
if
isinstance
(
optimizer
,
FusedSGD
):
prepare_backward_no_master_weights
,
optimizer
)
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
prepare_backward_no_master_weights_FusedSGD
,
optimizer
)
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
post_backward_no_master_weights_FusedSGD
,
optimizer
)
else
:
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
prepare_backward_no_master_weights
,
optimizer
)
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
post_backward_no_master_weights
,
optimizer
)
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
optimizer
.
_amp_lazy_init
=
types
.
MethodType
(
_amp_lazy_init
,
optimizer
)
post_backward_no_master_weights
,
optimizer
)
old_add_param_group
=
optimizer
.
add_param_group
old_add_param_group
=
optimizer
.
add_param_group
...
...
apex/amp/handle.py
View file @
15648029
...
@@ -6,8 +6,6 @@ from . import utils
...
@@ -6,8 +6,6 @@ from . import utils
from
.opt
import
OptimWrapper
from
.opt
import
OptimWrapper
from
.scaler
import
LossScaler
from
.scaler
import
LossScaler
from
._amp_state
import
_amp_state
,
master_params
,
maybe_print
from
._amp_state
import
_amp_state
,
master_params
,
maybe_print
from
..fp16_utils
import
FP16_Optimizer
as
FP16_Optimizer_general
from
..optimizers
import
FP16_Optimizer
as
FP16_Optimizer_for_fused
from
..parallel.LARC
import
LARC
from
..parallel.LARC
import
LARC
...
@@ -89,13 +87,8 @@ def scale_loss(loss,
...
@@ -89,13 +87,8 @@ def scale_loss(loss,
if
isinstance
(
optimizers
,
torch
.
optim
.
Optimizer
)
or
isinstance
(
optimizers
,
LARC
):
if
isinstance
(
optimizers
,
torch
.
optim
.
Optimizer
)
or
isinstance
(
optimizers
,
LARC
):
optimizers
=
[
optimizers
]
optimizers
=
[
optimizers
]
# this is what happens when i have to support tools from different sources under the same API...
loss_scaler
=
_amp_state
.
loss_scalers
[
loss_id
]
# TODO: Rewrite FusedAdam to use multi-tensor apply and the same loss scaler.
loss_scale
=
loss_scaler
.
loss_scale
()
if
isinstance
(
optimizers
,
FP16_Optimizer_for_fused
):
loss_scale
=
optimizers
.
cur_scale
else
:
loss_scaler
=
_amp_state
.
loss_scalers
[
loss_id
]
loss_scale
=
loss_scaler
.
loss_scale
()
if
((
not
_amp_state
.
opt_properties
.
master_weights
)
if
((
not
_amp_state
.
opt_properties
.
master_weights
)
and
(
not
loss_scaler
.
dynamic
)
and
(
not
loss_scaler
.
dynamic
)
...
@@ -120,8 +113,8 @@ def scale_loss(loss,
...
@@ -120,8 +113,8 @@ def scale_loss(loss,
for
optimizer
in
optimizers
:
for
optimizer
in
optimizers
:
optimizer
.
_amp_stash
.
params_have_scaled_gradients
=
True
optimizer
.
_amp_stash
.
params_have_scaled_gradients
=
True
else
:
else
:
# Fused
Adam and FusedSGD will
take care of unscaling as part of their step() methods.
# Fused
SGD may
take care of unscaling as part of their step() methods.
if
not
isinstance
(
optimizers
,
FP16_Optimizer_for_fused
):
#
if not isinstance(optimizers, FP16_Optimizer_for_fused):
loss_scaler
.
clear_overflow_state
()
loss_scaler
.
clear_overflow_state
()
for
optimizer
in
optimizers
:
for
optimizer
in
optimizers
:
optimizer
.
_post_amp_backward
(
loss_scaler
)
optimizer
.
_post_amp_backward
(
loss_scaler
)
...
@@ -142,10 +135,15 @@ def scale_loss(loss,
...
@@ -142,10 +135,15 @@ def scale_loss(loss,
maybe_print
((
"Gradient overflow. Skipping step, loss scaler "
+
maybe_print
((
"Gradient overflow. Skipping step, loss scaler "
+
"{} reducing loss scale to {}"
).
format
(
loss_id
,
"{} reducing loss scale to {}"
).
format
(
loss_id
,
loss_scaler
.
loss_scale
()))
loss_scaler
.
loss_scale
()))
# TODO: I don't like the special casing for different optimizer implementations.
# Maybe skip should delegate to a method owned by the optimizers themselves.
if
hasattr
(
opt
.
_amp_stash
,
"all_fp32_from_fp16_params"
):
if
hasattr
(
opt
.
_amp_stash
,
"all_fp32_from_fp16_params"
):
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
for
param
in
opt
.
_amp_stash
.
all_fp32_from_fp16_params
:
for
param
in
opt
.
_amp_stash
.
all_fp32_from_fp16_params
:
param
.
grad
=
None
param
.
grad
=
None
if
hasattr
(
opt
,
"most_recent_scale"
):
opt
.
most_recent_scale
=
1.0
opt
.
scale_set_by_backward
=
False
opt
.
step
=
opt_step
opt
.
step
=
opt_step
opt
.
_amp_stash
.
already_patched
=
False
opt
.
_amp_stash
.
already_patched
=
False
return
skip_step
return
skip_step
...
...
apex/amp/scaler.py
View file @
15648029
...
@@ -16,7 +16,7 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F
...
@@ -16,7 +16,7 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F
master_grad
.
mul_
(
scale
)
master_grad
.
mul_
(
scale
)
return
False
return
False
def
axpby_check_overflow_python
(
model_grad
,
stashed_grad
,
master_grad
,
scale
,
check_overflow
=
False
):
def
axpby_check_overflow_python
(
model_grad
,
stashed_grad
,
master_grad
,
a
,
b
,
check_overflow
=
False
):
# Exception handling for 18.04 compatibility
# Exception handling for 18.04 compatibility
if
check_overflow
:
if
check_overflow
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
...
@@ -26,9 +26,8 @@ def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, ch
...
@@ -26,9 +26,8 @@ def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, ch
# if master_grad is not model_grad: # copy_ probably internally short-circuits this
# if master_grad is not model_grad: # copy_ probably internally short-circuits this
# master_grad.copy_(model_grad)
# master_grad.copy_(model_grad)
assert
stashed_grad
.
dtype
==
master_grad
.
dtype
assert
stashed_grad
.
dtype
==
master_grad
.
dtype
converted_model_grad
=
model_grad
.
to
(
master_grad
.
dtype
)
converted_model_grad
=
model_grad
.
data
.
to
(
master_grad
.
dtype
)
stashed_grad
.
add_
(
scale
,
converted_model_grad
)
master_grad
.
data
=
a
*
converted_model_grad
.
data
+
b
*
stashed_grad
.
data
master_grad
.
data
=
stashed_grad
.
data
return
False
return
False
class
LossScaler
(
object
):
class
LossScaler
(
object
):
...
@@ -92,11 +91,13 @@ class LossScaler(object):
...
@@ -92,11 +91,13 @@ class LossScaler(object):
break
break
# unused_scale keeps some of the old API alive for hopefully a short time.
# unused_scale keeps some of the old API alive for hopefully a short time.
def
unscale
(
self
,
model_grads
,
master_grads
,
unused_scale
,
models_are_masters
=
False
):
def
unscale
(
self
,
model_grads
,
master_grads
,
unused_scale
,
models_are_masters
=
False
,
scale_override
=
None
):
if
self
.
_has_overflow
:
if
self
.
_has_overflow
:
return
return
scale
=
self
.
_loss_scale
scale
=
self
.
_loss_scale
if
scale_override
is
not
None
:
scale
=
scale_override
if
scale
==
1.0
and
models_are_masters
and
not
self
.
dynamic
:
if
scale
==
1.0
and
models_are_masters
and
not
self
.
dynamic
:
return
return
...
@@ -126,7 +127,8 @@ class LossScaler(object):
...
@@ -126,7 +127,8 @@ class LossScaler(object):
model_grads
,
model_grads
,
stashed_master_grads
,
stashed_master_grads
,
master_grads
,
master_grads
,
scale
):
a
,
b
):
for
model
,
stashed
,
master
in
zip
(
model_grads
,
stashed_master_grads
,
master_grads
):
for
model
,
stashed
,
master
in
zip
(
model_grads
,
stashed_master_grads
,
master_grads
):
if
model
is
None
and
stashed
is
None
:
if
model
is
None
and
stashed
is
None
:
continue
continue
...
@@ -141,7 +143,8 @@ class LossScaler(object):
...
@@ -141,7 +143,8 @@ class LossScaler(object):
self
.
_has_overflow
=
axpby_check_overflow_python
(
model
,
self
.
_has_overflow
=
axpby_check_overflow_python
(
model
,
stashed
,
stashed
,
master
,
master
,
1.
/
scale
,
a
,
b
,
self
.
dynamic
)
self
.
dynamic
)
if
self
.
_has_overflow
and
self
.
dynamic
:
if
self
.
_has_overflow
and
self
.
dynamic
:
break
break
...
@@ -149,11 +152,14 @@ class LossScaler(object):
...
@@ -149,11 +152,14 @@ class LossScaler(object):
def
unscale_with_stashed
(
self
,
def
unscale_with_stashed
(
self
,
model_grads
,
model_grads
,
stashed_master_grads
,
stashed_master_grads
,
master_grads
):
master_grads
,
scale_override
=
None
):
if
self
.
_has_overflow
:
if
self
.
_has_overflow
:
return
return
scale
=
self
.
_loss_scale
grads_have_scale
,
stashed_have_scale
,
out_scale
=
self
.
_loss_scale
,
1.0
,
1.0
if
scale_override
is
not
None
:
grads_have_scale
,
stashed_have_scale
,
out_scale
=
scale_override
if
LossScaler
.
has_fused_kernel
:
if
LossScaler
.
has_fused_kernel
:
if
(
not
LossScaler
.
warned_unscaling_non_fp32_grad
if
(
not
LossScaler
.
warned_unscaling_non_fp32_grad
...
@@ -167,14 +173,15 @@ class LossScaler(object):
...
@@ -167,14 +173,15 @@ class LossScaler(object):
multi_tensor_applier
(
LossScaler
.
multi_tensor_axpby_cuda
,
multi_tensor_applier
(
LossScaler
.
multi_tensor_axpby_cuda
,
self
.
_overflow_buf
,
self
.
_overflow_buf
,
[
model_grads
,
stashed_master_grads
,
master_grads
],
[
model_grads
,
stashed_master_grads
,
master_grads
],
1.
/
scale
,
out_scale
/
grads_have_scale
,
#
1./scale,
1.0
,
out_scale
/
stashed_have_scale
,
#
1.0,
0
)
# check only arg 0, aka the incoming model grads, for infs
0
)
# check only arg 0, aka the incoming model grads, for infs
else
:
else
:
self
.
unscale_with_stashed_python
(
model_grads
,
self
.
unscale_with_stashed_python
(
model_grads
,
stashed_master_grads
,
stashed_master_grads
,
master_grads
,
master_grads
,
scale
)
out_scale
/
grads_have_scale
,
out_scale
/
stashed_have_scale
)
# Defer to update_scale
# Defer to update_scale
# If the fused kernel is available, we only need one D2H memcopy and sync.
# If the fused kernel is available, we only need one D2H memcopy and sync.
...
...
tests/L0/run_mixed_adam
/__init__.py
→
apex/contrib
/__init__.py
View file @
15648029
File moved
apex/contrib/csrc/groupbn/batch_norm.cu
0 → 100644
View file @
15648029
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include "THC/THC.h"
#include "batch_norm.h"
#include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
static
size_t
round_up_to_multiple
(
size_t
x
,
int
multiple
)
{
return
((
x
+
multiple
-
1
)
/
multiple
)
*
multiple
;
}
// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
// tensor instead.
struct
Workspace
{
Workspace
(
size_t
size
)
:
size
(
size
),
data
(
NULL
)
{
data
=
THCudaMalloc
(
at
::
globalContext
().
lazyInitCUDA
(),
size
);
}
Workspace
(
const
Workspace
&
)
=
delete
;
Workspace
(
Workspace
&&
)
=
default
;
Workspace
&
operator
=
(
Workspace
&&
)
=
default
;
~
Workspace
()
{
if
(
data
)
{
THCudaFree
(
at
::
globalContext
().
lazyInitCUDA
(),
data
);
}
}
size_t
size
;
void
*
data
;
};
// Return {y}
at
::
Tensor
nhwc_bn_fwd_train
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
)
{
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// generating new magic number and use that for sync
int
*
magic
=
magic_tensor
.
data
<
int
>
();
*
magic
=
(
*
magic
+
1
)
&
0xff
;
// Allocate output tensor
at
::
Tensor
y
=
at
::
empty
({
N
,
H
,
W
,
C
},
x
.
options
());
// Create wrapper
NhwcBatchNorm
*
bn
=
new
NhwcBatchNorm
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
nullptr
,
y
.
data
<
at
::
Half
>
(),
nullptr
);
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
nullptr
,
nullptr
});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
minibatch_mean
.
data
<
float
>
());
workspace
.
push_back
(
minibatch_inv_var
.
data
<
float
>
());
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
2
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
3
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
// Don't fuse in ReLU for now at least
bn
->
fwd
(
stream
,
fuse_relu
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
*
magic
,
occupancy
,
grid_dim_x
,
coop
);
return
y
;
}
at
::
Tensor
nhwc_bn_fwd_eval
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
int
bn_group
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
)
{
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// Allocate output tensor
at
::
Tensor
y
=
at
::
empty
({
N
,
H
,
W
,
C
},
x
.
options
());
// Create wrapper
NhwcBatchNorm
*
bn
=
new
NhwcBatchNorm
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
nullptr
,
y
.
data
<
at
::
Half
>
(),
nullptr
);
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
nullptr
,
nullptr
});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
nullptr
);
workspace
.
push_back
(
nullptr
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
2
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
3
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
// Don't fuse in ReLU for now at least
bn
->
fwdInference
(
stream
,
fuse_relu
);
return
y
;
}
std
::
vector
<
at
::
Tensor
>
nhwc_bn_bwd
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
dy
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
)
{
// shape
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// generating new magic number and use that for sync
int
*
magic
=
magic_tensor
.
data
<
int
>
();
*
magic
=
(
*
magic
+
1
)
&
0xff
;
// outputs
at
::
Tensor
x_grad
,
scale_grad
,
bias_grad
;
// Allocate outputs
x_grad
=
at
::
empty_like
(
x
);
scale_grad
=
at
::
empty_like
(
scale
);
bias_grad
=
at
::
empty_like
(
bias
);
// Create wrapper
NhwcBatchNorm
*
bn
=
new
NhwcBatchNorm
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
x_grad
.
data
<
at
::
Half
>
(),
nullptr
,
dy
.
data
<
at
::
Half
>
());
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
scale_grad
.
data
<
float
>
(),
bias_grad
.
data
<
float
>
()});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
minibatch_mean
.
data
<
float
>
());
workspace
.
push_back
(
minibatch_inv_var
.
data
<
float
>
());
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
2
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
3
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
bn
->
dgrad
(
stream
,
fuse_relu
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
*
magic
,
occupancy
,
grid_dim_x
,
coop
);
return
std
::
vector
<
at
::
Tensor
>
{
x_grad
,
scale_grad
,
bias_grad
};
}
int
nhwc_bn_fwd_occupancy
()
{
int
device_id
=-
1
;
cudaGetDevice
(
&
device_id
);
//max occupancy supported by the code is 2
return
NhwcBatchNorm
::
smem_driven_fwd_occupancy
(
device_id
,
2
);
}
int
nhwc_bn_bwd_occupancy
()
{
int
device_id
=-
1
;
cudaGetDevice
(
&
device_id
);
//max occupancy supported by the code is 2
return
NhwcBatchNorm
::
smem_driven_bwd_occupancy
(
device_id
,
2
);
}
apex/contrib/csrc/groupbn/batch_norm.h
0 → 100644
View file @
15648029
This diff is collapsed.
Click to expand it.
apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
0 → 100644
View file @
15648029
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include "THC/THC.h"
#include "batch_norm_add_relu.h"
#include <cuda.h>
//FIXME move the common stuff to common h file
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
static
size_t
round_up_to_multiple
(
size_t
x
,
int
multiple
)
{
return
((
x
+
multiple
-
1
)
/
multiple
)
*
multiple
;
}
// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
// tensor instead.
struct
Workspace
{
Workspace
(
size_t
size
)
:
size
(
size
),
data
(
NULL
)
{
data
=
THCudaMalloc
(
at
::
globalContext
().
lazyInitCUDA
(),
size
);
}
Workspace
(
const
Workspace
&
)
=
delete
;
Workspace
(
Workspace
&&
)
=
default
;
Workspace
&
operator
=
(
Workspace
&&
)
=
default
;
~
Workspace
()
{
if
(
data
)
{
THCudaFree
(
at
::
globalContext
().
lazyInitCUDA
(),
data
);
}
}
size_t
size
;
void
*
data
;
};
// Return {y}
at
::
Tensor
nhwc_bn_addrelu_fwd_train
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
z
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
bitmask
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
)
{
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// generating new magic number and use that for sync
int
*
magic
=
magic_tensor
.
data
<
int
>
();
*
magic
=
(
*
magic
+
1
)
&
0xff
;
// Allocate output tensor
at
::
Tensor
y
=
at
::
empty
({
N
,
H
,
W
,
C
},
x
.
options
());
// Create wrapper
NhwcBatchNormAddRelu
*
bn
=
new
NhwcBatchNormAddRelu
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
nullptr
,
y
.
data
<
at
::
Half
>
(),
nullptr
,
z
.
data
<
at
::
Half
>
(),
nullptr
);
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
nullptr
,
nullptr
});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
minibatch_mean
.
data
<
float
>
());
workspace
.
push_back
(
minibatch_inv_var
.
data
<
float
>
());
workspace
.
push_back
(
bitmask
.
data
<
int32_t
>
());
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
3
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
4
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
// Don't fuse in ReLU for now at least
bn
->
fwd
(
stream
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
*
magic
,
occupancy
,
grid_dim_x
,
coop
);
return
y
;
}
at
::
Tensor
nhwc_bn_addrelu_fwd_eval
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
z
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
int
bn_group
,
const
float
momentum
,
const
float
epsilon
)
{
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// Allocate output tensor
at
::
Tensor
y
=
at
::
empty
({
N
,
H
,
W
,
C
},
x
.
options
());
// Create wrapper
NhwcBatchNormAddRelu
*
bn
=
new
NhwcBatchNormAddRelu
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
nullptr
,
y
.
data
<
at
::
Half
>
(),
nullptr
,
z
.
data
<
at
::
Half
>
(),
nullptr
);
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
nullptr
,
nullptr
});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
nullptr
);
workspace
.
push_back
(
nullptr
);
workspace
.
push_back
(
nullptr
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
3
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
4
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
// Don't fuse in ReLU for now at least
bn
->
fwdInference
(
stream
);
return
y
;
}
std
::
vector
<
at
::
Tensor
>
nhwc_bn_addrelu_bwd
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
dy
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
bitmask
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
)
{
// shape
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// generating new magic number and use that for sync
int
*
magic
=
magic_tensor
.
data
<
int
>
();
*
magic
=
(
*
magic
+
1
)
&
0xff
;
// outputs
at
::
Tensor
x_grad
,
z_grad
,
scale_grad
,
bias_grad
;
// Allocate outputs
x_grad
=
at
::
empty_like
(
x
);
z_grad
=
at
::
empty_like
(
x
);
scale_grad
=
at
::
empty_like
(
scale
);
bias_grad
=
at
::
empty_like
(
bias
);
// Create wrapper
NhwcBatchNormAddRelu
*
bn
=
new
NhwcBatchNormAddRelu
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
x_grad
.
data
<
at
::
Half
>
(),
nullptr
,
dy
.
data
<
at
::
Half
>
(),
nullptr
,
z_grad
.
data
<
at
::
Half
>
());
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
scale_grad
.
data
<
float
>
(),
bias_grad
.
data
<
float
>
()});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
minibatch_mean
.
data
<
float
>
());
workspace
.
push_back
(
minibatch_inv_var
.
data
<
float
>
());
workspace
.
push_back
(
bitmask
.
data
<
int32_t
>
());
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
3
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
4
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
bn
->
dgrad
(
stream
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
*
magic
,
occupancy
,
grid_dim_x
,
coop
);
return
std
::
vector
<
at
::
Tensor
>
{
x_grad
,
z_grad
,
scale_grad
,
bias_grad
};
}
int
nhwc_bn_addrelu_fwd_occupancy
()
{
int
device_id
=-
1
;
cudaGetDevice
(
&
device_id
);
//max occupancy supported by the code is 2
return
NhwcBatchNormAddRelu
::
smem_driven_fwd_occupancy
(
device_id
,
2
);
}
int
nhwc_bn_addrelu_bwd_occupancy
()
{
int
device_id
=-
1
;
cudaGetDevice
(
&
device_id
);
//max occupancy supported by the code is 2
return
NhwcBatchNormAddRelu
::
smem_driven_bwd_occupancy
(
device_id
,
2
);
}
apex/contrib/csrc/groupbn/batch_norm_add_relu.h
0 → 100644
View file @
15648029
This diff is collapsed.
Click to expand it.
apex/contrib/csrc/groupbn/cuda_utils.h
0 → 100644
View file @
15648029
#include <ATen/cuda/CUDAContext.h>
#ifndef CUDA_UTILS_H
#define CUDA_UTILS_H
namespace
at
{
namespace
cuda
{
namespace
utils
{
static
inline
int
MaxSharedMemoryPerMultiprocessor
(
int
device_id
)
{
return
getDeviceProperties
(
device_id
)
->
sharedMemPerMultiprocessor
;
}
}
}
}
#endif
apex/contrib/csrc/groupbn/interface.cpp
0 → 100644
View file @
15648029
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/ArrayRef.h>
#include <ATen/ScalarType.h>
#include "ATen/Scalar.h"
#ifndef VERSION_GE_1_1
#include "ATen/Type.h"
#endif
#include "ATen/Tensor.h"
#include "ATen/Storage.h"
#include "ATen/Generator.h"
namespace
py
=
pybind11
;
int64_t
get_buffer_size
(
const
int
bn_sync_steps
);
void
*
get_data_ptr
(
const
at
::
Tensor
&
data
);
void
*
get_remote_data_ptr
(
const
at
::
Tensor
&
handle
,
const
int64_t
offset
);
void
close_remote_data
(
const
at
::
Tensor
&
handle
);
at
::
Tensor
nhwc_bn_fwd_train
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
);
at
::
Tensor
nhwc_bn_fwd_eval
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
int
bn_group
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
);
std
::
vector
<
at
::
Tensor
>
nhwc_bn_bwd
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
dy
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
);
at
::
Tensor
nhwc_bn_addrelu_fwd_train
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
z
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
bitmask
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
);
at
::
Tensor
nhwc_bn_addrelu_fwd_eval
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
z
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
int
bn_group
,
const
float
momentum
,
const
float
epsilon
);
std
::
vector
<
at
::
Tensor
>
nhwc_bn_addrelu_bwd
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
dy
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
bitmask
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
);
int
nhwc_bn_fwd_occupancy
();
int
nhwc_bn_bwd_occupancy
();
int
nhwc_bn_addrelu_fwd_occupancy
();
int
nhwc_bn_addrelu_bwd_occupancy
();
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"get_buffer_size"
,
&
get_buffer_size
,
"get_buffer_size"
);
m
.
def
(
"get_data_ptr"
,
&
get_data_ptr
,
"get_data_ptr"
);
m
.
def
(
"get_remote_data_ptr"
,
&
get_remote_data_ptr
,
"get_remote_data_ptr"
);
m
.
def
(
"close_remote_data"
,
&
close_remote_data
,
"close_remote_data"
);
m
.
def
(
"bn_fwd_nhwc"
,
&
nhwc_bn_fwd_train
,
"bn_fwd_nhwc"
);
m
.
def
(
"bn_fwd_eval_nhwc"
,
&
nhwc_bn_fwd_eval
,
"bn_fwd_eval_nhwc"
);
m
.
def
(
"bn_bwd_nhwc"
,
&
nhwc_bn_bwd
,
"bn_bwd_nhwc"
);
m
.
def
(
"bn_fwd_nhwc_occupancy"
,
&
nhwc_bn_fwd_occupancy
,
"bn_fwd_nhwc_occupancy"
);
m
.
def
(
"bn_bwd_nhwc_occupancy"
,
&
nhwc_bn_bwd_occupancy
,
"bn_bwd_nhwc_occupancy"
);
m
.
def
(
"bn_addrelu_fwd_nhwc"
,
&
nhwc_bn_addrelu_fwd_train
,
"bn_addrelu_fwd_nhwc"
);
m
.
def
(
"bn_addrelu_fwd_eval_nhwc"
,
&
nhwc_bn_addrelu_fwd_eval
,
"bn_addrelu_fwd_eval_nhwc"
);
m
.
def
(
"bn_addrelu_bwd_nhwc"
,
&
nhwc_bn_addrelu_bwd
,
"bn_addrelu_bwd_nhwc"
);
m
.
def
(
"bn_addrelu_fwd_nhwc_occupancy"
,
&
nhwc_bn_addrelu_fwd_occupancy
,
"bn_addrelu_fwd_nhwc_occupancy"
);
m
.
def
(
"bn_addrelu_bwd_nhwc_occupancy"
,
&
nhwc_bn_addrelu_bwd_occupancy
,
"bn_addrelu_bwd_nhwc_occupancy"
);
}
apex/contrib/csrc/groupbn/ipc.cu
0 → 100644
View file @
15648029
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include "THC/THC.h"
#include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template
<
>
struct
std
::
hash
<
cudaIpcMemHandle_t
>
{
size_t
operator
()
(
const
cudaIpcMemHandle_t
&
handle
)
const
{
size_t
hash
=
0
;
uint8_t
*
ptr
=
(
uint8_t
*
)
&
handle
;
assert
(
sizeof
(
uint8_t
)
==
1
);
for
(
int
i
=
0
;
i
<
sizeof
(
cudaIpcMemHandle_t
);
i
++
)
{
hash
+=
*
ptr
;
ptr
++
;
}
return
hash
;
}
};
template
<
>
struct
std
::
equal_to
<
cudaIpcMemHandle_t
>
{
bool
operator
()
(
const
cudaIpcMemHandle_t
&
lhs
,
const
cudaIpcMemHandle_t
&
rhs
)
const
{
return
(
std
::
memcmp
((
void
*
)
&
lhs
,
(
void
*
)
&
rhs
,
sizeof
(
cudaIpcMemHandle_t
))
==
0
);
}
};
namespace
{
namespace
gpuipc
{
//from: src/operator/nn/cudnn/nhwc_batch_norm_kernel.h
// The number of threads per pixel.
const
int
THREADS_PER_PIXEL
=
16
;
// The number of elements per ldg.
const
int
ELEMENTS_PER_LDG
=
4
;
// The number of reducing ops, each uses its own space : mean, var, dscale, dbias
const
int
REDUCE_OPS
=
4
;
// Maximum block.y supported - limited due to buffer allocation
const
int
MAX_BLOCK_Y
=
256
;
const
int
MAX_OFFSET
=
REDUCE_OPS
*
MAX_BLOCK_Y
;
const
int
BYTES_PER_ELEM
=
4
;
// Buffer size per sync step
const
int
SINGLE_SYNC_BUFFER_BYTES
=
MAX_OFFSET
*
THREADS_PER_PIXEL
*
2
*
ELEMENTS_PER_LDG
*
BYTES_PER_ELEM
;
};
class
IpcMemHandleRegistry
{
public:
void
*
getPtr
(
const
cudaIpcMemHandle_t
&
handle
,
int64_t
offset
)
{
if
(
registry_
.
count
(
handle
)
==
0
)
{
registry_
.
insert
(
std
::
make_pair
(
handle
,
RegistryEntry
()));
registry_
[
handle
].
dev_ptr
=
ipcOpenMem
(
handle
);
}
registry_
[
handle
].
ref_count
++
;
return
(((
uint8_t
*
)
registry_
[
handle
].
dev_ptr
)
+
offset
);
}
void
releasePtr
(
const
cudaIpcMemHandle_t
&
handle
)
{
if
(
registry_
.
count
(
handle
)
==
0
)
{
}
if
(
--
registry_
[
handle
].
ref_count
==
0
)
{
ipcCloseMem
(
registry_
[
handle
].
dev_ptr
);
registry_
.
erase
(
handle
);
}
}
struct
RegistryEntry
{
void
*
dev_ptr
;
int
ref_count
;
RegistryEntry
()
:
dev_ptr
(
NULL
)
,
ref_count
(
0
)
{}
};
protected:
std
::
unordered_map
<
cudaIpcMemHandle_t
,
RegistryEntry
>
registry_
;
void
*
ipcOpenMem
(
const
cudaIpcMemHandle_t
&
handle
)
{
void
*
data
;
cudaIpcOpenMemHandle
(
&
data
,
handle
,
cudaIpcMemLazyEnablePeerAccess
);
cudaCheckErrors
(
"ipc init"
);
return
data
;
}
void
ipcCloseMem
(
void
*
dev_ptr
)
{
cudaIpcCloseMemHandle
(
dev_ptr
);
cudaCheckErrors
(
"ipc close"
);
}
};
}
static
IpcMemHandleRegistry
ipc_mem_registry
;
int64_t
get_buffer_size
(
const
int
bn_sync_steps
)
{
return
bn_sync_steps
*
gpuipc
::
SINGLE_SYNC_BUFFER_BYTES
;
}
void
*
get_remote_data_ptr
(
const
at
::
Tensor
&
handle
,
const
int64_t
offset
)
{
cudaIpcMemHandle_t
my_handle
;
memcpy
((
unsigned
char
*
)(
&
my_handle
),
handle
.
data
<
uint8_t
>
(),
sizeof
(
my_handle
));
return
ipc_mem_registry
.
getPtr
(
my_handle
,
offset
);
}
void
close_remote_data
(
const
at
::
Tensor
&
handle
)
{
cudaIpcMemHandle_t
my_handle
;
memcpy
((
unsigned
char
*
)(
&
my_handle
),
handle
.
data
<
uint8_t
>
(),
sizeof
(
my_handle
));
ipc_mem_registry
.
releasePtr
(
my_handle
);
}
void
*
get_data_ptr
(
const
at
::
Tensor
&
data
)
{
return
data
.
data
<
uint8_t
>
();
}
apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
0 → 100644
View file @
15648029
This diff is collapsed.
Click to expand it.
apex/contrib/csrc/xentropy/interface.cpp
0 → 100644
View file @
15648029
#include <torch/extension.h>
// CUDA forward declarations
std
::
vector
<
at
::
Tensor
>
softmax_xentropy_cuda
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
labels
,
const
float
smoothing
,
const
bool
half_to_float
);
at
::
Tensor
softmax_xentropy_backward_cuda
(
const
at
::
Tensor
&
grad_loss
,
const
at
::
Tensor
&
logits
,
const
at
::
Tensor
&
max_log_sum_exp
,
const
at
::
Tensor
&
labels
,
const
float
smoothing
);
// C++ interface
#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
std
::
vector
<
at
::
Tensor
>
softmax_xentropy_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
labels
,
const
float
smoothing
,
const
bool
half_to_float
)
{
CHECK_CUDA
(
input
);
CHECK_INPUT
(
labels
);
return
softmax_xentropy_cuda
(
input
,
labels
,
smoothing
,
half_to_float
);
}
at
::
Tensor
softmax_xentropy_backward
(
const
at
::
Tensor
&
grad_loss
,
const
at
::
Tensor
&
logits
,
const
at
::
Tensor
&
max_log_sum_exp
,
const
at
::
Tensor
&
labels
,
const
float
smoothing
)
{
CHECK_CUDA
(
grad_loss
);
CHECK_CUDA
(
logits
);
CHECK_INPUT
(
max_log_sum_exp
);
CHECK_INPUT
(
labels
);
return
softmax_xentropy_backward_cuda
(
grad_loss
,
logits
,
max_log_sum_exp
,
labels
,
smoothing
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
softmax_xentropy_forward
,
"Softmax cross entropy loss with label smoothing forward (CUDA)"
);
m
.
def
(
"backward"
,
&
softmax_xentropy_backward
,
"Softmax cross entropy loss with label smoothing backward (CUDA)"
);
}
apex/contrib/csrc/xentropy/xentropy_kernel.cu
0 → 100644
View file @
15648029
This diff is collapsed.
Click to expand it.
apex/contrib/groupbn/__init__.py
0 → 100644
View file @
15648029
try
:
import
torch
import
bnp
from
.batch_norm
import
BatchNorm2d_NHWC
del
torch
del
bnp
del
batch_norm
except
ImportError
as
err
:
print
(
"apex was installed without --bnp flag, contrib.groupbn is not available"
)
apex/contrib/groupbn/batch_norm.py
0 → 100644
View file @
15648029
import
torch
import
numpy
as
np
from
torch.nn.modules.batchnorm
import
_BatchNorm
import
bnp
class
bn_NHWC_impl
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
ret_cta
,
mom
,
epsilon
,
fuse_relu
,
is_train
,
bn_group
,
my_data
,
pair_data
,
magic
,
pair_data2
,
pair_data3
,
fwd_occup
,
fwd_grid_x
,
bwd_occup
,
bwd_grid_x
,
multi_stream
):
if
is_train
:
ctx
.
save_for_backward
(
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
)
ctx
.
epsilon
=
epsilon
ctx
.
momentum
=
mom
ctx
.
ret_cta
=
ret_cta
ctx
.
fuse_relu
=
fuse_relu
ctx
.
my_data
=
my_data
ctx
.
pair_data
=
pair_data
ctx
.
magic
=
magic
ctx
.
pair_data2
=
pair_data2
ctx
.
pair_data3
=
pair_data3
ctx
.
bn_group
=
bn_group
ctx
.
bwd_occup
=
bwd_occup
ctx
.
bwd_grid_x
=
bwd_grid_x
ctx
.
multi_stream
=
multi_stream
res
=
bnp
.
bn_fwd_nhwc
(
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
ret_cta
,
mom
,
epsilon
,
fuse_relu
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
magic
,
fwd_occup
,
fwd_grid_x
,
multi_stream
)
return
res
else
:
return
bnp
.
bn_fwd_eval_nhwc
(
x
,
s
,
b
,
rm
,
riv
,
ret_cta
,
bn_group
,
mom
,
epsilon
,
fuse_relu
)
@
staticmethod
def
backward
(
ctx
,
grad_y
):
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
=
ctx
.
saved_variables
epsilon
=
ctx
.
epsilon
mom
=
ctx
.
momentum
ret_cta
=
ctx
.
ret_cta
fuse_relu
=
ctx
.
fuse_relu
my_data
=
ctx
.
my_data
pair_data
=
ctx
.
pair_data
magic
=
ctx
.
magic
pair_data2
=
ctx
.
pair_data2
pair_data3
=
ctx
.
pair_data3
bn_group
=
ctx
.
bn_group
bwd_occup
=
ctx
.
bwd_occup
bwd_grid_x
=
ctx
.
bwd_grid_x
multi_stream
=
ctx
.
multi_stream
dx
,
dscale
,
dbias
=
bnp
.
bn_bwd_nhwc
(
x
,
grad_y
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
ret_cta
,
mom
,
epsilon
,
fuse_relu
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
magic
,
bwd_occup
,
bwd_grid_x
,
multi_stream
)
return
dx
,
dscale
,
dbias
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
class
bn_addrelu_NHWC_impl
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
x
,
z
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
grid_dim_y
,
ret_cta
,
mom
,
epsilon
,
is_train
,
bn_group
,
my_data
,
pair_data
,
magic
,
pair_data2
,
pair_data3
,
fwd_occup
,
fwd_grid_x
,
bwd_occup
,
bwd_grid_x
,
multi_stream
):
if
is_train
:
bitmask
=
torch
.
cuda
.
IntTensor
(((
x
.
numel
()
+
31
)
//
32
)
*
2
*
grid_dim_y
)
ctx
.
save_for_backward
(
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
bitmask
)
ctx
.
epsilon
=
epsilon
ctx
.
momentum
=
mom
ctx
.
ret_cta
=
ret_cta
ctx
.
my_data
=
my_data
ctx
.
pair_data
=
pair_data
ctx
.
magic
=
magic
ctx
.
pair_data2
=
pair_data2
ctx
.
pair_data3
=
pair_data3
ctx
.
bn_group
=
bn_group
ctx
.
bwd_occup
=
bwd_occup
ctx
.
bwd_grid_x
=
bwd_grid_x
ctx
.
multi_stream
=
multi_stream
res
=
bnp
.
bn_addrelu_fwd_nhwc
(
x
,
z
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
bitmask
,
ret_cta
,
mom
,
epsilon
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
magic
,
fwd_occup
,
fwd_grid_x
,
multi_stream
)
return
res
else
:
return
bnp
.
bn_addrelu_fwd_eval_nhwc
(
x
,
z
,
s
,
b
,
rm
,
riv
,
ret_cta
,
bn_group
,
mom
,
epsilon
)
@
staticmethod
def
backward
(
ctx
,
grad_y
):
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
bitmask
=
ctx
.
saved_variables
epsilon
=
ctx
.
epsilon
mom
=
ctx
.
momentum
ret_cta
=
ctx
.
ret_cta
my_data
=
ctx
.
my_data
pair_data
=
ctx
.
pair_data
magic
=
ctx
.
magic
pair_data2
=
ctx
.
pair_data2
pair_data3
=
ctx
.
pair_data3
bn_group
=
ctx
.
bn_group
bwd_occup
=
ctx
.
bwd_occup
bwd_grid_x
=
ctx
.
bwd_grid_x
multi_stream
=
ctx
.
multi_stream
dx
,
dz
,
dscale
,
dbias
=
bnp
.
bn_addrelu_bwd_nhwc
(
x
,
grad_y
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
bitmask
,
ret_cta
,
mom
,
epsilon
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
magic
,
bwd_occup
,
bwd_grid_x
,
multi_stream
)
return
dx
,
dz
,
dscale
,
dbias
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
class
BatchNorm2d_NHWC
(
_BatchNorm
):
# if using BatchNorm2d_NHWC simultaneously with multiple streams set multi_stream to True
def
__init__
(
self
,
num_features
,
fuse_relu
=
False
,
bn_group
=
1
,
max_cta_per_sm
=
2
,
cta_launch_margin
=
12
,
multi_stream
=
False
):
super
(
BatchNorm2d_NHWC
,
self
).
__init__
(
num_features
)
self
.
fuse_relu
=
fuse_relu
self
.
multi_stream
=
multi_stream
self
.
minibatch_mean
=
torch
.
cuda
.
FloatTensor
(
num_features
)
self
.
minibatch_riv
=
torch
.
cuda
.
FloatTensor
(
num_features
)
#defaut to distributed bn disabled
self
.
bn_group
=
bn_group
self
.
max_cta_per_sm
=
max_cta_per_sm
#used only in training fwd and bwd
self
.
cta_launch_margin
=
cta_launch_margin
#used only in training fwd and bwd
self
.
my_data
=
None
self
.
pair_data
=
None
self
.
pair_data2
=
None
self
.
pair_data3
=
None
self
.
local_rank
=
0
self
.
magic
=
torch
.
IntTensor
([
0
])
#calculate cta per sm occupancies
assert
(
max_cta_per_sm
>
0
)
# won't be able to do much with 0 CTAs :)
self
.
fwd_occupancy
=
min
(
bnp
.
bn_fwd_nhwc_occupancy
(),
max_cta_per_sm
)
self
.
bwd_occupancy
=
min
(
bnp
.
bn_bwd_nhwc_occupancy
(),
max_cta_per_sm
)
self
.
addrelu_fwd_occupancy
=
min
(
bnp
.
bn_addrelu_fwd_nhwc_occupancy
(),
max_cta_per_sm
)
self
.
addrelu_bwd_occupancy
=
min
(
bnp
.
bn_addrelu_bwd_nhwc_occupancy
(),
max_cta_per_sm
)
#calculate grid dimentions based on occupancy numbers
mp_count
=
torch
.
cuda
.
get_device_properties
(
None
).
multi_processor_count
self
.
fwd_grid_dim_x
=
max
(
mp_count
*
self
.
fwd_occupancy
-
cta_launch_margin
,
1
)
self
.
bwd_grid_dim_x
=
max
(
mp_count
*
self
.
bwd_occupancy
-
cta_launch_margin
,
1
)
self
.
addrelu_fwd_grid_dim_x
=
max
(
mp_count
*
self
.
addrelu_fwd_occupancy
-
cta_launch_margin
,
1
)
self
.
addrelu_bwd_grid_dim_x
=
max
(
mp_count
*
self
.
addrelu_bwd_occupancy
-
cta_launch_margin
,
1
)
self
.
grid_dim_y
=
(
num_features
+
63
)
//
64
# allocate scratch space used by implementation
# TODO: scratch space that is not supposed to be exposed at user code. We only need one time initialization, the
# same buffer could be reused in future iterations. Currently we exposed it here instead of requesting new
# buffer from cache allocator to avoid unnecessary initialization at future iterations.
self
.
ret_cta
=
torch
.
cuda
.
ByteTensor
(
8192
).
fill_
(
0
)
#FIXME: turn pair handles into an array
if
bn_group
>
1
:
local_rank
=
torch
.
distributed
.
get_rank
()
world_size
=
torch
.
distributed
.
get_world_size
()
assert
(
world_size
>=
bn_group
)
assert
(
world_size
%
bn_group
==
0
)
bn_sync_steps
=
1
if
(
bn_group
==
4
):
bn_sync_steps
=
2
if
(
bn_group
==
8
):
bn_sync_steps
=
3
self
.
ipc_buffer
=
torch
.
cuda
.
ByteTensor
(
bnp
.
get_buffer_size
(
bn_sync_steps
))
self
.
my_data
=
bnp
.
get_data_ptr
(
self
.
ipc_buffer
)
# we are walking on very thin ice here by utilizing internal `_share_cuda_()`
self
.
storage
=
self
.
ipc_buffer
.
storage
()
self
.
share_cuda
=
self
.
storage
.
_share_cuda_
()
internal_cuda_mem
=
self
.
share_cuda
# internal_cuda_mem[1]: ipc_mem_handle
my_handle
=
torch
.
cuda
.
ByteTensor
(
np
.
frombuffer
(
internal_cuda_mem
[
1
],
dtype
=
np
.
uint8
))
# internal_cuda_mem[3]: offset
my_offset
=
torch
.
cuda
.
IntTensor
([
internal_cuda_mem
[
3
]])
handles_all
=
torch
.
empty
(
world_size
,
my_handle
.
size
(
0
),
dtype
=
my_handle
.
dtype
,
device
=
my_handle
.
device
)
handles_l
=
list
(
handles_all
.
unbind
(
0
))
torch
.
distributed
.
all_gather
(
handles_l
,
my_handle
)
offsets_all
=
torch
.
empty
(
world_size
,
my_offset
.
size
(
0
),
dtype
=
my_offset
.
dtype
,
device
=
my_offset
.
device
)
offsets_l
=
list
(
offsets_all
.
unbind
(
0
))
torch
.
distributed
.
all_gather
(
offsets_l
,
my_offset
)
#whom do I actually care about? that would be local_rank XOR 1
self
.
pair_handle
=
handles_l
[
local_rank
^
1
].
cpu
().
contiguous
()
pair_offset
=
offsets_l
[
local_rank
^
1
].
cpu
()
self
.
pair_data
=
bnp
.
get_remote_data_ptr
(
self
.
pair_handle
,
pair_offset
)
if
bn_group
>
2
:
self
.
pair_handle2
=
handles_l
[
local_rank
^
2
].
cpu
().
contiguous
()
pair_offset2
=
offsets_l
[
local_rank
^
2
].
cpu
()
self
.
pair_data2
=
bnp
.
get_remote_data_ptr
(
self
.
pair_handle2
,
pair_offset2
)
if
bn_group
>
4
:
self
.
pair_handle3
=
handles_l
[
local_rank
^
4
].
cpu
().
contiguous
()
pair_offset3
=
offsets_l
[
local_rank
^
4
].
cpu
()
self
.
pair_data3
=
bnp
.
get_remote_data_ptr
(
self
.
pair_handle3
,
pair_offset3
)
#FIXME: get magic value into C code and eliminate from here
self
.
magic
=
torch
.
IntTensor
([
2
])
self
.
local_rank
=
local_rank
def
forward
(
self
,
x
,
z
=
None
):
if
z
is
not
None
:
assert
(
self
.
fuse_relu
==
True
)
return
bn_addrelu_NHWC_impl
.
apply
(
x
,
z
,
self
.
weight
,
self
.
bias
,
self
.
running_mean
,
self
.
running_var
,
self
.
minibatch_mean
,
self
.
minibatch_riv
,
self
.
grid_dim_y
,
self
.
ret_cta
,
self
.
momentum
,
self
.
eps
,
self
.
training
,
self
.
bn_group
,
self
.
my_data
,
self
.
pair_data
,
(
self
.
magic
),
self
.
pair_data2
,
self
.
pair_data3
,
self
.
addrelu_fwd_occupancy
,
self
.
addrelu_fwd_grid_dim_x
,
self
.
addrelu_bwd_occupancy
,
self
.
addrelu_bwd_grid_dim_x
,
self
.
multi_stream
)
else
:
return
bn_NHWC_impl
.
apply
(
x
,
self
.
weight
,
self
.
bias
,
self
.
running_mean
,
self
.
running_var
,
self
.
minibatch_mean
,
self
.
minibatch_riv
,
self
.
ret_cta
,
self
.
momentum
,
self
.
eps
,
self
.
fuse_relu
,
self
.
training
,
self
.
bn_group
,
self
.
my_data
,
self
.
pair_data
,
(
self
.
magic
),
self
.
pair_data2
,
self
.
pair_data3
,
self
.
fwd_occupancy
,
self
.
fwd_grid_dim_x
,
self
.
bwd_occupancy
,
self
.
bwd_grid_dim_x
,
self
.
multi_stream
)
def
__del__
(
self
):
if
self
.
bn_group
>
1
:
bnp
.
close_remote_data
(
self
.
pair_handle
)
if
self
.
bn_group
>
2
:
bnp
.
close_remote_data
(
self
.
pair_handle2
)
if
self
.
bn_group
>
4
:
bnp
.
close_remote_data
(
self
.
pair_handle3
)
apex/contrib/test/test_label_smoothing.py
0 → 100644
View file @
15648029
import
torch
from
apex.contrib
import
xentropy
as
label_smoothing
import
unittest
import
warnings
import
random
import
numpy
as
np
import
time
def
label_smoothing_raw
(
x
,
target
,
padding_idx
,
smoothing
):
logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
x
,
dim
=-
1
,
dtype
=
torch
.
float32
)
non_pad_mask
=
(
target
!=
padding_idx
)
nll_loss
=
-
logprobs
.
gather
(
dim
=-
1
,
index
=
target
.
unsqueeze
(
1
))
nll_loss
=
nll_loss
.
squeeze
(
1
)[
non_pad_mask
]
smooth_loss
=
-
logprobs
.
mean
(
dim
=-
1
)[
non_pad_mask
]
loss
=
(
1.0
-
smoothing
)
*
nll_loss
+
smoothing
*
smooth_loss
return
loss
def
label_smoothing_opt_1
(
x
,
target
,
padding_idx
,
smoothing
):
logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
x
,
dim
=-
1
,
dtype
=
torch
.
float32
)
pad_mask
=
(
target
==
padding_idx
)
ll_loss
=
logprobs
.
gather
(
dim
=-
1
,
index
=
target
.
unsqueeze
(
1
)).
squeeze
(
1
)
smooth_loss
=
logprobs
.
mean
(
dim
=-
1
)
loss
=
(
smoothing
-
1.0
)
*
ll_loss
-
smoothing
*
smooth_loss
loss
.
masked_fill_
(
pad_mask
,
0
)
return
loss
class
LabelSmoothingTest
(
unittest
.
TestCase
):
def
setUp
(
self
,
seed
=
1234
):
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed_all
(
seed
)
# Set pytorch print precision
torch
.
set_printoptions
(
precision
=
10
)
def
gen_test_inputs
(
self
,
N
,
T
,
H
,
smoothing
,
padding_idx
):
logits
=
torch
.
randn
((
N
*
T
,
H
),
dtype
=
torch
.
half
,
device
=
'cuda'
,
requires_grad
=
True
)
labels
=
torch
.
randint
(
0
,
H
,
[
N
*
T
],
device
=
'cuda'
)
for
i
in
random
.
sample
(
range
(
N
*
T
),
N
*
T
//
6
):
labels
[
i
]
=
padding_idx
half_to_float
=
(
logits
.
dtype
==
torch
.
half
)
return
logits
,
labels
,
half_to_float
def
print_max_diff_elem
(
self
,
ref
,
tst
):
ref
,
tst
=
ref
.
flatten
(),
tst
.
flatten
()
diff
=
(
ref
-
tst
).
abs
().
max
()
idx
=
(
ref
-
tst
).
abs
().
argmax
()
print
(
"Max atol idx: {}, diff: {:.6f}, ref: {:.6f}, tst: {:.6f}"
.
format
(
idx
,
diff
,
ref
[
idx
],
tst
[
idx
]))
def
test_label_smoothing_function
(
self
):
# Set label smoothing configuration
smoothing
,
padding_idx
=
0.1
,
0
N
,
T
,
H
=
128
,
74
,
32320
iters
=
10
loss_func
=
label_smoothing
.
SoftmaxCrossEntropyLoss
.
apply
for
i
in
range
(
iters
):
logits
,
labels
,
half_to_float
=
self
.
gen_test_inputs
(
N
,
T
,
H
,
smoothing
,
padding_idx
)
# Run original softmax cross entropy with label smoothing
logits
.
grad
=
None
losses
=
label_smoothing_raw
(
logits
,
labels
,
padding_idx
,
smoothing
)
loss
=
losses
.
sum
()
loss
.
backward
()
ref_loss
=
loss
.
clone
().
detach
()
ref_grad
=
logits
.
grad
.
clone
().
detach
()
# Run optimized softmax cross entropy with label smoothing
logits
.
grad
=
None
losses
=
loss_func
(
logits
,
labels
,
smoothing
,
padding_idx
,
half_to_float
)
loss
=
losses
.
sum
()
loss
.
backward
()
val_loss
=
loss
.
clone
().
detach
()
val_grad
=
logits
.
grad
.
clone
().
detach
()
# Validate
self
.
print_max_diff_elem
(
ref_grad
,
val_grad
)
self
.
assertTrue
(
torch
.
allclose
(
ref_loss
,
val_loss
,
atol
=
1e-5
,
rtol
=
1e-5
))
self
.
assertTrue
(
torch
.
allclose
(
ref_grad
,
val_grad
,
atol
=
1e-5
,
rtol
=
1e-5
))
def
test_label_smoothing_perf
(
self
):
# Set label smoothing configuration
smoothing
,
padding_idx
=
0.1
,
0
N
,
T
,
H
=
128
,
74
,
32320
iters
=
1000
loss_func
=
label_smoothing
.
SoftmaxCrossEntropyLoss
.
apply
print
()
logits
,
labels
,
half_to_float
=
self
.
gen_test_inputs
(
N
,
T
,
H
,
smoothing
,
padding_idx
)
# Run original softmax cross entropy with label smoothing
torch
.
cuda
.
synchronize
()
ts
=
time
.
time
()
for
i
in
range
(
iters
):
logits
.
grad
=
None
losses
=
label_smoothing_raw
(
logits
,
labels
,
padding_idx
,
smoothing
)
loss
=
losses
.
sum
()
/
N
loss
.
backward
()
torch
.
cuda
.
synchronize
()
print
(
"Raw time {:.2f} s elapsed for {} iterations, norm {:.4f}"
.
format
(
time
.
time
()
-
ts
,
iters
,
logits
.
grad
.
norm
()))
# Run optimized softmax cross entropy with label smoothing
torch
.
cuda
.
synchronize
()
ts
=
time
.
time
()
for
i
in
range
(
iters
):
logits
.
grad
=
None
losses
=
loss_func
(
logits
,
labels
,
smoothing
,
padding_idx
,
half_to_float
)
loss
=
losses
.
sum
()
/
N
loss
.
backward
()
torch
.
cuda
.
synchronize
()
print
(
"Opt time {:.2f} s elapsed for {} iterations, norm {:.4f}"
.
format
(
time
.
time
()
-
ts
,
iters
,
logits
.
grad
.
norm
()))
if
__name__
==
'__main__'
:
unittest
.
main
()
apex/contrib/xentropy/__init__.py
0 → 100644
View file @
15648029
try
:
import
torch
import
xentropy_cuda
from
.softmax_xentropy
import
SoftmaxCrossEntropyLoss
del
torch
del
xentropy_cuda
del
softmax_xentropy
except
ImportError
as
err
:
print
(
"apex was installed without --xentropy flag, contrib.xentropy is not available"
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment