Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
15648029
Commit
15648029
authored
Aug 26, 2019
by
Michael Carilli
Browse files
Merge branch 'FDecaYed-deyuf/fused_optimizer_v2'
parents
880ab925
b9f0995b
Changes
51
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
6329 additions
and
144 deletions
+6329
-144
README.md
README.md
+4
-4
apex/amp/_initialize.py
apex/amp/_initialize.py
+4
-26
apex/amp/_process_optimizer.py
apex/amp/_process_optimizer.py
+161
-91
apex/amp/handle.py
apex/amp/handle.py
+9
-11
apex/amp/scaler.py
apex/amp/scaler.py
+19
-12
apex/contrib/__init__.py
apex/contrib/__init__.py
+0
-0
apex/contrib/csrc/groupbn/batch_norm.cu
apex/contrib/csrc/groupbn/batch_norm.cu
+331
-0
apex/contrib/csrc/groupbn/batch_norm.h
apex/contrib/csrc/groupbn/batch_norm.h
+734
-0
apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
+343
-0
apex/contrib/csrc/groupbn/batch_norm_add_relu.h
apex/contrib/csrc/groupbn/batch_norm_add_relu.h
+681
-0
apex/contrib/csrc/groupbn/cuda_utils.h
apex/contrib/csrc/groupbn/cuda_utils.h
+20
-0
apex/contrib/csrc/groupbn/interface.cpp
apex/contrib/csrc/groupbn/interface.cpp
+175
-0
apex/contrib/csrc/groupbn/ipc.cu
apex/contrib/csrc/groupbn/ipc.cu
+130
-0
apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
+2685
-0
apex/contrib/csrc/xentropy/interface.cpp
apex/contrib/csrc/xentropy/interface.cpp
+52
-0
apex/contrib/csrc/xentropy/xentropy_kernel.cu
apex/contrib/csrc/xentropy/xentropy_kernel.cu
+610
-0
apex/contrib/groupbn/__init__.py
apex/contrib/groupbn/__init__.py
+9
-0
apex/contrib/groupbn/batch_norm.py
apex/contrib/groupbn/batch_norm.py
+225
-0
apex/contrib/test/test_label_smoothing.py
apex/contrib/test/test_label_smoothing.py
+128
-0
apex/contrib/xentropy/__init__.py
apex/contrib/xentropy/__init__.py
+9
-0
No files found.
README.md
View file @
15648029
apex/amp/_initialize.py
View file @
15648029
...
@@ -10,7 +10,6 @@ from ._process_optimizer import _process_optimizer
...
@@ -10,7 +10,6 @@ from ._process_optimizer import _process_optimizer
from
apex.fp16_utils
import
convert_network
from
apex.fp16_utils
import
convert_network
from
..fp16_utils
import
FP16_Optimizer
as
FP16_Optimizer_general
from
..fp16_utils
import
FP16_Optimizer
as
FP16_Optimizer_general
from
..optimizers
import
FP16_Optimizer
as
FP16_Optimizer_for_fused
from
..optimizers
import
FP16_Optimizer
as
FP16_Optimizer_for_fused
from
..optimizers
import
FusedAdam
from
..parallel
import
DistributedDataParallel
as
apex_DDP
from
..parallel
import
DistributedDataParallel
as
apex_DDP
from
..parallel.LARC
import
LARC
from
..parallel.LARC
import
LARC
...
@@ -124,29 +123,13 @@ def check_optimizers(optimizers):
...
@@ -124,29 +123,13 @@ def check_optimizers(optimizers):
raise
RuntimeError
(
"An incoming optimizer is an instance of {}. "
.
format
(
bad_optim_type
)
+
raise
RuntimeError
(
"An incoming optimizer is an instance of {}. "
.
format
(
bad_optim_type
)
+
"The optimizer(s) passed to amp.initialize() must be bare
\n
"
"The optimizer(s) passed to amp.initialize() must be bare
\n
"
"instances of either ordinary Pytorch optimizers, or Apex fused
\n
"
"instances of either ordinary Pytorch optimizers, or Apex fused
\n
"
"optimizers (
currently just
FusedAdam
, but
FusedSGD
will be added
\n
"
"optimizers (FusedAdam
or
FusedSGD
).
\n
"
"
soon).
You should not manually wrap your optimizer in either
\n
"
"You should not manually wrap your optimizer in either
\n
"
"apex.fp16_utils.FP16_Optimizer or apex.optimizers.FP16_Optimizer.
\n
"
"apex.fp16_utils.FP16_Optimizer or apex.optimizers.FP16_Optimizer.
\n
"
"amp.initialize will take care of that for you (if necessary) based
\n
"
"amp.initialize will take care of that for you (if necessary) based
\n
"
"on the specified opt_level (and optional overridden properties)."
)
"on the specified opt_level (and optional overridden properties)."
)
def
wrap_fused_adam
(
optimizer
,
properties
):
msg
=
'Currently, the usage of FusedAdam is restricted to '
\
'amp.initialize(..., opt_level="O2", keep_batchnorm_fp32=False, '
\
'loss_scale=float or "dynamic"). We are working on enabling more general usage.'
assert
properties
.
master_weights
is
True
,
msg
assert
properties
.
cast_model_type
is
torch
.
float16
,
msg
assert
(
properties
.
keep_batchnorm_fp32
is
False
or
properties
.
keep_batchnorm_fp32
is
None
),
msg
if
properties
.
loss_scale
==
"dynamic"
:
return
FP16_Optimizer_for_fused
(
optimizer
,
dynamic_loss_scale
=
True
)
else
:
return
FP16_Optimizer_for_fused
(
optimizer
,
static_loss_scale
=
properties
.
loss_scale
)
def
_initialize
(
models
,
optimizers
,
properties
,
num_losses
=
1
,
cast_model_outputs
=
None
):
def
_initialize
(
models
,
optimizers
,
properties
,
num_losses
=
1
,
cast_model_outputs
=
None
):
from
apex.parallel
import
DistributedDataParallel
as
apex_DDP
from
apex.parallel
import
DistributedDataParallel
as
apex_DDP
from
.amp
import
init
as
amp_init
from
.amp
import
init
as
amp_init
...
@@ -176,7 +159,6 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
...
@@ -176,7 +159,6 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
if
not
_amp_state
.
allow_incoming_model_not_fp32
:
if
not
_amp_state
.
allow_incoming_model_not_fp32
:
check_params_fp32
(
models
)
check_params_fp32
(
models
)
# In the future, when FP16_Optimizer can be deprecated and master weights can
# In the future, when FP16_Optimizer can be deprecated and master weights can
# become an attribute, remember to stash master weights before casting the model.
# become an attribute, remember to stash master weights before casting the model.
...
@@ -223,10 +205,6 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
...
@@ -223,10 +205,6 @@ def _initialize(models, optimizers, properties, num_losses=1, cast_model_outputs
model
.
forward
=
patch_forward
(
model
.
forward
)
model
.
forward
=
patch_forward
(
model
.
forward
)
for
i
,
optimizer
in
enumerate
(
optimizers
):
for
i
,
optimizer
in
enumerate
(
optimizers
):
# Still need to special case this for the first pass
if
isinstance
(
optimizer
,
FusedAdam
):
optimizers
[
i
]
=
wrap_fused_adam
(
optimizer
,
properties
)
else
:
optimizers
[
i
]
=
_process_optimizer
(
optimizer
,
properties
)
optimizers
[
i
]
=
_process_optimizer
(
optimizer
,
properties
)
_amp_state
.
loss_scalers
=
[]
_amp_state
.
loss_scalers
=
[]
...
...
apex/amp/_process_optimizer.py
View file @
15648029
...
@@ -3,6 +3,7 @@ from ..fp16_utils import master_params_to_model_params
...
@@ -3,6 +3,7 @@ from ..fp16_utils import master_params_to_model_params
from
..multi_tensor_apply
import
multi_tensor_applier
from
..multi_tensor_apply
import
multi_tensor_applier
from
._amp_state
import
maybe_print
from
._amp_state
import
maybe_print
import
torch
import
torch
from
..optimizers
import
FusedSGD
class
AmpOptimizerState
(
object
):
class
AmpOptimizerState
(
object
):
...
@@ -10,6 +11,20 @@ class AmpOptimizerState(object):
...
@@ -10,6 +11,20 @@ class AmpOptimizerState(object):
pass
pass
def
_master_params_to_model_params
(
self
):
stash
=
self
.
_amp_stash
if
multi_tensor_applier
.
available
:
if
len
(
stash
.
all_fp16_params
)
>
0
:
multi_tensor_applier
(
stash
.
multi_tensor_scale
,
stash
.
dummy_overflow_buf
,
[
stash
.
all_fp32_from_fp16_params
,
stash
.
all_fp16_params
],
1.0
)
else
:
for
fp16_group
,
fp32_from_fp16_group
in
zip
(
stash
.
fp16_groups
,
stash
.
fp32_from_fp16_groups
):
master_params_to_model_params
(
fp16_group
,
fp32_from_fp16_group
)
def
lazy_init_with_master_weights
(
self
):
def
lazy_init_with_master_weights
(
self
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
stash
.
fp16_groups
=
[]
stash
.
fp16_groups
=
[]
...
@@ -60,6 +75,8 @@ def lazy_init_with_master_weights(self):
...
@@ -60,6 +75,8 @@ def lazy_init_with_master_weights(self):
for
group
in
stash
.
fp32_from_fp32_groups
:
for
group
in
stash
.
fp32_from_fp32_groups
:
stash
.
all_fp32_from_fp32_params
+=
group
stash
.
all_fp32_from_fp32_params
+=
group
# all_fp16_grad_stash is only needed for fused optimizers.
stash
.
all_fp16_grad_stash
=
[
None
for
_
in
stash
.
all_fp16_params
]
# stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
# stash.all_fp32_from_fp16_grad_stash = [None for _ in stash.all_fp32_from_fp16_params]
stash
.
all_fp32_from_fp32_grad_stash
=
[
None
for
_
in
stash
.
all_fp32_from_fp32_params
]
stash
.
all_fp32_from_fp32_grad_stash
=
[
None
for
_
in
stash
.
all_fp32_from_fp32_params
]
...
@@ -73,15 +90,55 @@ def lazy_init_with_master_weights(self):
...
@@ -73,15 +90,55 @@ def lazy_init_with_master_weights(self):
self
.
load_state_dict
(
self
.
state_dict
())
self
.
load_state_dict
(
self
.
state_dict
())
def
post_backward_models_are_masters
(
scaler
,
params
,
stashed_grads
,
scale_override
=
None
):
grads_have_scale
,
stashed_have_scale
,
out_scale
=
scaler
.
loss_scale
(),
1.0
,
1.0
if
scale_override
is
not
None
:
grads_have_scale
,
stashed_have_scale
,
out_scale
=
scale_override
# This is a lot of python overhead...
grads_needing_unscale
=
[]
grads_needing_unscale_with_stash
=
[]
stashed
=
[]
for
param
,
stashed_grad
in
zip
(
params
,
stashed_grads
):
if
param
.
grad
is
None
and
stashed_grad
is
not
None
:
param
.
grad
=
stashed_grad
elif
param
.
grad
is
not
None
and
stashed_grad
is
None
:
grads_needing_unscale
.
append
(
param
.
grad
)
elif
param
.
grad
is
not
None
and
stashed_grad
is
not
None
:
grads_needing_unscale_with_stash
.
append
(
param
.
grad
)
stashed
.
append
(
stashed_grad
)
else
:
# param.grad is None and stashed_grad is None
continue
# unscale() implements grads*(1/scale), so "scale" should be grads_have_scale/out_scale.
if
len
(
grads_needing_unscale
)
>
0
:
scaler
.
unscale
(
grads_needing_unscale
,
grads_needing_unscale
,
None
,
# unused_scale, currently present to avoid API breakage elsewhere
models_are_masters
=
True
,
scale_override
=
grads_have_scale
/
out_scale
)
if
len
(
grads_needing_unscale_with_stash
)
>
0
:
scaler
.
unscale_with_stashed
(
grads_needing_unscale_with_stash
,
stashed
,
grads_needing_unscale_with_stash
,
scale_override
=
(
grads_have_scale
,
stashed_have_scale
,
out_scale
))
# Clear the stash.
for
i
in
range
(
len
(
stashed_grads
)):
stashed_grads
[
i
]
=
None
def
prepare_backward_with_master_weights
(
self
):
def
prepare_backward_with_master_weights
(
self
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
if
not
stash
.
lazy_init_called
:
self
.
_amp_lazy_init
()
self
.
_lazy_init_maybe_master_weights
()
stash
.
lazy_init_called
=
True
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
# Set up to leverage grad copy elision:
# Set up to leverage grad copy elision.
# This may behave differently from an unpatched optimizer if zero_grad is used and the param is unused.
param
.
grad
=
None
param
.
grad
=
None
# for i, param in enumerate(stash.all_fp32_from_fp16_params):
# for i, param in enumerate(stash.all_fp32_from_fp16_params):
...
@@ -96,6 +153,8 @@ def prepare_backward_with_master_weights(self):
...
@@ -96,6 +153,8 @@ def prepare_backward_with_master_weights(self):
def
post_backward_with_master_weights
(
self
,
scaler
):
def
post_backward_with_master_weights
(
self
,
scaler
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
self
.
_amp_lazy_init
()
# This is a lot of python overhead...
# This is a lot of python overhead...
fp16_grads_needing_unscale
=
[]
fp16_grads_needing_unscale
=
[]
new_fp32_grads
=
[]
new_fp32_grads
=
[]
...
@@ -129,37 +188,10 @@ def post_backward_with_master_weights(self, scaler):
...
@@ -129,37 +188,10 @@ def post_backward_with_master_weights(self, scaler):
preexisting_fp32_grads
)
preexisting_fp32_grads
)
# fp32 params can be treated as they would be in the "no_master_weights" case.
# fp32 params can be treated as they would be in the "no_master_weights" case.
grads_needing_unscale
=
[]
post_backward_models_are_masters
(
grads_needing_unscale_with_stash
=
[]
scaler
,
stashed
=
[]
stash
.
all_fp32_from_fp32_params
,
for
param
,
stashed_grad
in
zip
(
stash
.
all_fp32_from_fp32_params
,
stash
.
all_fp32_from_fp32_grad_stash
)
stash
.
all_fp32_from_fp32_grad_stash
):
if
param
.
grad
is
None
and
stashed_grad
is
not
None
:
param
.
grad
=
stashed_grad
elif
param
.
grad
is
not
None
and
stashed_grad
is
None
:
grads_needing_unscale
.
append
(
param
.
grad
)
elif
param
.
grad
is
not
None
and
stashed_grad
is
not
None
:
grads_needing_unscale_with_stash
.
append
(
param
.
grad
)
stashed
.
append
(
stashed_grad
)
else
:
# param.grad is None and stashed_grad is None:
continue
if
len
(
grads_needing_unscale
)
>
0
:
scaler
.
unscale
(
grads_needing_unscale
,
grads_needing_unscale
,
scaler
.
loss_scale
(),
models_are_masters
=
True
)
if
len
(
grads_needing_unscale_with_stash
)
>
0
:
scaler
.
unscale_with_stashed
(
grads_needing_unscale_with_stash
,
stashed
,
grads_needing_unscale_with_stash
)
# Clear the stash.
for
i
in
range
(
len
(
stash
.
all_fp32_from_fp32_grad_stash
)):
stash
.
all_fp32_from_fp32_grad_stash
[
i
]
=
None
def
lazy_init_no_master_weights
(
self
):
def
lazy_init_no_master_weights
(
self
):
...
@@ -184,9 +216,7 @@ def lazy_init_no_master_weights(self):
...
@@ -184,9 +216,7 @@ def lazy_init_no_master_weights(self):
def
prepare_backward_no_master_weights
(
self
):
def
prepare_backward_no_master_weights
(
self
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
if
not
stash
.
lazy_init_called
:
self
.
_amp_lazy_init
()
self
.
_lazy_init_maybe_master_weights
()
stash
.
lazy_init_called
=
True
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
stash
.
all_fp16_grad_stash
[
i
]
=
param
.
grad
stash
.
all_fp16_grad_stash
[
i
]
=
param
.
grad
...
@@ -202,55 +232,82 @@ def prepare_backward_no_master_weights(self):
...
@@ -202,55 +232,82 @@ def prepare_backward_no_master_weights(self):
def
post_backward_no_master_weights
(
self
,
scaler
):
def
post_backward_no_master_weights
(
self
,
scaler
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
self
.
_amp_lazy_init
()
split_types
=
((
stash
.
all_fp16_params
,
stash
.
all_fp16_grad_stash
),
split_types
=
((
stash
.
all_fp16_params
,
stash
.
all_fp16_grad_stash
),
(
stash
.
all_fp32_params
,
stash
.
all_fp32_grad_stash
))
(
stash
.
all_fp32_params
,
stash
.
all_fp32_grad_stash
))
for
params
,
stashed_grads
in
split_types
:
for
params
,
stashed_grads
in
split_types
:
# This is a lot of python overhead...
post_backward_models_are_masters
(
scaler
,
params
,
stashed_grads
)
grads_needing_unscale
=
[]
grads_needing_unscale_with_stash
=
[]
stashed
=
[]
for
param
,
stashed_grad
in
zip
(
params
,
stashed_grads
):
if
param
.
grad
is
None
and
stashed_grad
is
not
None
:
param
.
grad
=
stashed_grad
elif
param
.
grad
is
not
None
and
stashed_grad
is
None
:
grads_needing_unscale
.
append
(
param
.
grad
)
elif
param
.
grad
is
not
None
and
stashed_grad
is
not
None
:
grads_needing_unscale_with_stash
.
append
(
param
.
grad
)
stashed
.
append
(
stashed_grad
)
else
:
# param.grad is None and stashed_grad is None
continue
if
len
(
grads_needing_unscale
)
>
0
:
scaler
.
unscale
(
grads_needing_unscale
,
grads_needing_unscale
,
scaler
.
loss_scale
(),
models_are_masters
=
True
)
if
len
(
grads_needing_unscale_with_stash
)
>
0
:
#####################################################################################
scaler
.
unscale_with_stashed
(
# FusedSGD versions
grads_needing_unscale_with_stash
,
#####################################################################################
stashed
,
grads_needing_unscale_with_stash
)
# Clear the stash.
# FusedSGD never explicitly materializes the fp32 gradients for "fp32 from fp16" master params
for
i
in
range
(
len
(
stashed_grads
)):
# outside the kernel, so we must accumulate directly into the model grads.
stashed_grads
[
i
]
=
None
def
prepare_backward_with_master_weights_FusedSGD
(
self
):
if
self
.
materialize_master_grads
:
prepare_backward_with_master_weights
(
self
)
else
:
stash
=
self
.
_amp_stash
self
.
_amp_lazy_init
()
def
_master_params_to_model_params
(
self
):
for
i
,
param
in
enumerate
(
stash
.
all_fp16_params
):
stash
=
self
.
_amp_stash
stash
.
all_fp16_grad_stash
[
i
]
=
param
.
grad
if
multi_tensor_applier
.
available
:
# Set up to leverage grad copy elision:
if
len
(
stash
.
all_fp16_params
)
>
0
:
param
.
grad
=
None
multi_tensor_applier
(
stash
.
multi_tensor_scale
,
for
i
,
param
in
enumerate
(
stash
.
all_fp32_from_fp32_params
):
stash
.
dummy_overflow_buf
,
stash
.
all_fp32_from_fp32_grad_stash
[
i
]
=
param
.
grad
[
stash
.
all_fp32_from_fp16_params
,
stash
.
all_fp16_params
],
# Set up to leverage grad copy elision:
1.0
)
param
.
grad
=
None
def
post_backward_with_master_weights_FusedSGD
(
self
,
scaler
):
if
self
.
materialize_master_grads
:
post_backward_with_master_weights
(
self
,
scaler
)
else
:
else
:
for
fp16_group
,
fp32_from_fp16_group
in
zip
(
stash
.
fp16_groups
,
stash
.
fp32_from_fp16_groups
):
stash
=
self
.
_amp_stash
master_params_to_model_params
(
fp16_group
,
fp32_from_fp16_group
)
self
.
_amp_lazy_init
()
grads_have_scale
=
scaler
.
loss_scale
()
stashed_have_scale
=
self
.
most_recent_scale
out_scale
=
grads_have_scale
if
self
.
scale_set_by_backward
:
out_scale
=
min
(
grads_have_scale
,
self
.
most_recent_scale
)
split_types
=
((
stash
.
all_fp16_params
,
stash
.
all_fp16_grad_stash
),
(
stash
.
all_fp32_from_fp32_params
,
stash
.
all_fp32_from_fp32_grad_stash
))
# unscale_with_stashed() implements grads*1/scale + stashed_grads*1.
# stashed_grads are scaled by self.most_recent_scale.
for
params
,
stashed_grads
in
split_types
:
post_backward_models_are_masters
(
scaler
,
params
,
stashed_grads
,
(
grads_have_scale
,
stashed_have_scale
,
out_scale
))
self
.
most_recent_scale
=
out_scale
self
.
scale_set_by_backward
=
True
def
prepare_backward_no_master_weights_FusedSGD
(
self
):
prepare_backward_no_master_weights
(
self
)
def
post_backward_no_master_weights_FusedSGD
(
self
,
scaler
):
post_backward_no_master_weights
(
self
,
scaler
)
def
_amp_lazy_init
(
self
):
stash
=
self
.
_amp_stash
if
not
stash
.
lazy_init_called
:
self
.
_lazy_init_maybe_master_weights
()
stash
.
lazy_init_called
=
True
def
_process_optimizer
(
optimizer
,
properties
):
def
_process_optimizer
(
optimizer
,
properties
):
...
@@ -266,7 +323,8 @@ def _process_optimizer(optimizer, properties):
...
@@ -266,7 +323,8 @@ def _process_optimizer(optimizer, properties):
for
name
in
(
"_lazy_init_maybe_master_weights"
,
for
name
in
(
"_lazy_init_maybe_master_weights"
,
"_master_params_to_model_params"
,
"_master_params_to_model_params"
,
"_prepare_amp_backward"
,
"_prepare_amp_backward"
,
"_post_amp_backward"
):
"_post_amp_backward"
,
"_amp_lazy_init"
):
if
hasattr
(
optimizer
,
name
):
if
hasattr
(
optimizer
,
name
):
raise
RuntimeError
(
"Incoming optimizer already has {} defined."
.
format
(
name
))
raise
RuntimeError
(
"Incoming optimizer already has {} defined."
.
format
(
name
))
...
@@ -274,6 +332,7 @@ def _process_optimizer(optimizer, properties):
...
@@ -274,6 +332,7 @@ def _process_optimizer(optimizer, properties):
if
multi_tensor_applier
.
available
:
if
multi_tensor_applier
.
available
:
import
amp_C
import
amp_C
optimizer
.
_amp_stash
.
multi_tensor_scale
=
amp_C
.
multi_tensor_scale
optimizer
.
_amp_stash
.
multi_tensor_scale
=
amp_C
.
multi_tensor_scale
optimizer
.
_amp_stash
.
multi_tensor_l2norm
=
amp_C
.
multi_tensor_l2norm
optimizer
.
_amp_stash
.
dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
]);
optimizer
.
_amp_stash
.
dummy_overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
]);
if
properties
.
master_weights
:
if
properties
.
master_weights
:
...
@@ -288,6 +347,7 @@ def _process_optimizer(optimizer, properties):
...
@@ -288,6 +347,7 @@ def _process_optimizer(optimizer, properties):
if
closure
is
not
None
:
if
closure
is
not
None
:
raise
RuntimeError
(
"Currently, Amp does not support closure use with optimizers."
)
raise
RuntimeError
(
"Currently, Amp does not support closure use with optimizers."
)
retval
=
old_step
()
retval
=
old_step
()
if
not
isinstance
(
self
,
FusedSGD
):
self
.
_master_params_to_model_params
()
self
.
_master_params_to_model_params
()
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
for
param
in
self
.
_amp_stash
.
all_fp32_from_fp16_params
:
for
param
in
self
.
_amp_stash
.
all_fp32_from_fp16_params
:
...
@@ -298,9 +358,7 @@ def _process_optimizer(optimizer, properties):
...
@@ -298,9 +358,7 @@ def _process_optimizer(optimizer, properties):
old_zero_grad
=
optimizer
.
zero_grad
old_zero_grad
=
optimizer
.
zero_grad
def
new_zero_grad
(
self
):
def
new_zero_grad
(
self
):
stash
=
self
.
_amp_stash
stash
=
self
.
_amp_stash
if
not
stash
.
lazy_init_called
:
self
.
_amp_lazy_init
()
self
.
_lazy_init_maybe_master_weights
()
stash
.
lazy_init_called
=
True
# Zero the model grads.
# Zero the model grads.
for
param
in
stash
.
all_fp16_params
:
for
param
in
stash
.
all_fp16_params
:
if
param
.
grad
is
not
None
:
if
param
.
grad
is
not
None
:
...
@@ -315,21 +373,33 @@ def _process_optimizer(optimizer, properties):
...
@@ -315,21 +373,33 @@ def _process_optimizer(optimizer, properties):
param
.
grad
=
None
param
.
grad
=
None
optimizer
.
zero_grad
=
types
.
MethodType
(
new_zero_grad
,
optimizer
)
optimizer
.
zero_grad
=
types
.
MethodType
(
new_zero_grad
,
optimizer
)
if
isinstance
(
optimizer
,
FusedSGD
):
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
prepare_backward_with_master_weights_FusedSGD
,
optimizer
)
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
post_backward_with_master_weights_FusedSGD
,
optimizer
)
else
:
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
prepare_backward_with_master_weights
,
optimizer
)
prepare_backward_with_master_weights
,
optimizer
)
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
post_backward_with_master_weights
,
optimizer
)
post_backward_with_master_weights
,
optimizer
)
else
:
else
:
optimizer
.
_lazy_init_maybe_master_weights
=
types
.
MethodType
(
optimizer
.
_lazy_init_maybe_master_weights
=
types
.
MethodType
(
lazy_init_no_master_weights
,
optimizer
)
lazy_init_no_master_weights
,
optimizer
)
if
isinstance
(
optimizer
,
FusedSGD
):
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
prepare_backward_no_master_weights_FusedSGD
,
optimizer
)
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
post_backward_no_master_weights_FusedSGD
,
optimizer
)
else
:
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
optimizer
.
_prepare_amp_backward
=
types
.
MethodType
(
prepare_backward_no_master_weights
,
optimizer
)
prepare_backward_no_master_weights
,
optimizer
)
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
optimizer
.
_post_amp_backward
=
types
.
MethodType
(
post_backward_no_master_weights
,
optimizer
)
post_backward_no_master_weights
,
optimizer
)
optimizer
.
_amp_lazy_init
=
types
.
MethodType
(
_amp_lazy_init
,
optimizer
)
old_add_param_group
=
optimizer
.
add_param_group
old_add_param_group
=
optimizer
.
add_param_group
def
new_add_param_group
(
self
,
new_group
):
def
new_add_param_group
(
self
,
new_group
):
...
...
apex/amp/handle.py
View file @
15648029
...
@@ -6,8 +6,6 @@ from . import utils
...
@@ -6,8 +6,6 @@ from . import utils
from
.opt
import
OptimWrapper
from
.opt
import
OptimWrapper
from
.scaler
import
LossScaler
from
.scaler
import
LossScaler
from
._amp_state
import
_amp_state
,
master_params
,
maybe_print
from
._amp_state
import
_amp_state
,
master_params
,
maybe_print
from
..fp16_utils
import
FP16_Optimizer
as
FP16_Optimizer_general
from
..optimizers
import
FP16_Optimizer
as
FP16_Optimizer_for_fused
from
..parallel.LARC
import
LARC
from
..parallel.LARC
import
LARC
...
@@ -89,11 +87,6 @@ def scale_loss(loss,
...
@@ -89,11 +87,6 @@ def scale_loss(loss,
if
isinstance
(
optimizers
,
torch
.
optim
.
Optimizer
)
or
isinstance
(
optimizers
,
LARC
):
if
isinstance
(
optimizers
,
torch
.
optim
.
Optimizer
)
or
isinstance
(
optimizers
,
LARC
):
optimizers
=
[
optimizers
]
optimizers
=
[
optimizers
]
# this is what happens when i have to support tools from different sources under the same API...
# TODO: Rewrite FusedAdam to use multi-tensor apply and the same loss scaler.
if
isinstance
(
optimizers
,
FP16_Optimizer_for_fused
):
loss_scale
=
optimizers
.
cur_scale
else
:
loss_scaler
=
_amp_state
.
loss_scalers
[
loss_id
]
loss_scaler
=
_amp_state
.
loss_scalers
[
loss_id
]
loss_scale
=
loss_scaler
.
loss_scale
()
loss_scale
=
loss_scaler
.
loss_scale
()
...
@@ -120,8 +113,8 @@ def scale_loss(loss,
...
@@ -120,8 +113,8 @@ def scale_loss(loss,
for
optimizer
in
optimizers
:
for
optimizer
in
optimizers
:
optimizer
.
_amp_stash
.
params_have_scaled_gradients
=
True
optimizer
.
_amp_stash
.
params_have_scaled_gradients
=
True
else
:
else
:
# Fused
Adam and FusedSGD will
take care of unscaling as part of their step() methods.
# Fused
SGD may
take care of unscaling as part of their step() methods.
if
not
isinstance
(
optimizers
,
FP16_Optimizer_for_fused
):
#
if not isinstance(optimizers, FP16_Optimizer_for_fused):
loss_scaler
.
clear_overflow_state
()
loss_scaler
.
clear_overflow_state
()
for
optimizer
in
optimizers
:
for
optimizer
in
optimizers
:
optimizer
.
_post_amp_backward
(
loss_scaler
)
optimizer
.
_post_amp_backward
(
loss_scaler
)
...
@@ -142,10 +135,15 @@ def scale_loss(loss,
...
@@ -142,10 +135,15 @@ def scale_loss(loss,
maybe_print
((
"Gradient overflow. Skipping step, loss scaler "
+
maybe_print
((
"Gradient overflow. Skipping step, loss scaler "
+
"{} reducing loss scale to {}"
).
format
(
loss_id
,
"{} reducing loss scale to {}"
).
format
(
loss_id
,
loss_scaler
.
loss_scale
()))
loss_scaler
.
loss_scale
()))
# TODO: I don't like the special casing for different optimizer implementations.
# Maybe skip should delegate to a method owned by the optimizers themselves.
if
hasattr
(
opt
.
_amp_stash
,
"all_fp32_from_fp16_params"
):
if
hasattr
(
opt
.
_amp_stash
,
"all_fp32_from_fp16_params"
):
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
# Clear the master grads that wouldn't be zeroed by model.zero_grad()
for
param
in
opt
.
_amp_stash
.
all_fp32_from_fp16_params
:
for
param
in
opt
.
_amp_stash
.
all_fp32_from_fp16_params
:
param
.
grad
=
None
param
.
grad
=
None
if
hasattr
(
opt
,
"most_recent_scale"
):
opt
.
most_recent_scale
=
1.0
opt
.
scale_set_by_backward
=
False
opt
.
step
=
opt_step
opt
.
step
=
opt_step
opt
.
_amp_stash
.
already_patched
=
False
opt
.
_amp_stash
.
already_patched
=
False
return
skip_step
return
skip_step
...
...
apex/amp/scaler.py
View file @
15648029
...
@@ -16,7 +16,7 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F
...
@@ -16,7 +16,7 @@ def scale_check_overflow_python(model_grad, master_grad, scale, check_overflow=F
master_grad
.
mul_
(
scale
)
master_grad
.
mul_
(
scale
)
return
False
return
False
def
axpby_check_overflow_python
(
model_grad
,
stashed_grad
,
master_grad
,
scale
,
check_overflow
=
False
):
def
axpby_check_overflow_python
(
model_grad
,
stashed_grad
,
master_grad
,
a
,
b
,
check_overflow
=
False
):
# Exception handling for 18.04 compatibility
# Exception handling for 18.04 compatibility
if
check_overflow
:
if
check_overflow
:
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
cpu_sum
=
float
(
model_grad
.
float
().
sum
())
...
@@ -26,9 +26,8 @@ def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, ch
...
@@ -26,9 +26,8 @@ def axpby_check_overflow_python(model_grad, stashed_grad, master_grad, scale, ch
# if master_grad is not model_grad: # copy_ probably internally short-circuits this
# if master_grad is not model_grad: # copy_ probably internally short-circuits this
# master_grad.copy_(model_grad)
# master_grad.copy_(model_grad)
assert
stashed_grad
.
dtype
==
master_grad
.
dtype
assert
stashed_grad
.
dtype
==
master_grad
.
dtype
converted_model_grad
=
model_grad
.
to
(
master_grad
.
dtype
)
converted_model_grad
=
model_grad
.
data
.
to
(
master_grad
.
dtype
)
stashed_grad
.
add_
(
scale
,
converted_model_grad
)
master_grad
.
data
=
a
*
converted_model_grad
.
data
+
b
*
stashed_grad
.
data
master_grad
.
data
=
stashed_grad
.
data
return
False
return
False
class
LossScaler
(
object
):
class
LossScaler
(
object
):
...
@@ -92,11 +91,13 @@ class LossScaler(object):
...
@@ -92,11 +91,13 @@ class LossScaler(object):
break
break
# unused_scale keeps some of the old API alive for hopefully a short time.
# unused_scale keeps some of the old API alive for hopefully a short time.
def
unscale
(
self
,
model_grads
,
master_grads
,
unused_scale
,
models_are_masters
=
False
):
def
unscale
(
self
,
model_grads
,
master_grads
,
unused_scale
,
models_are_masters
=
False
,
scale_override
=
None
):
if
self
.
_has_overflow
:
if
self
.
_has_overflow
:
return
return
scale
=
self
.
_loss_scale
scale
=
self
.
_loss_scale
if
scale_override
is
not
None
:
scale
=
scale_override
if
scale
==
1.0
and
models_are_masters
and
not
self
.
dynamic
:
if
scale
==
1.0
and
models_are_masters
and
not
self
.
dynamic
:
return
return
...
@@ -126,7 +127,8 @@ class LossScaler(object):
...
@@ -126,7 +127,8 @@ class LossScaler(object):
model_grads
,
model_grads
,
stashed_master_grads
,
stashed_master_grads
,
master_grads
,
master_grads
,
scale
):
a
,
b
):
for
model
,
stashed
,
master
in
zip
(
model_grads
,
stashed_master_grads
,
master_grads
):
for
model
,
stashed
,
master
in
zip
(
model_grads
,
stashed_master_grads
,
master_grads
):
if
model
is
None
and
stashed
is
None
:
if
model
is
None
and
stashed
is
None
:
continue
continue
...
@@ -141,7 +143,8 @@ class LossScaler(object):
...
@@ -141,7 +143,8 @@ class LossScaler(object):
self
.
_has_overflow
=
axpby_check_overflow_python
(
model
,
self
.
_has_overflow
=
axpby_check_overflow_python
(
model
,
stashed
,
stashed
,
master
,
master
,
1.
/
scale
,
a
,
b
,
self
.
dynamic
)
self
.
dynamic
)
if
self
.
_has_overflow
and
self
.
dynamic
:
if
self
.
_has_overflow
and
self
.
dynamic
:
break
break
...
@@ -149,11 +152,14 @@ class LossScaler(object):
...
@@ -149,11 +152,14 @@ class LossScaler(object):
def
unscale_with_stashed
(
self
,
def
unscale_with_stashed
(
self
,
model_grads
,
model_grads
,
stashed_master_grads
,
stashed_master_grads
,
master_grads
):
master_grads
,
scale_override
=
None
):
if
self
.
_has_overflow
:
if
self
.
_has_overflow
:
return
return
scale
=
self
.
_loss_scale
grads_have_scale
,
stashed_have_scale
,
out_scale
=
self
.
_loss_scale
,
1.0
,
1.0
if
scale_override
is
not
None
:
grads_have_scale
,
stashed_have_scale
,
out_scale
=
scale_override
if
LossScaler
.
has_fused_kernel
:
if
LossScaler
.
has_fused_kernel
:
if
(
not
LossScaler
.
warned_unscaling_non_fp32_grad
if
(
not
LossScaler
.
warned_unscaling_non_fp32_grad
...
@@ -167,14 +173,15 @@ class LossScaler(object):
...
@@ -167,14 +173,15 @@ class LossScaler(object):
multi_tensor_applier
(
LossScaler
.
multi_tensor_axpby_cuda
,
multi_tensor_applier
(
LossScaler
.
multi_tensor_axpby_cuda
,
self
.
_overflow_buf
,
self
.
_overflow_buf
,
[
model_grads
,
stashed_master_grads
,
master_grads
],
[
model_grads
,
stashed_master_grads
,
master_grads
],
1.
/
scale
,
out_scale
/
grads_have_scale
,
#
1./scale,
1.0
,
out_scale
/
stashed_have_scale
,
#
1.0,
0
)
# check only arg 0, aka the incoming model grads, for infs
0
)
# check only arg 0, aka the incoming model grads, for infs
else
:
else
:
self
.
unscale_with_stashed_python
(
model_grads
,
self
.
unscale_with_stashed_python
(
model_grads
,
stashed_master_grads
,
stashed_master_grads
,
master_grads
,
master_grads
,
scale
)
out_scale
/
grads_have_scale
,
out_scale
/
stashed_have_scale
)
# Defer to update_scale
# Defer to update_scale
# If the fused kernel is available, we only need one D2H memcopy and sync.
# If the fused kernel is available, we only need one D2H memcopy and sync.
...
...
tests/L0/run_mixed_adam
/__init__.py
→
apex/contrib
/__init__.py
View file @
15648029
File moved
apex/contrib/csrc/groupbn/batch_norm.cu
0 → 100644
View file @
15648029
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include "THC/THC.h"
#include "batch_norm.h"
#include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
static
size_t
round_up_to_multiple
(
size_t
x
,
int
multiple
)
{
return
((
x
+
multiple
-
1
)
/
multiple
)
*
multiple
;
}
// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
// tensor instead.
struct
Workspace
{
Workspace
(
size_t
size
)
:
size
(
size
),
data
(
NULL
)
{
data
=
THCudaMalloc
(
at
::
globalContext
().
lazyInitCUDA
(),
size
);
}
Workspace
(
const
Workspace
&
)
=
delete
;
Workspace
(
Workspace
&&
)
=
default
;
Workspace
&
operator
=
(
Workspace
&&
)
=
default
;
~
Workspace
()
{
if
(
data
)
{
THCudaFree
(
at
::
globalContext
().
lazyInitCUDA
(),
data
);
}
}
size_t
size
;
void
*
data
;
};
// Return {y}
at
::
Tensor
nhwc_bn_fwd_train
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
)
{
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// generating new magic number and use that for sync
int
*
magic
=
magic_tensor
.
data
<
int
>
();
*
magic
=
(
*
magic
+
1
)
&
0xff
;
// Allocate output tensor
at
::
Tensor
y
=
at
::
empty
({
N
,
H
,
W
,
C
},
x
.
options
());
// Create wrapper
NhwcBatchNorm
*
bn
=
new
NhwcBatchNorm
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
nullptr
,
y
.
data
<
at
::
Half
>
(),
nullptr
);
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
nullptr
,
nullptr
});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
minibatch_mean
.
data
<
float
>
());
workspace
.
push_back
(
minibatch_inv_var
.
data
<
float
>
());
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
2
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
3
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
// Don't fuse in ReLU for now at least
bn
->
fwd
(
stream
,
fuse_relu
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
*
magic
,
occupancy
,
grid_dim_x
,
coop
);
return
y
;
}
at
::
Tensor
nhwc_bn_fwd_eval
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
int
bn_group
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
)
{
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// Allocate output tensor
at
::
Tensor
y
=
at
::
empty
({
N
,
H
,
W
,
C
},
x
.
options
());
// Create wrapper
NhwcBatchNorm
*
bn
=
new
NhwcBatchNorm
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
nullptr
,
y
.
data
<
at
::
Half
>
(),
nullptr
);
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
nullptr
,
nullptr
});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
nullptr
);
workspace
.
push_back
(
nullptr
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
2
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
3
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
// Don't fuse in ReLU for now at least
bn
->
fwdInference
(
stream
,
fuse_relu
);
return
y
;
}
std
::
vector
<
at
::
Tensor
>
nhwc_bn_bwd
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
dy
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
)
{
// shape
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// generating new magic number and use that for sync
int
*
magic
=
magic_tensor
.
data
<
int
>
();
*
magic
=
(
*
magic
+
1
)
&
0xff
;
// outputs
at
::
Tensor
x_grad
,
scale_grad
,
bias_grad
;
// Allocate outputs
x_grad
=
at
::
empty_like
(
x
);
scale_grad
=
at
::
empty_like
(
scale
);
bias_grad
=
at
::
empty_like
(
bias
);
// Create wrapper
NhwcBatchNorm
*
bn
=
new
NhwcBatchNorm
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
x_grad
.
data
<
at
::
Half
>
(),
nullptr
,
dy
.
data
<
at
::
Half
>
());
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
scale_grad
.
data
<
float
>
(),
bias_grad
.
data
<
float
>
()});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
minibatch_mean
.
data
<
float
>
());
workspace
.
push_back
(
minibatch_inv_var
.
data
<
float
>
());
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
2
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
3
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
3
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
bn
->
dgrad
(
stream
,
fuse_relu
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
*
magic
,
occupancy
,
grid_dim_x
,
coop
);
return
std
::
vector
<
at
::
Tensor
>
{
x_grad
,
scale_grad
,
bias_grad
};
}
int
nhwc_bn_fwd_occupancy
()
{
int
device_id
=-
1
;
cudaGetDevice
(
&
device_id
);
//max occupancy supported by the code is 2
return
NhwcBatchNorm
::
smem_driven_fwd_occupancy
(
device_id
,
2
);
}
int
nhwc_bn_bwd_occupancy
()
{
int
device_id
=-
1
;
cudaGetDevice
(
&
device_id
);
//max occupancy supported by the code is 2
return
NhwcBatchNorm
::
smem_driven_bwd_occupancy
(
device_id
,
2
);
}
apex/contrib/csrc/groupbn/batch_norm.h
0 → 100644
View file @
15648029
This diff is collapsed.
Click to expand it.
apex/contrib/csrc/groupbn/batch_norm_add_relu.cu
0 → 100644
View file @
15648029
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include "THC/THC.h"
#include "batch_norm_add_relu.h"
#include <cuda.h>
//FIXME move the common stuff to common h file
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
static
size_t
round_up_to_multiple
(
size_t
x
,
int
multiple
)
{
return
((
x
+
multiple
-
1
)
/
multiple
)
*
multiple
;
}
// TODO: Stop manually allocating CUDA memory; allocate an ATen byte
// tensor instead.
struct
Workspace
{
Workspace
(
size_t
size
)
:
size
(
size
),
data
(
NULL
)
{
data
=
THCudaMalloc
(
at
::
globalContext
().
lazyInitCUDA
(),
size
);
}
Workspace
(
const
Workspace
&
)
=
delete
;
Workspace
(
Workspace
&&
)
=
default
;
Workspace
&
operator
=
(
Workspace
&&
)
=
default
;
~
Workspace
()
{
if
(
data
)
{
THCudaFree
(
at
::
globalContext
().
lazyInitCUDA
(),
data
);
}
}
size_t
size
;
void
*
data
;
};
// Return {y}
at
::
Tensor
nhwc_bn_addrelu_fwd_train
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
z
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
bitmask
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
)
{
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// generating new magic number and use that for sync
int
*
magic
=
magic_tensor
.
data
<
int
>
();
*
magic
=
(
*
magic
+
1
)
&
0xff
;
// Allocate output tensor
at
::
Tensor
y
=
at
::
empty
({
N
,
H
,
W
,
C
},
x
.
options
());
// Create wrapper
NhwcBatchNormAddRelu
*
bn
=
new
NhwcBatchNormAddRelu
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
nullptr
,
y
.
data
<
at
::
Half
>
(),
nullptr
,
z
.
data
<
at
::
Half
>
(),
nullptr
);
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
nullptr
,
nullptr
});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
minibatch_mean
.
data
<
float
>
());
workspace
.
push_back
(
minibatch_inv_var
.
data
<
float
>
());
workspace
.
push_back
(
bitmask
.
data
<
int32_t
>
());
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
3
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
4
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
// Don't fuse in ReLU for now at least
bn
->
fwd
(
stream
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
*
magic
,
occupancy
,
grid_dim_x
,
coop
);
return
y
;
}
at
::
Tensor
nhwc_bn_addrelu_fwd_eval
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
z
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
int
bn_group
,
const
float
momentum
,
const
float
epsilon
)
{
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// Allocate output tensor
at
::
Tensor
y
=
at
::
empty
({
N
,
H
,
W
,
C
},
x
.
options
());
// Create wrapper
NhwcBatchNormAddRelu
*
bn
=
new
NhwcBatchNormAddRelu
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
nullptr
,
y
.
data
<
at
::
Half
>
(),
nullptr
,
z
.
data
<
at
::
Half
>
(),
nullptr
);
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
nullptr
,
nullptr
});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
nullptr
);
workspace
.
push_back
(
nullptr
);
workspace
.
push_back
(
nullptr
);
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
3
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
4
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
// Don't fuse in ReLU for now at least
bn
->
fwdInference
(
stream
);
return
y
;
}
std
::
vector
<
at
::
Tensor
>
nhwc_bn_addrelu_bwd
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
dy
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
bitmask
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
)
{
// shape
const
int
N
=
x
.
size
(
0
);
const
int
H
=
x
.
size
(
1
);
const
int
W
=
x
.
size
(
2
);
const
int
C
=
x
.
size
(
3
);
// generating new magic number and use that for sync
int
*
magic
=
magic_tensor
.
data
<
int
>
();
*
magic
=
(
*
magic
+
1
)
&
0xff
;
// outputs
at
::
Tensor
x_grad
,
z_grad
,
scale_grad
,
bias_grad
;
// Allocate outputs
x_grad
=
at
::
empty_like
(
x
);
z_grad
=
at
::
empty_like
(
x
);
scale_grad
=
at
::
empty_like
(
scale
);
bias_grad
=
at
::
empty_like
(
bias
);
// Create wrapper
NhwcBatchNormAddRelu
*
bn
=
new
NhwcBatchNormAddRelu
();
bn
->
setInputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
,
bn_group
);
bn
->
setOutputDescriptor
(
CUDNN_TENSOR_NHWC
,
CUDNN_DATA_HALF
,
N
,
C
,
H
,
W
);
bn
->
setConstants
(
momentum
,
epsilon
);
// set pointers within the wrapper
bn
->
setInputOutputPointers
(
x
.
data
<
at
::
Half
>
(),
x_grad
.
data
<
at
::
Half
>
(),
nullptr
,
dy
.
data
<
at
::
Half
>
(),
nullptr
,
z_grad
.
data
<
at
::
Half
>
());
bn
->
setWeightPointers
({
scale
.
data
<
float
>
(),
bias
.
data
<
float
>
()},
{
scale_grad
.
data
<
float
>
(),
bias_grad
.
data
<
float
>
()});
bn
->
setParameterPointers
({
running_mean
.
data
<
float
>
(),
running_inv_var
.
data
<
float
>
()});
// deal with workspace(s)
auto
workspace_bytes
=
bn
->
numWorkspaceBytes
();
// We'll create explicit tensors for the first 2 workspace ptrs, then allocate & offset
// an allocated workspace for the others
size_t
total_workspace_bytes
=
0
;
std
::
vector
<
size_t
>
workspace_offsets
;
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
total_workspace_bytes
=
round_up_to_multiple
(
total_workspace_bytes
,
512
);
workspace_offsets
.
push_back
(
total_workspace_bytes
);
auto
alloc_bytes
=
workspace_bytes
[
index
];
total_workspace_bytes
+=
alloc_bytes
;
}
// Allocate the workspace
Workspace
ws
(
total_workspace_bytes
);
std
::
vector
<
void
*>
workspace
;
workspace
.
push_back
(
minibatch_mean
.
data
<
float
>
());
workspace
.
push_back
(
minibatch_inv_var
.
data
<
float
>
());
workspace
.
push_back
(
bitmask
.
data
<
int32_t
>
());
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
().
stream
();
const
int
retired_cta_bytes
=
workspace_bytes
[
3
];
void
*
retired_ctas
=
ret_cta
.
data
<
uint8_t
>
();
assert
(
ret_cta
.
size
(
0
)
>=
retired_cta_bytes
);
workspace
.
push_back
(
retired_ctas
);
for
(
auto
index
=
4
;
index
<
workspace_bytes
.
size
();
++
index
)
{
void
*
ptr
=
reinterpret_cast
<
uint8_t
*>
(
ws
.
data
)
+
workspace_offsets
[
index
-
4
];
workspace
.
push_back
(
ptr
);
}
bn
->
setWorkspacePointers
(
workspace
,
workspace_bytes
);
bn
->
dgrad
(
stream
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
*
magic
,
occupancy
,
grid_dim_x
,
coop
);
return
std
::
vector
<
at
::
Tensor
>
{
x_grad
,
z_grad
,
scale_grad
,
bias_grad
};
}
int
nhwc_bn_addrelu_fwd_occupancy
()
{
int
device_id
=-
1
;
cudaGetDevice
(
&
device_id
);
//max occupancy supported by the code is 2
return
NhwcBatchNormAddRelu
::
smem_driven_fwd_occupancy
(
device_id
,
2
);
}
int
nhwc_bn_addrelu_bwd_occupancy
()
{
int
device_id
=-
1
;
cudaGetDevice
(
&
device_id
);
//max occupancy supported by the code is 2
return
NhwcBatchNormAddRelu
::
smem_driven_bwd_occupancy
(
device_id
,
2
);
}
apex/contrib/csrc/groupbn/batch_norm_add_relu.h
0 → 100644
View file @
15648029
This diff is collapsed.
Click to expand it.
apex/contrib/csrc/groupbn/cuda_utils.h
0 → 100644
View file @
15648029
#include <ATen/cuda/CUDAContext.h>
#ifndef CUDA_UTILS_H
#define CUDA_UTILS_H
namespace
at
{
namespace
cuda
{
namespace
utils
{
static
inline
int
MaxSharedMemoryPerMultiprocessor
(
int
device_id
)
{
return
getDeviceProperties
(
device_id
)
->
sharedMemPerMultiprocessor
;
}
}
}
}
#endif
apex/contrib/csrc/groupbn/interface.cpp
0 → 100644
View file @
15648029
#include <pybind11/pybind11.h>
#include <pybind11/numpy.h>
#include <pybind11/stl.h>
#include <torch/extension.h>
#include <ATen/ATen.h>
#include <ATen/ArrayRef.h>
#include <ATen/ScalarType.h>
#include "ATen/Scalar.h"
#ifndef VERSION_GE_1_1
#include "ATen/Type.h"
#endif
#include "ATen/Tensor.h"
#include "ATen/Storage.h"
#include "ATen/Generator.h"
namespace
py
=
pybind11
;
int64_t
get_buffer_size
(
const
int
bn_sync_steps
);
void
*
get_data_ptr
(
const
at
::
Tensor
&
data
);
void
*
get_remote_data_ptr
(
const
at
::
Tensor
&
handle
,
const
int64_t
offset
);
void
close_remote_data
(
const
at
::
Tensor
&
handle
);
at
::
Tensor
nhwc_bn_fwd_train
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
);
at
::
Tensor
nhwc_bn_fwd_eval
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
int
bn_group
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
);
std
::
vector
<
at
::
Tensor
>
nhwc_bn_bwd
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
dy
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
const
bool
fuse_relu
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
);
at
::
Tensor
nhwc_bn_addrelu_fwd_train
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
z
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
bitmask
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
);
at
::
Tensor
nhwc_bn_addrelu_fwd_eval
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
z
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
ret_cta
,
const
int
bn_group
,
const
float
momentum
,
const
float
epsilon
);
std
::
vector
<
at
::
Tensor
>
nhwc_bn_addrelu_bwd
(
const
at
::
Tensor
&
x
,
const
at
::
Tensor
&
dy
,
const
at
::
Tensor
&
scale
,
const
at
::
Tensor
&
bias
,
const
at
::
Tensor
&
running_mean
,
const
at
::
Tensor
&
running_inv_var
,
const
at
::
Tensor
&
minibatch_mean
,
const
at
::
Tensor
&
minibatch_inv_var
,
const
at
::
Tensor
&
bitmask
,
const
at
::
Tensor
&
ret_cta
,
const
float
momentum
,
const
float
epsilon
,
void
*
my_data
,
void
*
pair_data
,
void
*
pair_data2
,
void
*
pair_data3
,
const
int
bn_group
,
const
at
::
Tensor
&
magic_tensor
,
const
int
occupancy
,
const
int
grid_dim_x
,
const
bool
coop
);
int
nhwc_bn_fwd_occupancy
();
int
nhwc_bn_bwd_occupancy
();
int
nhwc_bn_addrelu_fwd_occupancy
();
int
nhwc_bn_addrelu_bwd_occupancy
();
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"get_buffer_size"
,
&
get_buffer_size
,
"get_buffer_size"
);
m
.
def
(
"get_data_ptr"
,
&
get_data_ptr
,
"get_data_ptr"
);
m
.
def
(
"get_remote_data_ptr"
,
&
get_remote_data_ptr
,
"get_remote_data_ptr"
);
m
.
def
(
"close_remote_data"
,
&
close_remote_data
,
"close_remote_data"
);
m
.
def
(
"bn_fwd_nhwc"
,
&
nhwc_bn_fwd_train
,
"bn_fwd_nhwc"
);
m
.
def
(
"bn_fwd_eval_nhwc"
,
&
nhwc_bn_fwd_eval
,
"bn_fwd_eval_nhwc"
);
m
.
def
(
"bn_bwd_nhwc"
,
&
nhwc_bn_bwd
,
"bn_bwd_nhwc"
);
m
.
def
(
"bn_fwd_nhwc_occupancy"
,
&
nhwc_bn_fwd_occupancy
,
"bn_fwd_nhwc_occupancy"
);
m
.
def
(
"bn_bwd_nhwc_occupancy"
,
&
nhwc_bn_bwd_occupancy
,
"bn_bwd_nhwc_occupancy"
);
m
.
def
(
"bn_addrelu_fwd_nhwc"
,
&
nhwc_bn_addrelu_fwd_train
,
"bn_addrelu_fwd_nhwc"
);
m
.
def
(
"bn_addrelu_fwd_eval_nhwc"
,
&
nhwc_bn_addrelu_fwd_eval
,
"bn_addrelu_fwd_eval_nhwc"
);
m
.
def
(
"bn_addrelu_bwd_nhwc"
,
&
nhwc_bn_addrelu_bwd
,
"bn_addrelu_bwd_nhwc"
);
m
.
def
(
"bn_addrelu_fwd_nhwc_occupancy"
,
&
nhwc_bn_addrelu_fwd_occupancy
,
"bn_addrelu_fwd_nhwc_occupancy"
);
m
.
def
(
"bn_addrelu_bwd_nhwc_occupancy"
,
&
nhwc_bn_addrelu_bwd_occupancy
,
"bn_addrelu_bwd_nhwc_occupancy"
);
}
apex/contrib/csrc/groupbn/ipc.cu
0 → 100644
View file @
15648029
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <THC/THCNumerics.cuh>
#include "THC/THC.h"
#include <cuda.h>
#define cudaCheckErrors(msg) \
do { \
cudaError_t __err = cudaGetLastError(); \
if (__err != cudaSuccess) { \
fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
msg, cudaGetErrorString(__err), \
__FILE__, __LINE__); \
fprintf(stderr, "*** FAILED - ABORTING\n"); \
exit(1); \
} \
} while (0)
template
<
>
struct
std
::
hash
<
cudaIpcMemHandle_t
>
{
size_t
operator
()
(
const
cudaIpcMemHandle_t
&
handle
)
const
{
size_t
hash
=
0
;
uint8_t
*
ptr
=
(
uint8_t
*
)
&
handle
;
assert
(
sizeof
(
uint8_t
)
==
1
);
for
(
int
i
=
0
;
i
<
sizeof
(
cudaIpcMemHandle_t
);
i
++
)
{
hash
+=
*
ptr
;
ptr
++
;
}
return
hash
;
}
};
template
<
>
struct
std
::
equal_to
<
cudaIpcMemHandle_t
>
{
bool
operator
()
(
const
cudaIpcMemHandle_t
&
lhs
,
const
cudaIpcMemHandle_t
&
rhs
)
const
{
return
(
std
::
memcmp
((
void
*
)
&
lhs
,
(
void
*
)
&
rhs
,
sizeof
(
cudaIpcMemHandle_t
))
==
0
);
}
};
namespace
{
namespace
gpuipc
{
//from: src/operator/nn/cudnn/nhwc_batch_norm_kernel.h
// The number of threads per pixel.
const
int
THREADS_PER_PIXEL
=
16
;
// The number of elements per ldg.
const
int
ELEMENTS_PER_LDG
=
4
;
// The number of reducing ops, each uses its own space : mean, var, dscale, dbias
const
int
REDUCE_OPS
=
4
;
// Maximum block.y supported - limited due to buffer allocation
const
int
MAX_BLOCK_Y
=
256
;
const
int
MAX_OFFSET
=
REDUCE_OPS
*
MAX_BLOCK_Y
;
const
int
BYTES_PER_ELEM
=
4
;
// Buffer size per sync step
const
int
SINGLE_SYNC_BUFFER_BYTES
=
MAX_OFFSET
*
THREADS_PER_PIXEL
*
2
*
ELEMENTS_PER_LDG
*
BYTES_PER_ELEM
;
};
class
IpcMemHandleRegistry
{
public:
void
*
getPtr
(
const
cudaIpcMemHandle_t
&
handle
,
int64_t
offset
)
{
if
(
registry_
.
count
(
handle
)
==
0
)
{
registry_
.
insert
(
std
::
make_pair
(
handle
,
RegistryEntry
()));
registry_
[
handle
].
dev_ptr
=
ipcOpenMem
(
handle
);
}
registry_
[
handle
].
ref_count
++
;
return
(((
uint8_t
*
)
registry_
[
handle
].
dev_ptr
)
+
offset
);
}
void
releasePtr
(
const
cudaIpcMemHandle_t
&
handle
)
{
if
(
registry_
.
count
(
handle
)
==
0
)
{
}
if
(
--
registry_
[
handle
].
ref_count
==
0
)
{
ipcCloseMem
(
registry_
[
handle
].
dev_ptr
);
registry_
.
erase
(
handle
);
}
}
struct
RegistryEntry
{
void
*
dev_ptr
;
int
ref_count
;
RegistryEntry
()
:
dev_ptr
(
NULL
)
,
ref_count
(
0
)
{}
};
protected:
std
::
unordered_map
<
cudaIpcMemHandle_t
,
RegistryEntry
>
registry_
;
void
*
ipcOpenMem
(
const
cudaIpcMemHandle_t
&
handle
)
{
void
*
data
;
cudaIpcOpenMemHandle
(
&
data
,
handle
,
cudaIpcMemLazyEnablePeerAccess
);
cudaCheckErrors
(
"ipc init"
);
return
data
;
}
void
ipcCloseMem
(
void
*
dev_ptr
)
{
cudaIpcCloseMemHandle
(
dev_ptr
);
cudaCheckErrors
(
"ipc close"
);
}
};
}
static
IpcMemHandleRegistry
ipc_mem_registry
;
int64_t
get_buffer_size
(
const
int
bn_sync_steps
)
{
return
bn_sync_steps
*
gpuipc
::
SINGLE_SYNC_BUFFER_BYTES
;
}
void
*
get_remote_data_ptr
(
const
at
::
Tensor
&
handle
,
const
int64_t
offset
)
{
cudaIpcMemHandle_t
my_handle
;
memcpy
((
unsigned
char
*
)(
&
my_handle
),
handle
.
data
<
uint8_t
>
(),
sizeof
(
my_handle
));
return
ipc_mem_registry
.
getPtr
(
my_handle
,
offset
);
}
void
close_remote_data
(
const
at
::
Tensor
&
handle
)
{
cudaIpcMemHandle_t
my_handle
;
memcpy
((
unsigned
char
*
)(
&
my_handle
),
handle
.
data
<
uint8_t
>
(),
sizeof
(
my_handle
));
ipc_mem_registry
.
releasePtr
(
my_handle
);
}
void
*
get_data_ptr
(
const
at
::
Tensor
&
data
)
{
return
data
.
data
<
uint8_t
>
();
}
apex/contrib/csrc/groupbn/nhwc_batch_norm_kernel.h
0 → 100644
View file @
15648029
This diff is collapsed.
Click to expand it.
apex/contrib/csrc/xentropy/interface.cpp
0 → 100644
View file @
15648029
#include <torch/extension.h>
// CUDA forward declarations
std
::
vector
<
at
::
Tensor
>
softmax_xentropy_cuda
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
labels
,
const
float
smoothing
,
const
bool
half_to_float
);
at
::
Tensor
softmax_xentropy_backward_cuda
(
const
at
::
Tensor
&
grad_loss
,
const
at
::
Tensor
&
logits
,
const
at
::
Tensor
&
max_log_sum_exp
,
const
at
::
Tensor
&
labels
,
const
float
smoothing
);
// C++ interface
#define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
#define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
std
::
vector
<
at
::
Tensor
>
softmax_xentropy_forward
(
const
at
::
Tensor
&
input
,
const
at
::
Tensor
&
labels
,
const
float
smoothing
,
const
bool
half_to_float
)
{
CHECK_CUDA
(
input
);
CHECK_INPUT
(
labels
);
return
softmax_xentropy_cuda
(
input
,
labels
,
smoothing
,
half_to_float
);
}
at
::
Tensor
softmax_xentropy_backward
(
const
at
::
Tensor
&
grad_loss
,
const
at
::
Tensor
&
logits
,
const
at
::
Tensor
&
max_log_sum_exp
,
const
at
::
Tensor
&
labels
,
const
float
smoothing
)
{
CHECK_CUDA
(
grad_loss
);
CHECK_CUDA
(
logits
);
CHECK_INPUT
(
max_log_sum_exp
);
CHECK_INPUT
(
labels
);
return
softmax_xentropy_backward_cuda
(
grad_loss
,
logits
,
max_log_sum_exp
,
labels
,
smoothing
);
}
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"forward"
,
&
softmax_xentropy_forward
,
"Softmax cross entropy loss with label smoothing forward (CUDA)"
);
m
.
def
(
"backward"
,
&
softmax_xentropy_backward
,
"Softmax cross entropy loss with label smoothing backward (CUDA)"
);
}
apex/contrib/csrc/xentropy/xentropy_kernel.cu
0 → 100644
View file @
15648029
This diff is collapsed.
Click to expand it.
apex/contrib/groupbn/__init__.py
0 → 100644
View file @
15648029
try
:
import
torch
import
bnp
from
.batch_norm
import
BatchNorm2d_NHWC
del
torch
del
bnp
del
batch_norm
except
ImportError
as
err
:
print
(
"apex was installed without --bnp flag, contrib.groupbn is not available"
)
apex/contrib/groupbn/batch_norm.py
0 → 100644
View file @
15648029
import
torch
import
numpy
as
np
from
torch.nn.modules.batchnorm
import
_BatchNorm
import
bnp
class
bn_NHWC_impl
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
ret_cta
,
mom
,
epsilon
,
fuse_relu
,
is_train
,
bn_group
,
my_data
,
pair_data
,
magic
,
pair_data2
,
pair_data3
,
fwd_occup
,
fwd_grid_x
,
bwd_occup
,
bwd_grid_x
,
multi_stream
):
if
is_train
:
ctx
.
save_for_backward
(
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
)
ctx
.
epsilon
=
epsilon
ctx
.
momentum
=
mom
ctx
.
ret_cta
=
ret_cta
ctx
.
fuse_relu
=
fuse_relu
ctx
.
my_data
=
my_data
ctx
.
pair_data
=
pair_data
ctx
.
magic
=
magic
ctx
.
pair_data2
=
pair_data2
ctx
.
pair_data3
=
pair_data3
ctx
.
bn_group
=
bn_group
ctx
.
bwd_occup
=
bwd_occup
ctx
.
bwd_grid_x
=
bwd_grid_x
ctx
.
multi_stream
=
multi_stream
res
=
bnp
.
bn_fwd_nhwc
(
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
ret_cta
,
mom
,
epsilon
,
fuse_relu
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
magic
,
fwd_occup
,
fwd_grid_x
,
multi_stream
)
return
res
else
:
return
bnp
.
bn_fwd_eval_nhwc
(
x
,
s
,
b
,
rm
,
riv
,
ret_cta
,
bn_group
,
mom
,
epsilon
,
fuse_relu
)
@
staticmethod
def
backward
(
ctx
,
grad_y
):
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
=
ctx
.
saved_variables
epsilon
=
ctx
.
epsilon
mom
=
ctx
.
momentum
ret_cta
=
ctx
.
ret_cta
fuse_relu
=
ctx
.
fuse_relu
my_data
=
ctx
.
my_data
pair_data
=
ctx
.
pair_data
magic
=
ctx
.
magic
pair_data2
=
ctx
.
pair_data2
pair_data3
=
ctx
.
pair_data3
bn_group
=
ctx
.
bn_group
bwd_occup
=
ctx
.
bwd_occup
bwd_grid_x
=
ctx
.
bwd_grid_x
multi_stream
=
ctx
.
multi_stream
dx
,
dscale
,
dbias
=
bnp
.
bn_bwd_nhwc
(
x
,
grad_y
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
ret_cta
,
mom
,
epsilon
,
fuse_relu
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
magic
,
bwd_occup
,
bwd_grid_x
,
multi_stream
)
return
dx
,
dscale
,
dbias
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
class
bn_addrelu_NHWC_impl
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
x
,
z
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
grid_dim_y
,
ret_cta
,
mom
,
epsilon
,
is_train
,
bn_group
,
my_data
,
pair_data
,
magic
,
pair_data2
,
pair_data3
,
fwd_occup
,
fwd_grid_x
,
bwd_occup
,
bwd_grid_x
,
multi_stream
):
if
is_train
:
bitmask
=
torch
.
cuda
.
IntTensor
(((
x
.
numel
()
+
31
)
//
32
)
*
2
*
grid_dim_y
)
ctx
.
save_for_backward
(
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
bitmask
)
ctx
.
epsilon
=
epsilon
ctx
.
momentum
=
mom
ctx
.
ret_cta
=
ret_cta
ctx
.
my_data
=
my_data
ctx
.
pair_data
=
pair_data
ctx
.
magic
=
magic
ctx
.
pair_data2
=
pair_data2
ctx
.
pair_data3
=
pair_data3
ctx
.
bn_group
=
bn_group
ctx
.
bwd_occup
=
bwd_occup
ctx
.
bwd_grid_x
=
bwd_grid_x
ctx
.
multi_stream
=
multi_stream
res
=
bnp
.
bn_addrelu_fwd_nhwc
(
x
,
z
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
bitmask
,
ret_cta
,
mom
,
epsilon
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
magic
,
fwd_occup
,
fwd_grid_x
,
multi_stream
)
return
res
else
:
return
bnp
.
bn_addrelu_fwd_eval_nhwc
(
x
,
z
,
s
,
b
,
rm
,
riv
,
ret_cta
,
bn_group
,
mom
,
epsilon
)
@
staticmethod
def
backward
(
ctx
,
grad_y
):
x
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
bitmask
=
ctx
.
saved_variables
epsilon
=
ctx
.
epsilon
mom
=
ctx
.
momentum
ret_cta
=
ctx
.
ret_cta
my_data
=
ctx
.
my_data
pair_data
=
ctx
.
pair_data
magic
=
ctx
.
magic
pair_data2
=
ctx
.
pair_data2
pair_data3
=
ctx
.
pair_data3
bn_group
=
ctx
.
bn_group
bwd_occup
=
ctx
.
bwd_occup
bwd_grid_x
=
ctx
.
bwd_grid_x
multi_stream
=
ctx
.
multi_stream
dx
,
dz
,
dscale
,
dbias
=
bnp
.
bn_addrelu_bwd_nhwc
(
x
,
grad_y
,
s
,
b
,
rm
,
riv
,
mini_m
,
mini_riv
,
bitmask
,
ret_cta
,
mom
,
epsilon
,
my_data
,
pair_data
,
pair_data2
,
pair_data3
,
bn_group
,
magic
,
bwd_occup
,
bwd_grid_x
,
multi_stream
)
return
dx
,
dz
,
dscale
,
dbias
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
class
BatchNorm2d_NHWC
(
_BatchNorm
):
# if using BatchNorm2d_NHWC simultaneously with multiple streams set multi_stream to True
def
__init__
(
self
,
num_features
,
fuse_relu
=
False
,
bn_group
=
1
,
max_cta_per_sm
=
2
,
cta_launch_margin
=
12
,
multi_stream
=
False
):
super
(
BatchNorm2d_NHWC
,
self
).
__init__
(
num_features
)
self
.
fuse_relu
=
fuse_relu
self
.
multi_stream
=
multi_stream
self
.
minibatch_mean
=
torch
.
cuda
.
FloatTensor
(
num_features
)
self
.
minibatch_riv
=
torch
.
cuda
.
FloatTensor
(
num_features
)
#defaut to distributed bn disabled
self
.
bn_group
=
bn_group
self
.
max_cta_per_sm
=
max_cta_per_sm
#used only in training fwd and bwd
self
.
cta_launch_margin
=
cta_launch_margin
#used only in training fwd and bwd
self
.
my_data
=
None
self
.
pair_data
=
None
self
.
pair_data2
=
None
self
.
pair_data3
=
None
self
.
local_rank
=
0
self
.
magic
=
torch
.
IntTensor
([
0
])
#calculate cta per sm occupancies
assert
(
max_cta_per_sm
>
0
)
# won't be able to do much with 0 CTAs :)
self
.
fwd_occupancy
=
min
(
bnp
.
bn_fwd_nhwc_occupancy
(),
max_cta_per_sm
)
self
.
bwd_occupancy
=
min
(
bnp
.
bn_bwd_nhwc_occupancy
(),
max_cta_per_sm
)
self
.
addrelu_fwd_occupancy
=
min
(
bnp
.
bn_addrelu_fwd_nhwc_occupancy
(),
max_cta_per_sm
)
self
.
addrelu_bwd_occupancy
=
min
(
bnp
.
bn_addrelu_bwd_nhwc_occupancy
(),
max_cta_per_sm
)
#calculate grid dimentions based on occupancy numbers
mp_count
=
torch
.
cuda
.
get_device_properties
(
None
).
multi_processor_count
self
.
fwd_grid_dim_x
=
max
(
mp_count
*
self
.
fwd_occupancy
-
cta_launch_margin
,
1
)
self
.
bwd_grid_dim_x
=
max
(
mp_count
*
self
.
bwd_occupancy
-
cta_launch_margin
,
1
)
self
.
addrelu_fwd_grid_dim_x
=
max
(
mp_count
*
self
.
addrelu_fwd_occupancy
-
cta_launch_margin
,
1
)
self
.
addrelu_bwd_grid_dim_x
=
max
(
mp_count
*
self
.
addrelu_bwd_occupancy
-
cta_launch_margin
,
1
)
self
.
grid_dim_y
=
(
num_features
+
63
)
//
64
# allocate scratch space used by implementation
# TODO: scratch space that is not supposed to be exposed at user code. We only need one time initialization, the
# same buffer could be reused in future iterations. Currently we exposed it here instead of requesting new
# buffer from cache allocator to avoid unnecessary initialization at future iterations.
self
.
ret_cta
=
torch
.
cuda
.
ByteTensor
(
8192
).
fill_
(
0
)
#FIXME: turn pair handles into an array
if
bn_group
>
1
:
local_rank
=
torch
.
distributed
.
get_rank
()
world_size
=
torch
.
distributed
.
get_world_size
()
assert
(
world_size
>=
bn_group
)
assert
(
world_size
%
bn_group
==
0
)
bn_sync_steps
=
1
if
(
bn_group
==
4
):
bn_sync_steps
=
2
if
(
bn_group
==
8
):
bn_sync_steps
=
3
self
.
ipc_buffer
=
torch
.
cuda
.
ByteTensor
(
bnp
.
get_buffer_size
(
bn_sync_steps
))
self
.
my_data
=
bnp
.
get_data_ptr
(
self
.
ipc_buffer
)
# we are walking on very thin ice here by utilizing internal `_share_cuda_()`
self
.
storage
=
self
.
ipc_buffer
.
storage
()
self
.
share_cuda
=
self
.
storage
.
_share_cuda_
()
internal_cuda_mem
=
self
.
share_cuda
# internal_cuda_mem[1]: ipc_mem_handle
my_handle
=
torch
.
cuda
.
ByteTensor
(
np
.
frombuffer
(
internal_cuda_mem
[
1
],
dtype
=
np
.
uint8
))
# internal_cuda_mem[3]: offset
my_offset
=
torch
.
cuda
.
IntTensor
([
internal_cuda_mem
[
3
]])
handles_all
=
torch
.
empty
(
world_size
,
my_handle
.
size
(
0
),
dtype
=
my_handle
.
dtype
,
device
=
my_handle
.
device
)
handles_l
=
list
(
handles_all
.
unbind
(
0
))
torch
.
distributed
.
all_gather
(
handles_l
,
my_handle
)
offsets_all
=
torch
.
empty
(
world_size
,
my_offset
.
size
(
0
),
dtype
=
my_offset
.
dtype
,
device
=
my_offset
.
device
)
offsets_l
=
list
(
offsets_all
.
unbind
(
0
))
torch
.
distributed
.
all_gather
(
offsets_l
,
my_offset
)
#whom do I actually care about? that would be local_rank XOR 1
self
.
pair_handle
=
handles_l
[
local_rank
^
1
].
cpu
().
contiguous
()
pair_offset
=
offsets_l
[
local_rank
^
1
].
cpu
()
self
.
pair_data
=
bnp
.
get_remote_data_ptr
(
self
.
pair_handle
,
pair_offset
)
if
bn_group
>
2
:
self
.
pair_handle2
=
handles_l
[
local_rank
^
2
].
cpu
().
contiguous
()
pair_offset2
=
offsets_l
[
local_rank
^
2
].
cpu
()
self
.
pair_data2
=
bnp
.
get_remote_data_ptr
(
self
.
pair_handle2
,
pair_offset2
)
if
bn_group
>
4
:
self
.
pair_handle3
=
handles_l
[
local_rank
^
4
].
cpu
().
contiguous
()
pair_offset3
=
offsets_l
[
local_rank
^
4
].
cpu
()
self
.
pair_data3
=
bnp
.
get_remote_data_ptr
(
self
.
pair_handle3
,
pair_offset3
)
#FIXME: get magic value into C code and eliminate from here
self
.
magic
=
torch
.
IntTensor
([
2
])
self
.
local_rank
=
local_rank
def
forward
(
self
,
x
,
z
=
None
):
if
z
is
not
None
:
assert
(
self
.
fuse_relu
==
True
)
return
bn_addrelu_NHWC_impl
.
apply
(
x
,
z
,
self
.
weight
,
self
.
bias
,
self
.
running_mean
,
self
.
running_var
,
self
.
minibatch_mean
,
self
.
minibatch_riv
,
self
.
grid_dim_y
,
self
.
ret_cta
,
self
.
momentum
,
self
.
eps
,
self
.
training
,
self
.
bn_group
,
self
.
my_data
,
self
.
pair_data
,
(
self
.
magic
),
self
.
pair_data2
,
self
.
pair_data3
,
self
.
addrelu_fwd_occupancy
,
self
.
addrelu_fwd_grid_dim_x
,
self
.
addrelu_bwd_occupancy
,
self
.
addrelu_bwd_grid_dim_x
,
self
.
multi_stream
)
else
:
return
bn_NHWC_impl
.
apply
(
x
,
self
.
weight
,
self
.
bias
,
self
.
running_mean
,
self
.
running_var
,
self
.
minibatch_mean
,
self
.
minibatch_riv
,
self
.
ret_cta
,
self
.
momentum
,
self
.
eps
,
self
.
fuse_relu
,
self
.
training
,
self
.
bn_group
,
self
.
my_data
,
self
.
pair_data
,
(
self
.
magic
),
self
.
pair_data2
,
self
.
pair_data3
,
self
.
fwd_occupancy
,
self
.
fwd_grid_dim_x
,
self
.
bwd_occupancy
,
self
.
bwd_grid_dim_x
,
self
.
multi_stream
)
def
__del__
(
self
):
if
self
.
bn_group
>
1
:
bnp
.
close_remote_data
(
self
.
pair_handle
)
if
self
.
bn_group
>
2
:
bnp
.
close_remote_data
(
self
.
pair_handle2
)
if
self
.
bn_group
>
4
:
bnp
.
close_remote_data
(
self
.
pair_handle3
)
apex/contrib/test/test_label_smoothing.py
0 → 100644
View file @
15648029
import
torch
from
apex.contrib
import
xentropy
as
label_smoothing
import
unittest
import
warnings
import
random
import
numpy
as
np
import
time
def
label_smoothing_raw
(
x
,
target
,
padding_idx
,
smoothing
):
logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
x
,
dim
=-
1
,
dtype
=
torch
.
float32
)
non_pad_mask
=
(
target
!=
padding_idx
)
nll_loss
=
-
logprobs
.
gather
(
dim
=-
1
,
index
=
target
.
unsqueeze
(
1
))
nll_loss
=
nll_loss
.
squeeze
(
1
)[
non_pad_mask
]
smooth_loss
=
-
logprobs
.
mean
(
dim
=-
1
)[
non_pad_mask
]
loss
=
(
1.0
-
smoothing
)
*
nll_loss
+
smoothing
*
smooth_loss
return
loss
def
label_smoothing_opt_1
(
x
,
target
,
padding_idx
,
smoothing
):
logprobs
=
torch
.
nn
.
functional
.
log_softmax
(
x
,
dim
=-
1
,
dtype
=
torch
.
float32
)
pad_mask
=
(
target
==
padding_idx
)
ll_loss
=
logprobs
.
gather
(
dim
=-
1
,
index
=
target
.
unsqueeze
(
1
)).
squeeze
(
1
)
smooth_loss
=
logprobs
.
mean
(
dim
=-
1
)
loss
=
(
smoothing
-
1.0
)
*
ll_loss
-
smoothing
*
smooth_loss
loss
.
masked_fill_
(
pad_mask
,
0
)
return
loss
class
LabelSmoothingTest
(
unittest
.
TestCase
):
def
setUp
(
self
,
seed
=
1234
):
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
torch
.
cuda
.
manual_seed_all
(
seed
)
# Set pytorch print precision
torch
.
set_printoptions
(
precision
=
10
)
def
gen_test_inputs
(
self
,
N
,
T
,
H
,
smoothing
,
padding_idx
):
logits
=
torch
.
randn
((
N
*
T
,
H
),
dtype
=
torch
.
half
,
device
=
'cuda'
,
requires_grad
=
True
)
labels
=
torch
.
randint
(
0
,
H
,
[
N
*
T
],
device
=
'cuda'
)
for
i
in
random
.
sample
(
range
(
N
*
T
),
N
*
T
//
6
):
labels
[
i
]
=
padding_idx
half_to_float
=
(
logits
.
dtype
==
torch
.
half
)
return
logits
,
labels
,
half_to_float
def
print_max_diff_elem
(
self
,
ref
,
tst
):
ref
,
tst
=
ref
.
flatten
(),
tst
.
flatten
()
diff
=
(
ref
-
tst
).
abs
().
max
()
idx
=
(
ref
-
tst
).
abs
().
argmax
()
print
(
"Max atol idx: {}, diff: {:.6f}, ref: {:.6f}, tst: {:.6f}"
.
format
(
idx
,
diff
,
ref
[
idx
],
tst
[
idx
]))
def
test_label_smoothing_function
(
self
):
# Set label smoothing configuration
smoothing
,
padding_idx
=
0.1
,
0
N
,
T
,
H
=
128
,
74
,
32320
iters
=
10
loss_func
=
label_smoothing
.
SoftmaxCrossEntropyLoss
.
apply
for
i
in
range
(
iters
):
logits
,
labels
,
half_to_float
=
self
.
gen_test_inputs
(
N
,
T
,
H
,
smoothing
,
padding_idx
)
# Run original softmax cross entropy with label smoothing
logits
.
grad
=
None
losses
=
label_smoothing_raw
(
logits
,
labels
,
padding_idx
,
smoothing
)
loss
=
losses
.
sum
()
loss
.
backward
()
ref_loss
=
loss
.
clone
().
detach
()
ref_grad
=
logits
.
grad
.
clone
().
detach
()
# Run optimized softmax cross entropy with label smoothing
logits
.
grad
=
None
losses
=
loss_func
(
logits
,
labels
,
smoothing
,
padding_idx
,
half_to_float
)
loss
=
losses
.
sum
()
loss
.
backward
()
val_loss
=
loss
.
clone
().
detach
()
val_grad
=
logits
.
grad
.
clone
().
detach
()
# Validate
self
.
print_max_diff_elem
(
ref_grad
,
val_grad
)
self
.
assertTrue
(
torch
.
allclose
(
ref_loss
,
val_loss
,
atol
=
1e-5
,
rtol
=
1e-5
))
self
.
assertTrue
(
torch
.
allclose
(
ref_grad
,
val_grad
,
atol
=
1e-5
,
rtol
=
1e-5
))
def
test_label_smoothing_perf
(
self
):
# Set label smoothing configuration
smoothing
,
padding_idx
=
0.1
,
0
N
,
T
,
H
=
128
,
74
,
32320
iters
=
1000
loss_func
=
label_smoothing
.
SoftmaxCrossEntropyLoss
.
apply
print
()
logits
,
labels
,
half_to_float
=
self
.
gen_test_inputs
(
N
,
T
,
H
,
smoothing
,
padding_idx
)
# Run original softmax cross entropy with label smoothing
torch
.
cuda
.
synchronize
()
ts
=
time
.
time
()
for
i
in
range
(
iters
):
logits
.
grad
=
None
losses
=
label_smoothing_raw
(
logits
,
labels
,
padding_idx
,
smoothing
)
loss
=
losses
.
sum
()
/
N
loss
.
backward
()
torch
.
cuda
.
synchronize
()
print
(
"Raw time {:.2f} s elapsed for {} iterations, norm {:.4f}"
.
format
(
time
.
time
()
-
ts
,
iters
,
logits
.
grad
.
norm
()))
# Run optimized softmax cross entropy with label smoothing
torch
.
cuda
.
synchronize
()
ts
=
time
.
time
()
for
i
in
range
(
iters
):
logits
.
grad
=
None
losses
=
loss_func
(
logits
,
labels
,
smoothing
,
padding_idx
,
half_to_float
)
loss
=
losses
.
sum
()
/
N
loss
.
backward
()
torch
.
cuda
.
synchronize
()
print
(
"Opt time {:.2f} s elapsed for {} iterations, norm {:.4f}"
.
format
(
time
.
time
()
-
ts
,
iters
,
logits
.
grad
.
norm
()))
if
__name__
==
'__main__'
:
unittest
.
main
()
apex/contrib/xentropy/__init__.py
0 → 100644
View file @
15648029
try
:
import
torch
import
xentropy_cuda
from
.softmax_xentropy
import
SoftmaxCrossEntropyLoss
del
torch
del
xentropy_cuda
del
softmax_xentropy
except
ImportError
as
err
:
print
(
"apex was installed without --xentropy flag, contrib.xentropy is not available"
)
Prev
1
2
3
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment