Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
fad78c16
"...git@developer.sourcefind.cn:OpenDAS/megatron-lm.git" did not exist on "d520d24fdf2eb7c2e76aa5ca3020cbfd07c42910"
Commit
fad78c16
authored
Feb 10, 2019
by
Michael Carilli
Browse files
Stashing work
parent
8db3f95c
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
443 additions
and
48 deletions
+443
-48
apex/amp/__init__.py
apex/amp/__init__.py
+1
-0
apex/amp/amp.py
apex/amp/amp.py
+2
-3
apex/amp/frontend.py
apex/amp/frontend.py
+1
-1
apex/amp/handle.py
apex/amp/handle.py
+6
-4
apex/amp/initialize.py
apex/amp/initialize.py
+14
-1
apex/amp/multi_tensor_apply.py
apex/amp/multi_tensor_apply.py
+103
-0
apex/amp/opt.py
apex/amp/opt.py
+3
-1
apex/amp/scaler.py
apex/amp/scaler.py
+34
-21
apex/fp16_utils/fp16_optimizer.py
apex/fp16_utils/fp16_optimizer.py
+4
-4
apex/optimizers/fp16_optimizer.py
apex/optimizers/fp16_optimizer.py
+20
-11
apex/parallel/distributed.py
apex/parallel/distributed.py
+0
-1
csrc/multi_tensor_apply.h
csrc/multi_tensor_apply.h
+49
-0
csrc/multi_tensor_unscale_kernel.cu
csrc/multi_tensor_unscale_kernel.cu
+134
-0
csrc/scale_check_overflow.cpp
csrc/scale_check_overflow.cpp
+66
-0
setup.py
setup.py
+6
-1
No files found.
apex/amp/__init__.py
View file @
fad78c16
from
.amp
import
init
,
half_function
,
float_function
,
promote_function
,
\
from
.amp
import
init
,
half_function
,
float_function
,
promote_function
,
\
register_half_function
,
register_float_function
,
register_promote_function
,
\
register_half_function
,
register_float_function
,
register_promote_function
,
\
register
register
from
.multi_tensor_apply
import
MultiTensorApply
apex/amp/amp.py
View file @
fad78c16
from
.
import
compat
,
rnn_compat
,
utils
,
wrap
from
.
import
compat
,
rnn_compat
,
utils
,
wrap
from
.handle
import
AmpHandle
,
NoOpHandle
from
.handle
import
AmpHandle
,
NoOpHandle
from
.lists
import
functional_overrides
,
torch_overrides
,
tensor_overrides
from
.lists
import
functional_overrides
,
torch_overrides
,
tensor_overrides
from
..fp16_utils
import
FP16_Optimizer
from
.frontend
import
*
from
.frontend
import
*
import
functools
import
functools
...
@@ -65,7 +64,7 @@ def register_promote_function(module, name):
...
@@ -65,7 +64,7 @@ def register_promote_function(module, name):
# Top-level function to insert _all_ the hooks.
# Top-level function to insert _all_ the hooks.
def
init
(
enabled
=
True
,
enable_caching
=
True
,
verbose
=
False
,
allow_banned
=
False
):
def
init
(
enabled
=
True
,
loss_scale
=
"dynamic"
,
enable_caching
=
True
,
verbose
=
False
,
allow_banned
=
False
):
global
_DECORATOR_HANDLE
global
_DECORATOR_HANDLE
if
not
enabled
:
if
not
enabled
:
...
@@ -73,7 +72,7 @@ def init(enabled=True, enable_caching=True, verbose=False, allow_banned=False):
...
@@ -73,7 +72,7 @@ def init(enabled=True, enable_caching=True, verbose=False, allow_banned=False):
_DECORATOR_HANDLE
=
handle
_DECORATOR_HANDLE
=
handle
return
handle
return
handle
handle
=
AmpHandle
(
enable_caching
,
verbose
)
handle
=
AmpHandle
(
loss_scale
,
enable_caching
,
verbose
)
# 0) Force-{fp16, fp32} for user-annotated functions
# 0) Force-{fp16, fp32} for user-annotated functions
for
mod
,
fn
,
cast_fn
in
_USER_CAST_REGISTRY
:
for
mod
,
fn
,
cast_fn
in
_USER_CAST_REGISTRY
:
...
...
apex/amp/frontend.py
View file @
fad78c16
...
@@ -199,7 +199,7 @@ def register(enabled=False,
...
@@ -199,7 +199,7 @@ def register(enabled=False,
for
k
,
v
in
amp
.
opt_properties
.
options
:
for
k
,
v
in
amp
.
opt_properties
.
options
:
print
(
"{:20} : {}"
,
k
,
v
)
print
(
"{:20} : {}"
,
k
,
v
)
initialize
(
optimizers
,
models
)
return
initialize
(
optimizers
,
models
)
def
check_option_consistency
(
enabled
=
False
,
def
check_option_consistency
(
enabled
=
False
,
...
...
apex/amp/handle.py
View file @
fad78c16
...
@@ -4,14 +4,14 @@ import warnings
...
@@ -4,14 +4,14 @@ import warnings
from
.
import
utils
from
.
import
utils
from
.opt
import
OptimWrapper
from
.opt
import
OptimWrapper
from
.scaler
import
LossScaler
from
.scaler
import
LossScaler
,
iter_params
class
AmpHandle
(
object
):
class
AmpHandle
(
object
):
def
__init__
(
self
,
enable_caching
=
True
,
verbose
=
False
):
def
__init__
(
self
,
loss_scale
=
"dynamic"
,
enable_caching
=
True
,
verbose
=
False
):
self
.
_enable_caching
=
enable_caching
self
.
_enable_caching
=
enable_caching
self
.
_verbose
=
verbose
self
.
_verbose
=
verbose
self
.
_cache
=
dict
()
self
.
_cache
=
dict
()
self
.
_default_scaler
=
LossScaler
()
self
.
_default_scaler
=
LossScaler
(
loss_scale
)
self
.
_is_active
=
True
self
.
_is_active
=
True
self
.
_all_wrappers
=
[]
self
.
_all_wrappers
=
[]
...
@@ -44,7 +44,9 @@ class AmpHandle(object):
...
@@ -44,7 +44,9 @@ class AmpHandle(object):
yield
loss
*
loss_scale
yield
loss
*
loss_scale
should_skip
=
self
.
_default_scaler
.
unscale_and_update
(
should_skip
=
self
.
_default_scaler
.
unscale_and_update
(
optimizer
.
param_groups
,
loss_scale
)
iter_params
(
optimizer
.
param_groups
),
iter_params
(
optimizer
.
param_groups
),
loss_scale
)
if
should_skip
:
if
should_skip
:
optimizer_step
=
optimizer
.
step
optimizer_step
=
optimizer
.
step
def
skip_step
():
def
skip_step
():
...
...
apex/amp/initialize.py
View file @
fad78c16
import
torch
import
torch
from
torch._six
import
container_abcs
,
string_classes
from
torch._six
import
container_abcs
,
string_classes
import
functools
import
functools
from
apex.fp16_utils
import
convert_network
def
to_type
(
dtype
,
t
):
def
to_type
(
dtype
,
t
):
...
@@ -36,7 +37,7 @@ def initialize(optimizers, models, properties):
...
@@ -36,7 +37,7 @@ def initialize(optimizers, models, properties):
if
properties
.
cast_model_type
is
not
None
:
if
properties
.
cast_model_type
is
not
None
:
if
properties
.
cast_batchnorm
is
not
None
:
if
properties
.
cast_batchnorm
is
not
None
:
for
model
in
models
:
for
model
in
models
:
model
.
to
(
properties
.
cast_model_type
)
convert_network
(
model
,
properties
.
cast_model_type
)
else
:
else
:
for
model
in
models
:
for
model
in
models
:
model
.
to
(
properties
.
cast_model_type
)
model
.
to
(
properties
.
cast_model_type
)
...
@@ -55,3 +56,15 @@ def initialize(optimizers, models, properties):
...
@@ -55,3 +56,15 @@ def initialize(optimizers, models, properties):
# State dict trick to recast any preexisting per-param state tensors
# State dict trick to recast any preexisting per-param state tensors
for
optimizer
in
optimizers
:
for
optimizer
in
optimizers
:
optimizer
.
load_state_dict
(
optimizer
.
state_dict
())
optimizer
.
load_state_dict
(
optimizer
.
state_dict
())
if
properties
.
master_weights
:
for
i
,
optimizer
in
enumerate
(
optimizers
):
if
properties
.
loss_scale
==
"dynamic"
:
optimizers
[
i
]
=
FP16_Optimizer
(
optimizer
[
i
],
dynamic_loss_scale
=
True
)
else
:
optimizers
[
i
]
=
FP16_Optimizer
(
optimizer
[
i
],
static_loss_scale
=
properties
.
loss_scale
)
if
properties
.
cast_torch_functions
:
handle
=
amp
.
init
()
# the handle is also globally accessible as amp._DECORATOR_HANDLE
return
optimizers
,
models
apex/amp/multi_tensor_apply.py
0 → 100644
View file @
fad78c16
import
torch
from
amp_C
import
prep_multi_tensor_launch
class
MultiTensorApply
(
object
):
def
__init__
(
self
,
max_blocks
,
max_tensors
,
max_depth
,
chunk_size
):
self
.
chunk_size
=
chunk_size
self
.
reallocate
(
max_blocks
,
max_tensors
,
max_depth
)
def
__call__
(
self
,
op
,
noop_flag_buffer
,
tensor_lists
,
*
args
):
self
.
assign_blocks
(
tensor_lists
)
# print(self.gpu_block_to_tensor)
# print(self.gpu_block_to_chunk)
# print(self.gpu_tensor_sizes)
return
op
(
self
.
nblocks
,
noop_flag_buffer
,
self
.
cpu_tensor_addresses
,
self
.
gpu_block_to_tensor
,
self
.
gpu_block_to_chunk
,
self
.
gpu_tensor_sizes
,
self
.
gpu_tensor_addresses
,
self
.
chunk_size
,
tensor_lists
,
*
args
)
# print()
# print([[p.data_ptr() for p in l] for l in tensor_lists])
# print()
# print(self.gpu_tensor_addresses)
def
assign_blocks
(
self
,
tensor_lists
):
needs_reallocate
=
False
# Currently, this loop appears prohibitively expensive.
# Need to move to c++.
torch
.
cuda
.
nvtx
.
range_push
(
"assign_blocks loop"
)
# list0 = tensor_lists[0]
# self.nblocks = 0
# for t, tensor in enumerate(list0):
# blocks_this_tensor = (tensor.numel() +
# self.chunk_size - 1)//self.chunk_size
# if not needs_reallocate:
# self.cpu_tensor_sizes[t] = tensor.numel()
# for chunk in range(blocks_this_tensor):
# if self.nblocks >= self.max_blocks:
# needs_reallocate = True
# if not needs_reallocate:
# self.cpu_block_to_tensor[self.nblocks] = t
# self.cpu_block_to_chunk[self.nblocks] = chunk
# self.nblocks += 1
needs_reallocate
,
self
.
nblocks
=
prep_multi_tensor_launch
(
self
.
cpu_block_to_tensor
,
self
.
cpu_block_to_chunk
,
self
.
cpu_tensor_sizes
,
self
.
gpu_block_to_tensor
,
self
.
gpu_block_to_chunk
,
self
.
gpu_tensor_sizes
,
self
.
chunk_size
,
self
.
max_depth
,
self
.
max_tensors
,
self
.
max_blocks
,
tensor_lists
)
torch
.
cuda
.
nvtx
.
range_pop
()
print
(
self
.
nblocks
)
if
self
.
nblocks
>
self
.
max_blocks
:
self
.
max_blocks
=
self
.
nblocks
if
len
(
tensor_lists
)
>
self
.
max_depth
:
self
.
max_depth
=
len
(
tensor_lists
)
if
len
(
tensor_lists
[
0
])
>
self
.
max_tensors
:
self
.
max_tensors
=
len
(
tensor_lists
[
0
])
if
needs_reallocate
:
self
.
reallocate
(
self
.
max_blocks
,
self
.
max_tensors
,
self
.
max_depth
)
needs_reallocate
,
self
.
nblocks
=
prep_multi_tensor_launch
(
self
.
cpu_block_to_tensor
,
self
.
cpu_block_to_chunk
,
self
.
cpu_tensor_sizes
,
self
.
gpu_block_to_tensor
,
self
.
gpu_block_to_chunk
,
self
.
gpu_tensor_sizes
,
self
.
chunk_size
,
self
.
max_depth
,
self
.
max_tensors
,
self
.
max_blocks
,
tensor_lists
)
assert
needs_reallocate
==
0
,
"Should not need reallocate on second attempt."
assert
self
.
nblocks
<=
self
.
max_blocks
,
"Should not need to increase blocks again."
def
reallocate
(
self
,
max_blocks
,
max_tensors
,
max_depth
):
self
.
max_blocks
=
max_blocks
self
.
max_tensors
=
max_tensors
self
.
max_depth
=
max_depth
self
.
cpu_block_to_tensor
=
torch
.
IntTensor
(
max_blocks
).
pin_memory
()
self
.
cpu_block_to_chunk
=
torch
.
IntTensor
(
max_blocks
).
pin_memory
()
self
.
cpu_tensor_sizes
=
torch
.
IntTensor
(
max_tensors
).
pin_memory
()
self
.
cpu_tensor_addresses
=
torch
.
LongTensor
(
max_depth
,
max_tensors
).
pin_memory
()
self
.
gpu_block_to_tensor
=
torch
.
cuda
.
IntTensor
(
max_blocks
)
self
.
gpu_block_to_chunk
=
torch
.
cuda
.
IntTensor
(
max_blocks
)
self
.
gpu_tensor_sizes
=
torch
.
cuda
.
IntTensor
(
max_tensors
)
self
.
gpu_tensor_addresses
=
torch
.
cuda
.
LongTensor
(
max_depth
,
max_tensors
)
apex/amp/opt.py
View file @
fad78c16
...
@@ -38,7 +38,9 @@ class OptimWrapper(object):
...
@@ -38,7 +38,9 @@ class OptimWrapper(object):
yield
loss
*
loss_scale
yield
loss
*
loss_scale
self
.
_skip_next
[
self
.
_loss_idx
]
=
self
.
_cur_loss_scaler
().
unscale_and_update
(
self
.
_skip_next
[
self
.
_loss_idx
]
=
self
.
_cur_loss_scaler
().
unscale_and_update
(
self
.
_optimizer
.
param_groups
,
loss_scale
)
iter_params
(
self
.
_optimizer
.
param_groups
),
iter_params
(
self
.
_optimizer
.
param_groups
),
loss_scale
)
self
.
_loss_idx
+=
1
self
.
_loss_idx
+=
1
if
len
(
cached_grads
)
>
0
:
if
len
(
cached_grads
)
>
0
:
...
...
apex/amp/scaler.py
View file @
fad78c16
...
@@ -3,7 +3,7 @@ import logging
...
@@ -3,7 +3,7 @@ import logging
# from apex_C import scale_check_overflow
# from apex_C import scale_check_overflow
def
scale_check_overflow_python
(
d
_grad
s
,
scale
):
def
scale_check_overflow_python
(
model
_grad
,
scale
,
master_grad
):
# Exception handling for 18.04 compatibility
# Exception handling for 18.04 compatibility
try
:
try
:
cpu_sum
=
float
(
d_grads
.
float
().
sum
())
cpu_sum
=
float
(
d_grads
.
float
().
sum
())
...
@@ -14,7 +14,9 @@ def scale_check_overflow_python(d_grads, scale):
...
@@ -14,7 +14,9 @@ def scale_check_overflow_python(d_grads, scale):
else
:
else
:
if
cpu_sum
==
float
(
'inf'
)
or
cpu_sum
==
-
float
(
'inf'
)
or
cpu_sum
!=
cpu_sum
:
if
cpu_sum
==
float
(
'inf'
)
or
cpu_sum
==
-
float
(
'inf'
)
or
cpu_sum
!=
cpu_sum
:
return
True
return
True
d_grads
.
mul_
(
scale
)
if
master_grad
is
not
model_grad
:
master_grad
.
copy_
(
model_grad
)
master_grad
.
mul_
(
scale
)
return
False
return
False
class
LossScaler
(
object
):
class
LossScaler
(
object
):
...
@@ -22,10 +24,19 @@ class LossScaler(object):
...
@@ -22,10 +24,19 @@ class LossScaler(object):
warned_fp16_grad
=
False
warned_fp16_grad
=
False
has_fused_kernel
=
False
has_fused_kernel
=
False
def
__init__
(
self
):
def
__init__
(
self
,
self
.
_loss_scale
=
2.
**
16
loss_scale
,
init_scale
=
2.
**
16
,
scale_factor
=
2.
,
scale_window
=
2000
):
if
loss_scale
==
"dynamic"
:
self
.
dynamic
=
True
self
.
_loss_scale
=
init_scale
else
:
self
.
dynamic
=
False
self
.
_loss_scale
=
loss_scale
self
.
_max_loss_scale
=
2.
**
24
self
.
_max_loss_scale
=
2.
**
24
self
.
_scale_seq_len
=
2000
self
.
_scale_seq_len
=
scale_window
self
.
_unskipped
=
0
self
.
_unskipped
=
0
self
.
_has_overflow
=
False
self
.
_has_overflow
=
False
try
:
try
:
...
@@ -44,35 +55,37 @@ class LossScaler(object):
...
@@ -44,35 +55,37 @@ class LossScaler(object):
def
loss_scale
(
self
):
def
loss_scale
(
self
):
return
self
.
_loss_scale
return
self
.
_loss_scale
def
unscale_and_update
(
self
,
param_group
s
,
scale
):
def
unscale_and_update
(
self
,
model_params
,
master_param
s
,
scale
):
if
LossScaler
.
has_fused_kernel
:
if
LossScaler
.
has_fused_kernel
:
self
.
_overflow_buf
.
zero_
()
self
.
_overflow_buf
.
zero_
()
self
.
_has_overflow
=
False
self
.
_has_overflow
=
False
for
p
in
iter_params
(
param_group
s
):
for
model
,
master
in
zip
(
model_params
,
master_param
s
):
if
p
.
grad
is
not
None
:
if
model
.
grad
is
not
None
:
if
LossScaler
.
has_fused_kernel
and
p
.
grad
.
data
.
type
()
==
"torch.cuda.FloatTensor"
:
if
LossScaler
.
has_fused_kernel
and
master
.
grad
.
data
.
type
()
==
"torch.cuda.FloatTensor"
:
LossScaler
.
scale_check_overflow_cuda
(
p
.
grad
.
data
,
LossScaler
.
scale_check_overflow_cuda
(
model
.
grad
.
data
,
1.
/
scale
,
1.
/
scale
,
self
.
_overflow_buf
,
self
.
_overflow_buf
,
p
.
grad
.
data
)
master
.
grad
.
data
)
else
:
else
:
if
(
p
.
grad
.
data
.
type
()
!=
"torch.cuda.FloatTensor"
if
(
master
.
grad
.
data
.
type
()
!=
"torch.cuda.FloatTensor"
and
not
LossScaler
.
warned_fp16_grad
):
and
not
LossScaler
.
warned_fp16_grad
):
logger
=
logging
.
getLogger
(
"apex.amp"
)
logger
=
logging
.
getLogger
(
"apex.amp"
)
logger
.
warning
(
"Incoming grads are not fp32 (not master grads). "
logger
.
warning
(
"Downscaling non-fp32 grads may indicate an error. "
"Attempting to downscale {} grads. "
.
format
(
master
.
grad
.
data
.
type
())
+
"When using Amp, you don't need to call .half() on your model."
)
"Downscaling non-fp32 grads may indicate an error. "
"When using Amp, you don't need to call .half() on your model."
)
LossScaler
.
warned_fp16_grad
=
True
LossScaler
.
warned_fp16_grad
=
True
self
.
_has_overflow
=
scale_check_overflow_python
(
p
.
grad
.
data
,
self
.
_has_overflow
=
scale_check_overflow_python
(
model
.
grad
.
data
,
1.
/
scale
)
1.
/
scale
,
if
self
.
_has_overflow
:
master
.
grad
.
data
)
if
self
.
_has_overflow
and
self
.
dynamic
:
break
break
# If the fused kernel is available, we only need one D2H memcopy and sync.
# If the fused kernel is available, we only need one D2H memcopy and sync.
if
LossScaler
.
has_fused_kernel
and
not
self
.
_has_overflow
:
if
LossScaler
.
has_fused_kernel
and
self
.
dynamic
and
not
self
.
_has_overflow
:
self
.
_has_overflow
=
self
.
_overflow_buf
.
item
()
self
.
_has_overflow
=
self
.
_overflow_buf
.
item
()
if
self
.
_has_overflow
:
if
self
.
_has_overflow
and
self
.
dynamic
:
should_skip
=
True
should_skip
=
True
self
.
_loss_scale
/=
2.
self
.
_loss_scale
/=
2.
self
.
_unskipped
=
0
self
.
_unskipped
=
0
...
@@ -80,7 +93,7 @@ class LossScaler(object):
...
@@ -80,7 +93,7 @@ class LossScaler(object):
should_skip
=
False
should_skip
=
False
self
.
_unskipped
+=
1
self
.
_unskipped
+=
1
if
self
.
_unskipped
==
self
.
_scale_seq_len
:
if
self
.
_unskipped
==
self
.
_scale_seq_len
and
self
.
dynamic
:
self
.
_loss_scale
=
min
(
self
.
_max_loss_scale
,
self
.
_loss_scale
*
2.
)
self
.
_loss_scale
=
min
(
self
.
_max_loss_scale
,
self
.
_loss_scale
*
2.
)
self
.
_unskipped
=
0
self
.
_unskipped
=
0
...
...
apex/fp16_utils/fp16_optimizer.py
View file @
fad78c16
...
@@ -4,7 +4,7 @@ from torch.autograd import Variable
...
@@ -4,7 +4,7 @@ from torch.autograd import Variable
from
torch.nn.parameter
import
Parameter
from
torch.nn.parameter
import
Parameter
from
torch._utils
import
_flatten_dense_tensors
,
_unflatten_dense_tensors
from
torch._utils
import
_flatten_dense_tensors
,
_unflatten_dense_tensors
from
.
loss_
scaler
import
DynamicLossScaler
,
LossScaler
from
.
.amp.
scaler
import
LossScaler
from
.fp16util
import
model_grads_to_master_grads
,
master_params_to_model_params
,
clip_grad_norm
from
.fp16util
import
model_grads_to_master_grads
,
master_params_to_model_params
,
clip_grad_norm
# TODO: Update overflow check + downscale to use Carl's fused kernel.
# TODO: Update overflow check + downscale to use Carl's fused kernel.
...
@@ -162,9 +162,9 @@ class FP16_Optimizer(object):
...
@@ -162,9 +162,9 @@ class FP16_Optimizer(object):
if
dynamic_loss_scale
:
if
dynamic_loss_scale
:
self
.
dynamic_loss_scale
=
True
self
.
dynamic_loss_scale
=
True
if
dynamic_loss_args
is
not
None
:
if
dynamic_loss_args
is
not
None
:
self
.
loss_scaler
=
Dynamic
LossScaler
(
**
dynamic_loss_args
)
self
.
loss_scaler
=
LossScaler
(
"dynamic"
,
**
dynamic_loss_args
)
else
:
else
:
self
.
loss_scaler
=
Dynamic
LossScaler
()
self
.
loss_scaler
=
LossScaler
(
"dynamic"
)
else
:
else
:
self
.
dynamic_loss_scale
=
False
self
.
dynamic_loss_scale
=
False
self
.
loss_scaler
=
LossScaler
(
static_loss_scale
)
self
.
loss_scaler
=
LossScaler
(
static_loss_scale
)
...
@@ -480,7 +480,7 @@ class FP16_Optimizer(object):
...
@@ -480,7 +480,7 @@ class FP16_Optimizer(object):
# a loss scale that works. After you find a loss scale that works, do a final dummy
# a loss scale that works. After you find a loss scale that works, do a final dummy
# backward pass with retain_graph=False to tear down the graph. Doing this would avoid
# backward pass with retain_graph=False to tear down the graph. Doing this would avoid
# discarding the iteration, but probably wouldn't improve overall efficiency.
# discarding the iteration, but probably wouldn't improve overall efficiency.
self
.
loss_scale
r
.
backward
(
loss
.
float
(),
retain_graph
=
retain_graph
)
loss
.
float
()
*
loss_scaler
.
loss_scale
()
.
backward
(
retain_graph
=
retain_graph
)
if
update_master_grads
:
if
update_master_grads
:
self
.
update_master_grads
()
self
.
update_master_grads
()
...
...
apex/optimizers/fp16_optimizer.py
View file @
fad78c16
...
@@ -2,17 +2,26 @@ import torch
...
@@ -2,17 +2,26 @@ import torch
from
torch._utils
import
_flatten_dense_tensors
,
_unflatten_dense_tensors
from
torch._utils
import
_flatten_dense_tensors
,
_unflatten_dense_tensors
import
ctypes
import
ctypes
lib
=
ctypes
.
cdll
.
LoadLibrary
(
None
)
stashed_err
=
None
lib
.
THCudaHalfTensor_normall
.
argtypes
=
[
ctypes
.
c_void_p
,
ctypes
.
c_void_p
]
try
:
lib
.
THCudaHalfTensor_normall
.
restype
=
ctypes
.
c_float
lib
=
ctypes
.
cdll
.
LoadLibrary
(
None
)
lib
.
THCudaHalfTensor_normall
.
argtypes
=
[
ctypes
.
c_void_p
,
ctypes
.
c_void_p
]
def
fused_norm
(
input
):
lib
.
THCudaHalfTensor_normall
.
restype
=
ctypes
.
c_float
if
input
.
type
()
==
'torch.cuda.HalfTensor'
:
# 16384 is half 2 if you stare at it long enough
def
fused_norm
(
input
):
return
lib
.
THCudaHalfTensor_normall
(
torch
.
cuda
.
_state_cdata
,
if
input
.
type
()
==
'torch.cuda.HalfTensor'
:
input
.
_cdata
,
16384
)
# 16384 is half 2 if you stare at it long enough
else
:
return
lib
.
THCudaHalfTensor_normall
(
torch
.
cuda
.
_state_cdata
,
return
input
.
norm
()
input
.
_cdata
,
16384
)
else
:
return
input
.
norm
()
except
TypeError
as
err
:
stashed_err
=
err
def
fused_norm
(
input
):
raise
RuntimeError
(
"Failed to create fused_norm. This may happen on Windows "
"because of lib = ctypes.cdll.LoadLibrary(None): you can't "
"LoadLibrary with None. Original exception message was "
,
stashed_err
)
class
FP16_Optimizer
(
object
):
class
FP16_Optimizer
(
object
):
"""
"""
...
...
apex/parallel/distributed.py
View file @
fad78c16
...
@@ -322,7 +322,6 @@ class DistributedDataParallel(Module):
...
@@ -322,7 +322,6 @@ class DistributedDataParallel(Module):
grad_acc
=
param_tmp
.
grad_fn
.
next_functions
[
0
][
0
]
grad_acc
=
param_tmp
.
grad_fn
.
next_functions
[
0
][
0
]
def
allreduce_hook
(
*
unused
):
def
allreduce_hook
(
*
unused
):
print
(
"hook fired"
)
if
self
.
delay_allreduce
or
self
.
needs_refresh
:
if
self
.
delay_allreduce
or
self
.
needs_refresh
:
# TODO: How do we want to handle multiple backward passes between
# TODO: How do we want to handle multiple backward passes between
# each forward, e.g., backward passes with retain_graph=True?
# each forward, e.g., backward passes with retain_graph=True?
...
...
csrc/multi_tensor_apply.h
0 → 100644
View file @
fad78c16
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include <assert.h>
#include <cuda_runtime.h>
template
<
typename
T
,
typename
...
ArgTypes
>
__global__
void
multi_tensor_apply_kernel
(
volatile
int
*
noop_flag
,
int
*
block_to_tensor
,
int
*
block_to_chunk
,
// could also get this from scan
int
*
tensor_sizes
,
int
chunk_size
,
void
**
addresses
,
int
addresses_x
,
T
callable
,
ArgTypes
...
args
)
// in_t** in, float** out, float scale
{
__shared__
int
noop
;
__shared__
int
chunk_idx
;
__shared__
int
tensor_idx
;
__shared__
int
n
;
if
(
threadIdx
.
x
==
0
)
{
noop
=
*
noop_flag
;
tensor_idx
=
block_to_tensor
[
blockIdx
.
x
];
chunk_idx
=
block_to_chunk
[
blockIdx
.
x
];
n
=
tensor_sizes
[
tensor_idx
];
}
__syncthreads
();
if
(
noop
==
1
)
return
;
// Hand the chunk information to the user-supplied functor to process however it likes.
callable
(
noop_flag
,
tensor_idx
,
chunk_idx
,
chunk_size
,
n
,
addresses
,
addresses_x
,
args
...);
}
csrc/multi_tensor_unscale_kernel.cu
0 → 100644
View file @
fad78c16
#include <ATen/ATen.h>
#include <ATen/AccumulateType.h>
#include <ATen/cuda/CUDAContext.h>
#include <ATen/cuda/Exceptions.h>
#include "multi_tensor_apply.h"
#include <assert.h>
#include <cuda_runtime.h>
#define BLOCK_SIZE 256
#define ILP 4
template
<
typename
in_t
>
struct
UnscaleFunctor
{
__device__
__forceinline__
void
operator
()(
volatile
int
*
noop_flag
,
int
tensor_idx
,
int
chunk_idx
,
int
chunk_size
,
int
n
,
void
**
addresses
,
int
addresses_x
,
float
scale
)
{
__shared__
int
noop
;
in_t
*
in
=
(
in_t
*
)
addresses
[
tensor_idx
];
in
+=
chunk_idx
*
chunk_size
;
float
*
out
=
(
float
*
)
addresses
[
addresses_x
+
tensor_idx
];
out
+=
chunk_idx
*
chunk_size
;
n
-=
chunk_idx
*
chunk_size
;
// Non-divergent exit condition for the __syncthreads
float
incoming_vals
[
ILP
];
for
(
int
i_start
=
0
;
i_start
<
n
&&
i_start
<
chunk_size
;
i_start
+=
blockDim
.
x
*
ILP
)
{
if
(
threadIdx
.
x
==
0
)
noop
=
*
noop_flag
;
__syncthreads
();
if
(
noop
==
1
)
break
;
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
incoming_vals
[
ii
]
=
0
;
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
)
incoming_vals
[
ii
]
=
static_cast
<
float
>
(
in
[
i
]);
}
#pragma unroll
for
(
int
ii
=
0
;
ii
<
ILP
;
ii
++
)
{
int
i
=
i_start
+
threadIdx
.
x
+
ii
*
blockDim
.
x
;
if
(
i
<
n
)
if
(
isfinite
(
incoming_vals
[
ii
]))
out
[
i
]
=
incoming_vals
[
ii
]
*
scale
;
else
*
noop_flag
=
1
;
// Blindly fire off a write. These will race but that's ok.
}
// This is NOT guaranteed to be seen immediately by thread 0 on the next iteration.
}
// I wonder if there's a way we can rig the short-circuiting with only one syncthreads.
}
// It's possible we can just lean on the cache (no smem or syncs) and still be fast.
};
void
multi_tensor_unscale_cuda
(
int
nblocks
,
at
::
Tensor
noop_flag
,
at
::
Tensor
cpu_tensor_addresses
,
at
::
Tensor
gpu_block_to_tensor
,
at
::
Tensor
gpu_block_to_chunk
,
at
::
Tensor
gpu_tensor_sizes
,
at
::
Tensor
gpu_tensor_addresses
,
int
chunk_size
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
float
scale
)
{
using
namespace
at
;
int
addresses_x
=
gpu_tensor_addresses
.
size
(
1
);
// <.< >.> i don't see any cops. i'm going to access the pointers directly.
// auto addresses_a = cpu_tensor_addresses.accessor<int64_t, 2>();
// This logic could be moved to prep_multi_tensor_launch, but we might need to
// pick which kernel instantiation to launch based on the RTTI of tensor_lists,
// so we may as well accept tensor_lists and extract the pointers here.
void
**
addresses_a
=
(
void
**
)
cpu_tensor_addresses
.
data_ptr
();
int
len0
=
tensor_lists
[
0
].
size
();
for
(
unsigned
int
l
=
0
;
l
<
tensor_lists
.
size
();
l
++
)
{
AT_CHECK
(
tensor_lists
[
l
].
size
()
==
len0
,
"Lengths of tensor lists do not match."
);
for
(
unsigned
int
t
=
0
;
t
<
tensor_lists
[
l
].
size
();
t
++
)
{
AT_CHECK
(
tensor_lists
[
l
][
t
].
numel
()
==
tensor_lists
[
0
][
t
].
numel
(),
"Numel mismatch in corresponding tensors in different lists."
);
addresses_a
[
l
*
addresses_x
+
t
]
=
tensor_lists
[
l
][
t
].
data_ptr
();
// addresses_a[l][t] = (void*)tensor_lists[l][t].data<float>();
}
}
cudaStream_t
stream
=
at
::
cuda
::
getCurrentCUDAStream
();
gpu_tensor_addresses
.
copy_
(
cpu_tensor_addresses
,
1
/*non_blocking*/
);
// Lock the output (downscaled) type to float.
AT_DISPATCH_FLOATING_TYPES_AND_HALF
(
tensor_lists
[
0
][
0
].
type
(),
"multi_tensor_unscale_cuda"
,
[
&
]
{
// using accscalar_t = acc_type<scalar_t, true>;
multi_tensor_apply_kernel
<<<
nblocks
,
BLOCK_SIZE
,
0
,
stream
>>>
(
noop_flag
.
data
<
int
>
(),
gpu_block_to_tensor
.
data
<
int
>
(),
gpu_block_to_chunk
.
data
<
int
>
(),
gpu_tensor_sizes
.
data
<
int
>
(),
chunk_size
,
(
void
**
)
gpu_tensor_addresses
.
data_ptr
(),
addresses_x
,
UnscaleFunctor
<
scalar_t
>
(),
scale
);
});
AT_CUDA_CHECK
(
cudaGetLastError
());
// AT_CUDA_CHECK(cudaDeviceSynchronize());
}
csrc/scale_check_overflow.cpp
View file @
fad78c16
#include <torch/extension.h>
#include <torch/extension.h>
void
multi_tensor_unscale_cuda
(
int
nblocks
,
at
::
Tensor
noop_flag
,
at
::
Tensor
cpu_tensor_addresses
,
at
::
Tensor
gpu_block_to_tensor
,
at
::
Tensor
gpu_block_to_chunk
,
at
::
Tensor
gpu_tensor_sizes
,
at
::
Tensor
gpu_tensor_addresses
,
int
chunk_size
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
,
float
scale
);
std
::
vector
<
int
>
prep_multi_tensor_launch
(
at
::
Tensor
cpu_block_to_tensor
,
at
::
Tensor
cpu_block_to_chunk
,
at
::
Tensor
cpu_tensor_sizes
,
at
::
Tensor
gpu_block_to_tensor
,
at
::
Tensor
gpu_block_to_chunk
,
at
::
Tensor
gpu_tensor_sizes
,
int
chunk_size
,
int
max_depth
,
int
max_tensors
,
int
max_blocks
,
std
::
vector
<
std
::
vector
<
at
::
Tensor
>>
tensor_lists
)
{
int
needs_reallocate
=
0
;
if
(
tensor_lists
.
size
()
>
max_depth
||
tensor_lists
[
0
].
size
()
>
max_tensors
)
needs_reallocate
=
1
;
auto
cpu_tensor_sizes_a
=
cpu_tensor_sizes
.
accessor
<
int
,
1
>
();
auto
cpu_block_to_tensor_a
=
cpu_block_to_tensor
.
accessor
<
int
,
1
>
();
auto
cpu_block_to_chunk_a
=
cpu_block_to_chunk
.
accessor
<
int
,
1
>
();
int
nblocks
=
0
;
for
(
int
t
=
0
;
t
<
tensor_lists
[
0
].
size
();
t
++
)
{
int
blocks_this_tensor
=
(
tensor_lists
[
0
][
t
].
numel
()
+
chunk_size
-
1
)
/
chunk_size
;
if
(
!
needs_reallocate
)
cpu_tensor_sizes_a
[
t
]
=
tensor_lists
[
0
][
t
].
numel
();
for
(
int
chunk
=
0
;
chunk
<
blocks_this_tensor
;
chunk
++
)
{
if
(
nblocks
>=
max_blocks
)
needs_reallocate
=
1
;
if
(
!
needs_reallocate
)
{
cpu_block_to_tensor_a
[
nblocks
]
=
t
;
cpu_block_to_chunk_a
[
nblocks
]
=
chunk
;
}
nblocks
++
;
}
}
if
(
!
needs_reallocate
)
{
gpu_block_to_tensor
.
copy_
(
cpu_block_to_tensor
,
1
);
gpu_block_to_chunk
.
copy_
(
cpu_block_to_chunk
,
1
);
gpu_tensor_sizes
.
copy_
(
cpu_tensor_sizes
,
1
);
}
return
std
::
vector
<
int
>
{
needs_reallocate
,
nblocks
};
}
void
scale_check_overflow_cuda
(
const
at
::
Tensor
&
grads
,
void
scale_check_overflow_cuda
(
const
at
::
Tensor
&
grads
,
float
scale
,
float
scale
,
const
at
::
Tensor
&
d_buf
,
const
at
::
Tensor
&
d_buf
,
...
@@ -27,4 +90,7 @@ void scale_check_overflow(at::Tensor grads,
...
@@ -27,4 +90,7 @@ void scale_check_overflow(at::Tensor grads,
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
PYBIND11_MODULE
(
TORCH_EXTENSION_NAME
,
m
)
{
m
.
def
(
"scale_check_overflow"
,
&
scale_check_overflow
,
"Fused overflow check + scale for FP32 tensors"
);
m
.
def
(
"scale_check_overflow"
,
&
scale_check_overflow
,
"Fused overflow check + scale for FP32 tensors"
);
m
.
def
(
"prep_multi_tensor_launch"
,
&
prep_multi_tensor_launch
,
"Prepare multitensor launch"
);
m
.
def
(
"multi_tensor_unscale"
,
&
multi_tensor_unscale_cuda
,
"Fused overflow check + unscale for a list of contiguous tensors"
);
}
}
setup.py
View file @
fad78c16
...
@@ -39,7 +39,12 @@ if "--cuda_ext" in sys.argv:
...
@@ -39,7 +39,12 @@ if "--cuda_ext" in sys.argv:
ext_modules
.
append
(
ext_modules
.
append
(
CUDAExtension
(
name
=
'amp_C'
,
CUDAExtension
(
name
=
'amp_C'
,
sources
=
[
'csrc/scale_check_overflow.cpp'
,
sources
=
[
'csrc/scale_check_overflow.cpp'
,
'csrc/scale_check_overflow_kernel.cu'
]))
'csrc/scale_check_overflow_kernel.cu'
,
'csrc/multi_tensor_unscale_kernel.cu'
],
extra_compile_args
=
{
'cxx'
:
[
'-O3'
,],
'nvcc'
:[
'-lineinfo'
,
'-O3'
,
'--use_fast_math'
]}))
ext_modules
.
append
(
ext_modules
.
append
(
CUDAExtension
(
name
=
'fused_adam_cuda'
,
CUDAExtension
(
name
=
'fused_adam_cuda'
,
sources
=
[
'apex/optimizers/csrc/fused_adam_cuda.cpp'
,
sources
=
[
'apex/optimizers/csrc/fused_adam_cuda.cpp'
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment