Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
apex
Commits
2cbca1a4
Commit
2cbca1a4
authored
Feb 06, 2019
by
Michael Carilli
Browse files
Merge branch 'master' into api_refactor
parents
a9a3fe57
340e71a4
Changes
29
Hide whitespace changes
Inline
Side-by-side
Showing
9 changed files
with
296 additions
and
23 deletions
+296
-23
examples/word_language_model/data.py
examples/word_language_model/data.py
+12
-1
examples/word_language_model/main.py
examples/word_language_model/main.py
+32
-14
examples/word_language_model/main_fp16_optimizer.py
examples/word_language_model/main_fp16_optimizer.py
+17
-5
setup.py
setup.py
+4
-0
tests/distributed/ddp_race_condition_test.py
tests/distributed/ddp_race_condition_test.py
+2
-2
tests/run_amp/test_scale.py
tests/run_amp/test_scale.py
+93
-0
tests/run_fp16_optimizer/test_fp16_optimizer.py
tests/run_fp16_optimizer/test_fp16_optimizer.py
+2
-0
tests/run_mixed_adam/test_fp16_optimizer.py
tests/run_mixed_adam/test_fp16_optimizer.py
+133
-0
tests/run_test.py
tests/run_test.py
+1
-1
No files found.
examples/word_language_model/data.py
View file @
2cbca1a4
...
@@ -18,12 +18,23 @@ class Dictionary(object):
...
@@ -18,12 +18,23 @@ class Dictionary(object):
class
Corpus
(
object
):
class
Corpus
(
object
):
def
__init__
(
self
,
path
):
def
__init__
(
self
,
path
,
pad_to_multiple_of
=
1
):
# Synthetic elements used to pad the dictionary length.
# It is assumed that these synthetic elements do not appear in the actual data files.
self
.
synthetic
=
[
"vvvvvvvv"
+
str
(
i
)
for
i
in
range
(
pad_to_multiple_of
-
1
)]
self
.
dictionary
=
Dictionary
()
self
.
dictionary
=
Dictionary
()
self
.
train
=
self
.
tokenize
(
os
.
path
.
join
(
path
,
'train.txt'
))
self
.
train
=
self
.
tokenize
(
os
.
path
.
join
(
path
,
'train.txt'
))
self
.
valid
=
self
.
tokenize
(
os
.
path
.
join
(
path
,
'valid.txt'
))
self
.
valid
=
self
.
tokenize
(
os
.
path
.
join
(
path
,
'valid.txt'
))
self
.
test
=
self
.
tokenize
(
os
.
path
.
join
(
path
,
'test.txt'
))
self
.
test
=
self
.
tokenize
(
os
.
path
.
join
(
path
,
'test.txt'
))
# Pad dictionary size to desired multiple. For example, padding to a multiple of 8
# is necessary to ensure Tensor Core usage for the decoder.
pad_elem
=
pad_to_multiple_of
-
len
(
self
.
dictionary
)
%
pad_to_multiple_of
if
pad_elem
!=
pad_to_multiple_of
:
for
i
in
range
(
pad_elem
):
self
.
dictionary
.
add_word
(
self
.
synthetic
[
i
])
def
tokenize
(
self
,
path
):
def
tokenize
(
self
,
path
):
"""Tokenizes a text file."""
"""Tokenizes a text file."""
assert
os
.
path
.
exists
(
path
)
assert
os
.
path
.
exists
(
path
)
...
...
examples/word_language_model/main.py
View file @
2cbca1a4
...
@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
...
@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
help
=
'location of the data corpus'
)
help
=
'location of the data corpus'
)
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'LSTM'
,
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'LSTM'
,
help
=
'type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)'
)
help
=
'type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)'
)
parser
.
add_argument
(
'--emsize'
,
type
=
int
,
default
=
200
,
parser
.
add_argument
(
'--emsize'
,
type
=
int
,
default
=
1504
,
help
=
'size of word embeddings'
)
help
=
'size of word embeddings'
)
parser
.
add_argument
(
'--nhid'
,
type
=
int
,
default
=
200
,
parser
.
add_argument
(
'--nhid'
,
type
=
int
,
default
=
1504
,
help
=
'number of hidden units per layer'
)
help
=
'number of hidden units per layer'
)
parser
.
add_argument
(
'--nlayers'
,
type
=
int
,
default
=
2
,
parser
.
add_argument
(
'--nlayers'
,
type
=
int
,
default
=
2
,
help
=
'number of layers'
)
help
=
'number of layers'
)
...
@@ -29,11 +29,11 @@ parser.add_argument('--clip', type=float, default=0.25,
...
@@ -29,11 +29,11 @@ parser.add_argument('--clip', type=float, default=0.25,
help
=
'gradient clipping'
)
help
=
'gradient clipping'
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
40
,
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
40
,
help
=
'upper epoch limit'
)
help
=
'upper epoch limit'
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
2
0
,
metavar
=
'N'
,
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
2
4
,
metavar
=
'N'
,
help
=
'batch size'
)
help
=
'batch size'
)
parser
.
add_argument
(
'--bptt'
,
type
=
int
,
default
=
35
,
parser
.
add_argument
(
'--bptt'
,
type
=
int
,
default
=
35
,
help
=
'sequence length'
)
help
=
'sequence length'
)
parser
.
add_argument
(
'--dropout'
,
type
=
float
,
default
=
0.
2
,
parser
.
add_argument
(
'--dropout'
,
type
=
float
,
default
=
0.
65
,
help
=
'dropout applied to layers (0 = no dropout)'
)
help
=
'dropout applied to layers (0 = no dropout)'
)
parser
.
add_argument
(
'--tied'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--tied'
,
action
=
'store_true'
,
help
=
'tie the word embedding and softmax weights'
)
help
=
'tie the word embedding and softmax weights'
)
...
@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt',
...
@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt',
help
=
'path to save the final model'
)
help
=
'path to save the final model'
)
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
'Run model in pseudo-fp16 mode (fp16 storage fp32 math).'
)
help
=
'Run model in pseudo-fp16 mode (fp16 storage fp32 math).'
)
parser
.
add_argument
(
'--static-loss-scale'
,
type
=
float
,
default
=
1
,
parser
.
add_argument
(
'--static-loss-scale'
,
type
=
float
,
default
=
1
28.0
,
help
=
'Static loss scale, positive power of 2 values can improve fp16 convergence.'
)
help
=
'Static loss scale, positive power of 2 values can improve fp16 convergence.'
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
@@ -64,7 +64,9 @@ if args.fp16 and not args.cuda:
...
@@ -64,7 +64,9 @@ if args.fp16 and not args.cuda:
# Load data
# Load data
###############################################################################
###############################################################################
corpus
=
data
.
Corpus
(
args
.
data
)
# Ensure that the dictionary length is a multiple of 8,
# so that the decoder's GEMMs will use Tensor Cores.
corpus
=
data
.
Corpus
(
args
.
data
,
pad_to_multiple_of
=
8
)
# Starting from sequential data, batchify arranges the dataset into columns.
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# For instance, with the alphabet as the sequence and batch size 4, we'd get
...
@@ -99,6 +101,16 @@ test_data = batchify(corpus.test, eval_batch_size)
...
@@ -99,6 +101,16 @@ test_data = batchify(corpus.test, eval_batch_size)
###############################################################################
###############################################################################
ntokens
=
len
(
corpus
.
dictionary
)
ntokens
=
len
(
corpus
.
dictionary
)
if
args
.
fp16
and
args
.
cuda
:
if
ntokens
%
8
!=
0
:
print
(
"Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
"Tensor Core use for the decoder's GEMMs."
.
format
(
ntokens
))
if
args
.
emsize
%
8
!=
0
or
args
.
nhid
%
8
!=
0
or
args
.
batch_size
%
8
!=
0
:
print
(
"Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
"to ensure Tensor Core use for the RNN's GEMMs."
.
format
(
args
.
emsize
,
args
.
nhid
,
args
.
batch_size
))
model
=
model
.
RNNModel
(
args
.
model
,
ntokens
,
args
.
emsize
,
args
.
nhid
,
args
.
nlayers
,
args
.
dropout
,
args
.
tied
)
model
=
model
.
RNNModel
(
args
.
model
,
ntokens
,
args
.
emsize
,
args
.
nhid
,
args
.
nlayers
,
args
.
dropout
,
args
.
tied
)
if
args
.
cuda
and
args
.
fp16
:
if
args
.
cuda
and
args
.
fp16
:
...
@@ -106,6 +118,12 @@ if args.cuda and args.fp16:
...
@@ -106,6 +118,12 @@ if args.cuda and args.fp16:
model_params
,
master_params
=
prep_param_lists
(
model
)
model_params
,
master_params
=
prep_param_lists
(
model
)
elif
args
.
cuda
:
elif
args
.
cuda
:
model
.
cuda
()
model
.
cuda
()
if
(
not
args
.
fp16
)
or
(
not
args
.
cuda
):
print
(
"Warning: static_loss_scale != 1.0 is only necessary with --fp16. "
"Resetting static_loss_scale to 1.0"
)
args
.
static_loss_scale
=
1.0
criterion
=
nn
.
CrossEntropyLoss
()
criterion
=
nn
.
CrossEntropyLoss
()
###############################################################################
###############################################################################
...
@@ -172,21 +190,21 @@ def train():
...
@@ -172,21 +190,21 @@ def train():
loss
=
criterion
(
output
.
view
(
-
1
,
ntokens
),
targets
)
loss
=
criterion
(
output
.
view
(
-
1
,
ntokens
),
targets
)
loss
=
loss
*
args
.
static_loss_scale
loss
=
loss
*
args
.
static_loss_scale
loss
.
backward
()
loss
.
backward
()
loss
=
loss
/
args
.
static_loss_scale
loss
.
data
=
loss
.
data
/
args
.
static_loss_scale
# `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
# apex.fp16_utils.clip_grad_norm selects between "torch.nn.utils.clip_grad_norm"
# and "torch.nn.utils.clip_grad_norm_" based on Pytorch version.
# It's not FP16-specific, just a small fix to avoid deprecation warnings.
clip_grad_norm
(
model
.
parameters
(),
args
.
clip
)
if
args
.
fp16
and
args
.
cuda
:
if
args
.
fp16
and
args
.
cuda
:
model_grads_to_master_grads
(
model_params
,
master_params
)
model_grads_to_master_grads
(
model_params
,
master_params
)
if
args
.
static_loss_scale
!=
1
:
for
param
in
master_params
:
param
.
grad
.
data
=
param
.
grad
.
data
/
args
.
static_loss_scale
clip_grad_norm
(
master_params
,
args
.
clip
)
for
param
in
master_params
:
for
param
in
master_params
:
param
.
data
=
param
.
data
-
param
.
grad
.
data
*
(
lr
/
args
.
static_loss_scale
)
param
.
data
=
param
.
data
-
param
.
grad
.
data
*
lr
master_params_to_model_params
(
model_params
,
master_params
)
master_params_to_model_params
(
model_params
,
master_params
)
else
:
else
:
clip_grad_norm
(
model
.
parameters
(),
args
.
clip
)
for
p
in
model
.
parameters
():
for
p
in
model
.
parameters
():
p
.
data
.
add_
(
-
lr
/
args
.
static_loss_scale
,
p
.
grad
.
data
)
p
.
data
.
add_
(
-
lr
,
p
.
grad
.
data
)
total_loss
+=
loss
.
data
total_loss
+=
loss
.
data
...
...
examples/word_language_model/main_fp16_optimizer.py
View file @
2cbca1a4
...
@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
...
@@ -17,9 +17,9 @@ parser.add_argument('--data', type=str, default='./data/wikitext-2',
help
=
'location of the data corpus'
)
help
=
'location of the data corpus'
)
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'LSTM'
,
parser
.
add_argument
(
'--model'
,
type
=
str
,
default
=
'LSTM'
,
help
=
'type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)'
)
help
=
'type of recurrent net (RNN_TANH, RNN_RELU, LSTM, GRU)'
)
parser
.
add_argument
(
'--emsize'
,
type
=
int
,
default
=
200
,
parser
.
add_argument
(
'--emsize'
,
type
=
int
,
default
=
1504
,
help
=
'size of word embeddings'
)
help
=
'size of word embeddings'
)
parser
.
add_argument
(
'--nhid'
,
type
=
int
,
default
=
200
,
parser
.
add_argument
(
'--nhid'
,
type
=
int
,
default
=
1504
,
help
=
'number of hidden units per layer'
)
help
=
'number of hidden units per layer'
)
parser
.
add_argument
(
'--nlayers'
,
type
=
int
,
default
=
2
,
parser
.
add_argument
(
'--nlayers'
,
type
=
int
,
default
=
2
,
help
=
'number of layers'
)
help
=
'number of layers'
)
...
@@ -29,7 +29,7 @@ parser.add_argument('--clip', type=float, default=0.25,
...
@@ -29,7 +29,7 @@ parser.add_argument('--clip', type=float, default=0.25,
help
=
'gradient clipping'
)
help
=
'gradient clipping'
)
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
40
,
parser
.
add_argument
(
'--epochs'
,
type
=
int
,
default
=
40
,
help
=
'upper epoch limit'
)
help
=
'upper epoch limit'
)
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
2
0
,
metavar
=
'N'
,
parser
.
add_argument
(
'--batch_size'
,
type
=
int
,
default
=
2
4
,
metavar
=
'N'
,
help
=
'batch size'
)
help
=
'batch size'
)
parser
.
add_argument
(
'--bptt'
,
type
=
int
,
default
=
35
,
parser
.
add_argument
(
'--bptt'
,
type
=
int
,
default
=
35
,
help
=
'sequence length'
)
help
=
'sequence length'
)
...
@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt',
...
@@ -47,7 +47,7 @@ parser.add_argument('--save', type=str, default='model.pt',
help
=
'path to save the final model'
)
help
=
'path to save the final model'
)
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--fp16'
,
action
=
'store_true'
,
help
=
'Run model in pseudo-fp16 mode (fp16 storage fp32 math).'
)
help
=
'Run model in pseudo-fp16 mode (fp16 storage fp32 math).'
)
parser
.
add_argument
(
'--static-loss-scale'
,
type
=
float
,
default
=
1
,
parser
.
add_argument
(
'--static-loss-scale'
,
type
=
float
,
default
=
1
28.0
,
help
=
'Static loss scale, positive power of 2 values can improve fp16 convergence.'
)
help
=
'Static loss scale, positive power of 2 values can improve fp16 convergence.'
)
parser
.
add_argument
(
'--dynamic-loss-scale'
,
action
=
'store_true'
,
parser
.
add_argument
(
'--dynamic-loss-scale'
,
action
=
'store_true'
,
help
=
'Use dynamic loss scaling. If supplied, this argument supersedes '
+
help
=
'Use dynamic loss scaling. If supplied, this argument supersedes '
+
...
@@ -67,7 +67,9 @@ if args.fp16 and not args.cuda:
...
@@ -67,7 +67,9 @@ if args.fp16 and not args.cuda:
# Load data
# Load data
###############################################################################
###############################################################################
corpus
=
data
.
Corpus
(
args
.
data
)
# Ensure that the dictionary length is a multiple of 8,
# so that the decoder's GEMMs will use Tensor Cores.
corpus
=
data
.
Corpus
(
args
.
data
,
pad_to_multiple_of
=
8
)
# Starting from sequential data, batchify arranges the dataset into columns.
# Starting from sequential data, batchify arranges the dataset into columns.
# For instance, with the alphabet as the sequence and batch size 4, we'd get
# For instance, with the alphabet as the sequence and batch size 4, we'd get
...
@@ -102,6 +104,16 @@ test_data = batchify(corpus.test, eval_batch_size)
...
@@ -102,6 +104,16 @@ test_data = batchify(corpus.test, eval_batch_size)
###############################################################################
###############################################################################
ntokens
=
len
(
corpus
.
dictionary
)
ntokens
=
len
(
corpus
.
dictionary
)
if
args
.
fp16
and
args
.
cuda
:
if
ntokens
%
8
!=
0
:
print
(
"Warning: the dictionary size (ntokens = {}) should be a multiple of 8 to ensure "
"Tensor Core use for the decoder's GEMMs."
.
format
(
ntokens
))
if
args
.
emsize
%
8
!=
0
or
args
.
nhid
%
8
!=
0
or
args
.
batch_size
%
8
!=
0
:
print
(
"Warning: emsize = {}, nhid = {}, batch_size = {} should all be multiples of 8 "
"to ensure Tensor Core use for the RNN's GEMMs."
.
format
(
args
.
emsize
,
args
.
nhid
,
args
.
batch_size
))
model
=
model
.
RNNModel
(
args
.
model
,
ntokens
,
args
.
emsize
,
args
.
nhid
,
args
.
nlayers
,
args
.
dropout
,
args
.
tied
)
model
=
model
.
RNNModel
(
args
.
model
,
ntokens
,
args
.
emsize
,
args
.
nhid
,
args
.
nlayers
,
args
.
dropout
,
args
.
tied
)
if
args
.
cuda
and
args
.
fp16
:
if
args
.
cuda
and
args
.
fp16
:
...
...
setup.py
View file @
2cbca1a4
...
@@ -36,6 +36,10 @@ if "--cuda_ext" in sys.argv:
...
@@ -36,6 +36,10 @@ if "--cuda_ext" in sys.argv:
if
torch
.
utils
.
cpp_extension
.
CUDA_HOME
is
None
:
if
torch
.
utils
.
cpp_extension
.
CUDA_HOME
is
None
:
print
(
"Warning: nvcc is not available. Ignoring --cuda-ext"
)
print
(
"Warning: nvcc is not available. Ignoring --cuda-ext"
)
else
:
else
:
ext_modules
.
append
(
CUDAExtension
(
name
=
'amp_C'
,
sources
=
[
'csrc/scale_check_overflow.cpp'
,
'csrc/scale_check_overflow_kernel.cu'
]))
ext_modules
.
append
(
ext_modules
.
append
(
CUDAExtension
(
name
=
'fused_adam_cuda'
,
CUDAExtension
(
name
=
'fused_adam_cuda'
,
sources
=
[
'apex/optimizers/csrc/fused_adam_cuda.cpp'
,
sources
=
[
'apex/optimizers/csrc/fused_adam_cuda.cpp'
,
...
...
tests/distributed/ddp_race_condition_test.py
View file @
2cbca1a4
...
@@ -34,8 +34,8 @@ class Model(Module):
...
@@ -34,8 +34,8 @@ class Model(Module):
return
(
input
*
self
.
a
)
*
self
.
b
return
(
input
*
self
.
a
)
*
self
.
b
model
=
Model
()
model
=
Model
()
model
=
DDP
(
model
,
message_size
=
1
,
gradient_predivide_factor
=
8.0
)
#
model = DDP(model, message_size=1, gradient_predivide_factor=8.0)
#
model = DDP(model, delay_allreduce=True)
model
=
DDP
(
model
,
delay_allreduce
=
True
)
# model = DDP(model, message_size=1, allreduce_trigger_params=[model.b])
# model = DDP(model, message_size=1, allreduce_trigger_params=[model.b])
x
=
torch
.
cuda
.
FloatTensor
(
4096
*
4096
)
x
=
torch
.
cuda
.
FloatTensor
(
4096
*
4096
)
...
...
tests/run_amp/test_scale.py
0 → 100644
View file @
2cbca1a4
import
unittest
import
functools
as
ft
import
itertools
as
it
from
apex
import
amp
import
torch
from
torch
import
nn
import
torch.nn.functional
as
F
from
utils
import
common_init
,
HALF
,
FLOAT
,
\
ALWAYS_HALF
,
ALWAYS_FLOAT
,
MATCH_INPUT
try
:
import
amp_C
scale_check_overflow
=
amp_C
.
scale_check_overflow
disabled
=
False
except
ImportError
as
err
:
print
(
"amp_C fused kernel unavailable, disabling TestScale. ImportError was "
,
err
)
disabled
=
True
class
TestScale
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
scale
=
128.0
self
.
nx
=
999
self
.
ny
=
888
self
.
overflow_buf
=
torch
.
cuda
.
IntTensor
([
0
])
self
.
fp16
=
torch
.
ones
((
self
.
ny
,
self
.
nx
),
device
=
'cuda'
,
dtype
=
torch
.
float16
)
self
.
fp32
=
torch
.
ones
((
self
.
ny
,
self
.
nx
),
device
=
'cuda'
,
dtype
=
torch
.
float32
)
self
.
fp16_ref
=
torch
.
ones
((
1
,
1
),
device
=
'cuda'
,
dtype
=
torch
.
float16
)
self
.
fp32_ref
=
torch
.
ones
((
1
,
1
),
device
=
'cuda'
,
dtype
=
torch
.
float32
)
common_init
(
self
)
def
tearDown
(
self
):
pass
def
downscale_test
(
self
,
input
,
output
,
ref
):
self
.
overflow_buf
.
zero_
()
input
.
fill_
(
1.0
)
if
input
is
not
output
:
output
.
fill_
(
3.0
)
input
.
mul_
(
self
.
scale
)
scale_check_overflow
(
input
,
1.
/
self
.
scale
,
self
.
overflow_buf
,
output
)
self
.
assertTrue
(
torch
.
allclose
(
output
,
ref
))
self
.
assertTrue
(
self
.
overflow_buf
.
item
()
==
0
)
def
find_inf_test
(
self
,
input
,
output
,
ref
,
x
,
y
,
val
):
self
.
overflow_buf
.
zero_
()
input
.
fill_
(
1.0
)
if
input
is
not
output
:
output
.
fill_
(
3.0
)
input
[
x
,
y
]
=
val
scale_check_overflow
(
input
,
1.
/
self
.
scale
,
self
.
overflow_buf
,
output
)
self
.
assertTrue
(
self
.
overflow_buf
.
item
())
# Currently, the fused kernel gives a hard error if you attempt to downscale
# into fp16 output, which imo is the desired behavior. Maybe someday we
# will learn otherwise.
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp16_to_fp16(self):
# self.downscale_test(self.fp16, self.fp16, self.fp16_ref)
@
unittest
.
skipIf
(
disabled
,
"amp_C is unavailable"
)
def
test_fp16_to_fp32
(
self
):
self
.
downscale_test
(
self
.
fp16
,
self
.
fp32
,
self
.
fp32_ref
)
# @unittest.skipIf(disabled, "amp_C is unavailable")
# def test_fp32_to_fp16(self):
# self.downscale_test(self.fp32, self.fp16, self.fp16_ref)
@
unittest
.
skipIf
(
disabled
,
"amp_C is unavailable"
)
def
test_fp32_to_fp32
(
self
):
self
.
downscale_test
(
self
.
fp32
,
self
.
fp32
,
self
.
fp32_ref
)
@
unittest
.
skipIf
(
disabled
,
"amp_C is unavailable"
)
def
test_fp16_to_fp32_find_inf_nan
(
self
):
self
.
find_inf_test
(
self
.
fp16
,
self
.
fp32
,
self
.
fp32_ref
,
0
,
0
,
float
(
'nan'
))
self
.
find_inf_test
(
self
.
fp16
,
self
.
fp32
,
self
.
fp32_ref
,
self
.
ny
//
2
,
self
.
nx
//
2
,
float
(
'inf'
))
self
.
find_inf_test
(
self
.
fp16
,
self
.
fp32
,
self
.
fp32_ref
,
self
.
ny
-
1
,
self
.
nx
-
1
,
float
(
'nan'
))
@
unittest
.
skipIf
(
disabled
,
"amp_C is unavailable"
)
def
test_fp32_to_fp32_find_inf_nan
(
self
):
self
.
find_inf_test
(
self
.
fp32
,
self
.
fp32
,
self
.
fp32_ref
,
0
,
0
,
float
(
'inf'
))
self
.
find_inf_test
(
self
.
fp32
,
self
.
fp32
,
self
.
fp32_ref
,
self
.
ny
//
2
,
self
.
nx
//
2
,
float
(
'nan'
))
self
.
find_inf_test
(
self
.
fp32
,
self
.
fp32
,
self
.
fp32_ref
,
self
.
ny
-
1
,
self
.
nx
-
1
,
float
(
'inf'
))
if
__name__
==
'__main__'
:
unittest
.
main
()
tests/run_fp16_optimizer/test_fp16_optimizer.py
View file @
2cbca1a4
...
@@ -6,6 +6,8 @@ import itertools as it
...
@@ -6,6 +6,8 @@ import itertools as it
import
torch
import
torch
from
apex.fp16_utils
import
FP16_Optimizer
from
apex.fp16_utils
import
FP16_Optimizer
# Currently no-ops (tested via examples).
# FP16_Optimizer to be deprecated and moved under unified Amp API.
class
TestFP16Optimizer
(
unittest
.
TestCase
):
class
TestFP16Optimizer
(
unittest
.
TestCase
):
def
setUp
(
self
):
def
setUp
(
self
):
N
,
D_in
,
D_out
=
64
,
1024
,
16
N
,
D_in
,
D_out
=
64
,
1024
,
16
...
...
tests/run_mixed_adam/test_fp16_optimizer.py
0 → 100644
View file @
2cbca1a4
import
unittest
import
torch
import
apex
class
TestFP16Optimizer
(
unittest
.
TestCase
):
def
setUp
(
self
,
max_abs_diff
=
1e-3
,
max_rel_diff
=
1
,
iters
=
7
):
self
.
max_abs_diff
=
max_abs_diff
self
.
max_rel_diff
=
max_rel_diff
self
.
iters
=
iters
torch
.
cuda
.
manual_seed
(
13337
)
N
,
D_in
,
D_out
=
64
,
1024
,
16
self
.
N
=
N
self
.
D_in
=
D_in
self
.
D_out
=
D_out
self
.
x
=
torch
.
randn
((
N
,
D_in
),
dtype
=
torch
.
float16
,
device
=
'cuda'
)
self
.
ref_model
=
torch
.
nn
.
Linear
(
D_in
,
D_out
).
cuda
().
half
()
self
.
tst_model
=
torch
.
nn
.
Linear
(
D_in
,
D_out
).
cuda
().
half
()
for
p
,
q
in
zip
(
self
.
tst_model
.
parameters
(),
self
.
ref_model
.
parameters
()):
p
.
data
.
copy_
(
q
.
data
)
def
get_max_diff
(
self
,
ref_param
,
tst_param
):
max_abs_diff
=
max_rel_diff
=
0
for
p_ref
,
p_tst
in
zip
(
ref_param
,
tst_param
):
max_abs_diff_p
=
(
p_ref
-
p_tst
).
abs
().
max
().
item
()
max_rel_diff_p
=
((
p_ref
-
p_tst
)
/
p_ref
).
abs
().
max
().
item
()
if
max_abs_diff_p
>
max_abs_diff
:
max_abs_diff
=
max_abs_diff_p
if
max_rel_diff_p
>
max_rel_diff
:
max_rel_diff
=
max_rel_diff_p
return
max_abs_diff
,
max_rel_diff
def
test_fp16_optimizer
(
self
):
ref_optim
=
torch
.
optim
.
Adam
(
self
.
ref_model
.
parameters
())
ref_optim
=
apex
.
fp16_utils
.
FP16_Optimizer
(
ref_optim
,
verbose
=
False
)
tst_optim
=
apex
.
optimizers
.
FusedAdam
(
self
.
tst_model
.
parameters
())
tst_optim
=
apex
.
optimizers
.
FP16_Optimizer
(
tst_optim
)
for
i
in
range
(
self
.
iters
):
ref_loss
=
self
.
ref_model
(
self
.
x
).
sum
()
ref_optim
.
backward
(
ref_loss
)
ref_optim
.
step
()
tst_loss
=
self
.
tst_model
(
self
.
x
).
sum
()
tst_optim
.
backward
(
tst_loss
)
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
self
.
ref_model
.
parameters
(),
self
.
tst_model
.
parameters
())
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
def
test_loss_scaling
(
self
):
ref_optim
=
torch
.
optim
.
Adam
(
self
.
ref_model
.
parameters
())
ref_optim
=
apex
.
fp16_utils
.
FP16_Optimizer
(
ref_optim
,
static_loss_scale
=
128.0
,
verbose
=
False
)
tst_optim
=
apex
.
optimizers
.
FusedAdam
(
self
.
tst_model
.
parameters
())
tst_optim
=
apex
.
optimizers
.
FP16_Optimizer
(
tst_optim
,
static_loss_scale
=
128.0
)
for
i
in
range
(
self
.
iters
):
ref_loss
=
self
.
ref_model
(
self
.
x
).
sum
()
ref_optim
.
backward
(
ref_loss
)
ref_optim
.
step
()
tst_loss
=
self
.
tst_model
(
self
.
x
).
sum
()
tst_optim
.
backward
(
tst_loss
)
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
self
.
ref_model
.
parameters
(),
self
.
tst_model
.
parameters
())
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
def
test_parameter_groups
(
self
):
ref_groups
=
[{
'params'
:
[
self
.
ref_model
.
weight
]},{
'params'
:
[
self
.
ref_model
.
bias
]}]
ref_optim
=
torch
.
optim
.
Adam
(
ref_groups
)
ref_optim
=
apex
.
fp16_utils
.
FP16_Optimizer
(
ref_optim
,
verbose
=
False
)
tst_groups
=
[{
'params'
:
[
self
.
tst_model
.
weight
]},{
'params'
:
[
self
.
tst_model
.
bias
]}]
tst_optim
=
apex
.
optimizers
.
FusedAdam
(
tst_groups
)
tst_optim
=
apex
.
optimizers
.
FP16_Optimizer
(
tst_optim
)
for
i
in
range
(
self
.
iters
):
ref_loss
=
self
.
ref_model
(
self
.
x
).
sum
()
ref_optim
.
backward
(
ref_loss
)
ref_optim
.
step
()
tst_loss
=
self
.
tst_model
(
self
.
x
).
sum
()
tst_optim
.
backward
(
tst_loss
)
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
self
.
ref_model
.
parameters
(),
self
.
tst_model
.
parameters
())
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
def
test_grad_clip
(
self
):
ref_optim
=
torch
.
optim
.
Adam
(
self
.
ref_model
.
parameters
())
ref_optim
=
apex
.
fp16_utils
.
FP16_Optimizer
(
ref_optim
,
verbose
=
False
)
tst_optim
=
apex
.
optimizers
.
FusedAdam
(
self
.
tst_model
.
parameters
(),
max_grad_norm
=
0.01
)
tst_optim
=
apex
.
optimizers
.
FP16_Optimizer
(
tst_optim
)
for
i
in
range
(
self
.
iters
):
ref_loss
=
self
.
ref_model
(
self
.
x
).
sum
()
ref_optim
.
backward
(
ref_loss
)
ref_optim
.
clip_master_grads
(
0.01
)
ref_optim
.
step
()
tst_loss
=
self
.
tst_model
(
self
.
x
).
sum
()
tst_optim
.
backward
(
tst_loss
)
tst_optim
.
step
()
max_abs_diff
,
max_rel_diff
=
self
.
get_max_diff
(
self
.
ref_model
.
parameters
(),
self
.
tst_model
.
parameters
())
self
.
assertLessEqual
(
max_abs_diff
,
self
.
max_abs_diff
)
self
.
assertLessEqual
(
max_rel_diff
,
self
.
max_rel_diff
)
@
unittest
.
skip
(
'Not support grad being None'
)
def
test_grad_None
(
self
):
self
.
fail
()
@
unittest
.
skip
(
'Not support same weight decay as pytorch'
)
def
test_weight_decay
(
self
):
self
.
fail
()
@
unittest
.
skip
(
'Not support empty parameter groups'
)
def
test_group_empty
(
self
):
self
.
fail
()
if
__name__
==
'__main__'
:
script_path
=
os
.
path
.
dirname
(
os
.
path
.
realpath
(
__file__
))
unittest
.
main
()
tests/run_test.py
View file @
2cbca1a4
import
unittest
import
unittest
import
sys
import
sys
test_dirs
=
[
"run_fp16_optimizer"
,
"run_amp"
,
"run_mixed_adam"
]
test_dirs
=
[
"run_amp"
,
"run_mixed_adam"
]
runner
=
unittest
.
TextTestRunner
(
verbosity
=
2
)
runner
=
unittest
.
TextTestRunner
(
verbosity
=
2
)
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment