Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
1011377c
Commit
1011377c
authored
Mar 31, 2022
by
qianyj
Browse files
the source code of NNI for DCU
parent
abc22158
Changes
788
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1358 additions
and
0 deletions
+1358
-0
examples/model_compress/pruning/v2/simulated_anealing_pruning_torch.py
...l_compress/pruning/v2/simulated_anealing_pruning_torch.py
+109
-0
examples/model_compress/pruning/v2/slim_pruning_torch.py
examples/model_compress/pruning/v2/slim_pruning_torch.py
+136
-0
examples/model_compress/pruning/v2/taylorfo_pruning_torch.py
examples/model_compress/pruning/v2/taylorfo_pruning_torch.py
+136
-0
examples/model_compress/quantization/BNN_quantizer_cifar10.py
...ples/model_compress/quantization/BNN_quantizer_cifar10.py
+154
-0
examples/model_compress/quantization/DoReFaQuantizer_torch_mnist.py
...odel_compress/quantization/DoReFaQuantizer_torch_mnist.py
+71
-0
examples/model_compress/quantization/LSQ_torch_quantizer.py
examples/model_compress/quantization/LSQ_torch_quantizer.py
+142
-0
examples/model_compress/quantization/QAT_torch_quantizer.py
examples/model_compress/quantization/QAT_torch_quantizer.py
+115
-0
examples/model_compress/quantization/mixed_precision_speedup_mnist.py
...el_compress/quantization/mixed_precision_speedup_mnist.py
+152
-0
examples/model_compress/quantization/observer_quantizer.py
examples/model_compress/quantization/observer_quantizer.py
+117
-0
examples/nas/.gitignore
examples/nas/.gitignore
+10
-0
examples/nas/benchmarks/.gitignore
examples/nas/benchmarks/.gitignore
+5
-0
examples/nas/benchmarks/nasbench101.requirements.txt
examples/nas/benchmarks/nasbench101.requirements.txt
+5
-0
examples/nas/benchmarks/nasbench101.sh
examples/nas/benchmarks/nasbench101.sh
+19
-0
examples/nas/benchmarks/nasbench201.requirements.txt
examples/nas/benchmarks/nasbench201.requirements.txt
+4
-0
examples/nas/benchmarks/nasbench201.sh
examples/nas/benchmarks/nasbench201.sh
+19
-0
examples/nas/benchmarks/nds.requirements.txt
examples/nas/benchmarks/nds.requirements.txt
+2
-0
examples/nas/benchmarks/nds.sh
examples/nas/benchmarks/nds.sh
+20
-0
examples/nas/benchmarks/nlp.requirements.txt
examples/nas/benchmarks/nlp.requirements.txt
+1
-0
examples/nas/benchmarks/nlp.sh
examples/nas/benchmarks/nlp.sh
+39
-0
examples/nas/legacy/cdarts/aux_head.py
examples/nas/legacy/cdarts/aux_head.py
+102
-0
No files found.
Too many changes to show.
To preserve performance only
788 of 788+
files are displayed.
Plain diff
Email patch
examples/model_compress/pruning/v2/simulated_anealing_pruning_torch.py
0 → 100644
View file @
1011377c
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for simulated anealing pruning algorithm.
In this example, we show the end-to-end iterative pruning process: pre-training -> pruning -> fine-tuning.
'''
import
sys
import
argparse
from
tqdm
import
tqdm
import
torch
from
torchvision
import
datasets
,
transforms
from
nni.algorithms.compression.v2.pytorch.pruning
import
SimulatedAnnealingPruner
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
def
trainer
(
model
,
optimizer
,
criterion
,
epoch
):
model
.
train
()
for
data
,
target
in
tqdm
(
iterable
=
train_loader
,
desc
=
'Epoch {}'
.
format
(
epoch
)):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
def
finetuner
(
model
):
model
.
train
()
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.1
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
for
data
,
target
in
tqdm
(
iterable
=
train_loader
,
desc
=
'Epoch PFs'
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
def
evaluator
(
model
):
model
.
eval
()
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
tqdm
(
iterable
=
test_loader
,
desc
=
'Test'
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Iterative Example for model comporession'
)
parser
.
add_argument
(
'--pretrain-epochs'
,
type
=
int
,
default
=
10
,
help
=
'number of epochs to pretrain the model'
)
parser
.
add_argument
(
'--pruning-algo'
,
type
=
str
,
default
=
'l1'
,
choices
=
[
'level'
,
'l1'
,
'l2'
,
'fpgm'
,
'slim'
,
'apoz'
,
'mean_activation'
,
'taylorfo'
,
'admm'
],
help
=
'algorithm to evaluate weights to prune'
)
parser
.
add_argument
(
'--cool-down-rate'
,
type
=
float
,
default
=
0.9
,
help
=
'Cool down rate of the temperature.'
)
args
=
parser
.
parse_args
()
model
=
VGG
().
to
(
device
)
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.1
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
# pre-train the model
for
i
in
range
(
args
.
pretrain_epochs
):
trainer
(
model
,
optimizer
,
criterion
,
i
)
evaluator
(
model
)
config_list
=
[{
'op_types'
:
[
'Conv2d'
],
'total_sparsity'
:
0.8
}]
# evaluator in 'SimulatedAnnealingPruner' could not be None.
pruner
=
SimulatedAnnealingPruner
(
model
,
config_list
,
pruning_algorithm
=
args
.
pruning_algo
,
evaluator
=
evaluator
,
cool_down_rate
=
args
.
cool_down_rate
,
finetuner
=
finetuner
)
pruner
.
compress
()
_
,
model
,
masks
,
_
,
_
=
pruner
.
get_best_result
()
evaluator
(
model
)
examples/model_compress/pruning/v2/slim_pruning_torch.py
0 → 100644
View file @
1011377c
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported slim pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> speedup -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speed up is required.
'''
import
argparse
import
sys
import
torch
from
torchvision
import
datasets
,
transforms
from
torch.optim.lr_scheduler
import
MultiStepLR
import
nni
from
nni.compression.pytorch
import
ModelSpeedup
from
nni.compression.pytorch.utils.counter
import
count_flops_params
from
nni.algorithms.compression.v2.pytorch.pruning.basic_pruner
import
SlimPruner
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
g_epoch
=
0
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
def
trainer
(
model
,
optimizer
,
criterion
):
global
g_epoch
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
and
batch_idx
%
100
==
0
:
print
(
'Train Epoch: {} [{}/{} ({:.0f}%)]
\t
Loss: {:.6f}'
.
format
(
g_epoch
,
batch_idx
*
len
(
data
),
len
(
train_loader
.
dataset
),
100.
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
g_epoch
+=
1
def
evaluator
(
model
):
model
.
eval
()
correct
=
0.0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
def
optimizer_scheduler_generator
(
model
,
_lr
=
0.1
,
_momentum
=
0.9
,
_weight_decay
=
5e-4
,
total_epoch
=
160
):
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
_lr
,
momentum
=
_momentum
,
weight_decay
=
_weight_decay
)
scheduler
=
MultiStepLR
(
optimizer
,
milestones
=
[
int
(
total_epoch
*
0.5
),
int
(
total_epoch
*
0.75
)],
gamma
=
0.1
)
return
optimizer
,
scheduler
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Example for model comporession'
)
parser
.
add_argument
(
'--pretrain-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to pretrain the model'
)
parser
.
add_argument
(
'--fine-tune-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to fine tune the model'
)
args
=
parser
.
parse_args
()
print
(
'
\n
'
+
'='
*
50
+
' START TO TRAIN THE MODEL '
+
'='
*
50
)
model
=
VGG
().
to
(
device
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
total_epoch
=
args
.
pretrain_epochs
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
pre_best_acc
=
0.0
best_state_dict
=
None
for
i
in
range
(
args
.
pretrain_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
acc
=
evaluator
(
model
)
if
acc
>
pre_best_acc
:
pre_best_acc
=
acc
best_state_dict
=
model
.
state_dict
()
print
(
"Best accuracy: {}"
.
format
(
pre_best_acc
))
model
.
load_state_dict
(
best_state_dict
)
pre_flops
,
pre_params
,
_
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
g_epoch
=
0
# Start to prune and speedup
print
(
'
\n
'
+
'='
*
50
+
' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL '
+
'='
*
50
)
config_list
=
[{
'total_sparsity'
:
0.5
,
'op_types'
:
[
'BatchNorm2d'
],
'max_sparsity_per_layer'
:
0.9
}]
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer
=
nni
.
trace
(
torch
.
optim
.
SGD
)(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
pruner
=
SlimPruner
(
model
,
config_list
,
trainer
,
traced_optimizer
,
criterion
,
training_epochs
=
1
,
scale
=
0.0001
,
mode
=
'global'
)
_
,
masks
=
pruner
.
compress
()
pruner
.
show_pruned_weights
()
pruner
.
_unwrap_model
()
ModelSpeedup
(
model
,
dummy_input
=
torch
.
rand
([
10
,
3
,
32
,
32
]).
to
(
device
),
masks_file
=
masks
).
speedup_model
()
print
(
'
\n
'
+
'='
*
50
+
' EVALUATE THE MODEL AFTER SPEEDUP '
+
'='
*
50
)
evaluator
(
model
)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print
(
'
\n
'
+
'='
*
50
+
' START TO FINE TUNE THE MODEL '
+
'='
*
50
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
_lr
=
0.01
,
total_epoch
=
args
.
fine_tune_epochs
)
best_acc
=
0.0
g_epoch
=
0
for
i
in
range
(
args
.
fine_tune_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
best_acc
=
max
(
evaluator
(
model
),
best_acc
)
flops
,
params
,
results
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
print
(
f
'Pretrained model FLOPs
{
pre_flops
/
1e6
:.
2
f
}
M, #Params:
{
pre_params
/
1e6
:.
2
f
}
M, Accuracy:
{
pre_best_acc
:
.
2
f
}
%'
)
print
(
f
'Finetuned model FLOPs
{
flops
/
1e6
:.
2
f
}
M, #Params:
{
params
/
1e6
:.
2
f
}
M, Accuracy:
{
best_acc
:
.
2
f
}
%'
)
examples/model_compress/pruning/v2/taylorfo_pruning_torch.py
0 → 100644
View file @
1011377c
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported TaylorFOWeight pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speed up is required.
'''
import
argparse
import
sys
import
torch
from
torchvision
import
datasets
,
transforms
from
torch.optim.lr_scheduler
import
MultiStepLR
import
nni
from
nni.compression.pytorch
import
ModelSpeedup
from
nni.compression.pytorch.utils.counter
import
count_flops_params
from
nni.algorithms.compression.v2.pytorch.pruning.basic_pruner
import
TaylorFOWeightPruner
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
2
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
g_epoch
=
0
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
def
trainer
(
model
,
optimizer
,
criterion
):
global
g_epoch
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
and
batch_idx
%
100
==
0
:
print
(
'Train Epoch: {} [{}/{} ({:.0f}%)]
\t
Loss: {:.6f}'
.
format
(
g_epoch
,
batch_idx
*
len
(
data
),
len
(
train_loader
.
dataset
),
100.
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
g_epoch
+=
1
def
evaluator
(
model
):
model
.
eval
()
correct
=
0.0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
def
optimizer_scheduler_generator
(
model
,
_lr
=
0.1
,
_momentum
=
0.9
,
_weight_decay
=
5e-4
,
total_epoch
=
160
):
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
_lr
,
momentum
=
_momentum
,
weight_decay
=
_weight_decay
)
scheduler
=
MultiStepLR
(
optimizer
,
milestones
=
[
int
(
total_epoch
*
0.5
),
int
(
total_epoch
*
0.75
)],
gamma
=
0.1
)
return
optimizer
,
scheduler
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Example for model comporession'
)
parser
.
add_argument
(
'--pretrain-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to pretrain the model'
)
parser
.
add_argument
(
'--fine-tune-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to fine tune the model'
)
args
=
parser
.
parse_args
()
print
(
'
\n
'
+
'='
*
50
+
' START TO TRAIN THE MODEL '
+
'='
*
50
)
model
=
VGG
().
to
(
device
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
total_epoch
=
args
.
pretrain_epochs
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
pre_best_acc
=
0.0
best_state_dict
=
None
for
i
in
range
(
args
.
pretrain_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
acc
=
evaluator
(
model
)
if
acc
>
pre_best_acc
:
pre_best_acc
=
acc
best_state_dict
=
model
.
state_dict
()
print
(
"Best accuracy: {}"
.
format
(
pre_best_acc
))
model
.
load_state_dict
(
best_state_dict
)
pre_flops
,
pre_params
,
_
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
g_epoch
=
0
# Start to prune and speedup
print
(
'
\n
'
+
'='
*
50
+
' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL '
+
'='
*
50
)
config_list
=
[{
'total_sparsity'
:
0.5
,
'op_types'
:
[
'Conv2d'
],
}]
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer
=
nni
.
trace
(
torch
.
optim
.
SGD
)(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
pruner
=
TaylorFOWeightPruner
(
model
,
config_list
,
trainer
,
traced_optimizer
,
criterion
,
training_batches
=
20
)
_
,
masks
=
pruner
.
compress
()
pruner
.
show_pruned_weights
()
pruner
.
_unwrap_model
()
ModelSpeedup
(
model
,
dummy_input
=
torch
.
rand
([
10
,
3
,
32
,
32
]).
to
(
device
),
masks_file
=
masks
).
speedup_model
()
print
(
'
\n
'
+
'='
*
50
+
' EVALUATE THE MODEL AFTER SPEEDUP '
+
'='
*
50
)
evaluator
(
model
)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print
(
'
\n
'
+
'='
*
50
+
' START TO FINE TUNE THE MODEL '
+
'='
*
50
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
_lr
=
0.01
,
total_epoch
=
args
.
fine_tune_epochs
)
best_acc
=
0.0
g_epoch
=
0
for
i
in
range
(
args
.
fine_tune_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
best_acc
=
max
(
evaluator
(
model
),
best_acc
)
flops
,
params
,
results
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
print
(
f
'Pretrained model FLOPs
{
pre_flops
/
1e6
:.
2
f
}
M, #Params:
{
pre_params
/
1e6
:.
2
f
}
M, Accuracy:
{
pre_best_acc
:
.
2
f
}
%'
)
print
(
f
'Finetuned model FLOPs
{
flops
/
1e6
:.
2
f
}
M, #Params:
{
params
/
1e6
:.
2
f
}
M, Accuracy:
{
best_acc
:
.
2
f
}
%'
)
examples/model_compress/quantization/BNN_quantizer_cifar10.py
0 → 100644
View file @
1011377c
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import
torch
import
torch.nn
as
nn
import
torch.nn.functional
as
F
from
torchvision
import
datasets
,
transforms
from
nni.algorithms.compression.pytorch.quantization
import
BNNQuantizer
class
VGG_Cifar10
(
nn
.
Module
):
def
__init__
(
self
,
num_classes
=
1000
):
super
(
VGG_Cifar10
,
self
).
__init__
()
self
.
features
=
nn
.
Sequential
(
nn
.
Conv2d
(
3
,
128
,
kernel_size
=
3
,
padding
=
1
,
bias
=
False
),
nn
.
BatchNorm2d
(
128
,
eps
=
1e-4
,
momentum
=
0.1
),
nn
.
Hardtanh
(
inplace
=
True
),
nn
.
Conv2d
(
128
,
128
,
kernel_size
=
3
,
padding
=
1
,
bias
=
False
),
nn
.
MaxPool2d
(
kernel_size
=
2
,
stride
=
2
),
nn
.
BatchNorm2d
(
128
,
eps
=
1e-4
,
momentum
=
0.1
),
nn
.
Hardtanh
(
inplace
=
True
),
nn
.
Conv2d
(
128
,
256
,
kernel_size
=
3
,
padding
=
1
,
bias
=
False
),
nn
.
BatchNorm2d
(
256
,
eps
=
1e-4
,
momentum
=
0.1
),
nn
.
Hardtanh
(
inplace
=
True
),
nn
.
Conv2d
(
256
,
256
,
kernel_size
=
3
,
padding
=
1
,
bias
=
False
),
nn
.
MaxPool2d
(
kernel_size
=
2
,
stride
=
2
),
nn
.
BatchNorm2d
(
256
,
eps
=
1e-4
,
momentum
=
0.1
),
nn
.
Hardtanh
(
inplace
=
True
),
nn
.
Conv2d
(
256
,
512
,
kernel_size
=
3
,
padding
=
1
,
bias
=
False
),
nn
.
BatchNorm2d
(
512
,
eps
=
1e-4
,
momentum
=
0.1
),
nn
.
Hardtanh
(
inplace
=
True
),
nn
.
Conv2d
(
512
,
512
,
kernel_size
=
3
,
padding
=
1
,
bias
=
False
),
nn
.
MaxPool2d
(
kernel_size
=
2
,
stride
=
2
),
nn
.
BatchNorm2d
(
512
,
eps
=
1e-4
,
momentum
=
0.1
),
nn
.
Hardtanh
(
inplace
=
True
)
)
self
.
classifier
=
nn
.
Sequential
(
nn
.
Linear
(
512
*
4
*
4
,
1024
,
bias
=
False
),
nn
.
BatchNorm1d
(
1024
),
nn
.
Hardtanh
(
inplace
=
True
),
nn
.
Linear
(
1024
,
1024
,
bias
=
False
),
nn
.
BatchNorm1d
(
1024
),
nn
.
Hardtanh
(
inplace
=
True
),
nn
.
Linear
(
1024
,
num_classes
),
# do not quantize output
nn
.
BatchNorm1d
(
num_classes
,
affine
=
False
)
)
def
forward
(
self
,
x
):
x
=
self
.
features
(
x
)
x
=
x
.
view
(
-
1
,
512
*
4
*
4
)
x
=
self
.
classifier
(
x
)
return
x
def
train
(
model
,
device
,
train_loader
,
optimizer
):
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
F
.
cross_entropy
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
for
name
,
param
in
model
.
named_parameters
():
if
name
.
endswith
(
'old_weight'
):
param
=
param
.
clamp
(
-
1
,
1
)
if
batch_idx
%
100
==
0
:
print
(
'{:2.0f}% Loss {}'
.
format
(
100
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
def
test
(
model
,
device
,
test_loader
):
model
.
eval
()
test_loss
=
0
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
test_loss
+=
F
.
nll_loss
(
output
,
target
,
reduction
=
'sum'
).
item
()
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
test_loss
/=
len
(
test_loader
.
dataset
)
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Loss: {} Accuracy: {}%)
\n
'
.
format
(
test_loss
,
acc
))
return
acc
def
adjust_learning_rate
(
optimizer
,
epoch
):
update_list
=
[
55
,
100
,
150
,
200
,
400
,
600
]
if
epoch
in
update_list
:
for
param_group
in
optimizer
.
param_groups
:
param_group
[
'lr'
]
=
param_group
[
'lr'
]
*
0.1
return
def
main
():
torch
.
manual_seed
(
0
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data.cifar10'
,
train
=
True
,
download
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
])),
batch_size
=
64
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data.cifar10'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
])),
batch_size
=
200
,
shuffle
=
False
)
model
=
VGG_Cifar10
(
num_classes
=
10
)
model
.
to
(
device
)
configure_list
=
[{
'quant_types'
:
[
'weight'
],
'quant_bits'
:
1
,
'op_types'
:
[
'Conv2d'
,
'Linear'
],
'op_names'
:
[
'features.3'
,
'features.7'
,
'features.10'
,
'features.14'
,
'classifier.0'
,
'classifier.3'
]
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
1
,
'op_types'
:
[
'Hardtanh'
],
'op_names'
:
[
'features.6'
,
'features.9'
,
'features.13'
,
'features.16'
,
'features.20'
,
'classifier.2'
,
'classifier.5'
]
}]
quantizer
=
BNNQuantizer
(
model
,
configure_list
)
model
=
quantizer
.
compress
()
print
(
'='
*
10
+
'train'
+
'='
*
10
)
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
1e-2
)
best_top1
=
0
for
epoch
in
range
(
400
):
print
(
'# Epoch {} #'
.
format
(
epoch
))
train
(
model
,
device
,
train_loader
,
optimizer
)
adjust_learning_rate
(
optimizer
,
epoch
)
top1
=
test
(
model
,
device
,
test_loader
)
if
top1
>
best_top1
:
best_top1
=
top1
print
(
best_top1
)
if
__name__
==
'__main__'
:
main
()
examples/model_compress/quantization/DoReFaQuantizer_torch_mnist.py
0 → 100644
View file @
1011377c
import
torch
import
torch.nn.functional
as
F
from
torchvision
import
datasets
,
transforms
from
nni.algorithms.compression.pytorch.quantization
import
DoReFaQuantizer
import
sys
sys
.
path
.
append
(
'../models'
)
from
mnist.naive
import
NaiveModel
def
train
(
model
,
quantizer
,
device
,
train_loader
,
optimizer
):
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
F
.
nll_loss
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
%
100
==
0
:
print
(
'{:2.0f}% Loss {}'
.
format
(
100
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
def
test
(
model
,
device
,
test_loader
):
model
.
eval
()
test_loss
=
0
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
test_loss
+=
F
.
nll_loss
(
output
,
target
,
reduction
=
'sum'
).
item
()
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
test_loss
/=
len
(
test_loader
.
dataset
)
print
(
'Loss: {} Accuracy: {}%)
\n
'
.
format
(
test_loss
,
100
*
correct
/
len
(
test_loader
.
dataset
)))
def
main
():
torch
.
manual_seed
(
0
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
trans
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.1307
,),
(
0.3081
,))])
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
True
,
download
=
True
,
transform
=
trans
),
batch_size
=
64
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
False
,
transform
=
trans
),
batch_size
=
1000
,
shuffle
=
True
)
model
=
NaiveModel
()
model
=
model
.
to
(
device
)
configure_list
=
[{
'quant_types'
:
[
'weight'
],
'quant_bits'
:
{
'weight'
:
8
,
},
# you can just use `int` here because all `quan_types` share same bits length, see config for `ReLu6` below.
'op_types'
:[
'Conv2d'
,
'Linear'
]
}]
quantizer
=
DoReFaQuantizer
(
model
,
configure_list
)
quantizer
.
compress
()
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.001
,
momentum
=
0.5
)
for
epoch
in
range
(
10
):
print
(
'# Epoch {} #'
.
format
(
epoch
))
train
(
model
,
quantizer
,
device
,
train_loader
,
optimizer
)
test
(
model
,
device
,
test_loader
)
if
__name__
==
'__main__'
:
main
()
examples/model_compress/quantization/LSQ_torch_quantizer.py
0 → 100644
View file @
1011377c
import
torch
import
torch.nn.functional
as
F
from
torchvision
import
datasets
,
transforms
from
nni.algorithms.compression.pytorch.quantization
import
LsqQuantizer
from
nni.compression.pytorch.quantization_speedup
import
ModelSpeedupTensorRT
class
Mnist
(
torch
.
nn
.
Module
):
def
__init__
(
self
):
super
().
__init__
()
self
.
conv1
=
torch
.
nn
.
Conv2d
(
1
,
20
,
5
,
1
)
self
.
conv2
=
torch
.
nn
.
Conv2d
(
20
,
50
,
5
,
1
)
self
.
fc1
=
torch
.
nn
.
Linear
(
4
*
4
*
50
,
500
)
self
.
fc2
=
torch
.
nn
.
Linear
(
500
,
10
)
self
.
relu1
=
torch
.
nn
.
ReLU6
()
self
.
relu2
=
torch
.
nn
.
ReLU6
()
self
.
relu3
=
torch
.
nn
.
ReLU6
()
self
.
max_pool1
=
torch
.
nn
.
MaxPool2d
(
2
,
2
)
self
.
max_pool2
=
torch
.
nn
.
MaxPool2d
(
2
,
2
)
def
forward
(
self
,
x
):
x
=
self
.
relu1
(
self
.
conv1
(
x
))
x
=
self
.
max_pool1
(
x
)
x
=
self
.
relu2
(
self
.
conv2
(
x
))
x
=
self
.
max_pool2
(
x
)
x
=
x
.
view
(
-
1
,
4
*
4
*
50
)
x
=
self
.
relu3
(
self
.
fc1
(
x
))
x
=
self
.
fc2
(
x
)
return
F
.
log_softmax
(
x
,
dim
=
1
)
def
train
(
model
,
quantizer
,
device
,
train_loader
,
optimizer
):
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
F
.
nll_loss
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
%
100
==
0
:
print
(
'{:2.0f}% Loss {}'
.
format
(
100
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
def
test
(
model
,
device
,
test_loader
):
model
.
eval
()
test_loss
=
0
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
test_loss
+=
F
.
nll_loss
(
output
,
target
,
reduction
=
'sum'
).
item
()
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
test_loss
/=
len
(
test_loader
.
dataset
)
print
(
'Loss: {} Accuracy: {}%)
\n
'
.
format
(
test_loss
,
100
*
correct
/
len
(
test_loader
.
dataset
)))
def
test_trt
(
engine
,
test_loader
):
test_loss
=
0
correct
=
0
time_elasped
=
0
for
data
,
target
in
test_loader
:
output
,
time
=
engine
.
inference
(
data
)
test_loss
+=
F
.
nll_loss
(
output
,
target
,
reduction
=
'sum'
).
item
()
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
time_elasped
+=
time
test_loss
/=
len
(
test_loader
.
dataset
)
print
(
'Loss: {} Accuracy: {}%'
.
format
(
test_loss
,
100
*
correct
/
len
(
test_loader
.
dataset
)))
print
(
"Inference elapsed_time (whole dataset): {}s"
.
format
(
time_elasped
))
def
main
():
torch
.
manual_seed
(
0
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
trans
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.1307
,),
(
0.3081
,))])
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
True
,
download
=
True
,
transform
=
trans
),
batch_size
=
64
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
False
,
transform
=
trans
),
batch_size
=
1000
,
shuffle
=
True
)
model
=
Mnist
()
configure_list
=
[{
'quant_types'
:
[
'weight'
,
'input'
],
'quant_bits'
:
{
'weight'
:
8
,
'input'
:
8
},
'op_names'
:
[
'conv1'
]
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
{
'output'
:
8
,
},
'op_names'
:
[
'relu1'
]
},
{
'quant_types'
:
[
'weight'
,
'input'
],
'quant_bits'
:
{
'weight'
:
8
,
'input'
:
8
},
'op_names'
:
[
'conv2'
]
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
{
'output'
:
8
},
'op_names'
:
[
'relu2'
]
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
{
'output'
:
8
},
'op_names'
:
[
'max_pool2'
]
}
]
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.5
)
quantizer
=
LsqQuantizer
(
model
,
configure_list
,
optimizer
)
quantizer
.
compress
()
model
.
to
(
device
)
for
epoch
in
range
(
40
):
print
(
'# Epoch {} #'
.
format
(
epoch
))
train
(
model
,
quantizer
,
device
,
train_loader
,
optimizer
)
test
(
model
,
device
,
test_loader
)
model_path
=
"mnist_model.pth"
calibration_path
=
"mnist_calibration.pth"
calibration_config
=
quantizer
.
export_model
(
model_path
,
calibration_path
)
test
(
model
,
device
,
test_loader
)
print
(
"calibration_config: "
,
calibration_config
)
batch_size
=
32
input_shape
=
(
batch_size
,
1
,
28
,
28
)
engine
=
ModelSpeedupTensorRT
(
model
,
input_shape
,
config
=
calibration_config
,
batchsize
=
batch_size
)
engine
.
compress
()
test_trt
(
engine
,
test_loader
)
if
__name__
==
'__main__'
:
main
()
examples/model_compress/quantization/QAT_torch_quantizer.py
0 → 100644
View file @
1011377c
import
torch
import
torch.nn.functional
as
F
from
torchvision
import
datasets
,
transforms
from
nni.algorithms.compression.pytorch.quantization
import
QAT_Quantizer
from
nni.compression.pytorch.quantization.settings
import
set_quant_scheme_dtype
import
sys
sys
.
path
.
append
(
'../models'
)
from
mnist.naive
import
NaiveModel
def
train
(
model
,
device
,
train_loader
,
optimizer
):
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
F
.
nll_loss
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
%
100
==
0
:
print
(
'{:2.0f}% Loss {}'
.
format
(
100
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
def
test
(
model
,
device
,
test_loader
):
model
.
eval
()
test_loss
=
0
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
test_loss
+=
F
.
nll_loss
(
output
,
target
,
reduction
=
'sum'
).
item
()
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
test_loss
/=
len
(
test_loader
.
dataset
)
print
(
'Loss: {} Accuracy: {}%)
\n
'
.
format
(
test_loss
,
100
*
correct
/
len
(
test_loader
.
dataset
)))
def
main
():
torch
.
manual_seed
(
0
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
trans
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.1307
,),
(
0.3081
,))])
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
True
,
download
=
True
,
transform
=
trans
),
batch_size
=
64
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
False
,
transform
=
trans
),
batch_size
=
1000
,
shuffle
=
True
)
# Two things should be kept in mind when set this configure_list:
# 1. When deploying model on backend, some layers will be fused into one layer. For example, the consecutive
# conv + bn + relu layers will be fused into one big layer. If we want to execute the big layer in quantization
# mode, we should tell the backend the quantization information of the input, output, and the weight tensor of
# the big layer, which correspond to conv's input, conv's weight and relu's output.
# 2. Same tensor should be quantized only once. For example, if a tensor is the output of layer A and the input
# of the layer B, you should configure either {'quant_types': ['output'], 'op_names': ['a']} or
# {'quant_types': ['input'], 'op_names': ['b']} in the configure_list.
configure_list
=
[{
'quant_types'
:
[
'weight'
,
'input'
],
'quant_bits'
:
{
'weight'
:
8
,
'input'
:
8
},
'op_names'
:
[
'conv1'
,
'conv2'
]
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
{
'output'
:
8
,
},
'op_names'
:
[
'relu1'
,
'relu2'
]
},
{
'quant_types'
:
[
'output'
,
'weight'
,
'input'
],
'quant_bits'
:
{
'output'
:
8
,
'weight'
:
8
,
'input'
:
8
},
'op_names'
:
[
'fc1'
,
'fc2'
],
}]
# you can also set the quantization dtype and scheme layer-wise through configure_list like:
# configure_list = [{
# 'quant_types': ['weight', 'input'],
# 'quant_bits': {'weight': 8, 'input': 8},
# 'op_names': ['conv1', 'conv2'],
# 'quant_dtype': 'int',
# 'quant_scheme': 'per_channel_symmetric'
# }]
# For now quant_dtype's options are 'int' and 'uint. And quant_scheme's options are per_tensor_affine,
# per_tensor_symmetric, per_channel_affine and per_channel_symmetric.
set_quant_scheme_dtype
(
'weight'
,
'per_channel_symmetric'
,
'int'
)
set_quant_scheme_dtype
(
'output'
,
'per_tensor_symmetric'
,
'int'
)
set_quant_scheme_dtype
(
'input'
,
'per_tensor_symmetric'
,
'int'
)
model
=
NaiveModel
().
to
(
device
)
dummy_input
=
torch
.
randn
(
1
,
1
,
28
,
28
).
to
(
device
)
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.5
)
# To enable batch normalization folding in the training process, you should
# pass dummy_input to the QAT_Quantizer.
quantizer
=
QAT_Quantizer
(
model
,
configure_list
,
optimizer
,
dummy_input
=
dummy_input
)
quantizer
.
compress
()
model
.
to
(
device
)
for
epoch
in
range
(
40
):
print
(
'# Epoch {} #'
.
format
(
epoch
))
train
(
model
,
device
,
train_loader
,
optimizer
)
test
(
model
,
device
,
test_loader
)
model_path
=
"mnist_model.pth"
calibration_path
=
"mnist_calibration.pth"
onnx_path
=
"mnist_model.onnx"
input_shape
=
(
1
,
1
,
28
,
28
)
device
=
torch
.
device
(
"cuda"
)
calibration_config
=
quantizer
.
export_model
(
model_path
,
calibration_path
,
onnx_path
,
input_shape
,
device
)
print
(
"Generated calibration config is: "
,
calibration_config
)
if
__name__
==
'__main__'
:
main
()
examples/model_compress/quantization/mixed_precision_speedup_mnist.py
0 → 100644
View file @
1011377c
import
torch
import
torch.nn.functional
as
F
from
torchvision
import
datasets
,
transforms
from
nni.algorithms.compression.pytorch.quantization
import
QAT_Quantizer
from
nni.compression.pytorch.quantization_speedup
import
ModelSpeedupTensorRT
import
sys
sys
.
path
.
append
(
'../models'
)
from
mnist.naive
import
NaiveModel
def
train
(
model
,
device
,
train_loader
,
optimizer
):
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
F
.
nll_loss
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
%
100
==
0
:
print
(
'{:2.0f}% Loss {}'
.
format
(
100
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
def
test
(
model
,
device
,
test_loader
):
model
.
eval
()
test_loss
=
0
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
test_loss
+=
F
.
nll_loss
(
output
,
target
,
reduction
=
'sum'
).
item
()
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
test_loss
/=
len
(
test_loader
.
dataset
)
print
(
'Loss: {} Accuracy: {}%)
\n
'
.
format
(
test_loss
,
100
*
correct
/
len
(
test_loader
.
dataset
)))
def
test_trt
(
engine
,
test_loader
):
test_loss
=
0
correct
=
0
time_elasped
=
0
for
data
,
target
in
test_loader
:
output
,
time
=
engine
.
inference
(
data
)
test_loss
+=
F
.
nll_loss
(
output
,
target
,
reduction
=
'sum'
).
item
()
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
time_elasped
+=
time
test_loss
/=
len
(
test_loader
.
dataset
)
print
(
'Loss: {} Accuracy: {}%'
.
format
(
test_loss
,
100
*
correct
/
len
(
test_loader
.
dataset
)))
print
(
"Inference elapsed_time (whole dataset): {}s"
.
format
(
time_elasped
))
def
post_training_quantization_example
(
train_loader
,
test_loader
,
device
):
model
=
NaiveModel
()
config
=
{
'conv1'
:{
'weight_bits'
:
8
,
'output_bits'
:
8
},
'conv2'
:{
'weight_bits'
:
32
,
'output_bits'
:
32
},
'fc1'
:{
'weight_bits'
:
16
,
'output_bits'
:
16
},
'fc2'
:{
'weight_bits'
:
8
,
'output_bits'
:
8
}
}
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.5
)
model
.
to
(
device
)
for
epoch
in
range
(
1
):
print
(
'# Epoch {} #'
.
format
(
epoch
))
train
(
model
,
device
,
train_loader
,
optimizer
)
test
(
model
,
device
,
test_loader
)
batch_size
=
32
input_shape
=
(
batch_size
,
1
,
28
,
28
)
engine
=
ModelSpeedupTensorRT
(
model
,
input_shape
,
config
=
config
,
calib_data_loader
=
train_loader
,
batchsize
=
batch_size
)
engine
.
compress
()
test_trt
(
engine
,
test_loader
)
def
quantization_aware_training_example
(
train_loader
,
test_loader
,
device
):
model
=
NaiveModel
()
configure_list
=
[{
'quant_types'
:
[
'input'
,
'weight'
],
'quant_bits'
:
{
'input'
:
8
,
'weight'
:
8
},
'op_names'
:
[
'conv1'
]
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
{
'output'
:
8
},
'op_names'
:
[
'relu1'
]
},
{
'quant_types'
:
[
'input'
,
'weight'
],
'quant_bits'
:
{
'input'
:
8
,
'weight'
:
8
},
'op_names'
:
[
'conv2'
]
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
{
'output'
:
8
},
'op_names'
:
[
'relu2'
]
}
]
# finetune the model by using QAT
# enable batchnorm folding mode
dummy_input
=
torch
.
randn
(
1
,
1
,
28
,
28
)
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.5
)
quantizer
=
QAT_Quantizer
(
model
,
configure_list
,
optimizer
,
dummy_input
=
dummy_input
)
quantizer
.
compress
()
model
.
to
(
device
)
for
epoch
in
range
(
1
):
print
(
'# Epoch {} #'
.
format
(
epoch
))
train
(
model
,
device
,
train_loader
,
optimizer
)
test
(
model
,
device
,
test_loader
)
model_path
=
"mnist_model.pth"
calibration_path
=
"mnist_calibration.pth"
calibration_config
=
quantizer
.
export_model
(
model_path
,
calibration_path
)
test
(
model
,
device
,
test_loader
)
print
(
"calibration_config: "
,
calibration_config
)
batch_size
=
32
input_shape
=
(
batch_size
,
1
,
28
,
28
)
engine
=
ModelSpeedupTensorRT
(
model
,
input_shape
,
config
=
calibration_config
,
batchsize
=
batch_size
)
engine
.
compress
()
test_trt
(
engine
,
test_loader
)
def
main
():
torch
.
manual_seed
(
0
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
trans
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.1307
,),
(
0.3081
,))])
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
True
,
download
=
True
,
transform
=
trans
),
batch_size
=
64
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
False
,
transform
=
trans
),
batch_size
=
1000
,
shuffle
=
True
)
# post-training quantization on TensorRT
post_training_quantization_example
(
train_loader
,
test_loader
,
device
)
# combine NNI quantization algorithm QAT with backend framework TensorRT
quantization_aware_training_example
(
train_loader
,
test_loader
,
device
)
if
__name__
==
'__main__'
:
main
()
\ No newline at end of file
examples/model_compress/quantization/observer_quantizer.py
0 → 100644
View file @
1011377c
import
torch
import
torch.nn.functional
as
F
from
torchvision
import
datasets
,
transforms
from
nni.algorithms.compression.pytorch.quantization
import
ObserverQuantizer
import
sys
sys
.
path
.
append
(
'../models'
)
from
mnist.naive
import
NaiveModel
def
train
(
model
,
device
,
train_loader
,
optimizer
):
model
.
to
(
device
)
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
F
.
nll_loss
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
%
100
==
0
:
print
(
'{:2.0f}% Loss {}'
.
format
(
100
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
def
test
(
model
,
device
,
test_loader
):
model
.
eval
()
test_loss
=
0
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
test_loss
+=
F
.
nll_loss
(
output
,
target
,
reduction
=
'sum'
).
item
()
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
test_loss
/=
len
(
test_loader
.
dataset
)
print
(
'Loss: {} Accuracy: {}%)
\n
'
.
format
(
test_loss
,
100
*
correct
/
len
(
test_loader
.
dataset
)))
def
calibration
(
model
,
device
,
test_loader
):
model
.
eval
()
with
torch
.
no_grad
():
for
data
,
_
in
test_loader
:
data
=
data
.
to
(
device
)
model
(
data
)
def
main
():
torch
.
manual_seed
(
0
)
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
trans
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.1307
,),
(
0.3081
,))])
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
True
,
download
=
True
,
transform
=
trans
),
batch_size
=
64
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
MNIST
(
'data'
,
train
=
False
,
transform
=
trans
),
batch_size
=
1000
,
shuffle
=
True
)
model
=
NaiveModel
()
configure_list
=
[{
'quant_types'
:
[
'weight'
,
'input'
],
'quant_bits'
:
{
'weight'
:
8
,
'input'
:
8
},
'op_names'
:
[
'conv1'
],
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
{
'output'
:
8
,
},
'op_names'
:
[
'relu1'
],
},
{
'quant_types'
:
[
'weight'
,
'input'
],
'quant_bits'
:
{
'weight'
:
8
,
'input'
:
8
},
'op_names'
:
[
'conv2'
],
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
{
'output'
:
8
},
'op_names'
:
[
'relu2'
],
},
{
'quant_types'
:
[
'output'
],
'quant_bits'
:
{
'output'
:
8
},
'op_names'
:
[
'max_pool2'
],
}
]
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.5
)
# Train the model to get a baseline performance
for
epoch
in
range
(
5
):
print
(
'# Epoch {} #'
.
format
(
epoch
))
train
(
model
,
device
,
train_loader
,
optimizer
)
test
(
model
,
device
,
test_loader
)
# Construct the ObserverQuantizer. Note that currently ObserverQuantizer only works
# in evaluation mode.
quantizer
=
ObserverQuantizer
(
model
.
eval
(),
configure_list
,
optimizer
)
# Use the test data set to do calibration, this will not change the model parameters
calibration
(
model
,
device
,
test_loader
)
# obtain the quantization information and switch the model to "accuracy verification" mode
quantizer
.
compress
()
# measure the accuracy of the quantized model.
test
(
model
,
device
,
test_loader
)
model_path
=
"mnist_model.pth"
calibration_path
=
"mnist_calibration.pth"
calibration_config
=
quantizer
.
export_model
(
model_path
,
calibration_path
)
print
(
"calibration_config: "
,
calibration_config
)
# For now the quantization settings of ObserverQuantizer does not match the TensorRT,
# so TensorRT conversion are not supported
# current settings:
# weight : per_tensor_symmetric, qint8
# activation : per_tensor_affine, quint8, reduce_range=True
if
__name__
==
'__main__'
:
main
()
examples/nas/.gitignore
0 → 100644
View file @
1011377c
data
checkpoints
runs
nni_auto_gen_search_space.json
checkpoint.json
_generated_model.py
_generated_model_*.py
_generated_model
generated
lightning_logs
examples/nas/benchmarks/.gitignore
0 → 100644
View file @
1011377c
nasbench_full.tfrecord
a.pth
data.zip
nds_data
nlp_data
\ No newline at end of file
examples/nas/benchmarks/nasbench101.requirements.txt
0 → 100644
View file @
1011377c
# nasbench claims it supports tensorflow>=1.12.0 and we have tested on 1.15.2
tensorflow
tqdm
peewee
git+https://github.com/google-research/nasbench
examples/nas/benchmarks/nasbench101.sh
0 → 100755
View file @
1011377c
#!/bin/bash
set
-e
if
[
-z
"
${
NASBENCHMARK_DIR
}
"
]
;
then
NASBENCHMARK_DIR
=
~/.nni/nasbenchmark
fi
echo
"Downloading NAS-Bench-101..."
if
[
-f
"nasbench_full.tfrecord"
]
;
then
echo
"nasbench_full.tfrecord found. Skip download."
else
wget https://storage.googleapis.com/nasbench/nasbench_full.tfrecord
fi
echo
"Generating database..."
rm
-f
${
NASBENCHMARK_DIR
}
/nasbench101.db
${
NASBENCHMARK_DIR
}
/nasbench101.db-journal
mkdir
-p
${
NASBENCHMARK_DIR
}
python3
-m
nni.nas.benchmarks.nasbench101.db_gen nasbench_full.tfrecord
rm
-f
nasbench_full.tfrecord
examples/nas/benchmarks/nasbench201.requirements.txt
0 → 100644
View file @
1011377c
torch
gdown
tqdm
peewee
examples/nas/benchmarks/nasbench201.sh
0 → 100755
View file @
1011377c
#!/bin/bash
set
-e
if
[
-z
"
${
NASBENCHMARK_DIR
}
"
]
;
then
NASBENCHMARK_DIR
=
~/.nni/nasbenchmark
fi
echo
"Downloading NAS-Bench-201..."
if
[
-f
"a.pth"
]
;
then
echo
"a.pth found. Skip download."
else
gdown https://drive.google.com/uc
\?
id
\=
1OOfVPpt-lA4u2HJrXbgrRd42IbfvJMyE
-O
a.pth
fi
echo
"Generating database..."
rm
-f
${
NASBENCHMARK_DIR
}
/nasbench201.db
${
NASBENCHMARK_DIR
}
/nasbench201.db-journal
mkdir
-p
${
NASBENCHMARK_DIR
}
python3
-m
nni.nas.benchmarks.nasbench201.db_gen a.pth
rm
-f
a.pth
examples/nas/benchmarks/nds.requirements.txt
0 → 100644
View file @
1011377c
tqdm
peewee
examples/nas/benchmarks/nds.sh
0 → 100755
View file @
1011377c
#!/bin/bash
set
-e
if
[
-z
"
${
NASBENCHMARK_DIR
}
"
]
;
then
NASBENCHMARK_DIR
=
~/.nni/nasbenchmark
fi
echo
"Downloading NDS..."
if
[
-f
"data.zip"
]
;
then
echo
"data.zip found. Skip download."
else
wget https://dl.fbaipublicfiles.com/nds/data.zip
-O
data.zip
fi
unzip data.zip
echo
"Generating database..."
rm
-f
${
NASBENCHMARK_DIR
}
/nds.db
${
NASBENCHMARK_DIR
}
/nds.db-journal
mkdir
-p
${
NASBENCHMARK_DIR
}
python3
-m
nni.nas.benchmarks.nds.db_gen nds_data
rm
-rf
data.zip nds_data
examples/nas/benchmarks/nlp.requirements.txt
0 → 100644
View file @
1011377c
peewee
examples/nas/benchmarks/nlp.sh
0 → 100644
View file @
1011377c
#!/bin/bash
set
-e
if
[
-z
"
${
NASBENCHMARK_DIR
}
"
]
;
then
NASBENCHMARK_DIR
=
~/.nni/nasbenchmark
fi
mkdir
-p
nlp_data
cd
nlp_data
echo
"Downloading NLP[1/3] wikitext2_data.zip..."
if
[
-f
"wikitext2_data.zip"
]
;
then
echo
"wikitext2_data.zip found. Skip download."
else
wget
-O
wikitext2_data.zip https://github.com/fmsnew/nas-bench-nlp-release/blob/master/train_logs_wikitext-2/logs.zip?raw
=
true
fi
echo
"Downloading NLP[2/3] ptb_single_run_data.zip..."
if
[
-f
"ptb_single_run_data.zip"
]
;
then
echo
"ptb_single_run_data.zip found. Skip download."
else
wget
-O
ptb_single_run_data.zip https://github.com/fmsnew/nas-bench-nlp-release/blob/master/train_logs_single_run/logs.zip?raw
=
true
fi
echo
"Downloading NLP[3/3] ptb_multi_runs_data.zip..."
if
[
-f
"ptb_multi_runs_data.zip"
]
;
then
echo
"ptb_multi_runs_data.zip found. Skip download."
else
wget
-O
ptb_multi_runs_data.zip https://github.com/fmsnew/nas-bench-nlp-release/blob/master/train_logs_multi_runs/logs.zip?raw
=
true
fi
echo
"### there exits duplicate log_files in ptb_single_run_data.zip and ptb_multi_run_data.zip, you can ignore all or replace all ###"
unzip
-q
wikitext2_data.zip
unzip
-q
ptb_single_run_data.zip
unzip
-q
ptb_multi_runs_data.zip
cd
..
echo
"Generating database..."
rm
-f
${
NASBENCHMARK_DIR
}
/nlp.db
${
NASBENCHMARK_DIR
}
/nlp.db-journal
mkdir
-p
${
NASBENCHMARK_DIR
}
python3
-m
nni.nas.benchmarks.nlp.db_gen nlp_data
rm
-rf
nlp_data
examples/nas/legacy/cdarts/aux_head.py
0 → 100644
View file @
1011377c
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import
torch.nn
as
nn
class
DistillHeadCIFAR
(
nn
.
Module
):
def
__init__
(
self
,
C
,
size
,
num_classes
,
bn_affine
=
False
):
"""assuming input size 8x8 or 16x16"""
super
(
DistillHeadCIFAR
,
self
).
__init__
()
self
.
features
=
nn
.
Sequential
(
nn
.
ReLU
(),
nn
.
AvgPool2d
(
size
,
stride
=
2
,
padding
=
0
,
count_include_pad
=
False
),
# image size = 2 x 2 / 6 x 6
nn
.
Conv2d
(
C
,
128
,
1
,
bias
=
False
),
nn
.
BatchNorm2d
(
128
,
affine
=
bn_affine
),
nn
.
ReLU
(),
nn
.
Conv2d
(
128
,
768
,
2
,
bias
=
False
),
nn
.
BatchNorm2d
(
768
,
affine
=
bn_affine
),
nn
.
ReLU
()
)
self
.
classifier
=
nn
.
Linear
(
768
,
num_classes
)
self
.
gap
=
nn
.
AdaptiveAvgPool2d
(
1
)
def
forward
(
self
,
x
):
x
=
self
.
features
(
x
)
x
=
self
.
gap
(
x
)
x
=
self
.
classifier
(
x
.
view
(
x
.
size
(
0
),
-
1
))
return
x
class
DistillHeadImagenet
(
nn
.
Module
):
def
__init__
(
self
,
C
,
size
,
num_classes
,
bn_affine
=
False
):
"""assuming input size 7x7 or 14x14"""
super
(
DistillHeadImagenet
,
self
).
__init__
()
self
.
features
=
nn
.
Sequential
(
nn
.
ReLU
(),
nn
.
AvgPool2d
(
size
,
stride
=
2
,
padding
=
0
,
count_include_pad
=
False
),
# image size = 2 x 2 / 6 x 6
nn
.
Conv2d
(
C
,
128
,
1
,
bias
=
False
),
nn
.
BatchNorm2d
(
128
,
affine
=
bn_affine
),
nn
.
ReLU
(),
nn
.
Conv2d
(
128
,
768
,
2
,
bias
=
False
),
nn
.
BatchNorm2d
(
768
,
affine
=
bn_affine
),
nn
.
ReLU
()
)
self
.
classifier
=
nn
.
Linear
(
768
,
num_classes
)
self
.
gap
=
nn
.
AdaptiveAvgPool2d
(
1
)
def
forward
(
self
,
x
):
x
=
self
.
features
(
x
)
x
=
self
.
gap
(
x
)
x
=
self
.
classifier
(
x
.
view
(
x
.
size
(
0
),
-
1
))
return
x
class
AuxiliaryHeadCIFAR
(
nn
.
Module
):
def
__init__
(
self
,
C
,
size
=
5
,
num_classes
=
10
):
"""assuming input size 8x8"""
super
(
AuxiliaryHeadCIFAR
,
self
).
__init__
()
self
.
features
=
nn
.
Sequential
(
nn
.
ReLU
(
inplace
=
True
),
nn
.
AvgPool2d
(
5
,
stride
=
3
,
padding
=
0
,
count_include_pad
=
False
),
# image size = 2 x 2
nn
.
Conv2d
(
C
,
128
,
1
,
bias
=
False
),
nn
.
BatchNorm2d
(
128
),
nn
.
ReLU
(
inplace
=
True
),
nn
.
Conv2d
(
128
,
768
,
2
,
bias
=
False
),
nn
.
BatchNorm2d
(
768
),
nn
.
ReLU
(
inplace
=
True
)
)
self
.
classifier
=
nn
.
Linear
(
768
,
num_classes
)
def
forward
(
self
,
x
):
x
=
self
.
features
(
x
)
x
=
self
.
classifier
(
x
.
view
(
x
.
size
(
0
),
-
1
))
return
x
class
AuxiliaryHeadImageNet
(
nn
.
Module
):
def
__init__
(
self
,
C
,
size
=
5
,
num_classes
=
1000
):
"""assuming input size 7x7"""
super
(
AuxiliaryHeadImageNet
,
self
).
__init__
()
self
.
features
=
nn
.
Sequential
(
nn
.
ReLU
(
inplace
=
True
),
nn
.
AvgPool2d
(
size
,
stride
=
2
,
padding
=
0
,
count_include_pad
=
False
),
nn
.
Conv2d
(
C
,
128
,
1
,
bias
=
False
),
nn
.
BatchNorm2d
(
128
),
nn
.
ReLU
(
inplace
=
True
),
nn
.
Conv2d
(
128
,
768
,
2
,
bias
=
False
),
# NOTE: This batchnorm was omitted in my earlier implementation due to a typo.
# Commenting it out for consistency with the experiments in the paper.
# nn.BatchNorm2d(768),
nn
.
ReLU
(
inplace
=
True
)
)
self
.
classifier
=
nn
.
Linear
(
768
,
num_classes
)
def
forward
(
self
,
x
):
x
=
self
.
features
(
x
)
x
=
self
.
classifier
(
x
.
view
(
x
.
size
(
0
),
-
1
))
return
x
Prev
1
…
30
31
32
33
34
35
36
37
38
…
40
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment