Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
nni
Commits
e773dfcc
Commit
e773dfcc
authored
Mar 21, 2023
by
qianyj
Browse files
create branch for v2.9
parents
Changes
633
Hide whitespace changes
Inline
Side-by-side
Showing
13 changed files
with
1635 additions
and
0 deletions
+1635
-0
examples/model_compress/pruning/legacy/speedup/speedup_nanodet.py
.../model_compress/pruning/legacy/speedup/speedup_nanodet.py
+40
-0
examples/model_compress/pruning/legacy/speedup/speedup_yolov3.py
...s/model_compress/pruning/legacy/speedup/speedup_yolov3.py
+36
-0
examples/model_compress/pruning/legacy/transformers/run.sh
examples/model_compress/pruning/legacy/transformers/run.sh
+43
-0
examples/model_compress/pruning/legacy/transformers/transformer_pruning.py
...mpress/pruning/legacy/transformers/transformer_pruning.py
+387
-0
examples/model_compress/pruning/level_pruning_torch.py
examples/model_compress/pruning/level_pruning_torch.py
+130
-0
examples/model_compress/pruning/movement_pruning_glue.py
examples/model_compress/pruning/movement_pruning_glue.py
+128
-0
examples/model_compress/pruning/norm_pruning_torch.py
examples/model_compress/pruning/norm_pruning_torch.py
+137
-0
examples/model_compress/pruning/scheduler_torch.py
examples/model_compress/pruning/scheduler_torch.py
+100
-0
examples/model_compress/pruning/simple_pruning_torch.py
examples/model_compress/pruning/simple_pruning_torch.py
+88
-0
examples/model_compress/pruning/simulated_anealing_pruning_torch.py
...odel_compress/pruning/simulated_anealing_pruning_torch.py
+109
-0
examples/model_compress/pruning/slim_pruning_torch.py
examples/model_compress/pruning/slim_pruning_torch.py
+136
-0
examples/model_compress/pruning/taylorfo_lightning_evaluator.py
...es/model_compress/pruning/taylorfo_lightning_evaluator.py
+165
-0
examples/model_compress/pruning/taylorfo_pruning_torch.py
examples/model_compress/pruning/taylorfo_pruning_torch.py
+136
-0
No files found.
Too many changes to show.
To preserve performance only
633 of 633+
files are displayed.
Plain diff
Email patch
examples/model_compress/pruning/legacy/speedup/speedup_nanodet.py
0 → 100644
View file @
e773dfcc
import
torch
from
nanodet.model.arch
import
build_model
from
nanodet.util
import
cfg
,
load_config
from
nni.compression.pytorch
import
ModelSpeedup
from
nni.algorithms.compression.pytorch.pruning
import
L1FilterPruner
"""
NanoDet model can be installed from https://github.com/RangiLyu/nanodet.git
"""
cfg_path
=
r
"nanodet/config/nanodet-RepVGG-A0_416.yml"
load_config
(
cfg
,
cfg_path
)
model
=
build_model
(
cfg
.
model
).
cpu
()
dummy_input
=
torch
.
rand
(
8
,
3
,
416
,
416
)
op_names
=
[]
# these three conv layers are followed by reshape-like functions
# that cannot be replaced, so we skip these three conv layers,
# you can also get such layers by `not_safe_to_prune` function
excludes
=
[
'head.gfl_cls.0'
,
'head.gfl_cls.1'
,
'head.gfl_cls.2'
]
for
name
,
module
in
model
.
named_modules
():
if
isinstance
(
module
,
torch
.
nn
.
Conv2d
):
if
name
not
in
excludes
:
op_names
.
append
(
name
)
cfg_list
=
[{
'op_types'
:[
'Conv2d'
],
'sparsity'
:
0.5
,
'op_names'
:
op_names
}]
pruner
=
L1FilterPruner
(
model
,
cfg_list
)
pruner
.
compress
()
pruner
.
export_model
(
'./model'
,
'./mask'
)
# need call _unwrap_model if you want run the speedup on the same model
pruner
.
_unwrap_model
()
# Speedup the nanodet
ms
=
ModelSpeedup
(
model
,
dummy_input
,
'./mask'
)
ms
.
speedup_model
()
model
(
dummy_input
)
\ No newline at end of file
examples/model_compress/pruning/legacy/speedup/speedup_yolov3.py
0 → 100644
View file @
e773dfcc
import
torch
from
pytorchyolo
import
models
from
nni.compression.pytorch
import
ModelSpeedup
from
nni.algorithms.compression.pytorch.pruning
import
L1FilterPruner
,
LevelPruner
from
nni.compression.pytorch.utils
import
not_safe_to_prune
# The Yolo can be downloaded at https://github.com/eriklindernoren/PyTorch-YOLOv3.git
prefix
=
'/home/user/PyTorch-YOLOv3'
# replace this path with yours
# Load the YOLO model
model
=
models
.
load_model
(
"%s/config/yolov3.cfg"
%
prefix
,
"%s/yolov3.weights"
%
prefix
).
cpu
()
model
.
eval
()
dummy_input
=
torch
.
rand
(
8
,
3
,
320
,
320
)
model
(
dummy_input
)
# Generate the config list for pruner
# Filter the layers that may not be able to prune
not_safe
=
not_safe_to_prune
(
model
,
dummy_input
)
cfg_list
=
[]
for
name
,
module
in
model
.
named_modules
():
if
name
in
not_safe
:
continue
if
isinstance
(
module
,
torch
.
nn
.
Conv2d
):
cfg_list
.
append
({
'op_types'
:[
'Conv2d'
],
'sparsity'
:
0.6
,
'op_names'
:[
name
]})
# Prune the model
pruner
=
L1FilterPruner
(
model
,
cfg_list
)
pruner
.
compress
()
pruner
.
export_model
(
'./model'
,
'./mask'
)
pruner
.
_unwrap_model
()
# Speedup the model
ms
=
ModelSpeedup
(
model
,
dummy_input
,
'./mask'
)
ms
.
speedup_model
()
model
(
dummy_input
)
examples/model_compress/pruning/legacy/transformers/run.sh
0 → 100755
View file @
e773dfcc
#!/bin/bash
# Usage: ./run.sh gpu_id glue_task
export
CUDA_VISIBLE_DEVICES
=
$1
TASK_NAME
=
$2
# "cola", "sst2", "mrpc", "stsb", "qqp", "mnli", "qnli", "rte", "wnli"
PRETRAINED_MODEL
=
"bert-base-uncased"
# "distilbert-base-uncased", "roberta-base", "bert-base-cased", ...
# parameters for pruning
SPARSITY
=
0.5
RANKING_CRITERION
=
l1_weight
# "l1_weight", "l2_weight", "l1_activation", "l2_activation", "taylorfo"
NUM_ITERATIONS
=
1
# 1 for one-shot pruning
EPOCHS_PER_ITERATION
=
1
# other training parameters, no need to change
MAX_LENGTH
=
128
BATCH_SIZE
=
32
LR
=
2e-5
N_EPOCHS
=
3
time
=
$(
date
"+%Y%m%d%H%M%S"
)
OUTDIR
=
"models_
${
PRETRAINED_MODEL
}
_
${
TASK_NAME
}
_
$time
/"
TASK_LIST
=(
"cola"
"sst2"
"mrpc"
"stsb"
"qqp"
"mnli"
"qnli"
"rte"
"wnli"
)
if
[[
${
TASK_LIST
[*]
}
=
~
(
^|[[:space:]]
)
$TASK_NAME
(
$|
[[
:space:]]
)
]]
;
then
mkdir
$OUTDIR
python transformer_pruning.py
\
--sparsity
$SPARSITY
\
--ranking_criterion
$RANKING_CRITERION
\
--num_iterations
$NUM_ITERATIONS
\
--epochs_per_iteration
$EPOCHS_PER_ITERATION
\
--speedup
\
--model_name
$PRETRAINED_MODEL
\
--task_name
$TASK_NAME
\
--max_length
$MAX_LENGTH
\
--batch_size
$BATCH_SIZE
\
--learning_rate
$LR
\
--num_train_epochs
$N_EPOCHS
\
--output_dir
$OUTDIR
\
2>&1 |
tee
"
$OUTDIR
/output.log"
else
echo
"Unsupported task
$TASK_NAME
."
fi
examples/model_compress/pruning/legacy/transformers/transformer_pruning.py
0 → 100644
View file @
e773dfcc
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import
argparse
import
logging
import
os
import
torch
from
torch.utils.data.dataloader
import
DataLoader
from
tqdm.auto
import
tqdm
from
nni.compression.pytorch.utils
import
count_flops_params
from
nni.algorithms.compression.pytorch.pruning
import
TransformerHeadPruner
import
datasets
from
datasets
import
load_dataset
,
load_metric
import
transformers
from
transformers
import
(
AdamW
,
AutoConfig
,
AutoModelForSequenceClassification
,
AutoTokenizer
,
DataCollatorWithPadding
,
get_scheduler
,
)
logger
=
logging
.
getLogger
(
"bert_pruning_example"
)
def
parse_args
():
parser
=
argparse
.
ArgumentParser
(
description
=
"Example: prune a Huggingface transformer and finetune on GLUE tasks."
)
parser
.
add_argument
(
"--model_name"
,
type
=
str
,
required
=
True
,
help
=
"Pretrained model architecture."
)
parser
.
add_argument
(
"--task_name"
,
type
=
str
,
default
=
None
,
help
=
"The name of the GLUE task."
,
choices
=
[
"cola"
,
"mnli"
,
"mrpc"
,
"qnli"
,
"qqp"
,
"rte"
,
"sst2"
,
"stsb"
,
"wnli"
])
parser
.
add_argument
(
"--output_dir"
,
type
=
str
,
default
=
None
,
help
=
"Where to store the model and mask."
)
parser
.
add_argument
(
"--sparsity"
,
type
=
float
,
required
=
True
,
help
=
"Sparsity: proportion of heads to prune (should be between 0 and 1)"
)
parser
.
add_argument
(
"--global_sort"
,
action
=
"store_true"
,
default
=
False
,
help
=
"Rank the heads globally and prune the heads with lowest scores. If set to False, the "
"heads are only ranked within one layer"
)
parser
.
add_argument
(
"--ranking_criterion"
,
type
=
str
,
default
=
"l1_weight"
,
choices
=
[
"l1_weight"
,
"l2_weight"
,
"l1_activation"
,
"l2_activation"
,
"taylorfo"
],
help
=
"Criterion by which the attention heads are ranked."
)
parser
.
add_argument
(
"--num_iterations"
,
type
=
int
,
default
=
1
,
help
=
"Number of pruning iterations (1 for one-shot pruning)."
)
parser
.
add_argument
(
"--epochs_per_iteration"
,
type
=
int
,
default
=
1
,
help
=
"Epochs to finetune before the next pruning iteration "
"(only effective if num_iterations > 1)."
)
parser
.
add_argument
(
"--speedup"
,
action
=
"store_true"
,
default
=
False
,
help
=
"Whether to speedup the pruned model"
)
# parameters for model training; no need to change them for running examples
parser
.
add_argument
(
"--max_length"
,
type
=
int
,
default
=
128
,
help
=
(
"The maximum total input sequence length after tokenization. Sequences longer than this "
"will be truncated, sequences shorter will be padded if `--pad_to_max_lengh` is passed."
))
parser
.
add_argument
(
"--batch_size"
,
type
=
int
,
default
=
8
,
help
=
"Batch size."
)
parser
.
add_argument
(
"--learning_rate"
,
type
=
float
,
default
=
5e-5
,
help
=
"Initial learning rate."
)
parser
.
add_argument
(
"--num_train_epochs"
,
type
=
int
,
default
=
3
,
help
=
"Total number of training epochs to perform."
)
parser
.
add_argument
(
"--lr_scheduler_type"
,
default
=
"linear"
,
choices
=
[
"linear"
,
"cosine"
,
"cosine_with_restarts"
,
"polynomial"
,
"constant"
,
"constant_with_warmup"
])
parser
.
add_argument
(
"--num_warmup_steps"
,
type
=
int
,
default
=
0
,
help
=
"Number of steps for the warmup in the lr scheduler."
)
args
=
parser
.
parse_args
()
if
args
.
output_dir
is
not
None
:
os
.
makedirs
(
args
.
output_dir
,
exist_ok
=
True
)
return
args
def
get_raw_dataset
(
task_name
):
"""
Get a GLUE dataset using huggingface datasets.
"""
raw_dataset
=
load_dataset
(
"glue"
,
task_name
)
is_regression
=
task_name
==
"stsb"
num_labels
=
1
if
is_regression
else
len
(
raw_dataset
[
"train"
].
features
[
"label"
].
names
)
return
raw_dataset
,
is_regression
,
num_labels
def
preprocess
(
args
,
tokenizer
,
raw_dataset
):
"""
Tokenization and column renaming.
"""
assert
args
.
task_name
is
not
None
task_to_keys
=
{
"cola"
:
(
"sentence"
,
None
),
"mnli"
:
(
"premise"
,
"hypothesis"
),
"mrpc"
:
(
"sentence1"
,
"sentence2"
),
"qnli"
:
(
"question"
,
"sentence"
),
"qqp"
:
(
"question1"
,
"question2"
),
"rte"
:
(
"sentence1"
,
"sentence2"
),
"sst2"
:
(
"sentence"
,
None
),
"stsb"
:
(
"sentence1"
,
"sentence2"
),
"wnli"
:
(
"sentence1"
,
"sentence2"
),
}
sentence1_key
,
sentence2_key
=
task_to_keys
[
args
.
task_name
]
def
tokenize
(
data
):
texts
=
(
(
data
[
sentence1_key
],)
if
sentence2_key
is
None
else
(
data
[
sentence1_key
],
data
[
sentence2_key
])
)
result
=
tokenizer
(
*
texts
,
padding
=
False
,
max_length
=
args
.
max_length
,
truncation
=
True
)
if
"label"
in
data
:
result
[
"labels"
]
=
data
[
"label"
]
return
result
processed_datasets
=
raw_dataset
.
map
(
tokenize
,
batched
=
True
,
remove_columns
=
raw_dataset
[
"train"
].
column_names
)
return
processed_datasets
def
get_dataloader_and_optimizer
(
args
,
tokenizer
,
model
,
train_dataset
,
eval_dataset
):
data_collator
=
DataCollatorWithPadding
(
tokenizer
)
train_dataloader
=
DataLoader
(
train_dataset
,
shuffle
=
True
,
collate_fn
=
data_collator
,
batch_size
=
args
.
batch_size
)
eval_dataloader
=
DataLoader
(
eval_dataset
,
collate_fn
=
data_collator
,
batch_size
=
args
.
batch_size
)
optimizer
=
AdamW
(
model
.
parameters
(),
lr
=
args
.
learning_rate
)
return
optimizer
,
train_dataloader
,
eval_dataloader
,
data_collator
def
train_model
(
args
,
model
,
is_regression
,
train_dataloader
,
eval_dataloader
,
optimizer
,
lr_scheduler
,
metric
,
device
):
"""
Train the model using train_dataloader and evaluate after every epoch using eval_dataloader.
This function is called before and after pruning for "pretraining" on the GLUE task and further "finetuning".
"""
train_steps
=
args
.
num_train_epochs
*
len
(
train_dataloader
)
progress_bar
=
tqdm
(
range
(
train_steps
),
position
=
0
,
leave
=
True
)
for
epoch
in
range
(
args
.
num_train_epochs
):
model
.
train
()
for
step
,
batch
in
enumerate
(
train_dataloader
):
for
field
in
batch
.
keys
():
batch
[
field
]
=
batch
[
field
].
to
(
device
)
outputs
=
model
(
**
batch
)
outputs
.
loss
.
backward
()
optimizer
.
step
()
lr_scheduler
.
step
()
optimizer
.
zero_grad
()
progress_bar
.
update
(
1
)
model
.
eval
()
for
step
,
batch
in
enumerate
(
eval_dataloader
):
for
field
in
batch
.
keys
():
batch
[
field
]
=
batch
[
field
].
to
(
device
)
outputs
=
model
(
**
batch
)
predictions
=
outputs
.
logits
.
argmax
(
dim
=-
1
)
if
not
is_regression
\
else
outputs
.
logits
.
squeeze
()
metric
.
add_batch
(
predictions
=
predictions
,
references
=
batch
[
"labels"
])
eval_metric
=
metric
.
compute
()
logger
.
info
(
f
"epoch
{
epoch
}
:
{
eval_metric
}
"
)
def
trainer_helper
(
model
,
train_dataloader
,
optimizer
,
device
):
"""
This function is used for to create a "trainer" that is passed to the pruner.
Finetune the model for 1 epoch. This function is called by the pruner during pruning iterations (or called to
calculate scores for pruning when ranking criterion is "taylorfo").
"""
logger
.
info
(
"Training for 1 epoch..."
)
progress_bar
=
tqdm
(
range
(
len
(
train_dataloader
)),
position
=
0
,
leave
=
True
)
train_epoch
=
1
for
epoch
in
range
(
train_epoch
):
for
step
,
batch
in
enumerate
(
train_dataloader
):
for
field
in
batch
.
keys
():
batch
[
field
]
=
batch
[
field
].
to
(
device
)
outputs
=
model
(
**
batch
)
outputs
.
loss
.
backward
()
optimizer
.
step
()
optimizer
.
zero_grad
()
progress_bar
.
update
(
1
)
def
forward_runner_helper
(
model
,
train_dataloader
,
device
):
"""
This function is used for to create a "forward_runner" that is passed to the pruner.
The function just runs forward on the train set without updating the parameters.
This allows the pruner to collect data for activation-based pruning methods.
"""
logger
.
info
(
"Running forward on the entire train set without updating parameters..."
)
progress_bar
=
tqdm
(
range
(
len
(
train_dataloader
)),
position
=
0
,
leave
=
True
)
forward_epoch
=
1
for
epoch
in
range
(
forward_epoch
):
for
step
,
batch
in
enumerate
(
train_dataloader
):
for
field
in
batch
.
keys
():
batch
[
field
]
=
batch
[
field
].
to
(
device
)
_
=
model
(
**
batch
)
# note: no loss.backward or optimizer.step() is performed here
progress_bar
.
update
(
1
)
def
final_eval_for_mnli
(
args
,
model
,
processed_datasets
,
metric
,
data_collator
):
"""
If the task is MNLI, perform a final evaluation on mismatched validation set
"""
eval_dataset
=
processed_datasets
[
"validation_mismatched"
]
eval_dataloader
=
DataLoader
(
eval_dataset
,
collate_fn
=
data_collator
,
batch_size
=
args
.
batch_size
)
model
.
eval
()
for
step
,
batch
in
enumerate
(
eval_dataloader
):
outputs
=
model
(
**
batch
)
predictions
=
outputs
.
logits
.
argmax
(
dim
=-
1
)
metric
.
add_batch
(
predictions
=
predictions
,
references
=
batch
[
"labels"
],
)
eval_metric
=
metric
.
compute
()
logger
.
info
(
f
"mnli-mm:
{
eval_metric
}
"
)
def
main
():
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
args
=
parse_args
()
#########################################################################
# Prepare model, tokenizer, dataset, optimizer, and the scheduler
logger
.
setLevel
(
logging
.
INFO
)
datasets
.
utils
.
logging
.
set_verbosity_warning
()
transformers
.
utils
.
logging
.
set_verbosity_info
()
# Load dataset and tokenizer, and then preprocess the dataset
raw_dataset
,
is_regression
,
num_labels
=
get_raw_dataset
(
args
.
task_name
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
model_name
,
use_fast
=
True
)
processed_datasets
=
preprocess
(
args
,
tokenizer
,
raw_dataset
)
train_dataset
=
processed_datasets
[
"train"
]
eval_dataset
=
processed_datasets
[
"validation_matched"
if
args
.
task_name
==
"mnli"
else
"validation"
]
# Load pretrained model
config
=
AutoConfig
.
from_pretrained
(
args
.
model_name
,
num_labels
=
num_labels
,
finetuning_task
=
args
.
task_name
)
model
=
AutoModelForSequenceClassification
.
from_pretrained
(
args
.
model_name
,
config
=
config
)
model
.
to
(
device
)
#########################################################################
# Finetune on the target GLUE task before pruning
optimizer
,
train_dataloader
,
eval_dataloader
,
data_collator
=
get_dataloader_and_optimizer
(
args
,
tokenizer
,
model
,
train_dataset
,
eval_dataset
)
train_steps
=
args
.
num_train_epochs
*
len
(
train_dataloader
)
lr_scheduler
=
get_scheduler
(
name
=
args
.
lr_scheduler_type
,
optimizer
=
optimizer
,
num_warmup_steps
=
args
.
num_warmup_steps
,
num_training_steps
=
train_steps
)
metric
=
load_metric
(
"glue"
,
args
.
task_name
)
logger
.
info
(
"================= Finetuning before pruning ================="
)
train_model
(
args
,
model
,
is_regression
,
train_dataloader
,
eval_dataloader
,
optimizer
,
lr_scheduler
,
metric
,
device
)
if
args
.
output_dir
is
not
None
:
torch
.
save
(
model
.
state_dict
(),
args
.
output_dir
+
"/model_before_pruning.pt"
)
if
args
.
task_name
==
"mnli"
:
final_eval_for_mnli
(
args
,
model
,
processed_datasets
,
metric
,
data_collator
)
#########################################################################
# Pruning
optimizer
,
train_dataloader
,
eval_dataloader
,
data_collator
=
get_dataloader_and_optimizer
(
args
,
tokenizer
,
model
,
train_dataset
,
eval_dataset
)
dummy_input
=
next
(
iter
(
train_dataloader
))[
"input_ids"
].
to
(
device
)
flops
,
params
,
results
=
count_flops_params
(
model
,
dummy_input
)
print
(
f
"Initial model FLOPs
{
flops
/
1e6
:.
2
f
}
M, #Params:
{
params
/
1e6
:.
2
f
}
M"
)
# Here criterion is embedded in the model. Upper levels can just pass None to trainer.
def
trainer
(
model
,
optimizer
,
criterion
,
epoch
):
return
trainer_helper
(
model
,
train_dataloader
,
optimizer
,
device
)
def
forward_runner
(
model
):
return
forward_runner_helper
(
model
,
train_dataloader
,
device
)
# example: prune different layers with different sparsity
attention_name_groups
=
list
(
zip
([
"bert.encoder.layer.{}.attention.self.query"
.
format
(
i
)
for
i
in
range
(
12
)],
[
"bert.encoder.layer.{}.attention.self.key"
.
format
(
i
)
for
i
in
range
(
12
)],
[
"bert.encoder.layer.{}.attention.self.value"
.
format
(
i
)
for
i
in
range
(
12
)],
[
"bert.encoder.layer.{}.attention.output.dense"
.
format
(
i
)
for
i
in
range
(
12
)]))
kwargs
=
{
"ranking_criterion"
:
args
.
ranking_criterion
,
"global_sort"
:
args
.
global_sort
,
"num_iterations"
:
args
.
num_iterations
,
"epochs_per_iteration"
:
args
.
epochs_per_iteration
,
"attention_name_groups"
:
attention_name_groups
,
"head_hidden_dim"
:
64
,
"trainer"
:
trainer
,
"optimizer"
:
optimizer
,
"forward_runner"
:
forward_runner
}
config_list
=
[{
"sparsity"
:
args
.
sparsity
,
"op_types"
:
[
"Linear"
],
"op_names"
:
[
x
for
layer
in
attention_name_groups
[:
6
]
for
x
in
layer
]
},
{
"sparsity"
:
args
.
sparsity
/
2
,
"op_types"
:
[
"Linear"
],
"op_names"
:
[
x
for
layer
in
attention_name_groups
[
6
:]
for
x
in
layer
]
}]
pruner
=
TransformerHeadPruner
(
model
,
config_list
,
**
kwargs
)
pruner
.
compress
()
#########################################################################
# uncomment the following part to export the pruned model masks
# model_path = os.path.join(args.output_dir, "pruned_{}_{}.pth".format(args.model_name, args.task_name))
# mask_path = os.path.join(args.output_dir, "mask_{}_{}.pth".format(args.model_name, args.task_name))
# pruner.export_model(model_path=model_path, mask_path=mask_path)
#########################################################################
# Speedup
# Currently, speeding up Transformers through NNI ModelSpeedup is not supported because of shape inference issues.
# However, if you are using the transformers library, you can use the following workaround:
# The following code gets the head pruning decisions from the pruner and calls the _prune_heads() function
# implemented in models from the transformers library to speedup the model.
if
args
.
speedup
:
speedup_rules
=
{}
for
group_idx
,
group
in
enumerate
(
pruner
.
attention_name_groups
):
# get the layer index
layer_idx
=
None
for
part
in
group
[
0
].
split
(
"."
):
try
:
layer_idx
=
int
(
part
)
break
except
:
continue
if
layer_idx
is
not
None
:
speedup_rules
[
layer_idx
]
=
pruner
.
pruned_heads
[
group_idx
]
pruner
.
_unwrap_model
()
model
.
bert
.
_prune_heads
(
speedup_rules
)
print
(
model
)
#########################################################################
# After pruning, finetune again on the target task
# Get the metric function
metric
=
load_metric
(
"glue"
,
args
.
task_name
)
# re-initialize the optimizer and the scheduler
optimizer
,
_
,
_
,
data_collator
=
get_dataloader_and_optimizer
(
args
,
tokenizer
,
model
,
train_dataset
,
eval_dataset
)
lr_scheduler
=
get_scheduler
(
name
=
args
.
lr_scheduler_type
,
optimizer
=
optimizer
,
num_warmup_steps
=
args
.
num_warmup_steps
,
num_training_steps
=
train_steps
)
logger
.
info
(
"================= Finetuning after Pruning ================="
)
train_model
(
args
,
model
,
is_regression
,
train_dataloader
,
eval_dataloader
,
optimizer
,
lr_scheduler
,
metric
,
device
)
if
args
.
output_dir
is
not
None
:
torch
.
save
(
model
.
state_dict
(),
args
.
output_dir
+
"/model_after_pruning.pt"
)
if
args
.
task_name
==
"mnli"
:
final_eval_for_mnli
(
args
,
model
,
processed_datasets
,
metric
,
data_collator
)
flops
,
params
,
results
=
count_flops_params
(
model
,
dummy_input
)
print
(
f
"Final model FLOPs
{
flops
/
1e6
:.
2
f
}
M, #Params:
{
params
/
1e6
:.
2
f
}
M"
)
if
__name__
==
"__main__"
:
main
()
examples/model_compress/pruning/level_pruning_torch.py
0 → 100644
View file @
e773dfcc
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported level pruning algorithm.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
'''
import
argparse
import
sys
import
torch
from
torchvision
import
datasets
,
transforms
from
torch.optim.lr_scheduler
import
MultiStepLR
from
nni.compression.pytorch.utils
import
count_flops_params
from
nni.compression.pytorch.pruning
import
LevelPruner
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
g_epoch
=
0
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
def
trainer
(
model
,
optimizer
,
criterion
):
global
g_epoch
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
and
batch_idx
%
100
==
0
:
print
(
'Train Epoch: {} [{}/{} ({:.0f}%)]
\t
Loss: {:.6f}'
.
format
(
g_epoch
,
batch_idx
*
len
(
data
),
len
(
train_loader
.
dataset
),
100.
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
g_epoch
+=
1
def
evaluator
(
model
):
model
.
eval
()
correct
=
0.0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
def
optimizer_scheduler_generator
(
model
,
_lr
=
0.1
,
_momentum
=
0.9
,
_weight_decay
=
5e-4
,
total_epoch
=
160
):
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
_lr
,
momentum
=
_momentum
,
weight_decay
=
_weight_decay
)
scheduler
=
MultiStepLR
(
optimizer
,
milestones
=
[
int
(
total_epoch
*
0.5
),
int
(
total_epoch
*
0.75
)],
gamma
=
0.1
)
return
optimizer
,
scheduler
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Example for model comporession'
)
parser
.
add_argument
(
'--pretrain-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to pretrain the model'
)
parser
.
add_argument
(
'--fine-tune-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to fine tune the model'
)
args
=
parser
.
parse_args
()
print
(
'
\n
'
+
'='
*
50
+
' START TO TRAIN THE MODEL '
+
'='
*
50
)
model
=
VGG
().
to
(
device
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
total_epoch
=
args
.
pretrain_epochs
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
pre_best_acc
=
0.0
best_state_dict
=
None
for
i
in
range
(
args
.
pretrain_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
acc
=
evaluator
(
model
)
if
acc
>
pre_best_acc
:
pre_best_acc
=
acc
best_state_dict
=
model
.
state_dict
()
print
(
"Best accuracy: {}"
.
format
(
pre_best_acc
))
model
.
load_state_dict
(
best_state_dict
)
pre_flops
,
pre_params
,
_
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
# Start to prune and speedup
print
(
'
\n
'
+
'='
*
50
+
' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL '
+
'='
*
50
)
config_list
=
[{
'sparsity'
:
0.5
,
'op_types'
:
[
'default'
]
}]
pruner
=
LevelPruner
(
model
,
config_list
)
_
,
masks
=
pruner
.
compress
()
pruner
.
show_pruned_weights
()
# Fine-grained method does not need to speedup
print
(
'
\n
'
+
'='
*
50
+
' EVALUATE THE MODEL AFTER PRUNING '
+
'='
*
50
)
evaluator
(
model
)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print
(
'
\n
'
+
'='
*
50
+
' START TO FINE TUNE THE MODEL '
+
'='
*
50
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
_lr
=
0.01
,
total_epoch
=
args
.
fine_tune_epochs
)
best_acc
=
0.0
g_epoch
=
0
for
i
in
range
(
args
.
fine_tune_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
best_acc
=
max
(
evaluator
(
model
),
best_acc
)
flops
,
params
,
results
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
print
(
f
'Pretrained model FLOPs
{
pre_flops
/
1e6
:.
2
f
}
M, #Params:
{
pre_params
/
1e6
:.
2
f
}
M, Accuracy:
{
pre_best_acc
:
.
2
f
}
%'
)
print
(
f
'Finetuned model FLOPs
{
flops
/
1e6
:.
2
f
}
M, #Params:
{
params
/
1e6
:.
2
f
}
M, Accuracy:
{
best_acc
:
.
2
f
}
%'
)
examples/model_compress/pruning/movement_pruning_glue.py
0 → 100644
View file @
e773dfcc
import
functools
import
time
from
tqdm
import
tqdm
import
torch
from
torch.optim
import
Adam
from
torch.utils.data
import
DataLoader
from
datasets
import
load_metric
,
load_dataset
from
transformers
import
(
BertForSequenceClassification
,
BertTokenizerFast
,
DataCollatorWithPadding
,
set_seed
)
import
nni
from
nni.compression.pytorch.pruning
import
MovementPruner
task_to_keys
=
{
"cola"
:
(
"sentence"
,
None
),
"mnli"
:
(
"premise"
,
"hypothesis"
),
"mrpc"
:
(
"sentence1"
,
"sentence2"
),
"qnli"
:
(
"question"
,
"sentence"
),
"qqp"
:
(
"question1"
,
"question2"
),
"rte"
:
(
"sentence1"
,
"sentence2"
),
"sst2"
:
(
"sentence"
,
None
),
"stsb"
:
(
"sentence1"
,
"sentence2"
),
"wnli"
:
(
"sentence1"
,
"sentence2"
),
}
device
=
torch
.
device
(
'cuda'
if
torch
.
cuda
.
is_available
()
else
'cpu'
)
gradient_accumulation_steps
=
8
# a fake criterion because huggingface output already has loss
def
criterion
(
input
,
target
):
return
input
.
loss
def
trainer
(
model
,
optimizer
,
criterion
,
train_dataloader
):
model
.
train
()
counter
=
0
for
batch
in
(
train_dataloader
):
counter
+=
1
batch
.
to
(
device
)
optimizer
.
zero_grad
()
outputs
=
model
(
**
batch
)
# pruner may wrap the criterion, for example, loss = origin_loss + norm(weight), so call criterion to get loss here
loss
=
criterion
(
outputs
,
None
)
loss
=
loss
/
gradient_accumulation_steps
loss
.
backward
()
if
counter
%
gradient_accumulation_steps
==
0
or
counter
==
len
(
train_dataloader
):
optimizer
.
step
()
if
counter
%
800
==
0
:
print
(
'[{}]: {}'
.
format
(
time
.
asctime
(
time
.
localtime
(
time
.
time
())),
counter
))
if
counter
%
8000
==
0
:
print
(
'Step {}: {}'
.
format
(
counter
//
gradient_accumulation_steps
,
evaluator
(
model
,
metric
,
is_regression
,
validate_dataloader
)))
def
evaluator
(
model
,
metric
,
is_regression
,
eval_dataloader
):
model
.
eval
()
for
batch
in
(
eval_dataloader
):
batch
.
to
(
device
)
outputs
=
model
(
**
batch
)
predictions
=
outputs
.
logits
.
argmax
(
dim
=-
1
)
if
not
is_regression
else
outputs
.
logits
.
squeeze
()
metric
.
add_batch
(
predictions
=
predictions
,
references
=
batch
[
"labels"
],
)
return
metric
.
compute
()
if
__name__
==
'__main__'
:
task_name
=
'mnli'
is_regression
=
False
num_labels
=
1
if
is_regression
else
(
3
if
task_name
==
'mnli'
else
2
)
train_batch_size
=
4
eval_batch_size
=
4
set_seed
(
1024
)
tokenizer
=
BertTokenizerFast
.
from_pretrained
(
'bert-base-cased'
)
sentence1_key
,
sentence2_key
=
task_to_keys
[
task_name
]
# used to preprocess the raw data
def
preprocess_function
(
examples
):
# Tokenize the texts
args
=
(
(
examples
[
sentence1_key
],)
if
sentence2_key
is
None
else
(
examples
[
sentence1_key
],
examples
[
sentence2_key
])
)
result
=
tokenizer
(
*
args
,
padding
=
False
,
max_length
=
128
,
truncation
=
True
)
if
"label"
in
examples
:
# In all cases, rename the column to labels because the model will expect that.
result
[
"labels"
]
=
examples
[
"label"
]
return
result
raw_datasets
=
load_dataset
(
'glue'
,
task_name
,
cache_dir
=
'./data'
)
processed_datasets
=
raw_datasets
.
map
(
preprocess_function
,
batched
=
True
,
remove_columns
=
raw_datasets
[
"train"
].
column_names
)
train_dataset
=
processed_datasets
[
'train'
]
validate_dataset
=
processed_datasets
[
'validation_matched'
if
task_name
==
"mnli"
else
'validation'
]
data_collator
=
DataCollatorWithPadding
(
tokenizer
)
train_dataloader
=
DataLoader
(
train_dataset
,
shuffle
=
True
,
collate_fn
=
data_collator
,
batch_size
=
train_batch_size
)
validate_dataloader
=
DataLoader
(
validate_dataset
,
collate_fn
=
data_collator
,
batch_size
=
eval_batch_size
)
metric
=
load_metric
(
"glue"
,
task_name
)
model
=
BertForSequenceClassification
.
from_pretrained
(
'bert-base-cased'
,
num_labels
=
num_labels
).
to
(
device
)
print
(
'Initial: {}'
.
format
(
evaluator
(
model
,
metric
,
is_regression
,
validate_dataloader
)))
config_list
=
[{
'op_types'
:
[
'Linear'
],
'op_partial_names'
:
[
'bert.encoder'
],
'sparsity'
:
0.9
}]
p_trainer
=
functools
.
partial
(
trainer
,
train_dataloader
=
train_dataloader
)
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer
=
nni
.
trace
(
Adam
)(
model
.
parameters
(),
lr
=
2e-5
)
pruner
=
MovementPruner
(
model
,
config_list
,
p_trainer
,
traced_optimizer
,
criterion
,
training_epochs
=
10
,
warm_up_step
=
12272
,
cool_down_beginning_step
=
110448
)
_
,
masks
=
pruner
.
compress
()
pruner
.
show_pruned_weights
()
print
(
'Final: {}'
.
format
(
evaluator
(
model
,
metric
,
is_regression
,
validate_dataloader
)))
optimizer
=
Adam
(
model
.
parameters
(),
lr
=
2e-5
)
trainer
(
model
,
optimizer
,
criterion
,
train_dataloader
)
print
(
'After 1 epoch finetuning: {}'
.
format
(
evaluator
(
model
,
metric
,
is_regression
,
validate_dataloader
)))
examples/model_compress/pruning/norm_pruning_torch.py
0 → 100644
View file @
e773dfcc
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported l1norm and l2norm pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
'''
import
argparse
import
sys
import
torch
from
torchvision
import
datasets
,
transforms
from
torch.optim.lr_scheduler
import
MultiStepLR
from
nni.compression.pytorch
import
ModelSpeedup
from
nni.compression.pytorch.utils
import
count_flops_params
from
nni.compression.pytorch.pruning
import
L1NormPruner
,
L2NormPruner
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
g_epoch
=
0
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
def
trainer
(
model
,
optimizer
,
criterion
):
global
g_epoch
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
and
batch_idx
%
100
==
0
:
print
(
'Train Epoch: {} [{}/{} ({:.0f}%)]
\t
Loss: {:.6f}'
.
format
(
g_epoch
,
batch_idx
*
len
(
data
),
len
(
train_loader
.
dataset
),
100.
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
g_epoch
+=
1
def
evaluator
(
model
):
model
.
eval
()
correct
=
0.0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
def
optimizer_scheduler_generator
(
model
,
_lr
=
0.1
,
_momentum
=
0.9
,
_weight_decay
=
5e-4
,
total_epoch
=
160
):
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
_lr
,
momentum
=
_momentum
,
weight_decay
=
_weight_decay
)
scheduler
=
MultiStepLR
(
optimizer
,
milestones
=
[
int
(
total_epoch
*
0.5
),
int
(
total_epoch
*
0.75
)],
gamma
=
0.1
)
return
optimizer
,
scheduler
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Example for model comporession'
)
parser
.
add_argument
(
'--pruner'
,
type
=
str
,
default
=
'l1norm'
,
choices
=
[
'l1norm'
,
'l2norm'
],
help
=
'pruner to use'
)
parser
.
add_argument
(
'--pretrain-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to pretrain the model'
)
parser
.
add_argument
(
'--fine-tune-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to fine tune the model'
)
args
=
parser
.
parse_args
()
print
(
'
\n
'
+
'='
*
50
+
' START TO TRAIN THE MODEL '
+
'='
*
50
)
model
=
VGG
().
to
(
device
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
total_epoch
=
args
.
pretrain_epochs
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
pre_best_acc
=
0.0
best_state_dict
=
None
for
i
in
range
(
args
.
pretrain_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
acc
=
evaluator
(
model
)
if
acc
>
pre_best_acc
:
pre_best_acc
=
acc
best_state_dict
=
model
.
state_dict
()
print
(
"Best accuracy: {}"
.
format
(
pre_best_acc
))
model
.
load_state_dict
(
best_state_dict
)
pre_flops
,
pre_params
,
_
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
g_epoch
=
0
# Start to prune and speedup
print
(
'
\n
'
+
'='
*
50
+
' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL '
+
'='
*
50
)
config_list
=
[{
'sparsity'
:
0.5
,
'op_types'
:
[
'Conv2d'
]
}]
if
'l1'
in
args
.
pruner
:
pruner
=
L1NormPruner
(
model
,
config_list
)
else
:
pruner
=
L2NormPruner
(
model
,
config_list
)
_
,
masks
=
pruner
.
compress
()
pruner
.
show_pruned_weights
()
pruner
.
_unwrap_model
()
ModelSpeedup
(
model
,
dummy_input
=
torch
.
rand
([
10
,
3
,
32
,
32
]).
to
(
device
),
masks_file
=
masks
).
speedup_model
()
print
(
'
\n
'
+
'='
*
50
+
' EVALUATE THE MODEL AFTER SPEEDUP '
+
'='
*
50
)
evaluator
(
model
)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print
(
'
\n
'
+
'='
*
50
+
' START TO FINE TUNE THE MODEL '
+
'='
*
50
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
_lr
=
0.01
,
total_epoch
=
args
.
fine_tune_epochs
)
best_acc
=
0.0
for
i
in
range
(
args
.
fine_tune_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
best_acc
=
max
(
evaluator
(
model
),
best_acc
)
flops
,
params
,
results
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
print
(
f
'Pretrained model FLOPs
{
pre_flops
/
1e6
:.
2
f
}
M, #Params:
{
pre_params
/
1e6
:.
2
f
}
M, Accuracy:
{
pre_best_acc
:
.
2
f
}
%'
)
print
(
f
'Finetuned model FLOPs
{
flops
/
1e6
:.
2
f
}
M, #Params:
{
params
/
1e6
:.
2
f
}
M, Accuracy:
{
best_acc
:
.
2
f
}
%'
)
examples/model_compress/pruning/scheduler_torch.py
0 → 100644
View file @
e773dfcc
import
sys
from
tqdm
import
tqdm
import
torch
from
torchvision
import
datasets
,
transforms
from
nni.compression.pytorch.pruning
import
L1NormPruner
from
nni.algorithms.compression.v2.pytorch.pruning.tools
import
AGPTaskGenerator
from
nni.algorithms.compression.v2.pytorch.pruning.basic_scheduler
import
PruningScheduler
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
def
trainer
(
model
,
optimizer
,
criterion
,
epoch
):
model
.
train
()
for
data
,
target
in
tqdm
(
iterable
=
train_loader
,
desc
=
'Epoch {}'
.
format
(
epoch
)):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
def
finetuner
(
model
):
model
.
train
()
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.1
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
for
data
,
target
in
tqdm
(
iterable
=
train_loader
,
desc
=
'Epoch PFs'
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
def
evaluator
(
model
):
model
.
eval
()
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
tqdm
(
iterable
=
test_loader
,
desc
=
'Test'
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
if
__name__
==
'__main__'
:
model
=
VGG
().
to
(
device
)
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.1
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
# pre-train the model
for
i
in
range
(
5
):
trainer
(
model
,
optimizer
,
criterion
,
i
)
# No need to pass model and config_list to pruner during initializing when using scheduler.
pruner
=
L1NormPruner
(
None
,
None
)
# you can specify the log_dir, all intermediate results and best result will save under this folder.
# if you don't want to keep intermediate results, you can set `keep_intermediate_result=False`.
config_list
=
[{
'op_types'
:
[
'Conv2d'
],
'sparsity'
:
0.8
}]
task_generator
=
AGPTaskGenerator
(
10
,
model
,
config_list
,
log_dir
=
'.'
,
keep_intermediate_result
=
True
)
dummy_input
=
torch
.
rand
(
10
,
3
,
32
,
32
).
to
(
device
)
# if you just want to keep the final result as the best result, you can pass evaluator as None.
# or the result with the highest score (given by evaluator) will be the best result.
# scheduler = PruningScheduler(pruner, task_generator, finetuner=finetuner, speedup=True, dummy_input=dummy_input, evaluator=evaluator)
scheduler
=
PruningScheduler
(
pruner
,
task_generator
,
finetuner
=
finetuner
,
speedup
=
True
,
dummy_input
=
dummy_input
,
evaluator
=
None
,
reset_weight
=
False
)
scheduler
.
compress
()
_
,
model
,
masks
,
_
,
_
=
scheduler
.
get_best_result
()
examples/model_compress/pruning/simple_pruning_torch.py
0 → 100644
View file @
e773dfcc
import
sys
from
tqdm
import
tqdm
import
torch
from
torchvision
import
datasets
,
transforms
from
nni.compression.pytorch.pruning
import
L1NormPruner
from
nni.compression.pytorch.speedup
import
ModelSpeedup
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
def
trainer
(
model
,
optimizer
,
criterion
,
epoch
):
model
.
train
()
for
data
,
target
in
tqdm
(
iterable
=
train_loader
,
desc
=
'Epoch {}'
.
format
(
epoch
)):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
def
evaluator
(
model
):
model
.
eval
()
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
tqdm
(
iterable
=
test_loader
,
desc
=
'Test'
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
if
__name__
==
'__main__'
:
model
=
VGG
().
to
(
device
)
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.1
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
print
(
'
\n
Pre-train the model:'
)
for
i
in
range
(
5
):
trainer
(
model
,
optimizer
,
criterion
,
i
)
evaluator
(
model
)
config_list
=
[{
'op_types'
:
[
'Conv2d'
],
'sparsity'
:
0.8
}]
pruner
=
L1NormPruner
(
model
,
config_list
)
_
,
masks
=
pruner
.
compress
()
print
(
'
\n
The accuracy with masks:'
)
evaluator
(
model
)
pruner
.
_unwrap_model
()
ModelSpeedup
(
model
,
dummy_input
=
torch
.
rand
(
10
,
3
,
32
,
32
).
to
(
device
),
masks_file
=
masks
).
speedup_model
()
print
(
'
\n
The accuracy after speedup:'
)
evaluator
(
model
)
# Need a new optimizer due to the modules in model will be replaced during speedup.
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.1
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
print
(
'
\n
Finetune the model after speedup:'
)
for
i
in
range
(
5
):
trainer
(
model
,
optimizer
,
criterion
,
i
)
evaluator
(
model
)
examples/model_compress/pruning/simulated_anealing_pruning_torch.py
0 → 100644
View file @
e773dfcc
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for simulated anealing pruning algorithm.
In this example, we show the end-to-end iterative pruning process: pre-training -> pruning -> fine-tuning.
'''
import
sys
import
argparse
from
tqdm
import
tqdm
import
torch
from
torchvision
import
datasets
,
transforms
from
nni.compression.pytorch.pruning
import
SimulatedAnnealingPruner
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
def
trainer
(
model
,
optimizer
,
criterion
,
epoch
):
model
.
train
()
for
data
,
target
in
tqdm
(
iterable
=
train_loader
,
desc
=
'Epoch {}'
.
format
(
epoch
)):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
def
finetuner
(
model
):
model
.
train
()
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.1
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
for
data
,
target
in
tqdm
(
iterable
=
train_loader
,
desc
=
'Epoch PFs'
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
def
evaluator
(
model
):
model
.
eval
()
correct
=
0
with
torch
.
no_grad
():
for
data
,
target
in
tqdm
(
iterable
=
test_loader
,
desc
=
'Test'
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Iterative Example for model comporession'
)
parser
.
add_argument
(
'--pretrain-epochs'
,
type
=
int
,
default
=
10
,
help
=
'number of epochs to pretrain the model'
)
parser
.
add_argument
(
'--pruning-algo'
,
type
=
str
,
default
=
'l1'
,
choices
=
[
'level'
,
'l1'
,
'l2'
,
'fpgm'
,
'slim'
,
'apoz'
,
'mean_activation'
,
'taylorfo'
,
'admm'
],
help
=
'algorithm to evaluate weights to prune'
)
parser
.
add_argument
(
'--cool-down-rate'
,
type
=
float
,
default
=
0.9
,
help
=
'Cool down rate of the temperature.'
)
args
=
parser
.
parse_args
()
model
=
VGG
().
to
(
device
)
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
0.1
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
# pre-train the model
for
i
in
range
(
args
.
pretrain_epochs
):
trainer
(
model
,
optimizer
,
criterion
,
i
)
evaluator
(
model
)
config_list
=
[{
'op_types'
:
[
'Conv2d'
],
'total_sparsity'
:
0.8
}]
# evaluator in 'SimulatedAnnealingPruner' could not be None.
pruner
=
SimulatedAnnealingPruner
(
model
,
config_list
,
pruning_algorithm
=
args
.
pruning_algo
,
evaluator
=
evaluator
,
cool_down_rate
=
args
.
cool_down_rate
,
finetuner
=
finetuner
)
pruner
.
compress
()
_
,
model
,
masks
,
_
,
_
=
pruner
.
get_best_result
()
evaluator
(
model
)
examples/model_compress/pruning/slim_pruning_torch.py
0 → 100644
View file @
e773dfcc
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported slim pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> speedup -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
'''
import
argparse
import
sys
import
torch
from
torchvision
import
datasets
,
transforms
from
torch.optim.lr_scheduler
import
MultiStepLR
import
nni
from
nni.compression.pytorch
import
ModelSpeedup
from
nni.compression.pytorch.utils
import
count_flops_params
from
nni.compression.pytorch.pruning
import
SlimPruner
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
g_epoch
=
0
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
def
trainer
(
model
,
optimizer
,
criterion
):
global
g_epoch
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
and
batch_idx
%
100
==
0
:
print
(
'Train Epoch: {} [{}/{} ({:.0f}%)]
\t
Loss: {:.6f}'
.
format
(
g_epoch
,
batch_idx
*
len
(
data
),
len
(
train_loader
.
dataset
),
100.
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
g_epoch
+=
1
def
evaluator
(
model
):
model
.
eval
()
correct
=
0.0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
def
optimizer_scheduler_generator
(
model
,
_lr
=
0.1
,
_momentum
=
0.9
,
_weight_decay
=
5e-4
,
total_epoch
=
160
):
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
_lr
,
momentum
=
_momentum
,
weight_decay
=
_weight_decay
)
scheduler
=
MultiStepLR
(
optimizer
,
milestones
=
[
int
(
total_epoch
*
0.5
),
int
(
total_epoch
*
0.75
)],
gamma
=
0.1
)
return
optimizer
,
scheduler
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Example for model comporession'
)
parser
.
add_argument
(
'--pretrain-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to pretrain the model'
)
parser
.
add_argument
(
'--fine-tune-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to fine tune the model'
)
args
=
parser
.
parse_args
()
print
(
'
\n
'
+
'='
*
50
+
' START TO TRAIN THE MODEL '
+
'='
*
50
)
model
=
VGG
().
to
(
device
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
total_epoch
=
args
.
pretrain_epochs
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
pre_best_acc
=
0.0
best_state_dict
=
None
for
i
in
range
(
args
.
pretrain_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
acc
=
evaluator
(
model
)
if
acc
>
pre_best_acc
:
pre_best_acc
=
acc
best_state_dict
=
model
.
state_dict
()
print
(
"Best accuracy: {}"
.
format
(
pre_best_acc
))
model
.
load_state_dict
(
best_state_dict
)
pre_flops
,
pre_params
,
_
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
g_epoch
=
0
# Start to prune and speedup
print
(
'
\n
'
+
'='
*
50
+
' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL '
+
'='
*
50
)
config_list
=
[{
'total_sparsity'
:
0.5
,
'op_types'
:
[
'BatchNorm2d'
],
'max_sparsity_per_layer'
:
0.9
}]
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer
=
nni
.
trace
(
torch
.
optim
.
SGD
)(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
pruner
=
SlimPruner
(
model
,
config_list
,
trainer
,
traced_optimizer
,
criterion
,
training_epochs
=
1
,
scale
=
0.0001
,
mode
=
'global'
)
_
,
masks
=
pruner
.
compress
()
pruner
.
show_pruned_weights
()
pruner
.
_unwrap_model
()
ModelSpeedup
(
model
,
dummy_input
=
torch
.
rand
([
10
,
3
,
32
,
32
]).
to
(
device
),
masks_file
=
masks
).
speedup_model
()
print
(
'
\n
'
+
'='
*
50
+
' EVALUATE THE MODEL AFTER SPEEDUP '
+
'='
*
50
)
evaluator
(
model
)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print
(
'
\n
'
+
'='
*
50
+
' START TO FINE TUNE THE MODEL '
+
'='
*
50
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
_lr
=
0.01
,
total_epoch
=
args
.
fine_tune_epochs
)
best_acc
=
0.0
g_epoch
=
0
for
i
in
range
(
args
.
fine_tune_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
best_acc
=
max
(
evaluator
(
model
),
best_acc
)
flops
,
params
,
results
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
print
(
f
'Pretrained model FLOPs
{
pre_flops
/
1e6
:.
2
f
}
M, #Params:
{
pre_params
/
1e6
:.
2
f
}
M, Accuracy:
{
pre_best_acc
:
.
2
f
}
%'
)
print
(
f
'Finetuned model FLOPs
{
flops
/
1e6
:.
2
f
}
M, #Params:
{
params
/
1e6
:.
2
f
}
M, Accuracy:
{
best_acc
:
.
2
f
}
%'
)
examples/model_compress/pruning/taylorfo_lightning_evaluator.py
0 → 100644
View file @
e773dfcc
from
__future__
import
annotations
import
pytorch_lightning
as
pl
from
pytorch_lightning.loggers
import
TensorBoardLogger
import
torch
from
torch.optim.lr_scheduler
import
StepLR
from
torch.utils.data
import
DataLoader
from
torchmetrics.functional
import
accuracy
from
torchvision
import
datasets
,
transforms
import
nni
from
nni.algorithms.compression.v2.pytorch
import
LightningEvaluator
import
sys
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'models'
))
from
cifar10.vgg
import
VGG
class
SimpleLightningModel
(
pl
.
LightningModule
):
def
__init__
(
self
):
super
().
__init__
()
self
.
model
=
VGG
()
self
.
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
def
forward
(
self
,
x
):
return
self
.
model
(
x
)
def
training_step
(
self
,
batch
,
batch_idx
):
x
,
y
=
batch
logits
=
self
(
x
)
loss
=
self
.
criterion
(
logits
,
y
)
self
.
log
(
"train_loss"
,
loss
)
return
loss
def
evaluate
(
self
,
batch
,
stage
=
None
):
x
,
y
=
batch
logits
=
self
(
x
)
loss
=
self
.
criterion
(
logits
,
y
)
preds
=
torch
.
argmax
(
logits
,
dim
=
1
)
acc
=
accuracy
(
preds
,
y
)
if
stage
:
self
.
log
(
f
"default"
,
loss
,
prog_bar
=
False
)
self
.
log
(
f
"
{
stage
}
_loss"
,
loss
,
prog_bar
=
True
)
self
.
log
(
f
"
{
stage
}
_acc"
,
acc
,
prog_bar
=
True
)
def
validation_step
(
self
,
batch
,
batch_idx
):
self
.
evaluate
(
batch
,
"val"
)
def
test_step
(
self
,
batch
,
batch_idx
):
self
.
evaluate
(
batch
,
"test"
)
def
configure_optimizers
(
self
):
optimizer
=
nni
.
trace
(
torch
.
optim
.
Adam
)(
self
.
parameters
(),
lr
=
0.001
)
scheduler_dict
=
{
"scheduler"
:
nni
.
trace
(
StepLR
)(
optimizer
,
step_size
=
1
,
gamma
=
0.5
),
"interval"
:
"epoch"
,
}
return
{
"optimizer"
:
optimizer
,
"lr_scheduler"
:
scheduler_dict
}
class
ImageNetDataModule
(
pl
.
LightningDataModule
):
def
__init__
(
self
,
data_dir
:
str
=
"./data"
):
super
().
__init__
()
self
.
data_dir
=
data_dir
def
prepare_data
(
self
):
# download
datasets
.
CIFAR10
(
self
.
data_dir
,
train
=
True
,
download
=
True
)
datasets
.
CIFAR10
(
self
.
data_dir
,
train
=
False
,
download
=
True
)
def
setup
(
self
,
stage
:
str
|
None
=
None
):
if
stage
==
"fit"
or
stage
is
None
:
self
.
cifar10_train_data
=
datasets
.
CIFAR10
(
root
=
'data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
)),
]))
self
.
cifar10_val_data
=
datasets
.
CIFAR10
(
root
=
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
)),
]))
if
stage
==
"test"
or
stage
is
None
:
self
.
cifar10_test_data
=
datasets
.
CIFAR10
(
root
=
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
)),
]))
if
stage
==
"predict"
or
stage
is
None
:
self
.
cifar10_predict_data
=
datasets
.
CIFAR10
(
root
=
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
)),
]))
def
train_dataloader
(
self
):
return
DataLoader
(
self
.
cifar10_train_data
,
batch_size
=
128
,
shuffle
=
True
)
def
val_dataloader
(
self
):
return
DataLoader
(
self
.
cifar10_val_data
,
batch_size
=
128
,
shuffle
=
False
)
def
test_dataloader
(
self
):
return
DataLoader
(
self
.
cifar10_test_data
,
batch_size
=
128
,
shuffle
=
False
)
def
predict_dataloader
(
self
):
return
DataLoader
(
self
.
cifar10_predict_data
,
batch_size
=
128
,
shuffle
=
False
)
# Train the model
pl_trainer
=
nni
.
trace
(
pl
.
Trainer
)(
accelerator
=
'auto'
,
devices
=
1
,
max_epochs
=
3
,
logger
=
TensorBoardLogger
(
'./lightning_logs'
,
name
=
"vgg"
),
)
pl_data
=
nni
.
trace
(
ImageNetDataModule
)(
data_dir
=
'./data'
)
model
=
SimpleLightningModel
()
pl_trainer
.
fit
(
model
,
pl_data
)
metric
=
pl_trainer
.
test
(
model
,
pl_data
)
print
(
f
'The trained model accuracy:
{
metric
}
'
)
# create traced optimizer / lr_scheduler
optimizer
=
nni
.
trace
(
torch
.
optim
.
Adam
)(
model
.
parameters
(),
lr
=
1e-3
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
lr_scheduler
=
nni
.
trace
(
StepLR
)(
optimizer
,
step_size
=
1
,
gamma
=
0.5
)
dummy_input
=
torch
.
rand
(
4
,
3
,
224
,
224
)
# TorchEvaluator initialization
evaluator
=
LightningEvaluator
(
pl_trainer
,
pl_data
)
# apply pruning
from
nni.compression.pytorch.pruning
import
TaylorFOWeightPruner
from
nni.compression.pytorch.speedup
import
ModelSpeedup
pruner
=
TaylorFOWeightPruner
(
model
,
config_list
=
[{
'total_sparsity'
:
0.5
,
'op_types'
:
[
'Conv2d'
]}],
evaluator
=
evaluator
,
training_steps
=
100
)
_
,
masks
=
pruner
.
compress
()
metric
=
pl_trainer
.
test
(
model
,
pl_data
)
print
(
f
'The masked model accuracy:
{
metric
}
'
)
pruner
.
show_pruned_weights
()
pruner
.
_unwrap_model
()
ModelSpeedup
(
model
,
dummy_input
=
torch
.
rand
([
10
,
3
,
32
,
32
]),
masks_file
=
masks
).
speedup_model
()
metric
=
pl_trainer
.
test
(
model
,
pl_data
)
print
(
f
'The speedup model accuracy:
{
metric
}
'
)
# finetune the speedup model
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
1e-3
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
lr_scheduler
=
StepLR
(
optimizer
,
step_size
=
1
,
gamma
=
0.5
)
pl_trainer
=
pl
.
Trainer
(
accelerator
=
'auto'
,
devices
=
1
,
max_epochs
=
3
,
logger
=
TensorBoardLogger
(
'./lightning_logs'
,
name
=
"vgg"
),
)
pl_trainer
.
fit
(
model
,
pl_data
)
metric
=
pl_trainer
.
test
(
model
,
pl_data
)
print
(
f
'The speedup model after finetuning accuracy:
{
metric
}
'
)
examples/model_compress/pruning/taylorfo_pruning_torch.py
0 → 100644
View file @
e773dfcc
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported TaylorFOWeight pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
'''
import
argparse
import
sys
import
torch
from
torchvision
import
datasets
,
transforms
from
torch.optim.lr_scheduler
import
MultiStepLR
import
nni
from
nni.compression.pytorch
import
ModelSpeedup
from
nni.compression.pytorch.utils
import
count_flops_params
from
nni.compression.pytorch.pruning
import
TaylorFOWeightPruner
from
pathlib
import
Path
sys
.
path
.
append
(
str
(
Path
(
__file__
).
absolute
().
parents
[
1
]
/
'models'
))
from
cifar10.vgg
import
VGG
device
=
torch
.
device
(
"cuda"
if
torch
.
cuda
.
is_available
()
else
"cpu"
)
normalize
=
transforms
.
Normalize
((
0.4914
,
0.4822
,
0.4465
),
(
0.2023
,
0.1994
,
0.2010
))
g_epoch
=
0
train_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
True
,
transform
=
transforms
.
Compose
([
transforms
.
RandomHorizontalFlip
(),
transforms
.
RandomCrop
(
32
,
4
),
transforms
.
ToTensor
(),
normalize
,
]),
download
=
True
),
batch_size
=
128
,
shuffle
=
True
)
test_loader
=
torch
.
utils
.
data
.
DataLoader
(
datasets
.
CIFAR10
(
'./data'
,
train
=
False
,
transform
=
transforms
.
Compose
([
transforms
.
ToTensor
(),
normalize
,
])),
batch_size
=
128
,
shuffle
=
False
)
def
trainer
(
model
,
optimizer
,
criterion
):
global
g_epoch
model
.
train
()
for
batch_idx
,
(
data
,
target
)
in
enumerate
(
train_loader
):
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
optimizer
.
zero_grad
()
output
=
model
(
data
)
loss
=
criterion
(
output
,
target
)
loss
.
backward
()
optimizer
.
step
()
if
batch_idx
and
batch_idx
%
100
==
0
:
print
(
'Train Epoch: {} [{}/{} ({:.0f}%)]
\t
Loss: {:.6f}'
.
format
(
g_epoch
,
batch_idx
*
len
(
data
),
len
(
train_loader
.
dataset
),
100.
*
batch_idx
/
len
(
train_loader
),
loss
.
item
()))
g_epoch
+=
1
def
evaluator
(
model
):
model
.
eval
()
correct
=
0.0
with
torch
.
no_grad
():
for
data
,
target
in
test_loader
:
data
,
target
=
data
.
to
(
device
),
target
.
to
(
device
)
output
=
model
(
data
)
pred
=
output
.
argmax
(
dim
=
1
,
keepdim
=
True
)
correct
+=
pred
.
eq
(
target
.
view_as
(
pred
)).
sum
().
item
()
acc
=
100
*
correct
/
len
(
test_loader
.
dataset
)
print
(
'Accuracy: {}%
\n
'
.
format
(
acc
))
return
acc
def
optimizer_scheduler_generator
(
model
,
_lr
=
0.1
,
_momentum
=
0.9
,
_weight_decay
=
5e-4
,
total_epoch
=
160
):
optimizer
=
torch
.
optim
.
SGD
(
model
.
parameters
(),
lr
=
_lr
,
momentum
=
_momentum
,
weight_decay
=
_weight_decay
)
scheduler
=
MultiStepLR
(
optimizer
,
milestones
=
[
int
(
total_epoch
*
0.5
),
int
(
total_epoch
*
0.75
)],
gamma
=
0.1
)
return
optimizer
,
scheduler
if
__name__
==
'__main__'
:
parser
=
argparse
.
ArgumentParser
(
description
=
'PyTorch Example for model comporession'
)
parser
.
add_argument
(
'--pretrain-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to pretrain the model'
)
parser
.
add_argument
(
'--fine-tune-epochs'
,
type
=
int
,
default
=
20
,
help
=
'number of epochs to fine tune the model'
)
args
=
parser
.
parse_args
()
print
(
'
\n
'
+
'='
*
50
+
' START TO TRAIN THE MODEL '
+
'='
*
50
)
model
=
VGG
().
to
(
device
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
total_epoch
=
args
.
pretrain_epochs
)
criterion
=
torch
.
nn
.
CrossEntropyLoss
()
pre_best_acc
=
0.0
best_state_dict
=
None
for
i
in
range
(
args
.
pretrain_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
acc
=
evaluator
(
model
)
if
acc
>
pre_best_acc
:
pre_best_acc
=
acc
best_state_dict
=
model
.
state_dict
()
print
(
"Best accuracy: {}"
.
format
(
pre_best_acc
))
model
.
load_state_dict
(
best_state_dict
)
pre_flops
,
pre_params
,
_
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
g_epoch
=
0
# Start to prune and speedup
print
(
'
\n
'
+
'='
*
50
+
' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL '
+
'='
*
50
)
config_list
=
[{
'total_sparsity'
:
0.5
,
'op_types'
:
[
'Conv2d'
],
}]
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer
=
nni
.
trace
(
torch
.
optim
.
SGD
)(
model
.
parameters
(),
lr
=
0.01
,
momentum
=
0.9
,
weight_decay
=
5e-4
)
pruner
=
TaylorFOWeightPruner
(
model
,
config_list
,
trainer
,
traced_optimizer
,
criterion
,
training_batches
=
20
)
_
,
masks
=
pruner
.
compress
()
pruner
.
show_pruned_weights
()
pruner
.
_unwrap_model
()
ModelSpeedup
(
model
,
dummy_input
=
torch
.
rand
([
10
,
3
,
32
,
32
]).
to
(
device
),
masks_file
=
masks
).
speedup_model
()
print
(
'
\n
'
+
'='
*
50
+
' EVALUATE THE MODEL AFTER SPEEDUP '
+
'='
*
50
)
evaluator
(
model
)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print
(
'
\n
'
+
'='
*
50
+
' START TO FINE TUNE THE MODEL '
+
'='
*
50
)
optimizer
,
scheduler
=
optimizer_scheduler_generator
(
model
,
_lr
=
0.01
,
total_epoch
=
args
.
fine_tune_epochs
)
best_acc
=
0.0
g_epoch
=
0
for
i
in
range
(
args
.
fine_tune_epochs
):
trainer
(
model
,
optimizer
,
criterion
)
scheduler
.
step
()
best_acc
=
max
(
evaluator
(
model
),
best_acc
)
flops
,
params
,
results
=
count_flops_params
(
model
,
torch
.
randn
([
128
,
3
,
32
,
32
]).
to
(
device
))
print
(
f
'Pretrained model FLOPs
{
pre_flops
/
1e6
:.
2
f
}
M, #Params:
{
pre_params
/
1e6
:.
2
f
}
M, Accuracy:
{
pre_best_acc
:
.
2
f
}
%'
)
print
(
f
'Finetuned model FLOPs
{
flops
/
1e6
:.
2
f
}
M, #Params:
{
params
/
1e6
:.
2
f
}
M, Accuracy:
{
best_acc
:
.
2
f
}
%'
)
Prev
1
…
28
29
30
31
32
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment