Commit e773dfcc authored by qianyj's avatar qianyj
Browse files

create branch for v2.9

parents
import torch
from nanodet.model.arch import build_model
from nanodet.util import cfg, load_config
from nni.compression.pytorch import ModelSpeedup
from nni.algorithms.compression.pytorch.pruning import L1FilterPruner
"""
NanoDet model can be installed from https://github.com/RangiLyu/nanodet.git
"""
cfg_path = r"nanodet/config/nanodet-RepVGG-A0_416.yml"
load_config(cfg, cfg_path)
model = build_model(cfg.model).cpu()
dummy_input = torch.rand(8, 3, 416, 416)
op_names = []
# these three conv layers are followed by reshape-like functions
# that cannot be replaced, so we skip these three conv layers,
# you can also get such layers by `not_safe_to_prune` function
excludes = ['head.gfl_cls.0', 'head.gfl_cls.1', 'head.gfl_cls.2']
for name, module in model.named_modules():
if isinstance(module, torch.nn.Conv2d):
if name not in excludes:
op_names.append(name)
cfg_list = [{'op_types':['Conv2d'], 'sparsity':0.5, 'op_names':op_names}]
pruner = L1FilterPruner(model, cfg_list)
pruner.compress()
pruner.export_model('./model', './mask')
# need call _unwrap_model if you want run the speedup on the same model
pruner._unwrap_model()
# Speedup the nanodet
ms = ModelSpeedup(model, dummy_input, './mask')
ms.speedup_model()
model(dummy_input)
\ No newline at end of file
import torch
from pytorchyolo import models
from nni.compression.pytorch import ModelSpeedup
from nni.algorithms.compression.pytorch.pruning import L1FilterPruner, LevelPruner
from nni.compression.pytorch.utils import not_safe_to_prune
# The Yolo can be downloaded at https://github.com/eriklindernoren/PyTorch-YOLOv3.git
prefix = '/home/user/PyTorch-YOLOv3' # replace this path with yours
# Load the YOLO model
model = models.load_model(
"%s/config/yolov3.cfg" % prefix,
"%s/yolov3.weights" % prefix).cpu()
model.eval()
dummy_input = torch.rand(8, 3, 320, 320)
model(dummy_input)
# Generate the config list for pruner
# Filter the layers that may not be able to prune
not_safe = not_safe_to_prune(model, dummy_input)
cfg_list = []
for name, module in model.named_modules():
if name in not_safe:
continue
if isinstance(module, torch.nn.Conv2d):
cfg_list.append({'op_types':['Conv2d'], 'sparsity':0.6, 'op_names':[name]})
# Prune the model
pruner = L1FilterPruner(model, cfg_list)
pruner.compress()
pruner.export_model('./model', './mask')
pruner._unwrap_model()
# Speedup the model
ms = ModelSpeedup(model, dummy_input, './mask')
ms.speedup_model()
model(dummy_input)
#!/bin/bash
# Usage: ./run.sh gpu_id glue_task
export CUDA_VISIBLE_DEVICES=$1
TASK_NAME=$2 # "cola", "sst2", "mrpc", "stsb", "qqp", "mnli", "qnli", "rte", "wnli"
PRETRAINED_MODEL="bert-base-uncased" # "distilbert-base-uncased", "roberta-base", "bert-base-cased", ...
# parameters for pruning
SPARSITY=0.5
RANKING_CRITERION=l1_weight # "l1_weight", "l2_weight", "l1_activation", "l2_activation", "taylorfo"
NUM_ITERATIONS=1 # 1 for one-shot pruning
EPOCHS_PER_ITERATION=1
# other training parameters, no need to change
MAX_LENGTH=128
BATCH_SIZE=32
LR=2e-5
N_EPOCHS=3
time=$(date "+%Y%m%d%H%M%S")
OUTDIR="models_${PRETRAINED_MODEL}_${TASK_NAME}_$time/"
TASK_LIST=("cola" "sst2" "mrpc" "stsb" "qqp" "mnli" "qnli" "rte" "wnli")
if [[ ${TASK_LIST[*]} =~ (^|[[:space:]])$TASK_NAME($|[[:space:]]) ]]; then
mkdir $OUTDIR
python transformer_pruning.py \
--sparsity $SPARSITY \
--ranking_criterion $RANKING_CRITERION \
--num_iterations $NUM_ITERATIONS \
--epochs_per_iteration $EPOCHS_PER_ITERATION \
--speedup \
--model_name $PRETRAINED_MODEL \
--task_name $TASK_NAME \
--max_length $MAX_LENGTH \
--batch_size $BATCH_SIZE \
--learning_rate $LR \
--num_train_epochs $N_EPOCHS \
--output_dir $OUTDIR \
2>&1 | tee "$OUTDIR/output.log"
else
echo "Unsupported task $TASK_NAME."
fi
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import argparse
import logging
import os
import torch
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from nni.compression.pytorch.utils import count_flops_params
from nni.algorithms.compression.pytorch.pruning import TransformerHeadPruner
import datasets
from datasets import load_dataset, load_metric
import transformers
from transformers import (
AdamW,
AutoConfig,
AutoModelForSequenceClassification,
AutoTokenizer,
DataCollatorWithPadding,
get_scheduler,
)
logger = logging.getLogger("bert_pruning_example")
def parse_args():
parser = argparse.ArgumentParser(
description="Example: prune a Huggingface transformer and finetune on GLUE tasks.")
parser.add_argument("--model_name", type=str, required=True,
help="Pretrained model architecture.")
parser.add_argument("--task_name", type=str, default=None,
help="The name of the GLUE task.",
choices=["cola", "mnli", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"])
parser.add_argument("--output_dir", type=str, default=None,
help="Where to store the model and mask.")
parser.add_argument("--sparsity", type=float, required=True,
help="Sparsity: proportion of heads to prune (should be between 0 and 1)")
parser.add_argument("--global_sort", action="store_true", default=False,
help="Rank the heads globally and prune the heads with lowest scores. If set to False, the "
"heads are only ranked within one layer")
parser.add_argument("--ranking_criterion", type=str, default="l1_weight",
choices=["l1_weight", "l2_weight",
"l1_activation", "l2_activation", "taylorfo"],
help="Criterion by which the attention heads are ranked.")
parser.add_argument("--num_iterations", type=int, default=1,
help="Number of pruning iterations (1 for one-shot pruning).")
parser.add_argument("--epochs_per_iteration", type=int, default=1,
help="Epochs to finetune before the next pruning iteration "
"(only effective if num_iterations > 1).")
parser.add_argument("--speedup", action="store_true", default=False,
help="Whether to speedup the pruned model")
# parameters for model training; no need to change them for running examples
parser.add_argument("--max_length", type=int, default=128,
help=("The maximum total input sequence length after tokenization. Sequences longer than this "
"will be truncated, sequences shorter will be padded if `--pad_to_max_lengh` is passed."))
parser.add_argument("--batch_size", type=int, default=8,
help="Batch size.")
parser.add_argument("--learning_rate", type=float, default=5e-5,
help="Initial learning rate.")
parser.add_argument("--num_train_epochs", type=int, default=3,
help="Total number of training epochs to perform.")
parser.add_argument("--lr_scheduler_type", default="linear",
choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant",
"constant_with_warmup"])
parser.add_argument("--num_warmup_steps", type=int, default=0,
help="Number of steps for the warmup in the lr scheduler.")
args = parser.parse_args()
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
def get_raw_dataset(task_name):
"""
Get a GLUE dataset using huggingface datasets.
"""
raw_dataset = load_dataset("glue", task_name)
is_regression = task_name == "stsb"
num_labels = 1 if is_regression else len(
raw_dataset["train"].features["label"].names)
return raw_dataset, is_regression, num_labels
def preprocess(args, tokenizer, raw_dataset):
"""
Tokenization and column renaming.
"""
assert args.task_name is not None
task_to_keys = {
"cola": ("sentence", None),
"mnli": ("premise", "hypothesis"),
"mrpc": ("sentence1", "sentence2"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"rte": ("sentence1", "sentence2"),
"sst2": ("sentence", None),
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys[args.task_name]
def tokenize(data):
texts = (
(data[sentence1_key],) if sentence2_key is None else (
data[sentence1_key], data[sentence2_key])
)
result = tokenizer(*texts, padding=False,
max_length=args.max_length, truncation=True)
if "label" in data:
result["labels"] = data["label"]
return result
processed_datasets = raw_dataset.map(
tokenize, batched=True, remove_columns=raw_dataset["train"].column_names)
return processed_datasets
def get_dataloader_and_optimizer(args, tokenizer, model, train_dataset, eval_dataset):
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator,
batch_size=args.batch_size)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator,
batch_size=args.batch_size)
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
return optimizer, train_dataloader, eval_dataloader, data_collator
def train_model(args, model, is_regression, train_dataloader, eval_dataloader, optimizer, lr_scheduler, metric, device):
"""
Train the model using train_dataloader and evaluate after every epoch using eval_dataloader.
This function is called before and after pruning for "pretraining" on the GLUE task and further "finetuning".
"""
train_steps = args.num_train_epochs * len(train_dataloader)
progress_bar = tqdm(range(train_steps), position=0, leave=True)
for epoch in range(args.num_train_epochs):
model.train()
for step, batch in enumerate(train_dataloader):
for field in batch.keys():
batch[field] = batch[field].to(device)
outputs = model(**batch)
outputs.loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
model.eval()
for step, batch in enumerate(eval_dataloader):
for field in batch.keys():
batch[field] = batch[field].to(device)
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1) if not is_regression \
else outputs.logits.squeeze()
metric.add_batch(predictions=predictions, references=batch["labels"])
eval_metric = metric.compute()
logger.info(f"epoch {epoch}: {eval_metric}")
def trainer_helper(model, train_dataloader, optimizer, device):
"""
This function is used for to create a "trainer" that is passed to the pruner.
Finetune the model for 1 epoch. This function is called by the pruner during pruning iterations (or called to
calculate scores for pruning when ranking criterion is "taylorfo").
"""
logger.info("Training for 1 epoch...")
progress_bar = tqdm(range(len(train_dataloader)), position=0, leave=True)
train_epoch = 1
for epoch in range(train_epoch):
for step, batch in enumerate(train_dataloader):
for field in batch.keys():
batch[field] = batch[field].to(device)
outputs = model(**batch)
outputs.loss.backward()
optimizer.step()
optimizer.zero_grad()
progress_bar.update(1)
def forward_runner_helper(model, train_dataloader, device):
"""
This function is used for to create a "forward_runner" that is passed to the pruner.
The function just runs forward on the train set without updating the parameters.
This allows the pruner to collect data for activation-based pruning methods.
"""
logger.info("Running forward on the entire train set without updating parameters...")
progress_bar = tqdm(range(len(train_dataloader)), position=0, leave=True)
forward_epoch = 1
for epoch in range(forward_epoch):
for step, batch in enumerate(train_dataloader):
for field in batch.keys():
batch[field] = batch[field].to(device)
_ = model(**batch)
# note: no loss.backward or optimizer.step() is performed here
progress_bar.update(1)
def final_eval_for_mnli(args, model, processed_datasets, metric, data_collator):
"""
If the task is MNLI, perform a final evaluation on mismatched validation set
"""
eval_dataset = processed_datasets["validation_mismatched"]
eval_dataloader = DataLoader(
eval_dataset, collate_fn=data_collator, batch_size=args.batch_size
)
model.eval()
for step, batch in enumerate(eval_dataloader):
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1)
metric.add_batch(
predictions=predictions,
references=batch["labels"],
)
eval_metric = metric.compute()
logger.info(f"mnli-mm: {eval_metric}")
def main():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args = parse_args()
#########################################################################
# Prepare model, tokenizer, dataset, optimizer, and the scheduler
logger.setLevel(logging.INFO)
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
# Load dataset and tokenizer, and then preprocess the dataset
raw_dataset, is_regression, num_labels = get_raw_dataset(args.task_name)
tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)
processed_datasets = preprocess(args, tokenizer, raw_dataset)
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation_matched" if args.task_name ==
"mnli" else "validation"]
# Load pretrained model
config = AutoConfig.from_pretrained(
args.model_name, num_labels=num_labels, finetuning_task=args.task_name)
model = AutoModelForSequenceClassification.from_pretrained(
args.model_name, config=config)
model.to(device)
#########################################################################
# Finetune on the target GLUE task before pruning
optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(args, tokenizer,
model,
train_dataset,
eval_dataset)
train_steps = args.num_train_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps,
num_training_steps=train_steps)
metric = load_metric("glue", args.task_name)
logger.info("================= Finetuning before pruning =================")
train_model(args, model, is_regression, train_dataloader,
eval_dataloader, optimizer, lr_scheduler, metric, device)
if args.output_dir is not None:
torch.save(model.state_dict(), args.output_dir + "/model_before_pruning.pt")
if args.task_name == "mnli":
final_eval_for_mnli(args, model, processed_datasets, metric, data_collator)
#########################################################################
# Pruning
optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(args, tokenizer,
model,
train_dataset,
eval_dataset)
dummy_input = next(iter(train_dataloader))["input_ids"].to(device)
flops, params, results = count_flops_params(model, dummy_input)
print(f"Initial model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M")
# Here criterion is embedded in the model. Upper levels can just pass None to trainer.
def trainer(model, optimizer, criterion, epoch):
return trainer_helper(model, train_dataloader, optimizer, device)
def forward_runner(model):
return forward_runner_helper(model, train_dataloader, device)
# example: prune different layers with different sparsity
attention_name_groups = list(zip(["bert.encoder.layer.{}.attention.self.query".format(i) for i in range(12)],
["bert.encoder.layer.{}.attention.self.key".format(i) for i in range(12)],
["bert.encoder.layer.{}.attention.self.value".format(i) for i in range(12)],
["bert.encoder.layer.{}.attention.output.dense".format(i) for i in range(12)]))
kwargs = {"ranking_criterion": args.ranking_criterion,
"global_sort": args.global_sort,
"num_iterations": args.num_iterations,
"epochs_per_iteration": args.epochs_per_iteration,
"attention_name_groups": attention_name_groups,
"head_hidden_dim": 64,
"trainer": trainer,
"optimizer": optimizer,
"forward_runner": forward_runner}
config_list = [{
"sparsity": args.sparsity,
"op_types": ["Linear"],
"op_names": [x for layer in attention_name_groups[:6] for x in layer]
}, {
"sparsity": args.sparsity / 2,
"op_types": ["Linear"],
"op_names": [x for layer in attention_name_groups[6:] for x in layer]
}]
pruner = TransformerHeadPruner(model, config_list, **kwargs)
pruner.compress()
#########################################################################
# uncomment the following part to export the pruned model masks
# model_path = os.path.join(args.output_dir, "pruned_{}_{}.pth".format(args.model_name, args.task_name))
# mask_path = os.path.join(args.output_dir, "mask_{}_{}.pth".format(args.model_name, args.task_name))
# pruner.export_model(model_path=model_path, mask_path=mask_path)
#########################################################################
# Speedup
# Currently, speeding up Transformers through NNI ModelSpeedup is not supported because of shape inference issues.
# However, if you are using the transformers library, you can use the following workaround:
# The following code gets the head pruning decisions from the pruner and calls the _prune_heads() function
# implemented in models from the transformers library to speedup the model.
if args.speedup:
speedup_rules = {}
for group_idx, group in enumerate(pruner.attention_name_groups):
# get the layer index
layer_idx = None
for part in group[0].split("."):
try:
layer_idx = int(part)
break
except:
continue
if layer_idx is not None:
speedup_rules[layer_idx] = pruner.pruned_heads[group_idx]
pruner._unwrap_model()
model.bert._prune_heads(speedup_rules)
print(model)
#########################################################################
# After pruning, finetune again on the target task
# Get the metric function
metric = load_metric("glue", args.task_name)
# re-initialize the optimizer and the scheduler
optimizer, _, _, data_collator = get_dataloader_and_optimizer(args, tokenizer, model, train_dataset,
eval_dataset)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps,
num_training_steps=train_steps)
logger.info("================= Finetuning after Pruning =================")
train_model(args, model, is_regression, train_dataloader,
eval_dataloader, optimizer, lr_scheduler, metric, device)
if args.output_dir is not None:
torch.save(model.state_dict(), args.output_dir +
"/model_after_pruning.pt")
if args.task_name == "mnli":
final_eval_for_mnli(args, model, processed_datasets,
metric, data_collator)
flops, params, results = count_flops_params(model, dummy_input)
print(f"Final model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M")
if __name__ == "__main__":
main()
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported level pruning algorithm.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
'''
import argparse
import sys
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
from nni.compression.pytorch.utils import count_flops_params
from nni.compression.pytorch.pruning import LevelPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
g_epoch = 0
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
def trainer(model, optimizer, criterion):
global g_epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx and batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
g_epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
g_epoch += 1
def evaluator(model):
model.eval()
correct = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
return optimizer, scheduler
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
parser.add_argument('--pretrain-epochs', type=int, default=20,
help='number of epochs to pretrain the model')
parser.add_argument('--fine-tune-epochs', type=int, default=20,
help='number of epochs to fine tune the model')
args = parser.parse_args()
print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
model = VGG().to(device)
optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
criterion = torch.nn.CrossEntropyLoss()
pre_best_acc = 0.0
best_state_dict = None
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
acc = evaluator(model)
if acc > pre_best_acc:
pre_best_acc = acc
best_state_dict = model.state_dict()
print("Best accuracy: {}".format(pre_best_acc))
model.load_state_dict(best_state_dict)
pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
# Start to prune and speedup
print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
config_list = [{
'sparsity': 0.5,
'op_types': ['default']
}]
pruner = LevelPruner(model, config_list)
_, masks = pruner.compress()
pruner.show_pruned_weights()
# Fine-grained method does not need to speedup
print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER PRUNING ' + '=' * 50)
evaluator(model)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
best_acc = 0.0
g_epoch = 0
for i in range(args.fine_tune_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
best_acc = max(evaluator(model), best_acc)
flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
import functools
import time
from tqdm import tqdm
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from datasets import load_metric, load_dataset
from transformers import (
BertForSequenceClassification,
BertTokenizerFast,
DataCollatorWithPadding,
set_seed
)
import nni
from nni.compression.pytorch.pruning import MovementPruner
task_to_keys = {
"cola": ("sentence", None),
"mnli": ("premise", "hypothesis"),
"mrpc": ("sentence1", "sentence2"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"rte": ("sentence1", "sentence2"),
"sst2": ("sentence", None),
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gradient_accumulation_steps = 8
# a fake criterion because huggingface output already has loss
def criterion(input, target):
return input.loss
def trainer(model, optimizer, criterion, train_dataloader):
model.train()
counter = 0
for batch in (train_dataloader):
counter += 1
batch.to(device)
optimizer.zero_grad()
outputs = model(**batch)
# pruner may wrap the criterion, for example, loss = origin_loss + norm(weight), so call criterion to get loss here
loss = criterion(outputs, None)
loss = loss / gradient_accumulation_steps
loss.backward()
if counter % gradient_accumulation_steps == 0 or counter == len(train_dataloader):
optimizer.step()
if counter % 800 == 0:
print('[{}]: {}'.format(time.asctime(time.localtime(time.time())), counter))
if counter % 8000 == 0:
print('Step {}: {}'.format(counter // gradient_accumulation_steps, evaluator(model, metric, is_regression, validate_dataloader)))
def evaluator(model, metric, is_regression, eval_dataloader):
model.eval()
for batch in (eval_dataloader):
batch.to(device)
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1) if not is_regression else outputs.logits.squeeze()
metric.add_batch(
predictions=predictions,
references=batch["labels"],
)
return metric.compute()
if __name__ == '__main__':
task_name = 'mnli'
is_regression = False
num_labels = 1 if is_regression else (3 if task_name == 'mnli' else 2)
train_batch_size = 4
eval_batch_size = 4
set_seed(1024)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
sentence1_key, sentence2_key = task_to_keys[task_name]
# used to preprocess the raw data
def preprocess_function(examples):
# Tokenize the texts
args = (
(examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
)
result = tokenizer(*args, padding=False, max_length=128, truncation=True)
if "label" in examples:
# In all cases, rename the column to labels because the model will expect that.
result["labels"] = examples["label"]
return result
raw_datasets = load_dataset('glue', task_name, cache_dir='./data')
processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)
train_dataset = processed_datasets['train']
validate_dataset = processed_datasets['validation_matched' if task_name == "mnli" else 'validation']
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)
validate_dataloader = DataLoader(validate_dataset, collate_fn=data_collator, batch_size=eval_batch_size)
metric = load_metric("glue", task_name)
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=num_labels).to(device)
print('Initial: {}'.format(evaluator(model, metric, is_regression, validate_dataloader)))
config_list = [{'op_types': ['Linear'], 'op_partial_names': ['bert.encoder'], 'sparsity': 0.9}]
p_trainer = functools.partial(trainer, train_dataloader=train_dataloader)
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer = nni.trace(Adam)(model.parameters(), lr=2e-5)
pruner = MovementPruner(model, config_list, p_trainer, traced_optimizer, criterion, training_epochs=10,
warm_up_step=12272, cool_down_beginning_step=110448)
_, masks = pruner.compress()
pruner.show_pruned_weights()
print('Final: {}'.format(evaluator(model, metric, is_regression, validate_dataloader)))
optimizer = Adam(model.parameters(), lr=2e-5)
trainer(model, optimizer, criterion, train_dataloader)
print('After 1 epoch finetuning: {}'.format(evaluator(model, metric, is_regression, validate_dataloader)))
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported l1norm and l2norm pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
'''
import argparse
import sys
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
from nni.compression.pytorch import ModelSpeedup
from nni.compression.pytorch.utils import count_flops_params
from nni.compression.pytorch.pruning import L1NormPruner, L2NormPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
g_epoch = 0
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
def trainer(model, optimizer, criterion):
global g_epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx and batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
g_epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
g_epoch += 1
def evaluator(model):
model.eval()
correct = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
return optimizer, scheduler
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
parser.add_argument('--pruner', type=str, default='l1norm',
choices=['l1norm', 'l2norm'],
help='pruner to use')
parser.add_argument('--pretrain-epochs', type=int, default=20,
help='number of epochs to pretrain the model')
parser.add_argument('--fine-tune-epochs', type=int, default=20,
help='number of epochs to fine tune the model')
args = parser.parse_args()
print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
model = VGG().to(device)
optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
criterion = torch.nn.CrossEntropyLoss()
pre_best_acc = 0.0
best_state_dict = None
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
acc = evaluator(model)
if acc > pre_best_acc:
pre_best_acc = acc
best_state_dict = model.state_dict()
print("Best accuracy: {}".format(pre_best_acc))
model.load_state_dict(best_state_dict)
pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
g_epoch = 0
# Start to prune and speedup
print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
config_list = [{
'sparsity': 0.5,
'op_types': ['Conv2d']
}]
if 'l1' in args.pruner:
pruner = L1NormPruner(model, config_list)
else:
pruner = L2NormPruner(model, config_list)
_, masks = pruner.compress()
pruner.show_pruned_weights()
pruner._unwrap_model()
ModelSpeedup(model, dummy_input=torch.rand([10, 3, 32, 32]).to(device), masks_file=masks).speedup_model()
print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER SPEEDUP ' + '=' * 50)
evaluator(model)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
best_acc = 0.0
for i in range(args.fine_tune_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
best_acc = max(evaluator(model), best_acc)
flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
import sys
from tqdm import tqdm
import torch
from torchvision import datasets, transforms
from nni.compression.pytorch.pruning import L1NormPruner
from nni.algorithms.compression.v2.pytorch.pruning.tools import AGPTaskGenerator
from nni.algorithms.compression.v2.pytorch.pruning.basic_scheduler import PruningScheduler
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
criterion = torch.nn.CrossEntropyLoss()
def trainer(model, optimizer, criterion, epoch):
model.train()
for data, target in tqdm(iterable=train_loader, desc='Epoch {}'.format(epoch)):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def finetuner(model):
model.train()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
for data, target in tqdm(iterable=train_loader, desc='Epoch PFs'):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def evaluator(model):
model.eval()
correct = 0
with torch.no_grad():
for data, target in tqdm(iterable=test_loader, desc='Test'):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
if __name__ == '__main__':
model = VGG().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
# pre-train the model
for i in range(5):
trainer(model, optimizer, criterion, i)
# No need to pass model and config_list to pruner during initializing when using scheduler.
pruner = L1NormPruner(None, None)
# you can specify the log_dir, all intermediate results and best result will save under this folder.
# if you don't want to keep intermediate results, you can set `keep_intermediate_result=False`.
config_list = [{'op_types': ['Conv2d'], 'sparsity': 0.8}]
task_generator = AGPTaskGenerator(10, model, config_list, log_dir='.', keep_intermediate_result=True)
dummy_input = torch.rand(10, 3, 32, 32).to(device)
# if you just want to keep the final result as the best result, you can pass evaluator as None.
# or the result with the highest score (given by evaluator) will be the best result.
# scheduler = PruningScheduler(pruner, task_generator, finetuner=finetuner, speedup=True, dummy_input=dummy_input, evaluator=evaluator)
scheduler = PruningScheduler(pruner, task_generator, finetuner=finetuner, speedup=True, dummy_input=dummy_input, evaluator=None, reset_weight=False)
scheduler.compress()
_, model, masks, _, _ = scheduler.get_best_result()
import sys
from tqdm import tqdm
import torch
from torchvision import datasets, transforms
from nni.compression.pytorch.pruning import L1NormPruner
from nni.compression.pytorch.speedup import ModelSpeedup
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
criterion = torch.nn.CrossEntropyLoss()
def trainer(model, optimizer, criterion, epoch):
model.train()
for data, target in tqdm(iterable=train_loader, desc='Epoch {}'.format(epoch)):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def evaluator(model):
model.eval()
correct = 0
with torch.no_grad():
for data, target in tqdm(iterable=test_loader, desc='Test'):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
if __name__ == '__main__':
model = VGG().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
print('\nPre-train the model:')
for i in range(5):
trainer(model, optimizer, criterion, i)
evaluator(model)
config_list = [{'op_types': ['Conv2d'], 'sparsity': 0.8}]
pruner = L1NormPruner(model, config_list)
_, masks = pruner.compress()
print('\nThe accuracy with masks:')
evaluator(model)
pruner._unwrap_model()
ModelSpeedup(model, dummy_input=torch.rand(10, 3, 32, 32).to(device), masks_file=masks).speedup_model()
print('\nThe accuracy after speedup:')
evaluator(model)
# Need a new optimizer due to the modules in model will be replaced during speedup.
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
print('\nFinetune the model after speedup:')
for i in range(5):
trainer(model, optimizer, criterion, i)
evaluator(model)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for simulated anealing pruning algorithm.
In this example, we show the end-to-end iterative pruning process: pre-training -> pruning -> fine-tuning.
'''
import sys
import argparse
from tqdm import tqdm
import torch
from torchvision import datasets, transforms
from nni.compression.pytorch.pruning import SimulatedAnnealingPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
criterion = torch.nn.CrossEntropyLoss()
def trainer(model, optimizer, criterion, epoch):
model.train()
for data, target in tqdm(iterable=train_loader, desc='Epoch {}'.format(epoch)):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def finetuner(model):
model.train()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
for data, target in tqdm(iterable=train_loader, desc='Epoch PFs'):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def evaluator(model):
model.eval()
correct = 0
with torch.no_grad():
for data, target in tqdm(iterable=test_loader, desc='Test'):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Iterative Example for model comporession')
parser.add_argument('--pretrain-epochs', type=int, default=10,
help='number of epochs to pretrain the model')
parser.add_argument('--pruning-algo', type=str, default='l1',
choices=['level', 'l1', 'l2', 'fpgm', 'slim', 'apoz',
'mean_activation', 'taylorfo', 'admm'],
help='algorithm to evaluate weights to prune')
parser.add_argument('--cool-down-rate', type=float, default=0.9,
help='Cool down rate of the temperature.')
args = parser.parse_args()
model = VGG().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
# pre-train the model
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion, i)
evaluator(model)
config_list = [{'op_types': ['Conv2d'], 'total_sparsity': 0.8}]
# evaluator in 'SimulatedAnnealingPruner' could not be None.
pruner = SimulatedAnnealingPruner(model, config_list, pruning_algorithm=args.pruning_algo,
evaluator=evaluator, cool_down_rate=args.cool_down_rate, finetuner=finetuner)
pruner.compress()
_, model, masks, _, _ = pruner.get_best_result()
evaluator(model)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported slim pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> speedup -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
'''
import argparse
import sys
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
import nni
from nni.compression.pytorch import ModelSpeedup
from nni.compression.pytorch.utils import count_flops_params
from nni.compression.pytorch.pruning import SlimPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
g_epoch = 0
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
def trainer(model, optimizer, criterion):
global g_epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx and batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
g_epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
g_epoch += 1
def evaluator(model):
model.eval()
correct = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
return optimizer, scheduler
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
parser.add_argument('--pretrain-epochs', type=int, default=20,
help='number of epochs to pretrain the model')
parser.add_argument('--fine-tune-epochs', type=int, default=20,
help='number of epochs to fine tune the model')
args = parser.parse_args()
print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
model = VGG().to(device)
optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
criterion = torch.nn.CrossEntropyLoss()
pre_best_acc = 0.0
best_state_dict = None
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
acc = evaluator(model)
if acc > pre_best_acc:
pre_best_acc = acc
best_state_dict = model.state_dict()
print("Best accuracy: {}".format(pre_best_acc))
model.load_state_dict(best_state_dict)
pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
g_epoch = 0
# Start to prune and speedup
print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
config_list = [{
'total_sparsity': 0.5,
'op_types': ['BatchNorm2d'],
'max_sparsity_per_layer': 0.9
}]
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer = nni.trace(torch.optim.SGD)(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
pruner = SlimPruner(model, config_list, trainer, traced_optimizer, criterion, training_epochs=1, scale=0.0001, mode='global')
_, masks = pruner.compress()
pruner.show_pruned_weights()
pruner._unwrap_model()
ModelSpeedup(model, dummy_input=torch.rand([10, 3, 32, 32]).to(device), masks_file=masks).speedup_model()
print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER SPEEDUP ' + '=' * 50)
evaluator(model)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
best_acc = 0.0
g_epoch = 0
for i in range(args.fine_tune_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
best_acc = max(evaluator(model), best_acc)
flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
from __future__ import annotations
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
import torch
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from torchmetrics.functional import accuracy
from torchvision import datasets, transforms
import nni
from nni.algorithms.compression.v2.pytorch import LightningEvaluator
import sys
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
from cifar10.vgg import VGG
class SimpleLightningModel(pl.LightningModule):
def __init__(self):
super().__init__()
self.model = VGG()
self.criterion = torch.nn.CrossEntropyLoss()
def forward(self, x):
return self.model(x)
def training_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = self.criterion(logits, y)
self.log("train_loss", loss)
return loss
def evaluate(self, batch, stage=None):
x, y = batch
logits = self(x)
loss = self.criterion(logits, y)
preds = torch.argmax(logits, dim=1)
acc = accuracy(preds, y)
if stage:
self.log(f"default", loss, prog_bar=False)
self.log(f"{stage}_loss", loss, prog_bar=True)
self.log(f"{stage}_acc", acc, prog_bar=True)
def validation_step(self, batch, batch_idx):
self.evaluate(batch, "val")
def test_step(self, batch, batch_idx):
self.evaluate(batch, "test")
def configure_optimizers(self):
optimizer = nni.trace(torch.optim.Adam)(
self.parameters(),
lr=0.001
)
scheduler_dict = {
"scheduler": nni.trace(StepLR)(
optimizer,
step_size=1,
gamma=0.5
),
"interval": "epoch",
}
return {"optimizer": optimizer, "lr_scheduler": scheduler_dict}
class ImageNetDataModule(pl.LightningDataModule):
def __init__(self, data_dir: str = "./data"):
super().__init__()
self.data_dir = data_dir
def prepare_data(self):
# download
datasets.CIFAR10(self.data_dir, train=True, download=True)
datasets.CIFAR10(self.data_dir, train=False, download=True)
def setup(self, stage: str | None = None):
if stage == "fit" or stage is None:
self.cifar10_train_data = datasets.CIFAR10(root='data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]))
self.cifar10_val_data = datasets.CIFAR10(root='./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]))
if stage == "test" or stage is None:
self.cifar10_test_data = datasets.CIFAR10(root='./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]))
if stage == "predict" or stage is None:
self.cifar10_predict_data = datasets.CIFAR10(root='./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
]))
def train_dataloader(self):
return DataLoader(self.cifar10_train_data, batch_size=128, shuffle=True)
def val_dataloader(self):
return DataLoader(self.cifar10_val_data, batch_size=128, shuffle=False)
def test_dataloader(self):
return DataLoader(self.cifar10_test_data, batch_size=128, shuffle=False)
def predict_dataloader(self):
return DataLoader(self.cifar10_predict_data, batch_size=128, shuffle=False)
# Train the model
pl_trainer = nni.trace(pl.Trainer)(
accelerator='auto',
devices=1,
max_epochs=3,
logger=TensorBoardLogger('./lightning_logs', name="vgg"),
)
pl_data = nni.trace(ImageNetDataModule)(data_dir='./data')
model = SimpleLightningModel()
pl_trainer.fit(model, pl_data)
metric = pl_trainer.test(model, pl_data)
print(f'The trained model accuracy: {metric}')
# create traced optimizer / lr_scheduler
optimizer = nni.trace(torch.optim.Adam)(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()
lr_scheduler = nni.trace(StepLR)(optimizer, step_size=1, gamma=0.5)
dummy_input = torch.rand(4, 3, 224, 224)
# TorchEvaluator initialization
evaluator = LightningEvaluator(pl_trainer, pl_data)
# apply pruning
from nni.compression.pytorch.pruning import TaylorFOWeightPruner
from nni.compression.pytorch.speedup import ModelSpeedup
pruner = TaylorFOWeightPruner(model, config_list=[{'total_sparsity': 0.5, 'op_types': ['Conv2d']}], evaluator=evaluator, training_steps=100)
_, masks = pruner.compress()
metric = pl_trainer.test(model, pl_data)
print(f'The masked model accuracy: {metric}')
pruner.show_pruned_weights()
pruner._unwrap_model()
ModelSpeedup(model, dummy_input=torch.rand([10, 3, 32, 32]), masks_file=masks).speedup_model()
metric = pl_trainer.test(model, pl_data)
print(f'The speedup model accuracy: {metric}')
# finetune the speedup model
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = torch.nn.CrossEntropyLoss()
lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.5)
pl_trainer = pl.Trainer(
accelerator='auto',
devices=1,
max_epochs=3,
logger=TensorBoardLogger('./lightning_logs', name="vgg"),
)
pl_trainer.fit(model, pl_data)
metric = pl_trainer.test(model, pl_data)
print(f'The speedup model after finetuning accuracy: {metric}')
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported TaylorFOWeight pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speedup is required.
'''
import argparse
import sys
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
import nni
from nni.compression.pytorch import ModelSpeedup
from nni.compression.pytorch.utils import count_flops_params
from nni.compression.pytorch.pruning import TaylorFOWeightPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[1] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
g_epoch = 0
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
def trainer(model, optimizer, criterion):
global g_epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx and batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
g_epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
g_epoch += 1
def evaluator(model):
model.eval()
correct = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
return optimizer, scheduler
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
parser.add_argument('--pretrain-epochs', type=int, default=20,
help='number of epochs to pretrain the model')
parser.add_argument('--fine-tune-epochs', type=int, default=20,
help='number of epochs to fine tune the model')
args = parser.parse_args()
print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
model = VGG().to(device)
optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
criterion = torch.nn.CrossEntropyLoss()
pre_best_acc = 0.0
best_state_dict = None
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
acc = evaluator(model)
if acc > pre_best_acc:
pre_best_acc = acc
best_state_dict = model.state_dict()
print("Best accuracy: {}".format(pre_best_acc))
model.load_state_dict(best_state_dict)
pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
g_epoch = 0
# Start to prune and speedup
print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
config_list = [{
'total_sparsity': 0.5,
'op_types': ['Conv2d'],
}]
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer = nni.trace(torch.optim.SGD)(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
pruner = TaylorFOWeightPruner(model, config_list, trainer, traced_optimizer, criterion, training_batches=20)
_, masks = pruner.compress()
pruner.show_pruned_weights()
pruner._unwrap_model()
ModelSpeedup(model, dummy_input=torch.rand([10, 3, 32, 32]).to(device), masks_file=masks).speedup_model()
print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER SPEEDUP ' + '=' * 50)
evaluator(model)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
best_acc = 0.0
g_epoch = 0
for i in range(args.fine_tune_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
best_acc = max(evaluator(model), best_acc)
flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment