Commit 1011377c authored by qianyj's avatar qianyj
Browse files

the source code of NNI for DCU

parent abc22158
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import os
import sys
import torch
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import numpy as np
from nni.compression.pytorch.utils.counter import count_flops_params
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from mobilenet import MobileNet
from mobilenet_v2 import MobileNetV2
def create_model(model_type=None, n_classes=120, input_size=224, checkpoint=None, pretrained=False, width_mult=1.):
if model_type == 'mobilenet_v1':
model = MobileNet(n_class=n_classes, profile='normal')
elif model_type == 'mobilenet_v2':
model = MobileNetV2(n_class=n_classes, input_size=input_size, width_mult=width_mult)
elif model_type == 'mobilenet_v2_torchhub':
model = torch.hub.load('pytorch/vision:v0.8.1', 'mobilenet_v2', pretrained=pretrained)
# model = torch.hub.load('pytorch/vision:v0.10.0', 'mobilenet_v2', pretrained=pretrained)
feature_size = model.classifier[1].weight.data.size()[1]
replace_classifier = torch.nn.Linear(feature_size, n_classes)
model.classifier[1] = replace_classifier
elif model_type is None:
model = None
else:
raise RuntimeError('Unknown model_type.')
if checkpoint is not None:
model.load_state_dict(torch.load(checkpoint))
return model
def get_dataloader(dataset_type, data_path, batch_size=32, shuffle=True):
assert dataset_type in ['train', 'eval']
if dataset_type == 'train':
ds = TrainDataset(data_path)
else:
ds = EvalDataset(data_path)
return DataLoader(ds, batch_size, shuffle=shuffle)
class TrainDataset(Dataset):
def __init__(self, npy_dir):
self.root_dir = npy_dir
self.case_names = [self.root_dir + '/' + x for x in os.listdir(self.root_dir)]
transform_set = [transforms.Lambda(lambda x: x),
transforms.RandomRotation(30),
transforms.ColorJitter(),
transforms.RandomHorizontalFlip(p=1)]
self.transform = transforms.RandomChoice(transform_set)
def __len__(self):
return len(self.case_names)
def __getitem__(self, index):
instance = np.load(self.case_names[index], allow_pickle=True).item()
x = instance['input'].transpose(2, 0, 1) # (C, H, W)
x = torch.from_numpy(x).type(torch.float) # convert to Tensor to use torchvision.transforms
x = self.transform(x)
return x, instance['label']
class EvalDataset(Dataset):
def __init__(self, npy_dir):
self.root_dir = npy_dir
self.case_names = [self.root_dir + '/' + x for x in os.listdir(self.root_dir)]
def __len__(self):
return len(self.case_names)
def __getitem__(self, index):
instance = np.load(self.case_names[index], allow_pickle=True).item()
x = instance['input'].transpose(2, 0, 1)
x = torch.from_numpy(x).type(torch.float)
return x, instance['label']
def count_flops(model, log=None, device=None):
dummy_input = torch.rand([1, 3, 256, 256])
if device is not None:
dummy_input = dummy_input.to(device)
flops, params, results = count_flops_params(model, dummy_input)
print(f"FLOPs: {flops}, params: {params}")
if log is not None:
log.write(f"FLOPs: {flops}, params: {params}\n")
return flops, params
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for quick start of pruning.
In this example, we use level pruner to prune the LeNet on MNIST.
'''
import argparse
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import (Conv2D, Dense, Dropout, Flatten, MaxPool2D, BatchNormalization)
from nni.algorithms.compression.tensorflow.pruning import LevelPruner, SlimPruner
class LeNet(Model):
"""
LeNet-5 Model with customizable hyper-parameters
"""
def __init__(self, conv_size=3, hidden_size=32, dropout_rate=0.5):
"""
Initialize hyper-parameters.
Parameters
----------
conv_size : int
Kernel size of convolutional layers.
hidden_size : int
Dimensionality of last hidden layer.
dropout_rate : float
Dropout rate between two fully connected (dense) layers, to prevent co-adaptation.
"""
super().__init__()
self.conv1 = Conv2D(filters=32, kernel_size=conv_size, activation='relu')
self.pool1 = MaxPool2D(pool_size=2)
self.bn1 = BatchNormalization()
self.conv2 = Conv2D(filters=64, kernel_size=conv_size, activation='relu')
self.pool2 = MaxPool2D(pool_size=2)
self.bn2 = BatchNormalization()
self.flatten = Flatten()
self.fc1 = Dense(units=hidden_size, activation='relu')
self.dropout = Dropout(rate=dropout_rate)
self.fc2 = Dense(units=10, activation='softmax')
def call(self, x):
"""Override ``Model.call`` to build LeNet-5 model."""
x = self.conv1(x)
x = self.pool1(x)
x = self.bn1(x)
x = self.conv2(x)
x = self.pool2(x)
x = self.bn2(x)
x = self.flatten(x)
x = self.fc1(x)
x = self.dropout(x)
return self.fc2(x)
def get_dataset(dataset_name='mnist'):
assert dataset_name == 'mnist'
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x_train = x_train[..., tf.newaxis] / 255.0
x_test = x_test[..., tf.newaxis] / 255.0
return (x_train, y_train), (x_test, y_test)
# def create_model(model_name='naive'):
# assert model_name == 'naive'
# return tf.keras.Sequential([
# tf.keras.layers.Conv2D(filters=20, kernel_size=5),
# tf.keras.layers.BatchNormalization(),
# tf.keras.layers.ReLU(),
# tf.keras.layers.MaxPool2D(pool_size=2),
# tf.keras.layers.Conv2D(filters=20, kernel_size=5),
# tf.keras.layers.BatchNormalization(),
# tf.keras.layers.ReLU(),
# tf.keras.layers.MaxPool2D(pool_size=2),
# tf.keras.layers.Flatten(),
# tf.keras.layers.Dense(units=500),
# tf.keras.layers.ReLU(),
# tf.keras.layers.Dense(units=10),
# tf.keras.layers.Softmax()
# ])
def main(args):
train_set, test_set = get_dataset('mnist')
model = LeNet()
print('start training')
optimizer = tf.keras.optimizers.SGD(learning_rate=0.1, momentum=0.9, decay=1e-4)
if args.pruner_name == 'slim':
def slim_loss(y_true, y_pred):
loss_1 = tf.keras.losses.sparse_categorical_crossentropy(y_true=y_true, y_pred=y_pred)
weight_list = []
for layer in [model.bn1, model.bn2]:
weight_list.append([w for w in layer.weights if '/gamma:' in w.name][0].read_value())
loss_2 = 0.0001 * tf.reduce_sum([tf.reduce_sum(tf.abs(w)) for w in weight_list])
return loss_1 + loss_2
model.compile(
optimizer=optimizer,
loss=slim_loss,
metrics=['accuracy']
)
else:
model.compile(
optimizer=optimizer,
loss='sparse_categorical_crossentropy',
metrics=['accuracy']
)
model.fit(
train_set[0],
train_set[1],
batch_size=args.batch_size,
epochs=args.pretrain_epochs,
validation_data=test_set
)
print('start pruning')
optimizer_finetune = tf.keras.optimizers.SGD(learning_rate=0.001, momentum=0.9, decay=1e-4)
# create_pruner
if args.pruner_name == 'level':
prune_config = [{
'sparsity': args.sparsity,
'op_types': ['default'],
}]
pruner = LevelPruner(model, prune_config)
elif args.pruner_name == 'slim':
prune_config = [{
'sparsity': args.sparsity,
'op_types': ['BatchNormalization'],
}]
pruner = SlimPruner(model, prune_config)
model = pruner.compress()
model.compile(
optimizer=optimizer_finetune,
loss='sparse_categorical_crossentropy',
metrics=['accuracy'],
run_eagerly=True # NOTE: Important, model compression does not work in graph mode!
)
# fine-tuning
model.fit(
train_set[0],
train_set[1],
batch_size=args.batch_size,
epochs=args.prune_epochs,
validation_data=test_set
)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--pruner_name', type=str, default='level', choices=['level', 'slim'])
parser.add_argument('--batch-size', type=int, default=256)
parser.add_argument('--pretrain_epochs', type=int, default=10)
parser.add_argument('--prune_epochs', type=int, default=10)
parser.add_argument('--sparsity', type=float, default=0.5)
args = parser.parse_args()
main(args)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for quick start of pruning.
In this example, we use level pruner to prune the LeNet on MNIST.
'''
import logging
import argparse
import torch
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import StepLR
from nni.algorithms.compression.pytorch.pruning import LevelPruner
import sys
sys.path.append('../models')
from mnist.lenet import LeNet
_logger = logging.getLogger('mnist_example')
_logger.setLevel(logging.INFO)
def train(args, model, device, train_loader, optimizer, epoch):
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % args.log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
if args.dry_run:
break
def test(model, device, test_loader):
model.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
test_loss += F.nll_loss(output, target, reduction='sum').item()
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
test_loss /= len(test_loader.dataset)
acc = 100 * correct / len(test_loader.dataset)
print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset), acc))
return acc
def main(args):
torch.manual_seed(args.seed)
use_cuda = not args.no_cuda and torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
train_kwargs = {'batch_size': args.batch_size}
test_kwargs = {'batch_size': args.test_batch_size}
if use_cuda:
cuda_kwargs = {'num_workers': 1,
'pin_memory': True,
'shuffle': True}
train_kwargs.update(cuda_kwargs)
test_kwargs.update(cuda_kwargs)
transform=transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
])
dataset1 = datasets.MNIST('./data', train=True, download=True,
transform=transform)
dataset2 = datasets.MNIST('./data', train=False,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs)
test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs)
model = LeNet().to(device)
optimizer = optim.Adadelta(model.parameters(), lr=args.lr)
print('start pre-training')
scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma)
for epoch in range(1, args.epochs + 1):
train(args, model, device, train_loader, optimizer, epoch)
test(model, device, test_loader)
scheduler.step()
torch.save(model.state_dict(), "pretrain_mnist_lenet.pt")
print('start pruning')
optimizer_finetune = torch.optim.SGD(model.parameters(), lr=0.01)
# create pruner
prune_config = [{
'sparsity': args.sparsity,
'op_types': ['default'],
}]
pruner = LevelPruner(model, prune_config)
model = pruner.compress()
# fine-tuning
best_top1 = 0
for epoch in range(1, args.epochs + 1):
pruner.update_epoch(epoch)
train(args, model, device, train_loader, optimizer_finetune, epoch)
top1 = test(model, device, test_loader)
if top1 > best_top1:
best_top1 = top1
# Export the best model, 'model_path' stores state_dict of the pruned model,
# mask_path stores mask_dict of the pruned model
pruner.export_model(model_path='pruend_mnist_lenet.pt', mask_path='mask_mnist_lenet.pt')
if __name__ == '__main__':
# Training settings
parser = argparse.ArgumentParser(description='PyTorch MNIST Example for model comporession')
parser.add_argument('--batch-size', type=int, default=64, metavar='N',
help='input batch size for training (default: 64)')
parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
help='input batch size for testing (default: 1000)')
parser.add_argument('--epochs', type=int, default=10, metavar='N',
help='number of epochs to train (default: 10)')
parser.add_argument('--lr', type=float, default=1.0, metavar='LR',
help='learning rate (default: 1.0)')
parser.add_argument('--gamma', type=float, default=0.7, metavar='M',
help='Learning rate step gamma (default: 0.7)')
parser.add_argument('--no-cuda', action='store_true', default=False,
help='disables CUDA training')
parser.add_argument('--dry-run', action='store_true', default=False,
help='quickly check a single pass')
parser.add_argument('--seed', type=int, default=1, metavar='S',
help='random seed (default: 1)')
parser.add_argument('--log-interval', type=int, default=10, metavar='N',
help='how many batches to wait before logging training status')
parser.add_argument('--sparsity', type=float, default=0.5,
help='target overall target sparsity')
args = parser.parse_args()
main(args)
import os
import sys
import argparse
import time
import torch
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
from mnist.lenet import LeNet
from nni.compression.pytorch import apply_compression_results, ModelSpeedup
torch.manual_seed(0)
use_mask = True
use_speedup = True
compare_results = True
config = {
'apoz': {
'model_name': 'vgg16',
'input_shape': [64, 3, 32, 32],
'masks_file': './experiment_data/mask_vgg16_cifar10_apoz.pth'
},
'l1filter': {
'model_name': 'vgg16',
'input_shape': [64, 3, 32, 32],
'masks_file': './experiment_data/mask_vgg16_cifar10_l1filter.pth'
},
'fpgm': {
'model_name': 'vgg16',
'input_shape': [64, 3, 32, 32],
'masks_file': './experiment_data/mask_vgg16_cifar10_fpgm.pth'
},
'slim': {
'model_name': 'vgg19',
'input_shape': [64, 3, 32, 32],
'masks_file': './experiment_data/mask_vgg19_cifar10_slim.pth'
}
}
def model_inference(config):
masks_file = config['masks_file']
device = torch.device(
'cuda') if torch.cuda.is_available() else torch.device('cpu')
# device = torch.device(config['device'])
if config['model_name'] == 'vgg16':
model = VGG(depth=16)
elif config['model_name'] == 'vgg19':
model = VGG(depth=19)
elif config['model_name'] == 'lenet':
model = LeNet()
model.to(device)
model.eval()
dummy_input = torch.randn(config['input_shape']).to(device)
use_mask_out = use_speedup_out = None
# must run use_mask before use_speedup because use_speedup modify the model
if use_mask:
apply_compression_results(model, masks_file, device)
start = time.time()
for _ in range(32):
use_mask_out = model(dummy_input)
print('elapsed time when use mask: ', time.time() - start)
if use_speedup:
m_speedup = ModelSpeedup(model, dummy_input, masks_file, device)
m_speedup.speedup_model()
start = time.time()
for _ in range(32):
use_speedup_out = model(dummy_input)
print('elapsed time when use speedup: ', time.time() - start)
if compare_results:
if torch.allclose(use_mask_out, use_speedup_out, atol=1e-07):
print('the outputs from use_mask and use_speedup are the same')
else:
raise RuntimeError('the outputs from use_mask and use_speedup are different')
if __name__ == '__main__':
parser = argparse.ArgumentParser("speedup")
parser.add_argument("--example_name", type=str, default="slim", help="the name of pruning example")
parser.add_argument("--masks_file", type=str, default=None, help="the path of the masks file")
args = parser.parse_args()
if args.example_name != 'all':
if args.masks_file is not None:
config[args.example_name]['masks_file'] = args.masks_file
if not os.path.exists(config[args.example_name]['masks_file']):
msg = '{} does not exist! You should specify masks_file correctly, ' \
'or use default one which is generated by model_prune_torch.py'
raise RuntimeError(msg.format(config[args.example_name]['masks_file']))
model_inference(config[args.example_name])
else:
model_inference(config['fpgm'])
model_inference(config['slim'])
model_inference(config['l1filter'])
model_inference(config['apoz'])
import torch
from torchvision.models import mobilenet_v2
from nni.compression.pytorch import ModelSpeedup
from nni.algorithms.compression.pytorch.pruning import L1FilterPruner
model = mobilenet_v2(pretrained=True)
dummy_input = torch.rand(8, 3, 416, 416)
cfg_list = [{'op_types':['Conv2d'], 'sparsity':0.5}]
pruner = L1FilterPruner(model, cfg_list)
pruner.compress()
pruner.export_model('./model', './mask')
# need call _unwrap_model if you want run the speedup on the same model
pruner._unwrap_model()
# Speedup the nanodet
ms = ModelSpeedup(model, dummy_input, './mask')
ms.speedup_model()
model(dummy_input)
\ No newline at end of file
import torch
from nanodet.model.arch import build_model
from nanodet.util import cfg, load_config
from nni.compression.pytorch import ModelSpeedup
from nni.algorithms.compression.pytorch.pruning import L1FilterPruner
"""
NanoDet model can be installed from https://github.com/RangiLyu/nanodet.git
"""
cfg_path = r"nanodet/config/nanodet-RepVGG-A0_416.yml"
load_config(cfg, cfg_path)
model = build_model(cfg.model).cpu()
dummy_input = torch.rand(8, 3, 416, 416)
op_names = []
# these three conv layers are followed by reshape-like functions
# that cannot be replaced, so we skip these three conv layers,
# you can also get such layers by `not_safe_to_prune` function
excludes = ['head.gfl_cls.0', 'head.gfl_cls.1', 'head.gfl_cls.2']
for name, module in model.named_modules():
if isinstance(module, torch.nn.Conv2d):
if name not in excludes:
op_names.append(name)
cfg_list = [{'op_types':['Conv2d'], 'sparsity':0.5, 'op_names':op_names}]
pruner = L1FilterPruner(model, cfg_list)
pruner.compress()
pruner.export_model('./model', './mask')
# need call _unwrap_model if you want run the speedup on the same model
pruner._unwrap_model()
# Speedup the nanodet
ms = ModelSpeedup(model, dummy_input, './mask')
ms.speedup_model()
model(dummy_input)
\ No newline at end of file
import torch
from pytorchyolo import models
from nni.compression.pytorch import ModelSpeedup
from nni.algorithms.compression.pytorch.pruning import L1FilterPruner, LevelPruner
from nni.compression.pytorch.utils import not_safe_to_prune
# The Yolo can be downloaded at https://github.com/eriklindernoren/PyTorch-YOLOv3.git
prefix = '/home/user/PyTorch-YOLOv3' # replace this path with yours
# Load the YOLO model
model = models.load_model(
"%s/config/yolov3.cfg" % prefix,
"%s/yolov3.weights" % prefix).cpu()
model.eval()
dummy_input = torch.rand(8, 3, 320, 320)
model(dummy_input)
# Generate the config list for pruner
# Filter the layers that may not be able to prune
not_safe = not_safe_to_prune(model, dummy_input)
cfg_list = []
for name, module in model.named_modules():
if name in not_safe:
continue
if isinstance(module, torch.nn.Conv2d):
cfg_list.append({'op_types':['Conv2d'], 'sparsity':0.6, 'op_names':[name]})
# Prune the model
pruner = L1FilterPruner(model, cfg_list)
pruner.compress()
pruner.export_model('./model', './mask')
pruner._unwrap_model()
# Speedup the model
ms = ModelSpeedup(model, dummy_input, './mask')
ms.speedup_model()
model(dummy_input)
#!/bin/bash
# Usage: ./run.sh gpu_id glue_task
export HIP_VISIBLE_DEVICES=$1
TASK_NAME=$2 # "cola", "sst2", "mrpc", "stsb", "qqp", "mnli", "qnli", "rte", "wnli"
PRETRAINED_MODEL="bert-base-uncased" # "distilbert-base-uncased", "roberta-base", "bert-base-cased", ...
# parameters for pruning
SPARSITY=0.5
RANKING_CRITERION=l1_weight # "l1_weight", "l2_weight", "l1_activation", "l2_activation", "taylorfo"
NUM_ITERATIONS=1 # 1 for one-shot pruning
EPOCHS_PER_ITERATION=1
# other training parameters, no need to change
MAX_LENGTH=128
BATCH_SIZE=32
LR=2e-5
N_EPOCHS=3
time=$(date "+%Y%m%d%H%M%S")
OUTDIR="models_${PRETRAINED_MODEL}_${TASK_NAME}_$time/"
TASK_LIST=("cola" "sst2" "mrpc" "stsb" "qqp" "mnli" "qnli" "rte" "wnli")
if [[ ${TASK_LIST[*]} =~ (^|[[:space:]])$TASK_NAME($|[[:space:]]) ]]; then
mkdir $OUTDIR
python transformer_pruning.py \
--sparsity $SPARSITY \
--ranking_criterion $RANKING_CRITERION \
--num_iterations $NUM_ITERATIONS \
--epochs_per_iteration $EPOCHS_PER_ITERATION \
--speed_up \
--model_name $PRETRAINED_MODEL \
--task_name $TASK_NAME \
--max_length $MAX_LENGTH \
--batch_size $BATCH_SIZE \
--learning_rate $LR \
--num_train_epochs $N_EPOCHS \
--output_dir $OUTDIR \
2>&1 | tee "$OUTDIR/output.log"
else
echo "Unsupported task $TASK_NAME."
fi
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
import argparse
import logging
import os
import torch
from torch.utils.data.dataloader import DataLoader
from tqdm.auto import tqdm
from nni.compression.pytorch.utils.counter import count_flops_params
from nni.algorithms.compression.pytorch.pruning import TransformerHeadPruner
import datasets
from datasets import load_dataset, load_metric
import transformers
from transformers import (
AdamW,
AutoConfig,
AutoModelForSequenceClassification,
AutoTokenizer,
DataCollatorWithPadding,
get_scheduler,
)
logger = logging.getLogger("bert_pruning_example")
def parse_args():
parser = argparse.ArgumentParser(
description="Example: prune a Huggingface transformer and finetune on GLUE tasks.")
parser.add_argument("--model_name", type=str, required=True,
help="Pretrained model architecture.")
parser.add_argument("--task_name", type=str, default=None,
help="The name of the GLUE task.",
choices=["cola", "mnli", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"])
parser.add_argument("--output_dir", type=str, default=None,
help="Where to store the model and mask.")
parser.add_argument("--sparsity", type=float, required=True,
help="Sparsity: proportion of heads to prune (should be between 0 and 1)")
parser.add_argument("--global_sort", action="store_true", default=False,
help="Rank the heads globally and prune the heads with lowest scores. If set to False, the "
"heads are only ranked within one layer")
parser.add_argument("--ranking_criterion", type=str, default="l1_weight",
choices=["l1_weight", "l2_weight",
"l1_activation", "l2_activation", "taylorfo"],
help="Criterion by which the attention heads are ranked.")
parser.add_argument("--num_iterations", type=int, default=1,
help="Number of pruning iterations (1 for one-shot pruning).")
parser.add_argument("--epochs_per_iteration", type=int, default=1,
help="Epochs to finetune before the next pruning iteration "
"(only effective if num_iterations > 1).")
parser.add_argument("--speed_up", action="store_true", default=False,
help="Whether to speed-up the pruned model")
# parameters for model training; no need to change them for running examples
parser.add_argument("--max_length", type=int, default=128,
help=("The maximum total input sequence length after tokenization. Sequences longer than this "
"will be truncated, sequences shorter will be padded if `--pad_to_max_lengh` is passed."))
parser.add_argument("--batch_size", type=int, default=8,
help="Batch size.")
parser.add_argument("--learning_rate", type=float, default=5e-5,
help="Initial learning rate.")
parser.add_argument("--num_train_epochs", type=int, default=3,
help="Total number of training epochs to perform.")
parser.add_argument("--lr_scheduler_type", default="linear",
choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant",
"constant_with_warmup"])
parser.add_argument("--num_warmup_steps", type=int, default=0,
help="Number of steps for the warmup in the lr scheduler.")
args = parser.parse_args()
if args.output_dir is not None:
os.makedirs(args.output_dir, exist_ok=True)
return args
def get_raw_dataset(task_name):
"""
Get a GLUE dataset using huggingface datasets.
"""
raw_dataset = load_dataset("glue", task_name)
is_regression = task_name == "stsb"
num_labels = 1 if is_regression else len(
raw_dataset["train"].features["label"].names)
return raw_dataset, is_regression, num_labels
def preprocess(args, tokenizer, raw_dataset):
"""
Tokenization and column renaming.
"""
assert args.task_name is not None
task_to_keys = {
"cola": ("sentence", None),
"mnli": ("premise", "hypothesis"),
"mrpc": ("sentence1", "sentence2"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"rte": ("sentence1", "sentence2"),
"sst2": ("sentence", None),
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
sentence1_key, sentence2_key = task_to_keys[args.task_name]
def tokenize(data):
texts = (
(data[sentence1_key],) if sentence2_key is None else (
data[sentence1_key], data[sentence2_key])
)
result = tokenizer(*texts, padding=False,
max_length=args.max_length, truncation=True)
if "label" in data:
result["labels"] = data["label"]
return result
processed_datasets = raw_dataset.map(
tokenize, batched=True, remove_columns=raw_dataset["train"].column_names)
return processed_datasets
def get_dataloader_and_optimizer(args, tokenizer, model, train_dataset, eval_dataset):
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator,
batch_size=args.batch_size)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator,
batch_size=args.batch_size)
optimizer = AdamW(model.parameters(), lr=args.learning_rate)
return optimizer, train_dataloader, eval_dataloader, data_collator
def train_model(args, model, is_regression, train_dataloader, eval_dataloader, optimizer, lr_scheduler, metric, device):
"""
Train the model using train_dataloader and evaluate after every epoch using eval_dataloader.
This function is called before and after pruning for "pretraining" on the GLUE task and further "finetuning".
"""
train_steps = args.num_train_epochs * len(train_dataloader)
progress_bar = tqdm(range(train_steps), position=0, leave=True)
for epoch in range(args.num_train_epochs):
model.train()
for step, batch in enumerate(train_dataloader):
for field in batch.keys():
batch[field] = batch[field].to(device)
outputs = model(**batch)
outputs.loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
progress_bar.update(1)
model.eval()
for step, batch in enumerate(eval_dataloader):
for field in batch.keys():
batch[field] = batch[field].to(device)
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1) if not is_regression \
else outputs.logits.squeeze()
metric.add_batch(predictions=predictions, references=batch["labels"])
eval_metric = metric.compute()
logger.info(f"epoch {epoch}: {eval_metric}")
def trainer_helper(model, train_dataloader, optimizer, device):
"""
This function is used for to create a "trainer" that is passed to the pruner.
Finetune the model for 1 epoch. This function is called by the pruner during pruning iterations (or called to
calculate scores for pruning when ranking criterion is "taylorfo").
"""
logger.info("Training for 1 epoch...")
progress_bar = tqdm(range(len(train_dataloader)), position=0, leave=True)
train_epoch = 1
for epoch in range(train_epoch):
for step, batch in enumerate(train_dataloader):
for field in batch.keys():
batch[field] = batch[field].to(device)
outputs = model(**batch)
outputs.loss.backward()
optimizer.step()
optimizer.zero_grad()
progress_bar.update(1)
def forward_runner_helper(model, train_dataloader, device):
"""
This function is used for to create a "forward_runner" that is passed to the pruner.
The function just runs forward on the train set without updating the parameters.
This allows the pruner to collect data for activation-based pruning methods.
"""
logger.info("Running forward on the entire train set without updating parameters...")
progress_bar = tqdm(range(len(train_dataloader)), position=0, leave=True)
forward_epoch = 1
for epoch in range(forward_epoch):
for step, batch in enumerate(train_dataloader):
for field in batch.keys():
batch[field] = batch[field].to(device)
_ = model(**batch)
# note: no loss.backward or optimizer.step() is performed here
progress_bar.update(1)
def final_eval_for_mnli(args, model, processed_datasets, metric, data_collator):
"""
If the task is MNLI, perform a final evaluation on mismatched validation set
"""
eval_dataset = processed_datasets["validation_mismatched"]
eval_dataloader = DataLoader(
eval_dataset, collate_fn=data_collator, batch_size=args.batch_size
)
model.eval()
for step, batch in enumerate(eval_dataloader):
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1)
metric.add_batch(
predictions=predictions,
references=batch["labels"],
)
eval_metric = metric.compute()
logger.info(f"mnli-mm: {eval_metric}")
def main():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
args = parse_args()
#########################################################################
# Prepare model, tokenizer, dataset, optimizer, and the scheduler
logger.setLevel(logging.INFO)
datasets.utils.logging.set_verbosity_warning()
transformers.utils.logging.set_verbosity_info()
# Load dataset and tokenizer, and then preprocess the dataset
raw_dataset, is_regression, num_labels = get_raw_dataset(args.task_name)
tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True)
processed_datasets = preprocess(args, tokenizer, raw_dataset)
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation_matched" if args.task_name ==
"mnli" else "validation"]
# Load pretrained model
config = AutoConfig.from_pretrained(
args.model_name, num_labels=num_labels, finetuning_task=args.task_name)
model = AutoModelForSequenceClassification.from_pretrained(
args.model_name, config=config)
model.to(device)
#########################################################################
# Finetune on the target GLUE task before pruning
optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(args, tokenizer,
model,
train_dataset,
eval_dataset)
train_steps = args.num_train_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps,
num_training_steps=train_steps)
metric = load_metric("glue", args.task_name)
logger.info("================= Finetuning before pruning =================")
train_model(args, model, is_regression, train_dataloader,
eval_dataloader, optimizer, lr_scheduler, metric, device)
if args.output_dir is not None:
torch.save(model.state_dict(), args.output_dir + "/model_before_pruning.pt")
if args.task_name == "mnli":
final_eval_for_mnli(args, model, processed_datasets, metric, data_collator)
#########################################################################
# Pruning
optimizer, train_dataloader, eval_dataloader, data_collator = get_dataloader_and_optimizer(args, tokenizer,
model,
train_dataset,
eval_dataset)
dummy_input = next(iter(train_dataloader))["input_ids"].to(device)
flops, params, results = count_flops_params(model, dummy_input)
print(f"Initial model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M")
# Here criterion is embedded in the model. Upper levels can just pass None to trainer.
def trainer(model, optimizer, criterion, epoch):
return trainer_helper(model, train_dataloader, optimizer, device)
def forward_runner(model):
return forward_runner_helper(model, train_dataloader, device)
# example: prune different layers with different sparsity
attention_name_groups = list(zip(["bert.encoder.layer.{}.attention.self.query".format(i) for i in range(12)],
["bert.encoder.layer.{}.attention.self.key".format(i) for i in range(12)],
["bert.encoder.layer.{}.attention.self.value".format(i) for i in range(12)],
["bert.encoder.layer.{}.attention.output.dense".format(i) for i in range(12)]))
kwargs = {"ranking_criterion": args.ranking_criterion,
"global_sort": args.global_sort,
"num_iterations": args.num_iterations,
"epochs_per_iteration": args.epochs_per_iteration,
"attention_name_groups": attention_name_groups,
"head_hidden_dim": 64,
"trainer": trainer,
"optimizer": optimizer,
"forward_runner": forward_runner}
config_list = [{
"sparsity": args.sparsity,
"op_types": ["Linear"],
"op_names": [x for layer in attention_name_groups[:6] for x in layer]
}, {
"sparsity": args.sparsity / 2,
"op_types": ["Linear"],
"op_names": [x for layer in attention_name_groups[6:] for x in layer]
}]
pruner = TransformerHeadPruner(model, config_list, **kwargs)
pruner.compress()
#########################################################################
# uncomment the following part to export the pruned model masks
# model_path = os.path.join(args.output_dir, "pruned_{}_{}.pth".format(args.model_name, args.task_name))
# mask_path = os.path.join(args.output_dir, "mask_{}_{}.pth".format(args.model_name, args.task_name))
# pruner.export_model(model_path=model_path, mask_path=mask_path)
#########################################################################
# Speedup
# Currently, speeding up Transformers through NNI ModelSpeedup is not supported because of shape inference issues.
# However, if you are using the transformers library, you can use the following workaround:
# The following code gets the head pruning decisions from the pruner and calls the _prune_heads() function
# implemented in models from the transformers library to speed up the model.
if args.speed_up:
speedup_rules = {}
for group_idx, group in enumerate(pruner.attention_name_groups):
# get the layer index
layer_idx = None
for part in group[0].split("."):
try:
layer_idx = int(part)
break
except:
continue
if layer_idx is not None:
speedup_rules[layer_idx] = pruner.pruned_heads[group_idx]
pruner._unwrap_model()
model.bert._prune_heads(speedup_rules)
print(model)
#########################################################################
# After pruning, finetune again on the target task
# Get the metric function
metric = load_metric("glue", args.task_name)
# re-initialize the optimizer and the scheduler
optimizer, _, _, data_collator = get_dataloader_and_optimizer(args, tokenizer, model, train_dataset,
eval_dataset)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer, num_warmup_steps=args.num_warmup_steps,
num_training_steps=train_steps)
logger.info("================= Finetuning after Pruning =================")
train_model(args, model, is_regression, train_dataloader,
eval_dataloader, optimizer, lr_scheduler, metric, device)
if args.output_dir is not None:
torch.save(model.state_dict(), args.output_dir +
"/model_after_pruning.pt")
if args.task_name == "mnli":
final_eval_for_mnli(args, model, processed_datasets,
metric, data_collator)
flops, params, results = count_flops_params(model, dummy_input)
print(f"Final model FLOPs {flops / 1e6:.2f} M, #Params: {params / 1e6:.2f}M")
if __name__ == "__main__":
main()
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported ActivationAPoZRank and ActivationMeanRank pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speed up is required.
'''
import argparse
import sys
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
import nni
from nni.compression.pytorch import ModelSpeedup
from nni.compression.pytorch.utils.counter import count_flops_params
from nni.algorithms.compression.v2.pytorch.pruning.basic_pruner import ActivationAPoZRankPruner, ActivationMeanRankPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
g_epoch = 0
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
def trainer(model, optimizer, criterion):
global g_epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx and batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
g_epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
g_epoch += 1
def evaluator(model):
model.eval()
correct = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
return optimizer, scheduler
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
parser.add_argument('--pruner', type=str, default='apoz',
choices=['apoz', 'mean'],
help='pruner to use')
parser.add_argument('--pretrain-epochs', type=int, default=20,
help='number of epochs to pretrain the model')
parser.add_argument('--fine-tune-epochs', type=int, default=20,
help='number of epochs to fine tune the model')
args = parser.parse_args()
print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
model = VGG().to(device)
optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
criterion = torch.nn.CrossEntropyLoss()
pre_best_acc = 0.0
best_state_dict = None
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
acc = evaluator(model)
if acc > pre_best_acc:
pre_best_acc = acc
best_state_dict = model.state_dict()
print("Best accuracy: {}".format(pre_best_acc))
model.load_state_dict(best_state_dict)
pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
g_epoch = 0
# Start to prune and speedup
print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
config_list = [{
'total_sparsity': 0.5,
'op_types': ['Conv2d'],
}]
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer = nni.trace(torch.optim.SGD)(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
if 'apoz' in args.pruner:
pruner = ActivationAPoZRankPruner(model, config_list, trainer, traced_optimizer, criterion, training_batches=20)
else:
pruner = ActivationMeanRankPruner(model, config_list, trainer, traced_optimizer, criterion, training_batches=20)
_, masks = pruner.compress()
pruner.show_pruned_weights()
pruner._unwrap_model()
ModelSpeedup(model, dummy_input=torch.rand([10, 3, 32, 32]).to(device), masks_file=masks).speedup_model()
print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER SPEEDUP ' + '=' * 50)
evaluator(model)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
best_acc = 0.0
g_epoch = 0
for i in range(args.fine_tune_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
best_acc = max(evaluator(model), best_acc)
flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported ADMM pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speed up is required.
'''
import argparse
import sys
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
import nni
from nni.compression.pytorch.utils.counter import count_flops_params
from nni.algorithms.compression.v2.pytorch.pruning.basic_pruner import ADMMPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
g_epoch = 0
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
def trainer(model, optimizer, criterion):
global g_epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx and batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
g_epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
g_epoch += 1
def evaluator(model):
model.eval()
correct = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
return optimizer, scheduler
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
parser.add_argument('--pretrain-epochs', type=int, default=20,
help='number of epochs to pretrain the model')
parser.add_argument('--fine-tune-epochs', type=int, default=20,
help='number of epochs to fine tune the model')
args = parser.parse_args()
print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
model = VGG().to(device)
optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
criterion = torch.nn.CrossEntropyLoss()
pre_best_acc = 0.0
best_state_dict = None
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
acc = evaluator(model)
if acc > pre_best_acc:
pre_best_acc = acc
best_state_dict = model.state_dict()
print("Best accuracy: {}".format(pre_best_acc))
model.load_state_dict(best_state_dict)
pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
g_epoch = 0
# Start to prune and speedup
print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
config_list = [{
'sparsity': 0.8,
'op_types': ['Conv2d'],
}, {
'sparsity': 0.92,
'op_types': ['Conv2d'],
}]
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer = nni.trace(torch.optim.SGD)(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
pruner = ADMMPruner(model, config_list, trainer, traced_optimizer, criterion, iterations=2, training_epochs=2)
_, masks = pruner.compress()
pruner.show_pruned_weights()
# Fine-grained method does not need to speedup
print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER PRUNING ' + '=' * 50)
evaluator(model)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
best_acc = 0.0
g_epoch = 0
for i in range(args.fine_tune_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
best_acc = max(evaluator(model), best_acc)
flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
import sys
from tqdm import tqdm
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
from nni.algorithms.compression.v2.pytorch.pruning import AMCPruner
from nni.compression.pytorch.utils.counter import count_flops_params
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
criterion = torch.nn.CrossEntropyLoss()
def trainer(model, optimizer, criterion, epoch):
model.train()
for data, target in tqdm(iterable=train_loader, desc='Epoch {}'.format(epoch)):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def finetuner(model):
model.train()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
for data, target in tqdm(iterable=train_loader, desc='Epoch PFs'):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def evaluator(model):
model.eval()
correct = 0
with torch.no_grad():
for data, target in tqdm(iterable=test_loader, desc='Test'):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
if __name__ == '__main__':
# model = MobileNetV2(n_class=10).to(device)
model = VGG().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
scheduler = MultiStepLR(optimizer, milestones=[50, 75], gamma=0.1)
criterion = torch.nn.CrossEntropyLoss()
for i in range(100):
trainer(model, optimizer, criterion, i)
pre_best_acc = evaluator(model)
dummy_input = torch.rand(10, 3, 32, 32).to(device)
pre_flops, pre_params, _ = count_flops_params(model, dummy_input)
config_list = [{'op_types': ['Conv2d'], 'total_sparsity': 0.5, 'max_sparsity_per_layer': 0.8}]
# if you just want to keep the final result as the best result, you can pass evaluator as None.
# or the result with the highest score (given by evaluator) will be the best result.
ddpg_params = {'hidden1': 300, 'hidden2': 300, 'lr_c': 1e-3, 'lr_a': 1e-4, 'warmup': 100, 'discount': 1., 'bsize': 64,
'rmsize': 100, 'window_length': 1, 'tau': 0.01, 'init_delta': 0.5, 'delta_decay': 0.99, 'max_episode_length': 1e9, 'epsilon': 50000}
pruner = AMCPruner(400, model, config_list, dummy_input, evaluator, finetuner=finetuner, ddpg_params=ddpg_params, target='flops')
pruner.compress()
_, model, masks, best_acc, _ = pruner.get_best_result()
flops, params, _ = count_flops_params(model, dummy_input)
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
import sys
from tqdm import tqdm
import torch
from torchvision import datasets, transforms
import nni
from nni.algorithms.compression.v2.pytorch.pruning import AutoCompressPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
criterion = torch.nn.CrossEntropyLoss()
epoch = 0
def trainer(model, optimizer, criterion):
global epoch
model.train()
for data, target in tqdm(iterable=train_loader, desc='Total Epoch {}'.format(epoch)):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
epoch = epoch + 1
def finetuner(model):
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
trainer(model, optimizer, criterion)
def evaluator(model):
model.eval()
correct = 0
with torch.no_grad():
for data, target in tqdm(iterable=test_loader, desc='Test'):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
if __name__ == '__main__':
model = VGG().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
# pre-train the model
for _ in range(10):
trainer(model, optimizer, criterion)
config_list = [{'op_types': ['Conv2d'], 'total_sparsity': 0.8}]
dummy_input = torch.rand(10, 3, 32, 32).to(device)
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer = nni.trace(torch.optim.SGD)(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
admm_params = {
'trainer': trainer,
'traced_optimizer': traced_optimizer,
'criterion': criterion,
'iterations': 10,
'training_epochs': 1
}
sa_params = {
'evaluator': evaluator
}
pruner = AutoCompressPruner(model, config_list, 10, admm_params, sa_params, keep_intermediate_result=True, finetuner=finetuner)
pruner.compress()
_, model, masks, _, _ = pruner.get_best_result()
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported fpgm pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speed up is required.
'''
import argparse
import sys
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
from nni.compression.pytorch import ModelSpeedup
from nni.compression.pytorch.utils.counter import count_flops_params
from nni.algorithms.compression.v2.pytorch.pruning.basic_pruner import FPGMPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
g_epoch = 0
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
def trainer(model, optimizer, criterion):
global g_epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx and batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
g_epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
g_epoch += 1
def evaluator(model):
model.eval()
correct = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
return optimizer, scheduler
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
parser.add_argument('--pretrain-epochs', type=int, default=20,
help='number of epochs to pretrain the model')
parser.add_argument('--fine-tune-epochs', type=int, default=20,
help='number of epochs to fine tune the model')
args = parser.parse_args()
print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
model = VGG().to(device)
optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
criterion = torch.nn.CrossEntropyLoss()
pre_best_acc = 0.0
best_state_dict = None
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
acc = evaluator(model)
if acc > pre_best_acc:
pre_best_acc = acc
best_state_dict = model.state_dict()
print("Best accuracy: {}".format(pre_best_acc))
model.load_state_dict(best_state_dict)
pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
g_epoch = 0
# Start to prune and speedup
print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
config_list = [{
'sparsity': 0.5,
'op_types': ['Conv2d']
}]
pruner = FPGMPruner(model, config_list)
_, masks = pruner.compress()
pruner.show_pruned_weights()
pruner._unwrap_model()
ModelSpeedup(model, dummy_input=torch.rand([10, 3, 32, 32]).to(device), masks_file=masks).speedup_model()
print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER SPEEDUP ' + '=' * 50)
evaluator(model)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
best_acc = 0.0
for i in range(args.fine_tune_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
best_acc = max(evaluator(model), best_acc)
flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported iterative pruning algorithms.
In this example, we show the end-to-end iterative pruning process: pre-training -> pruning -> fine-tuning.
'''
import sys
import argparse
from tqdm import tqdm
import torch
from torchvision import datasets, transforms
from nni.algorithms.compression.v2.pytorch.pruning import (
LinearPruner,
AGPPruner,
LotteryTicketPruner
)
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
criterion = torch.nn.CrossEntropyLoss()
def trainer(model, optimizer, criterion, epoch):
model.train()
for data, target in tqdm(iterable=train_loader, desc='Epoch {}'.format(epoch)):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def finetuner(model):
model.train()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
for data, target in tqdm(iterable=train_loader, desc='Epoch PFs'):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def evaluator(model):
model.eval()
correct = 0
with torch.no_grad():
for data, target in tqdm(iterable=test_loader, desc='Test'):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Iterative Example for model comporession')
parser.add_argument('--pruner', type=str, default='linear',
choices=['linear', 'agp', 'lottery'],
help='pruner to use')
parser.add_argument('--pretrain-epochs', type=int, default=10,
help='number of epochs to pretrain the model')
parser.add_argument('--total-iteration', type=int, default=10,
help='number of iteration to iteratively prune the model')
parser.add_argument('--pruning-algo', type=str, default='l1',
choices=['level', 'l1', 'l2', 'fpgm', 'slim', 'apoz',
'mean_activation', 'taylorfo', 'admm'],
help='algorithm to evaluate weights to prune')
parser.add_argument('--speed-up', type=bool, default=False,
help='Whether to speed-up the pruned model')
parser.add_argument('--reset-weight', type=bool, default=True,
help='Whether to reset weight during each iteration')
args = parser.parse_args()
model = VGG().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
# pre-train the model
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion, i)
evaluator(model)
config_list = [{'op_types': ['Conv2d'], 'sparsity': 0.8}]
dummy_input = torch.rand(10, 3, 32, 32).to(device)
# if you just want to keep the final result as the best result, you can pass evaluator as None.
# or the result with the highest score (given by evaluator) will be the best result.
kw_args = {'pruning_algorithm': args.pruning_algo,
'total_iteration': args.total_iteration,
'evaluator': None,
'finetuner': finetuner}
if args.speed_up:
kw_args['speed_up'] = args.speed_up
kw_args['dummy_input'] = torch.rand(10, 3, 32, 32).to(device)
if args.pruner == 'linear':
iterative_pruner = LinearPruner
elif args.pruner == 'agp':
iterative_pruner = AGPPruner
elif args.pruner == 'lottery':
kw_args['reset_weight'] = args.reset_weight
iterative_pruner = LotteryTicketPruner
pruner = iterative_pruner(model, config_list, **kw_args)
pruner.compress()
_, model, masks, _, _ = pruner.get_best_result()
evaluator(model)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported level pruning algorithm.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speed up is required.
'''
import argparse
import sys
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
from nni.compression.pytorch.utils.counter import count_flops_params
from nni.algorithms.compression.v2.pytorch.pruning.basic_pruner import LevelPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
g_epoch = 0
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
def trainer(model, optimizer, criterion):
global g_epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx and batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
g_epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
g_epoch += 1
def evaluator(model):
model.eval()
correct = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
return optimizer, scheduler
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
parser.add_argument('--pretrain-epochs', type=int, default=20,
help='number of epochs to pretrain the model')
parser.add_argument('--fine-tune-epochs', type=int, default=20,
help='number of epochs to fine tune the model')
args = parser.parse_args()
print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
model = VGG().to(device)
optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
criterion = torch.nn.CrossEntropyLoss()
pre_best_acc = 0.0
best_state_dict = None
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
acc = evaluator(model)
if acc > pre_best_acc:
pre_best_acc = acc
best_state_dict = model.state_dict()
print("Best accuracy: {}".format(pre_best_acc))
model.load_state_dict(best_state_dict)
pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
# Start to prune and speedup
print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
config_list = [{
'sparsity': 0.5,
'op_types': ['default']
}]
pruner = LevelPruner(model, config_list)
_, masks = pruner.compress()
pruner.show_pruned_weights()
# Fine-grained method does not need to speedup
print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER PRUNING ' + '=' * 50)
evaluator(model)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
best_acc = 0.0
g_epoch = 0
for i in range(args.fine_tune_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
best_acc = max(evaluator(model), best_acc)
flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
import functools
from tqdm import tqdm
import torch
from torch.optim import Adam
from torch.utils.data import DataLoader
from datasets import load_metric, load_dataset
from transformers import (
BertForSequenceClassification,
BertTokenizerFast,
DataCollatorWithPadding,
set_seed
)
import nni
from nni.algorithms.compression.v2.pytorch.pruning import MovementPruner
task_to_keys = {
"cola": ("sentence", None),
"mnli": ("premise", "hypothesis"),
"mrpc": ("sentence1", "sentence2"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"rte": ("sentence1", "sentence2"),
"sst2": ("sentence", None),
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
gradient_accumulation_steps = 16
# a fake criterion because huggingface output already has loss
def criterion(input, target):
return input.loss
def trainer(model, optimizer, criterion, train_dataloader):
model.train()
counter = 0
for batch in tqdm(train_dataloader):
counter += 1
batch.to(device)
optimizer.zero_grad()
outputs = model(**batch)
# pruner may wrap the criterion, for example, loss = origin_loss + norm(weight), so call criterion to get loss here
loss = criterion(outputs, None)
loss = loss / gradient_accumulation_steps
loss.backward()
if counter % gradient_accumulation_steps == 0 or counter == len(train_dataloader):
optimizer.step()
if counter % 16000 == 0:
print('Step {}: {}'.format(counter // gradient_accumulation_steps, evaluator(model, metric, is_regression, validate_dataloader)))
def evaluator(model, metric, is_regression, eval_dataloader):
model.eval()
for batch in tqdm(eval_dataloader):
batch.to(device)
outputs = model(**batch)
predictions = outputs.logits.argmax(dim=-1) if not is_regression else outputs.logits.squeeze()
metric.add_batch(
predictions=predictions,
references=batch["labels"],
)
return metric.compute()
if __name__ == '__main__':
task_name = 'mnli'
is_regression = False
num_labels = 1 if is_regression else (3 if task_name == 'mnli' else 2)
train_batch_size = 8
eval_batch_size = 8
set_seed(1024)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')
sentence1_key, sentence2_key = task_to_keys[task_name]
# used to preprocess the raw data
def preprocess_function(examples):
# Tokenize the texts
args = (
(examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
)
result = tokenizer(*args, padding=False, max_length=128, truncation=True)
if "label" in examples:
# In all cases, rename the column to labels because the model will expect that.
result["labels"] = examples["label"]
return result
raw_datasets = load_dataset('glue', task_name, cache_dir='./data')
processed_datasets = raw_datasets.map(preprocess_function, batched=True, remove_columns=raw_datasets["train"].column_names)
train_dataset = processed_datasets['train']
validate_dataset = processed_datasets['validation_matched' if task_name == "mnli" else 'validation']
data_collator = DataCollatorWithPadding(tokenizer)
train_dataloader = DataLoader(train_dataset, shuffle=True, collate_fn=data_collator, batch_size=train_batch_size)
validate_dataloader = DataLoader(validate_dataset, collate_fn=data_collator, batch_size=eval_batch_size)
metric = load_metric("glue", task_name)
model = BertForSequenceClassification.from_pretrained('bert-base-cased', num_labels=num_labels).to(device)
print('Initial: {}'.format(evaluator(model, metric, is_regression, validate_dataloader)))
config_list = [{'op_types': ['Linear'], 'op_partial_names': ['bert.encoder'], 'sparsity': 0.9}]
p_trainer = functools.partial(trainer, train_dataloader=train_dataloader)
# make sure you have used nni.trace to wrap the optimizer class before initialize
traced_optimizer = nni.trace(Adam)(model.parameters(), lr=2e-5)
pruner = MovementPruner(model, config_list, p_trainer, traced_optimizer, criterion, training_epochs=10,
warm_up_step=3000, cool_down_beginning_step=27000)
_, masks = pruner.compress()
pruner.show_pruned_weights()
print('Final: {}'.format(evaluator(model, metric, is_regression, validate_dataloader)))
optimizer = Adam(model.parameters(), lr=2e-5)
trainer(model, optimizer, criterion, train_dataloader)
print('After 1 epoch finetuning: {}'.format(evaluator(model, metric, is_regression, validate_dataloader)))
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
'''
NNI example for supported l1norm and l2norm pruning algorithms.
In this example, we show the end-to-end pruning process: pre-training -> pruning -> fine-tuning.
Note that pruners use masks to simulate the real pruning. In order to obtain a real compressed model, model speed up is required.
'''
import argparse
import sys
import torch
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import MultiStepLR
from nni.compression.pytorch import ModelSpeedup
from nni.compression.pytorch.utils.counter import count_flops_params
from nni.algorithms.compression.v2.pytorch.pruning.basic_pruner import L1NormPruner, L2NormPruner
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
g_epoch = 0
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
def trainer(model, optimizer, criterion):
global g_epoch
model.train()
for batch_idx, (data, target) in enumerate(train_loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
if batch_idx and batch_idx % 100 == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
g_epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
g_epoch += 1
def evaluator(model):
model.eval()
correct = 0.0
with torch.no_grad():
for data, target in test_loader:
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
def optimizer_scheduler_generator(model, _lr=0.1, _momentum=0.9, _weight_decay=5e-4, total_epoch=160):
optimizer = torch.optim.SGD(model.parameters(), lr=_lr, momentum=_momentum, weight_decay=_weight_decay)
scheduler = MultiStepLR(optimizer, milestones=[int(total_epoch * 0.5), int(total_epoch * 0.75)], gamma=0.1)
return optimizer, scheduler
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='PyTorch Example for model comporession')
parser.add_argument('--pruner', type=str, default='l1norm',
choices=['l1norm', 'l2norm'],
help='pruner to use')
parser.add_argument('--pretrain-epochs', type=int, default=20,
help='number of epochs to pretrain the model')
parser.add_argument('--fine-tune-epochs', type=int, default=20,
help='number of epochs to fine tune the model')
args = parser.parse_args()
print('\n' + '=' * 50 + ' START TO TRAIN THE MODEL ' + '=' * 50)
model = VGG().to(device)
optimizer, scheduler = optimizer_scheduler_generator(model, total_epoch=args.pretrain_epochs)
criterion = torch.nn.CrossEntropyLoss()
pre_best_acc = 0.0
best_state_dict = None
for i in range(args.pretrain_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
acc = evaluator(model)
if acc > pre_best_acc:
pre_best_acc = acc
best_state_dict = model.state_dict()
print("Best accuracy: {}".format(pre_best_acc))
model.load_state_dict(best_state_dict)
pre_flops, pre_params, _ = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
g_epoch = 0
# Start to prune and speedup
print('\n' + '=' * 50 + ' START TO PRUNE THE BEST ACCURACY PRETRAINED MODEL ' + '=' * 50)
config_list = [{
'sparsity': 0.5,
'op_types': ['Conv2d']
}]
if 'l1' in args.pruner:
pruner = L1NormPruner(model, config_list)
else:
pruner = L2NormPruner(model, config_list)
_, masks = pruner.compress()
pruner.show_pruned_weights()
pruner._unwrap_model()
ModelSpeedup(model, dummy_input=torch.rand([10, 3, 32, 32]).to(device), masks_file=masks).speedup_model()
print('\n' + '=' * 50 + ' EVALUATE THE MODEL AFTER SPEEDUP ' + '=' * 50)
evaluator(model)
# Optimizer used in the pruner might be patched, so recommend to new an optimizer for fine-tuning stage.
print('\n' + '=' * 50 + ' START TO FINE TUNE THE MODEL ' + '=' * 50)
optimizer, scheduler = optimizer_scheduler_generator(model, _lr=0.01, total_epoch=args.fine_tune_epochs)
best_acc = 0.0
for i in range(args.fine_tune_epochs):
trainer(model, optimizer, criterion)
scheduler.step()
best_acc = max(evaluator(model), best_acc)
flops, params, results = count_flops_params(model, torch.randn([128, 3, 32, 32]).to(device))
print(f'Pretrained model FLOPs {pre_flops/1e6:.2f} M, #Params: {pre_params/1e6:.2f}M, Accuracy: {pre_best_acc: .2f}%')
print(f'Finetuned model FLOPs {flops/1e6:.2f} M, #Params: {params/1e6:.2f}M, Accuracy: {best_acc: .2f}%')
import sys
from tqdm import tqdm
import torch
from torchvision import datasets, transforms
from nni.algorithms.compression.v2.pytorch.pruning import L1NormPruner
from nni.algorithms.compression.v2.pytorch.pruning.tools import AGPTaskGenerator
from nni.algorithms.compression.v2.pytorch.pruning.basic_scheduler import PruningScheduler
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
criterion = torch.nn.CrossEntropyLoss()
def trainer(model, optimizer, criterion, epoch):
model.train()
for data, target in tqdm(iterable=train_loader, desc='Epoch {}'.format(epoch)):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def finetuner(model):
model.train()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
for data, target in tqdm(iterable=train_loader, desc='Epoch PFs'):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def evaluator(model):
model.eval()
correct = 0
with torch.no_grad():
for data, target in tqdm(iterable=test_loader, desc='Test'):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
if __name__ == '__main__':
model = VGG().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
# pre-train the model
for i in range(5):
trainer(model, optimizer, criterion, i)
# No need to pass model and config_list to pruner during initializing when using scheduler.
pruner = L1NormPruner(None, None)
# you can specify the log_dir, all intermediate results and best result will save under this folder.
# if you don't want to keep intermediate results, you can set `keep_intermediate_result=False`.
config_list = [{'op_types': ['Conv2d'], 'sparsity': 0.8}]
task_generator = AGPTaskGenerator(10, model, config_list, log_dir='.', keep_intermediate_result=True)
dummy_input = torch.rand(10, 3, 32, 32).to(device)
# if you just want to keep the final result as the best result, you can pass evaluator as None.
# or the result with the highest score (given by evaluator) will be the best result.
# scheduler = PruningScheduler(pruner, task_generator, finetuner=finetuner, speed_up=True, dummy_input=dummy_input, evaluator=evaluator)
scheduler = PruningScheduler(pruner, task_generator, finetuner=finetuner, speed_up=True, dummy_input=dummy_input, evaluator=None, reset_weight=False)
scheduler.compress()
_, model, masks, _, _ = scheduler.get_best_result()
import sys
from tqdm import tqdm
import torch
from torchvision import datasets, transforms
from nni.algorithms.compression.v2.pytorch.pruning import L1NormPruner
from nni.compression.pytorch.speedup import ModelSpeedup
from pathlib import Path
sys.path.append(str(Path(__file__).absolute().parents[2] / 'models'))
from cifar10.vgg import VGG
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
train_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=True, transform=transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomCrop(32, 4),
transforms.ToTensor(),
normalize,
]), download=True),
batch_size=128, shuffle=True)
test_loader = torch.utils.data.DataLoader(
datasets.CIFAR10('./data', train=False, transform=transforms.Compose([
transforms.ToTensor(),
normalize,
])),
batch_size=128, shuffle=False)
criterion = torch.nn.CrossEntropyLoss()
def trainer(model, optimizer, criterion, epoch):
model.train()
for data, target in tqdm(iterable=train_loader, desc='Epoch {}'.format(epoch)):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output, target)
loss.backward()
optimizer.step()
def evaluator(model):
model.eval()
correct = 0
with torch.no_grad():
for data, target in tqdm(iterable=test_loader, desc='Test'):
data, target = data.to(device), target.to(device)
output = model(data)
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
acc = 100 * correct / len(test_loader.dataset)
print('Accuracy: {}%\n'.format(acc))
return acc
if __name__ == '__main__':
model = VGG().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()
print('\nPre-train the model:')
for i in range(5):
trainer(model, optimizer, criterion, i)
evaluator(model)
config_list = [{'op_types': ['Conv2d'], 'sparsity': 0.8}]
pruner = L1NormPruner(model, config_list)
_, masks = pruner.compress()
print('\nThe accuracy with masks:')
evaluator(model)
pruner._unwrap_model()
ModelSpeedup(model, dummy_input=torch.rand(10, 3, 32, 32).to(device), masks_file=masks).speedup_model()
print('\nThe accuracy after speed up:')
evaluator(model)
# Need a new optimizer due to the modules in model will be replaced during speedup.
optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9, weight_decay=5e-4)
print('\nFinetune the model after speed up:')
for i in range(5):
trainer(model, optimizer, criterion, i)
evaluator(model)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment