"web/vscode:/vscode.git/clone" did not exist on "7c5fa7f4a2046c1f4bb77ed5e480918ffb8a10aa"
Commit dec49f1f authored by mashun1's avatar mashun1
Browse files

resnet50-qat

parents
*pyc*
data/
checkpoints*
nohup*
# ResNet50-QAT
本项目旨在对ResNet50模型执行量化感知训练,将其转换为onnx模型,并在TensorRT上运行。
## 论文
**Deep Residual Learning for Image Recognition**
* https://arxiv.org/pdf/1512.03385.pdf
## 模型结构
ResNet50包含深度残差网络,卷积,池化,全局平均池化以及分类层。
深度残差网络(ResNet):
ResNet 引入了残差连接(residual connection)的概念,通过在网络中添加跨层的直接连接来解决深度神经网络中的梯度消失问题。这种连接允许信息在网络中更容易地向后传播,从而使得可以训练更深的网络模型。
深度结构:
ResNet-50 具有深度的网络结构,包括 50 层卷积和全连接层。它包含多个残差块(residual blocks),每个残差块内部有多个卷积层和恒等映射(identity mapping)。通过这种方式,ResNet-50 能够学习到更复杂和抽象的特征表示。
卷积层和池化层:
ResNet-50 使用了一系列卷积层和池化层,这些层用于从输入图像中提取特征。卷积层通过滤波器(filter)对输入图像进行卷积操作,从而提取图像中的局部特征。池化层则用于降低特征图的空间分辨率,减少模型的参数数量。
全局平均池化层:
在 ResNet-50 的最后一层卷积层之后,通常会添加一个全局平均池化层(Global Average Pooling Layer)。该层将特征图中的每个通道的特征取平均值,生成一个固定大小的特征向量作为输入,用于最终的分类任务。
分类层:
最后,ResNet-50 使用一个全连接的分类层,该层将特征向量映射到预定义的类别标签上。通常在分类层之前还会添加一个或多个全连接层和激活函数,用于增加模型的非线性能力。
![alt text](readme_imgs/image-1.png)
## 算法原理
ResNet 引入了残差连接(residual connection)的概念,通过在网络中添加跨层的直接连接来传递信息,解决深度神经网络训练过程中的梯度消失和梯度爆炸问题,从而使得可以训练更深的网络模型。
![alt text](readme_imgs/image-2.png)
## 环境配置
### Anaconda (方法一)
1、本项目目前仅支持在N卡环境运行
python 3.9.18
torch 2.0.1
cuda 11
pip install -r requirements.txt
pip install --no-cache-dir --extra-index-url https://pypi.nvidia.com pytorch-quantization
2、TensorRT
wget https://github.com/NVIDIA/TensorRT/archive/refs/tags/8.5.3.zip
unzip [下载的压缩包] -d [解压路径]
pip install 解压路径/python/tensorrt-8.5.3.1-cp39-none-linux_x86_64.whl
ln -s 解压路径(绝对路径)/bin/trtexec /usr/local/bin/trtexec
注意:若需要`cu12`则将`requirements.txt`中的相关注释关闭,并安装。
## 数据集
本项目使用CIFAR-10数据集,可直接运行`main.py`后自动下载并处理。
## 训练
# --epochs表示训练或校准回合数
# --resume表示继续训练
# --qat表示校准(在训练基础模型时不能使用此参数)
CUDA_VISIBLE_DEVICES=0,1 torchrun --nnodes=1 --nproc_per_node=2 --rdzv_id=100 --rdzv_backend=c10d --rdzv_endpoint=localhost:29400 main.py --epochs=N --resume --qat --batch_size=N --lr=X --num_classes=10
## 推理
# N卡推理
trtexec --onnx=/path/to/onnx --saveEngine=./checkpoints/qat/last.trt --int8
python eval.py --device=0
# DCU卡推理
python evaluate_migraphx.py --device=0
## result
![alt text](readme_imgs/image-3.png)
### 精度
||原始模型(A800)|QAT模型(A800)|ONNX模型(A800)|TensorRT模型(A800)|MIGraphX模型|
|:---|:---|:---|:---|:---|----|
|Acc|0.9589|0.9584|0.9588|0.9584||
|推理时间|7.6061s|42.9348s|10.4021s|2.2839s||
## 应用场景
### 算法类别
`图像分类`
### 热点应用行业
`制造,交通,网安`
## 源码仓库及问题反馈
* https://developer.hpccube.com/codes/modelzoo/resnet50-qat_pytorch
## 参考资料
* https://docs.nvidia.com/deeplearning/tensorrt/pytorch-quantization-toolkit/docs/index.html
from pathlib import Path
import sys
parent_dir = Path(__file__).resolve().parent
sys.path.append(str(parent_dir))
from models import resnet50
from tqdm import tqdm
from utils.data import prepare_dataloader
from utils.trt import TrtModel
import time
import torch
import onnxruntime
import numpy as np
import pycuda.driver as cuda
from pytorch_quantization import quant_modules
from torch.utils.data import DataLoader, Dataset
class NumpyDataLoader:
def __init__(self, dataloader):
self.data = []
for data, label in dataloader:
self.data.append((data.numpy().astype(np.float32), label.numpy().astype(np.float32)))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
class CacheDataLoader:
def __init__(self, dataloader):
self.data = []
for data, label in dataloader:
self.data.append((data, label))
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
def eval_onnx(ckpt_path, dataloader, device):
sess_options = onnxruntime.SessionOptions()
if onnxruntime.get_device() == "GPU":
providers = ['CUDAExecutionProvider']
else:
providers = ['CUDAExecutionProvider', 'CPUExecutionProvider']
sess_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_EXTENDED
session = onnxruntime.InferenceSession(ckpt_path, sess_options, providers=providers, provider_options=[{"device_id": device}]*len(providers))
input_name = session.get_inputs()[0].name
output_name = session.get_outputs()[0].name
correct, total = 0, 0
for it in range(2):
desc = "warmup"
if it == 1:
start_time = time.time()
desc = "eval onnx model"
for data, label in tqdm(dataloader, desc=desc, total=len(dataloader)):
output = session.run([output_name], {input_name: data})
predictions = np.argmax(output, axis=-1)[0]
correct += (label == predictions).sum()
total += len(label)
if it == 1:
end_time = time.time()
return correct / total, end_time - start_time
def eval_trt(ckpt_path, dataloader, device):
cuda.init()
device = cuda.Device(device)
batch_size = 16
model = TrtModel(ckpt_path)
correct = 0
total = 0
desc = "warmup"
for it in range(2):
if it == 1:
desc = "eval trt model"
start_time = time.time()
for data, label in tqdm(dataloader, desc=desc, total=(len(dataloader))):
result = model(data, batch_size)
result = np.argmax(result, axis=-1)
total += label.shape[0]
correct += (label == result).sum()
if it == 1:
end_time = time.time()
return correct / total, end_time - start_time
def eval_original(ckpt_path, dataloader, num_classes, device):
model = resnet50(num_classes=num_classes)
model.load_state_dict(torch.load(ckpt_path))
model.to(device)
model.eval()
total, correct = 0, 0
for it in range(2):
desc = "warmup"
if it == 1:
start_time = time.time()
desc = 'eval original pytorch model'
for data, label in tqdm(dataloader, desc=desc, total=len(dataloader)):
output = model(data.to(device))
_, predictions = torch.max(output, dim=-1)
correct += torch.sum(predictions==label.to(device)).item()
total += label.size(0)
if it == 1:
end_time = time.time()
return correct / total, end_time - start_time
def eval_qat(ckpt_path, dataloader, num_classes, device):
quant_modules.initialize()
model = resnet50(num_classes=num_classes, quantize=True)
model.load_state_dict(torch.load(ckpt_path))
model.to(device)
model.eval()
total, correct = 0, 0
for it in range(2):
desc = "warmup"
if it == 1:
start_time = time.time()
desc = 'eval qat pytorch model'
for data, label in tqdm(dataloader, desc=desc, total=len(dataloader)):
output = model(data.to(device))
_, predictions = torch.max(output, dim=-1)
correct += torch.sum(predictions==label.to(device)).item()
total += label.size(0)
if it == 1:
end_time = time.time()
return correct / total, end_time - start_time
def main(args):
device = torch.device(f"cuda:{args.device}" if args.device != -1 else "cpu")
test_dataloader, _ = prepare_dataloader("./data/cifar10", False, 1)
numpy_dataloader = NumpyDataLoader(test_dataloader)
cache_dataloader = CacheDataLoader(test_dataloader)
# 测试pytorch模型
acc1, runtime1 = eval_original("./checkpoints/pretrained/pretrained_model.pth", cache_dataloader, args.num_classes, device)
acc2, runtime2 = eval_qat("./checkpoints/qat/pretrained_model.pth", cache_dataloader, args.num_classes, device)
acc_onnx, runtime_onnx = eval_onnx("./checkpoints/qat/pretrained_qat.onnx", numpy_dataloader, args.device)
acc_trt, runtime_trt = eval_trt("./checkpoints/qat/last.trt", numpy_dataloader, args.device)
print("==============================================================")
print(f"Original Model Acc: {acc1}, Inference Time: {runtime1:.4f}s")
print(f"Qat Model Acc: {acc2}, Inference Time: {runtime2:.4f}s")
print(f"Onnx Model Acc: {acc_onnx}, Inference Time: {runtime_onnx:.4f}s")
print(f"Trt Model Acc: {acc_trt}, Inference Time: {runtime_trt:.4f}s")
print("==============================================================")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", type=int, default=16)
parser.add_argument("--device", type=int, default=-1)
parser.add_argument("--num_classes", type=int, default=10)
args = parser.parse_args()
main(args)
from pathlib import Path
import sys
parent_dir = Path(__file__).resolve().parent
sys.path.append(str(parent_dir))
from models import resnet50
# from torchvision.models import resnet50
import os
import torch
import torch.distributed as dist
from tqdm import tqdm
from utils.data import prepare_dataloader
from utils.qat import *
from torch.nn.parallel import DistributedDataParallel as DDP
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import quant_modules
def cleanup():
dist.destroy_process_group()
def prepare_training_obj(lr: float = 1e-3,
num_classes=10,
ckpt_root: str = '',
resume: bool = True,
qat: bool = True):
if qat:
model = resnet50(num_classes=num_classes, quantize=True)
else:
if not resume:
model = resnet50(pretrained=True)
fc_in = model.fc.in_features
model.fc = torch.nn.Linear(fc_in, num_classes)
else:
model = resnet50(num_classes=10)
if resume or qat:
model.load_state_dict(torch.load(os.path.join(ckpt_root, "pretrained_model.pth"), map_location="cpu"))
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-3)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
lr_scheduler.load_state_dict(torch.load(os.path.join(ckpt_root, "scheduler.pth")))
lr_scheduler.step()
else:
optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-3)
lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=100)
loss_fc = torch.nn.CrossEntropyLoss()
return model, optimizer, lr_scheduler, loss_fc
def train_one_epoch(model,
optimizer,
lr_scheduler,
loss_fc,
dataloader,
device):
model.train()
epoch_loss = torch.zeros(1).to(device)
for it, (data, label) in enumerate(dataloader):
output = model(data.to(device))
loss = loss_fc(output, label.to(device))
optimizer.zero_grad()
loss.backward()
optimizer.step()
epoch_loss += (loss / label.size(0))
lr_scheduler.step()
dist.reduce(epoch_loss, dst=0)
return epoch_loss
def evaluate(model,
dataloader,
device):
correct = 0
total = 0
model.eval()
for data, label in dataloader:
output = model(data.to(device))
_, predictions = torch.max(output, dim=-1)
correct += torch.sum(predictions.cpu()==label)
total += label.size(0)
return correct / total
def pretrain(args):
dist.init_process_group('nccl')
rank = dist.get_rank()
model, optimizer, lr_scheduler, loss_fc = prepare_training_obj(args.lr, ckpt_root="./checkpoints/pretrained", resume=args.resume, qat=args.qat)
device = torch.device(f"cuda:{rank}")
model.to(device)
ddp_model = DDP(model, device_ids=[rank])
train_dataloader, sampler = prepare_dataloader("./data/cifar10", True, args.batch_size)
if rank == 0:
test_dataloader, _ = prepare_dataloader("./data/cifar10", False)
for epoch in range(args.epochs):
if rank == 0:
train_dataloader = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{args.epochs}", position=0, leave=False)
dist.barrier()
sampler.set_epoch(epoch)
loss = train_one_epoch(ddp_model, optimizer, lr_scheduler, loss_fc, train_dataloader, device)
if dist.get_rank() == 0:
avg_loss = loss.item() / dist.get_world_size()
if (epoch + 1) % 5 == 0:
acc = evaluate(model, test_dataloader, device)
tqdm.write(f"Epoch: {epoch+1}, Avg Train Loss: {avg_loss:.4f}, Eval Acc: {acc}")
else:
tqdm.write(f"Epoch: {epoch+1}, Avg Train Loss: {avg_loss:.4f}")
if (epoch + 1) % 5 == 0:
# save checkpoints and lr.
ckpt_path = "./checkpoints/pretrained"
if not os.path.exists(ckpt_path):
os.makedirs(ckpt_path)
torch.save(model.state_dict(), os.path.join(ckpt_path, "pretrained_model.pth"))
torch.save(lr_scheduler.state_dict(), os.path.join(ckpt_path, "scheduler.pth"))
cleanup()
def qat(args):
dist.init_process_group('nccl')
rank = dist.get_rank()
quant_modules.initialize()
if args.resume:
model, optimizer, lr_scheduler, loss_fc = prepare_training_obj(args.lr, ckpt_root="./checkpoints/qat", resume=args.resume, qat=args.qat)
else:
model, optimizer, lr_scheduler, loss_fc = prepare_training_obj(args.lr, ckpt_root="./checkpoints/pretrained", resume=args.resume, qat=args.qat)
device = torch.device(f"cuda:{rank}")
model.to(device)
train_dataloader, sampler = prepare_dataloader("./data/cifar10", True, args.batch_size)
ddp_model = DDP(model, device_ids=[rank])
with torch.no_grad():
collect_stats(ddp_model, train_dataloader, num_batches=2, device=device)
compute_amax(ddp_model, device=device, method="percentile", percentile=99.99)
if rank == 0:
test_dataloader, _ = prepare_dataloader("./data/cifar10", False)
for epoch in range(args.epochs):
if rank == 0:
train_dataloader = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{args.epochs}", position=0, leave=False)
dist.barrier()
sampler.set_epoch(epoch)
loss = train_one_epoch(ddp_model, optimizer, lr_scheduler, loss_fc, train_dataloader, device)
if dist.get_rank() == 0:
avg_loss = loss.item() / dist.get_world_size()
if (epoch + 1) % 5 == 0:
acc = evaluate(model, test_dataloader, device)
tqdm.write(f"Epoch: {epoch+1}, Avg Train Loss: {avg_loss:.4f}, Eval Acc: {acc}")
else:
tqdm.write(f"Epoch: {epoch+1}, Avg Train Loss: {avg_loss:.4f}")
if (epoch + 1) % 5 == 0:
# save checkpoints and lr.
ckpt_path = "./checkpoints/qat"
if not os.path.exists(ckpt_path):
os.makedirs(ckpt_path)
torch.save(model.state_dict(), os.path.join(ckpt_path, "pretrained_model.pth"))
torch.save(lr_scheduler.state_dict(), os.path.join(ckpt_path, "scheduler.pth"))
if rank == 0:
quant_nn.TensorQuantizer.use_fb_fake_quant = True
model.eval()
with torch.no_grad():
jit_model = torch.jit.trace(model, torch.randn((16, 3, 224, 224)).to(device))
# torch.jit.save(jit_model, "./checkpoints/qat/pretrained_model.jit")
jit_model.eval()
torch.onnx.export(jit_model.to(device), torch.randn((16, 3, 224, 224)).to(device), "checkpoints/qat/pretrained_qat.onnx")
cleanup()
def main(args):
if args.qat:
qat(args)
else:
pretrain(args)
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--epochs", type=int, default=100)
parser.add_argument("--lr", type=float, default=1e-3)
parser.add_argument("--batch_size", type=int, default=512)
parser.add_argument("--num_classes", type=int, default=10)
parser.add_argument("--resume", action="store_true")
parser.add_argument("--qat", action="store_true")
args = parser.parse_args()
main(args)
\ No newline at end of file
# 模型唯一标识
modelCode = 555
# 模型名称
modelName = resnet50-qat_pytorch
# 模型描述
modelDescription = resnet50-qat
# 应用场景
appScenario = 训练,推理,图像分类,制造,交通,网安
# 框架类型
frameType = pytorch
#
# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import torch
from torch import Tensor
import torch.nn as nn
from torch.hub import load_state_dict_from_url
from typing import Type, Any, Callable, Union, List, Optional
from pytorch_quantization import quant_modules
from pytorch_quantization import nn as quant_nn
__all__ = [
'ResNet', 'resnet18', 'resnet34', 'resnet50', 'resnet101', 'resnet152', 'resnext50_32x4d', 'resnext101_32x8d',
'wide_resnet50_2', 'wide_resnet101_2'
]
model_urls = {
'resnet18': 'https://download.pytorch.org/models/resnet18-5c106cde.pth',
'resnet34': 'https://download.pytorch.org/models/resnet34-333f7ec4.pth',
'resnet50': 'https://download.pytorch.org/models/resnet50-19c8e357.pth',
'resnet101': 'https://download.pytorch.org/models/resnet101-5d3b4d8f.pth',
'resnet152': 'https://download.pytorch.org/models/resnet152-b121ed2d.pth',
'resnext50_32x4d': 'https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth',
'resnext101_32x8d': 'https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth',
'wide_resnet50_2': 'https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth',
'wide_resnet101_2': 'https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth',
}
def conv3x3(in_planes: int,
out_planes: int,
stride: int = 1,
groups: int = 1,
dilation: int = 1,
quantize: bool = False) -> nn.Conv2d:
"""3x3 convolution with padding"""
if quantize:
return quant_nn.QuantConv2d(in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=dilation,
groups=groups,
bias=False,
dilation=dilation)
else:
return nn.Conv2d(in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=dilation,
groups=groups,
bias=False,
dilation=dilation)
def conv1x1(in_planes: int, out_planes: int, stride: int = 1, quantize: bool = False) -> nn.Conv2d:
"""1x1 convolution"""
if quantize:
return quant_nn.QuantConv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
else:
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class BasicBlock(nn.Module):
expansion: int = 1
def __init__(self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None,
quantize: bool = False) -> None:
super(BasicBlock, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
if groups != 1 or base_width != 64:
raise ValueError('BasicBlock only supports groups=1 and base_width=64')
if dilation > 1:
raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
# Both self.conv1 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv3x3(inplanes, planes, stride, quantize=quantize)
self.bn1 = norm_layer(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes, quantize=quantize)
self.bn2 = norm_layer(planes)
self.downsample = downsample
self.stride = stride
self._quantize = quantize
if self._quantize:
self.residual_quantizer = quant_nn.TensorQuantizer(quant_nn.QuantConv2d.default_quant_desc_input)
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
identity = self.downsample(x)
if self._quantize:
out += self.residual_quantizer(identity)
else:
out += identity
out = self.relu(out)
return out
class Bottleneck(nn.Module):
# Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
# while original implementation places the stride at the first 1x1 convolution(self.conv1)
# according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
# This variant is also known as ResNet V1.5 and improves accuracy according to
# https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
expansion: int = 4
def __init__(self,
inplanes: int,
planes: int,
stride: int = 1,
downsample: Optional[nn.Module] = None,
groups: int = 1,
base_width: int = 64,
dilation: int = 1,
norm_layer: Optional[Callable[..., nn.Module]] = None,
quantize: bool = False) -> None:
super(Bottleneck, self).__init__()
if norm_layer is None:
norm_layer = nn.BatchNorm2d
width = int(planes * (base_width / 64.)) * groups
# Both self.conv2 and self.downsample layers downsample the input when stride != 1
self.conv1 = conv1x1(inplanes, width, quantize=quantize)
self.bn1 = norm_layer(width)
self.conv2 = conv3x3(width, width, stride, groups, dilation, quantize=quantize)
self.bn2 = norm_layer(width)
self.conv3 = conv1x1(width, planes * self.expansion, quantize=quantize)
self.bn3 = norm_layer(planes * self.expansion)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
self._quantize = quantize
if self._quantize:
self.residual_quantizer = quant_nn.TensorQuantizer(quant_nn.QuantConv2d.default_quant_desc_input)
def forward(self, x: Tensor) -> Tensor:
identity = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
identity = self.downsample(x)
if self._quantize:
out += self.residual_quantizer(identity)
else:
out += identity
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self,
block: Type[Union[BasicBlock, Bottleneck]],
layers: List[int],
quantize: bool = False,
num_classes: int = 1000,
zero_init_residual: bool = False,
groups: int = 1,
width_per_group: int = 64,
replace_stride_with_dilation: Optional[List[bool]] = None,
norm_layer: Optional[Callable[..., nn.Module]] = None) -> None:
super(ResNet, self).__init__()
self._quantize = quantize
if norm_layer is None:
norm_layer = nn.BatchNorm2d
self._norm_layer = norm_layer
self.inplanes = 64
self.dilation = 1
if replace_stride_with_dilation is None:
# each element in the tuple indicates if we should replace
# the 2x2 stride with a dilated convolution instead
replace_stride_with_dilation = [False, False, False]
if len(replace_stride_with_dilation) != 3:
raise ValueError("replace_stride_with_dilation should be None "
"or a 3-element tuple, got {}".format(replace_stride_with_dilation))
self.groups = groups
self.base_width = width_per_group
if quantize:
self.conv1 = quant_nn.QuantConv2d(3,
self.inplanes,
kernel_size=7,
stride=2,
padding=3,
bias=False)
else:
self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
self.bn1 = norm_layer(self.inplanes)
self.relu = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0], quantize=quantize)
self.layer2 = self._make_layer(block,
128,
layers[1],
stride=2,
dilate=replace_stride_with_dilation[0],
quantize=quantize)
self.layer3 = self._make_layer(block,
256,
layers[2],
stride=2,
dilate=replace_stride_with_dilation[1],
quantize=quantize)
self.layer4 = self._make_layer(block,
512,
layers[3],
stride=2,
dilate=replace_stride_with_dilation[2],
quantize=quantize)
self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
if quantize:
self.fc = quant_nn.QuantLinear(512 * block.expansion, num_classes)
else:
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
nn.init.constant_(m.weight, 1)
nn.init.constant_(m.bias, 0)
# Zero-initialize the last BN in each residual branch,
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
if zero_init_residual:
for m in self.modules():
if isinstance(m, Bottleneck):
nn.init.constant_(m.bn3.weight, 0) # type: ignore[arg-type]
elif isinstance(m, BasicBlock):
nn.init.constant_(m.bn2.weight, 0) # type: ignore[arg-type]
def _make_layer(self,
block: Type[Union[BasicBlock, Bottleneck]],
planes: int,
blocks: int,
stride: int = 1,
dilate: bool = False,
quantize: bool = False) -> nn.Sequential:
norm_layer = self._norm_layer
downsample = None
previous_dilation = self.dilation
if dilate:
self.dilation *= stride
stride = 1
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
conv1x1(self.inplanes, planes * block.expansion, stride, quantize=quantize),
norm_layer(planes * block.expansion),
)
layers = []
layers.append(
block(self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation,
norm_layer, self._quantize))
self.inplanes = planes * block.expansion
for _ in range(1, blocks):
layers.append(
block(self.inplanes,
planes,
groups=self.groups,
base_width=self.base_width,
dilation=self.dilation,
norm_layer=norm_layer,
quantize=quantize))
return nn.Sequential(*layers)
def _forward_impl(self, x: Tensor) -> Tensor:
# See note [TorchScript super()]
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.fc(x)
return x
def forward(self, x: Tensor) -> Tensor:
return self._forward_impl(x)
def _resnet(arch: str, block: Type[Union[BasicBlock, Bottleneck]], layers: List[int], pretrained: bool, progress: bool,
quantize: bool, **kwargs: Any) -> ResNet:
model = ResNet(block, layers, quantize, **kwargs)
if pretrained:
state_dict = load_state_dict_from_url(model_urls[arch], progress=progress)
model.load_state_dict(state_dict)
return model
def resnet18(pretrained: bool = False, progress: bool = True, quantize: bool = False, **kwargs: Any) -> ResNet:
r"""ResNet-18 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, quantize, **kwargs)
def resnet34(pretrained: bool = False, progress: bool = True, quantize: bool = False, **kwargs: Any) -> ResNet:
r"""ResNet-34 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _resnet('resnet34', BasicBlock, [3, 4, 6, 3], pretrained, progress, quantize, **kwargs)
def resnet50(pretrained: bool = False, progress: bool = True, quantize: bool = False, **kwargs: Any) -> ResNet:
r"""ResNet-50 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _resnet('resnet50', Bottleneck, [3, 4, 6, 3], pretrained, progress, quantize, **kwargs)
def resnet101(pretrained: bool = False, progress: bool = True, quantize: bool = False, **kwargs: Any) -> ResNet:
r"""ResNet-101 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _resnet('resnet101', Bottleneck, [3, 4, 23, 3], pretrained, progress, quantize, **kwargs)
def resnet152(pretrained: bool = False, progress: bool = True, quantize: bool = False, **kwargs: Any) -> ResNet:
r"""ResNet-152 model from
`"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
return _resnet('resnet152', Bottleneck, [3, 8, 36, 3], pretrained, progress, quantize, **kwargs)
def resnext50_32x4d(pretrained: bool = False, progress: bool = True, quantize: bool = False, **kwargs: Any) -> ResNet:
r"""ResNeXt-50 32x4d model from
`"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
kwargs['groups'] = 32
kwargs['width_per_group'] = 4
return _resnet('resnext50_32x4d', Bottleneck, [3, 4, 6, 3], pretrained, progress, quantize, **kwargs)
def resnext101_32x8d(pretrained: bool = False, progress: bool = True, quantize: bool = False, **kwargs: Any) -> ResNet:
r"""ResNeXt-101 32x8d model from
`"Aggregated Residual Transformation for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`_.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
kwargs['groups'] = 32
kwargs['width_per_group'] = 8
return _resnet('resnext101_32x8d', Bottleneck, [3, 4, 23, 3], pretrained, progress, quantize, **kwargs)
def wide_resnet50_2(pretrained: bool = False, progress: bool = True, quantize: bool = False, **kwargs: Any) -> ResNet:
r"""Wide ResNet-50-2 model from
`"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
The model is the same as ResNet except for the bottleneck number of channels
which is twice larger in every block. The number of channels in outer 1x1
convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
channels, and in Wide ResNet-50-2 has 2048-1024-2048.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
kwargs['width_per_group'] = 64 * 2
return _resnet('wide_resnet50_2', Bottleneck, [3, 4, 6, 3], pretrained, progress, quantize, **kwargs)
def wide_resnet101_2(pretrained: bool = False, progress: bool = True, quantize: bool = False, **kwargs: Any) -> ResNet:
r"""Wide ResNet-101-2 model from
`"Wide Residual Networks" <https://arxiv.org/pdf/1605.07146.pdf>`_.
The model is the same as ResNet except for the bottleneck number of channels
which is twice larger in every block. The number of channels in outer 1x1
convolutions is the same, e.g. last block in ResNet-50 has 2048-512-2048
channels, and in Wide ResNet-50-2 has 2048-1024-2048.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
progress (bool): If True, displays a progress bar of the download to stderr
"""
kwargs['width_per_group'] = 64 * 2
return _resnet('wide_resnet101_2', Bottleneck, [3, 4, 23, 3], pretrained, progress, quantize, **kwargs)
import torchvision.transforms.transforms as T
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader
from torch.utils.data.distributed import DistributedSampler
def prepare_dataloader(data_root,
train=True,
batch_size = 512):
if train:
train_transform = T.Compose([
T.Resize((224, 224)),
T.RandomHorizontalFlip(p=0.5),
T.ToTensor(),
T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])
train_dataset = CIFAR10(data_root, train=True, transform=train_transform, download=True)
sampler = DistributedSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, shuffle=False, batch_size=batch_size, sampler=sampler)
return train_dataloader, sampler
else:
test_transform = T.Compose([
T.Resize((224, 224)),
T.ToTensor(),
T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)),
])
test_dataset = CIFAR10(data_root, train=False, transform=test_transform, download=True)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=16)
return test_dataloader, None
from pytorch_quantization import nn as quant_nn
from pytorch_quantization import quant_modules
from pytorch_quantization import calib
from tqdm import tqdm
def collect_stats(model, data_loader, num_batches, device):
# Enable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.disable_quant()
module.enable_calib()
else:
module.disable()
for i, (image, _) in tqdm(enumerate(data_loader), total=num_batches):
model(image.to(device))
if i >= num_batches:
break
# Disable calibrators
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
module.enable_quant()
module.disable_calib()
else:
module.enable()
def compute_amax(model, device, **kwargs):
# Load calib result
for name, module in model.named_modules():
if isinstance(module, quant_nn.TensorQuantizer):
if module._calibrator is not None:
if isinstance(module._calibrator, calib.MaxCalibrator):
module.load_calib_amax()
else:
module.load_calib_amax(**kwargs)
model.to(device)
\ No newline at end of file
import pycuda.driver as cuda
import pycuda.autoinit
import tensorrt as trt
import numpy as np
class HostDeviceMem(object):
def __init__(self, host_mem, device_mem):
self.host = host_mem
self.device = device_mem
def __str__(self):
return "Host:\n" + str(self.host) + "\nDevice:\n" + str(self.device)
def __repr__(self):
return self.__str__()
class TrtModel:
def __init__(self,
engine_path,
max_batch_size=1,
dtype=np.float32):
self.engine_path = engine_path
self.dtype = dtype
self.logger = trt.Logger(trt.Logger.ERROR)
self.runtime = trt.Runtime(self.logger)
self.engine = self.load_engine(self.runtime, self.engine_path)
self.max_batch_size = max_batch_size
self.inputs, self.outputs, self.bindings, self.stream = self.allocate_buffers()
self.context = self.engine.create_execution_context()
@staticmethod
def load_engine(trt_runtime, engine_path):
trt.init_libnvinfer_plugins(None, "")
with open(engine_path, 'rb') as f:
engine_data = f.read()
engine = trt_runtime.deserialize_cuda_engine(engine_data)
return engine
def allocate_buffers(self):
inputs = []
outputs = []
bindings = []
stream = cuda.Stream()
for binding in self.engine:
size = trt.volume(self.engine.get_binding_shape(binding)) * self.max_batch_size
host_mem = cuda.pagelocked_empty(size, self.dtype)
device_mem = cuda.mem_alloc(host_mem.nbytes)
bindings.append(int(device_mem))
if self.engine.binding_is_input(binding):
inputs.append(HostDeviceMem(host_mem, device_mem))
else:
outputs.append(HostDeviceMem(host_mem, device_mem))
return inputs, outputs, bindings, stream
def __call__(self,
x,
batch_size=2):
x = x.astype(self.dtype)
np.copyto(self.inputs[0].host, x.ravel())
for inp in self.inputs:
cuda.memcpy_htod_async(inp.device, inp.host, self.stream)
self.context.execute_async(batch_size=batch_size, bindings=self.bindings, stream_handle=self.stream.handle)
for out in self.outputs:
cuda.memcpy_dtoh_async(out.host, out.device, self.stream)
self.stream.synchronize()
return [out.host.reshape(batch_size,-1) for out in self.outputs]
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment