# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ..bbox_utils import bbox_iou __all__ = ['YOLOv5Loss'] @register class YOLOv5Loss(nn.Layer): __shared__ = ['num_classes'] def __init__(self, num_classes=80, downsample_ratios=[8, 16, 32], balance=[4.0, 1.0, 0.4], box_weight=0.05, obj_weight=1.0, cls_weght=0.5, bias=0.5, anchor_t=4.0, label_smooth_eps=0.): super(YOLOv5Loss, self).__init__() self.num_classes = num_classes self.balance = balance self.na = 3 # not len(anchors) self.gr = 1.0 self.BCEcls = nn.BCEWithLogitsLoss(reduction="mean") self.BCEobj = nn.BCEWithLogitsLoss(reduction="mean") self.loss_weights = { 'box': box_weight, 'obj': obj_weight, 'cls': cls_weght, } eps = label_smooth_eps if label_smooth_eps > 0 else 0. self.cls_pos_label = 1.0 - 0.5 * eps self.cls_neg_label = 0.5 * eps self.downsample_ratios = downsample_ratios self.bias = bias # named 'g' in torch yolov5 self.off = np.array( [ [0, 0], [1, 0], [0, 1], [-1, 0], [0, -1], # j,k,l,m ], dtype=np.float32) * bias # offsets self.anchor_t = anchor_t self.to_static = False def build_targets(self, outputs, targets, anchors): if 0: # collate_batch True # targets['gt_class'] [bs, max_gt_nums, 1] # targets['gt_bbox'] [bs, max_gt_nums, 4] # targets['pad_gt_mask'] [bs, max_gt_nums, 1] gt_nums = targets['pad_gt_mask'].sum(1).squeeze(-1).numpy() nt = int(sum(gt_nums)) anchors = anchors.numpy() na = anchors.shape[1] # not len(anchors) tcls, tbox, indices, anch = [], [], [], [] gain = np.ones(7, dtype=np.float32) # normalized to gridspace gain ai = np.tile( np.arange( na, dtype=np.float32).reshape(na, 1), [1, nt]) batch_size = outputs[0].shape[0] gt_labels = [] for idx in range(batch_size): gt_num = int(gt_nums[idx]) if gt_num == 0: continue gt_bbox = targets['gt_bbox'][idx][:gt_num].numpy() gt_class = targets['gt_class'][idx][:gt_num].numpy() * 1.0 img_idx = np.repeat(np.array([[idx]]), gt_num, axis=0) gt_labels.append( np.concatenate((img_idx, gt_class, gt_bbox), -1)) else: gt_nums = [len(bbox) for bbox in targets['gt_bbox']] nt = int(sum(gt_nums)) anchors = anchors.numpy() na = anchors.shape[1] # not len(anchors) tcls, tbox, indices, anch = [], [], [], [] gain = np.ones(7, dtype=np.float32) # normalized to gridspace gain ai = np.tile( np.arange( na, dtype=np.float32).reshape(na, 1), [1, nt]) batch_size = outputs[0].shape[0] gt_labels = [] for idx in range(batch_size): gt_num = gt_nums[idx] if gt_num == 0: continue gt_bbox = targets['gt_bbox'][idx][:gt_num] gt_class = targets['gt_class'][idx][:gt_num] * 1.0 img_idx = np.repeat(np.array([[idx]]), gt_num, axis=0) gt_labels.append( np.concatenate((img_idx, gt_class, gt_bbox), -1)) if (len(gt_labels)): gt_labels = np.concatenate(gt_labels) else: gt_labels = np.zeros([0, 6]) targets_labels = np.concatenate((np.tile( np.expand_dims(gt_labels, 0), [na, 1, 1]), ai[:, :, None]), 2) g = self.bias # 0.5 for i in range(len(anchors)): anchor = np.array(anchors[i]) / self.downsample_ratios[i] gain[2:6] = np.array( outputs[i].shape, dtype=np.float32)[[3, 2, 3, 2]] # xyxy gain # Match targets_labels to t = targets_labels * gain if nt: # Matches r = t[:, :, 4:6] / anchor[:, None] j = np.maximum(r, 1 / r).max(2) < self.anchor_t t = t[j] # filter # Offsets gxy = t[:, 2:4] # grid xy gxi = gain[[2, 3]] - gxy # inverse j, k = ((gxy % 1 < g) & (gxy > 1)).T l, m = ((gxi % 1 < g) & (gxi > 1)).T j = np.stack((np.ones_like(j), j, k, l, m)) t = np.tile(t, [5, 1, 1])[j] offsets = (np.zeros_like(gxy)[None] + self.off[:, None])[j] else: t = targets_labels[0] offsets = 0 # Define b, c = t[:, :2].astype(np.int64).T # image, class gxy = t[:, 2:4] # grid xy gwh = t[:, 4:6] # grid wh gij = (gxy - offsets).astype(np.int64) gi, gj = gij.T # grid xy indices # Append a = t[:, 6].astype(np.int64) # anchor indices gj, gi = gj.clip(0, gain[3] - 1), gi.clip(0, gain[2] - 1) indices.append( (paddle.to_tensor(b), paddle.to_tensor(a), paddle.to_tensor(gj, 'int64'), paddle.to_tensor(gi, 'int64'))) tbox.append( paddle.to_tensor( np.concatenate((gxy - gij, gwh), 1), dtype=paddle.float32)) anch.append(paddle.to_tensor(anchor[a])) tcls.append(paddle.to_tensor(c)) return tcls, tbox, indices, anch def yolov5_loss(self, pi, t_cls, t_box, t_indices, t_anchor, balance): loss = dict() b, a, gj, gi = t_indices # image, anchor, gridy, gridx n = b.shape[0] # number of targets tobj = paddle.zeros_like(pi[:, :, :, :, 4]) loss_box = paddle.to_tensor([0.]) loss_cls = paddle.to_tensor([0.]) if n: mask = paddle.stack([b, a, gj, gi], 1) ps = pi.gather_nd(mask) # Regression pxy = F.sigmoid(ps[:, :2]) * 2 - 0.5 pwh = (F.sigmoid(ps[:, 2:4]) * 2)**2 * t_anchor pbox = paddle.concat((pxy, pwh), 1) iou = bbox_iou(pbox.T, t_box.T, x1y1x2y2=False, ciou=True) loss_box = (1.0 - iou).mean() # Objectness score_iou = paddle.cast(iou.detach().clip(0), tobj.dtype) # with paddle.no_grad(): # x = paddle.gather_nd(tobj, mask) # tobj = paddle.scatter_nd_add( # tobj, mask, (1.0 - self.gr) + self.gr * score_iou - x) with paddle.no_grad(): tobj[b, a, gj, gi] = (1.0 - self.gr ) + self.gr * score_iou # iou ratio # Classification if self.num_classes > 1: # cls loss (only if multiple classes) # t = paddle.full_like(ps[:, 5:], self.cls_neg_label) # t[range(n), t_cls] = self.cls_pos_label # loss_cls = self.BCEcls(ps[:, 5:], t) t = paddle.full_like(ps[:, 5:], self.cls_neg_label) if not self.to_static: t = paddle.put_along_axis( t, t_cls.unsqueeze(-1), values=self.cls_pos_label, axis=1) else: for i in range(n): t[i, t_cls[i]] = self.cls_pos_label loss_cls = self.BCEcls(ps[:, 5:], t) obji = self.BCEobj(pi[:, :, :, :, 4], tobj) # [bs, 3, h, w] loss_obj = obji * balance loss['loss_box'] = loss_box * self.loss_weights['box'] loss['loss_obj'] = loss_obj * self.loss_weights['obj'] loss['loss_cls'] = loss_cls * self.loss_weights['cls'] return loss def forward(self, inputs, targets, anchors): yolo_losses = dict() if not self.to_static: tcls, tbox, indices, anch = self.build_targets(inputs, targets, anchors) else: tcls, tbox, indices, anch = self.build_targets_paddle( inputs, targets, anchors) for i, (p_det, balance) in enumerate(zip(inputs, self.balance)): t_cls = tcls[i] t_box = tbox[i] t_anchor = anch[i] t_indices = indices[i] bs, ch, h, w = p_det.shape pi = p_det.reshape( (bs, self.na, int(ch / self.na), h, w)).transpose( (0, 1, 3, 4, 2)) yolo_loss = self.yolov5_loss(pi, t_cls, t_box, t_indices, t_anchor, balance) for k, v in yolo_loss.items(): if k in yolo_losses: yolo_losses[k] += v else: yolo_losses[k] = v batch_size = inputs[0].shape[0] num_gpus = targets.get('num_gpus', 8) loss = 0 for k, v in yolo_losses.items(): yolo_losses[k] = v * batch_size * num_gpus loss += yolo_losses[k] yolo_losses['loss'] = loss return yolo_losses def build_targets_paddle(self, outputs, targets, anchors): # targets['gt_class'] [bs, max_gt_nums, 1] # targets['gt_bbox'] [bs, max_gt_nums, 4] # targets['pad_gt_mask'] [bs, max_gt_nums, 1] gt_nums = [len(bbox) for bbox in targets['gt_bbox']] nt = int(sum(gt_nums)) anchors = anchors na = anchors.shape[1] # not len(anchors) tcls, tbox, indices, anch = [], [], [], [] gain = paddle.ones( [7], dtype=paddle.float32) # normalized to gridspace gain ai = paddle.tile( paddle.arange( na, dtype=paddle.float32).reshape([na, 1]), [1, nt]) batch_size = outputs[0].shape[0] gt_labels = [] for i, ( gt_num, gt_bboxs, gt_classes ) in enumerate(zip(gt_nums, targets['gt_bbox'], targets['gt_class'])): if gt_num == 0: continue gt_bbox = gt_bboxs[:gt_num].astype('float32') gt_class = (gt_classes[:gt_num] * 1.0).astype('float32') img_idx = paddle.repeat_interleave( paddle.to_tensor([i]), gt_num, axis=0)[None, :].astype('float32').T gt_labels.append( paddle.concat( (img_idx, gt_class, gt_bbox), axis=-1)) if (len(gt_labels)): gt_labels = paddle.concat(gt_labels) else: gt_labels = paddle.zeros([0, 6], dtype=paddle.float32) targets_labels = paddle.concat((paddle.tile( paddle.unsqueeze(gt_labels, 0), [na, 1, 1]), ai[:, :, None]), 2) g = self.bias # 0.5 for i in range(len(anchors)): anchor = anchors[i] / self.downsample_ratios[i] gain[2:6] = paddle.to_tensor( outputs[i].shape, dtype=paddle.float32)[[3, 2, 3, 2]] # xyxy gain # Match targets_labels to t = targets_labels * gain if nt: # Matches r = t[:, :, 4:6] / anchor[:, None] j = paddle.maximum(r, 1 / r).max(2) < self.anchor_t t = paddle.flatten(t, 0, 1) j = paddle.flatten(j.astype(paddle.int32), 0, 1).astype(paddle.bool) t = t[j] # filter # Offsets gxy = t[:, 2:4] # grid xy gxi = gain[[2, 3]] - gxy # inverse j, k = ((gxy % 1 < g) & (gxy > 1)).T.astype(paddle.int64) l, m = ((gxi % 1 < g) & (gxi > 1)).T.astype(paddle.int64) j = paddle.flatten( paddle.stack((paddle.ones_like(j), j, k, l, m)), 0, 1).astype(paddle.bool) t = paddle.flatten(paddle.tile(t, [5, 1, 1]), 0, 1) t = t[j] offsets = paddle.zeros_like(gxy)[None, :] + paddle.to_tensor( self.off)[:, None] offsets = paddle.flatten(offsets, 0, 1)[j] else: t = targets_labels[0] offsets = 0 # Define b, c = t[:, :2].astype(paddle.int64).T # image, class gxy = t[:, 2:4] # grid xy gwh = t[:, 4:6] # grid wh gij = (gxy - offsets).astype(paddle.int64) gi, gj = gij.T # grid xy indices # Append a = t[:, 6].astype(paddle.int64) # anchor indices gj, gi = gj.clip(0, gain[3] - 1), gi.clip(0, gain[2] - 1) indices.append( (b, a, gj.astype(paddle.int64), gi.astype(paddle.int64))) tbox.append( paddle.concat((gxy - gij, gwh), 1).astype(paddle.float32)) anch.append(anchor[a]) tcls.append(c) return tcls, tbox, indices, anch