# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
from ..bbox_utils import bbox_iou

__all__ = ['YOLOv7Loss']


@register
class YOLOv7Loss(nn.Layer):
    """
    this code is based on https://github.com/WongKinYiu/yolov7
    """
    __shared__ = ['num_classes', 'use_aux']

    def __init__(self,
                 num_classes=80,
                 downsample_ratios=[8, 16, 32],
                 balance=[4.0, 1.0, 0.4],
                 box_weight=0.05,
                 cls_weght=0.3,
                 obj_weight=0.7,
                 bias=0.5,
                 anchor_t=4.0,
                 label_smooth_eps=0.,
                 use_aux=False):
        super(YOLOv7Loss, self).__init__()
        self.num_classes = num_classes
        self.balance = balance
        self.use_aux = use_aux
        if self.use_aux:
            self.balance = balance * 2
        self.na = 3  # len(anchors[0]) not len(anchors)
        self.gr = 1.0

        self.BCEcls = nn.BCEWithLogitsLoss(
            pos_weight=paddle.to_tensor([1.0]), reduction="mean")
        self.BCEobj = nn.BCEWithLogitsLoss(
            pos_weight=paddle.to_tensor([1.0]), reduction="mean")

        self.loss_weights = {
            'box': box_weight,
            'obj': obj_weight,
            'cls': cls_weght,
        }

        eps = label_smooth_eps if label_smooth_eps > 0 else 0.
        self.cls_pos_label = 1.0 - 0.5 * eps
        self.cls_neg_label = 0.5 * eps

        self.downsample_ratios = downsample_ratios
        if self.use_aux:
            self.downsample_ratios = downsample_ratios * 2
        self.bias = bias  # named 'g' in torch yolov5/yolov7
        self.off = np.array(
            [
                [0, 0],
                [1, 0],
                [0, 1],
                [-1, 0],
                [0, -1],  # j,k,l,m
            ],
            dtype=np.float32) * bias  # offsets
        self.anchor_t = anchor_t

    def forward(self, head_outs, gt_targets, anchors):
        self.nl = len(anchors)

        # 1.split head_outs feature from [b,c,h,w] to [b,na,c//na,h,w]
        inputs = []
        for i in range(self.nl):
            pi = head_outs[i]
            bs, _, h, w = pi.shape
            pi = pi.reshape((bs, self.na, -1, h, w)).transpose((0, 1, 3, 4, 2))
            inputs.append(pi)
        if self.use_aux:
            for i in range(self.nl):
                pi = head_outs[i + self.nl]
                bs, _, h, w = pi.shape
                pi = pi.reshape((bs, self.na, -1, h, w)).transpose(
                    (0, 1, 3, 4, 2))
                inputs.append(pi)

        # 2.generate targets_labels [nt, 6] from gt_targets(dict)
        anchors = anchors.numpy()
        if 0:
            # collate_batch True
            # gt_targets['gt_class'] [bs, max_gt_nums, 1]
            # gt_targets['gt_bbox'] [bs, max_gt_nums, 4]
            # gt_targets['pad_gt_mask'] [bs, max_gt_nums, 1]
            gt_nums = gt_targets['pad_gt_mask'].sum(1).squeeze(-1).numpy()
            batch_size = head_outs[0].shape[0]
            targets_labels = []  # [nt, 6]
            for idx in range(batch_size):
                gt_num = int(gt_nums[idx])
                if gt_num == 0:
                    continue
                gt_bbox = gt_targets['gt_bbox'][idx][:gt_num].reshape(
                    [-1, 4]).numpy()
                gt_class = gt_targets['gt_class'][idx][:gt_num].reshape(
                    [-1, 1]).numpy() * 1.0
                img_idx = np.repeat(np.array([[idx]]), gt_num, axis=0)
                targets_labels.append(
                    np.concatenate((img_idx, gt_class, gt_bbox), -1))
        else:
            gt_nums = [len(bbox) for bbox in gt_targets['gt_bbox']]
            batch_size = head_outs[0].shape[0]
            targets_labels = []  # [nt, 6]
            for idx in range(batch_size):
                gt_num = int(gt_nums[idx])
                if gt_num == 0:
                    continue
                gt_bbox = gt_targets['gt_bbox'][idx][:gt_num].reshape([-1, 4])
                gt_class = gt_targets['gt_class'][idx][:gt_num].reshape(
                    [-1, 1]) * 1.0
                img_idx = np.repeat(np.array([[idx]]), gt_num, axis=0)
                targets_labels.append(
                    np.concatenate((img_idx, gt_class, gt_bbox), -1))

        if (len(targets_labels)):
            targets_labels = np.concatenate(targets_labels)
        else:
            targets_labels = np.zeros([0, 6])

        # 3.build targets
        batch_images = gt_targets['image']  # just get shape
        if not self.use_aux:
            bs, as_, gjs, gis, targets, anchors = self.build_targets(
                inputs, targets_labels, anchors, batch_images)
            pre_gen_gains = [
                paddle.to_tensor(pp.shape, 'float32')[[3, 2, 3, 2]]
                for pp in inputs
            ]
        else:
            bs_aux, as_aux_, gjs_aux, gis_aux, targets_aux, anchors_aux = self.build_targets2(
                inputs[:self.nl], targets_labels, anchors, batch_images)
            bs, as_, gjs, gis, targets, anchors = self.build_targets(
                inputs[:self.nl], targets_labels, anchors, batch_images)
            pre_gen_gains_aux = [
                paddle.to_tensor(pp.shape, 'float32')[[3, 2, 3, 2]]
                for pp in inputs[:self.nl]
            ]
            pre_gen_gains = [
                paddle.to_tensor(pp.shape, 'float32')[[3, 2, 3, 2]]
                for pp in inputs[:self.nl]
            ]

        # Losses
        lcls, lbox = paddle.zeros([1]), paddle.zeros([1])
        lobj = paddle.zeros([1])  # single class will always be tensor([0.])
        for i in range(self.nl):
            pi = inputs[i]
            b, a, gj, gi = bs[i], as_[i], gjs[i], gis[i]
            tobj = paddle.zeros_like(pi[..., 0])
            n = b.shape[0]  # number of targets
            if n:
                ps = pi[b, a, gj, gi]  # numpy index
                if len(ps.shape) == 1:  # Note: when only one sample
                    ps = ps.unsqueeze(0)

                # Regression
                tensor_grid = paddle.to_tensor(np.stack([gi, gj], 1), 'float32')
                tensor_anch = paddle.to_tensor(anchors[i], 'float32')
                tensor_box = paddle.to_tensor(targets[i][:, 2:6], 'float32')
                pxy = F.sigmoid(ps[:, :2]) * 2. - 0.5
                pwh = (F.sigmoid(ps[:, 2:4]) * 2)**2 * tensor_anch
                pbox = paddle.concat([pxy, pwh], 1)  # predicted box
                selected_tbox = tensor_box * pre_gen_gains[i]
                selected_tbox[:, :2] -= tensor_grid
                iou = bbox_iou(
                    pbox.T,
                    selected_tbox.T,
                    x1y1x2y2=False,
                    ciou=True,
                    eps=1e-7)
                lbox += (1.0 - iou).mean()

                # Objectness
                score_iou = paddle.cast(iou.detach().clip(0), tobj.dtype)
                with paddle.no_grad():
                    # numpy index
                    tobj[b, a, gj, gi] = (1.0 - self.gr) + self.gr * score_iou

                # Classification
                selected_tcls = targets[i][:, 1].astype(np.int64)
                if self.num_classes > 1:  # cls loss (only if multiple classes)
                    t = paddle.full_like(ps[:, 5:], self.cls_neg_label)
                    t[range(n), selected_tcls] = self.cls_pos_label
                    lcls += self.BCEcls(ps[:, 5:], t)

            if self.use_aux:
                pi_aux = inputs[i + self.nl]
                b_aux, a_aux, gj_aux, gi_aux = bs_aux[i], as_aux_[i], gjs_aux[
                    i], gis_aux[i]
                tobj_aux = paddle.zeros_like(pi_aux[..., 0])

                n_aux = b_aux.shape[0]  # number of targets
                if n_aux:
                    ps_aux = pi_aux[b_aux, a_aux, gj_aux, gi_aux]  # numpy index
                    if len(ps_aux.shape) == 1:  # Note: when only one sample
                        ps_aux = ps_aux.unsqueeze(0)

                    # Regression
                    tensor_grid_aux = paddle.to_tensor(
                        np.stack([gi_aux, gj_aux], 1), 'float32')
                    tensor_anch_aux = paddle.to_tensor(anchors_aux[i],
                                                       'float32')
                    tensor_box_aux = paddle.to_tensor(targets_aux[i][:, 2:6],
                                                      'float32')
                    pxy_aux = F.sigmoid(ps_aux[:, :2]) * 2. - 0.5
                    pwh_aux = (F.sigmoid(ps_aux[:, 2:4]) *
                               2)**2 * tensor_anch_aux
                    pbox_aux = paddle.concat((pxy_aux, pwh_aux), 1)
                    selected_tbox_aux = tensor_box_aux * pre_gen_gains_aux[i]
                    selected_tbox_aux[:, :2] -= tensor_grid_aux
                    iou_aux = bbox_iou(
                        pbox_aux.T,
                        selected_tbox_aux.T,
                        x1y1x2y2=False,
                        ciou=True)
                    lbox += 0.25 * (1.0 - iou_aux).mean()

                    # Objectness
                    score_iou_aux = paddle.cast(iou_aux.detach().clip(0),
                                                tobj_aux.dtype)
                    with paddle.no_grad():
                        tobj_aux[b_aux, a_aux, gj_aux, gi_aux] = (
                            1.0 - self.gr) + self.gr * score_iou_aux

                    # Classification
                    selected_tcls_aux = targets_aux[i][:, 1].astype(np.int64)
                    if self.num_classes > 1:  # cls loss (only if multiple classes)
                        t_aux = paddle.full_like(ps_aux[:, 5:],
                                                 self.cls_neg_label)
                        t_aux[range(n_aux),
                              selected_tcls_aux] = self.cls_pos_label
                        lcls += 0.25 * self.BCEcls(ps_aux[:, 5:], t_aux)

            obji = self.BCEobj(pi[:, :, :, :, 4], tobj)
            lobj += obji * self.balance[i]  # obj loss
            if self.use_aux:
                obji_aux = self.BCEobj(pi_aux[:, :, :, :, 4], tobj_aux)
                lobj += 0.25 * obji_aux * self.balance[i]  # obj_aux loss

        batch_size = head_outs[0].shape[0]
        num_gpus = gt_targets.get('num_gpus', 8)
        yolo_losses = dict()
        yolo_losses['loss_box'] = lbox * self.loss_weights[
            'box'] * batch_size * num_gpus
        yolo_losses['loss_cls'] = lcls * self.loss_weights[
            'cls'] * batch_size * num_gpus
        yolo_losses['loss_obj'] = lobj * self.loss_weights[
            'obj'] * batch_size * num_gpus
        loss_all = yolo_losses['loss_box'] + yolo_losses[
            'loss_obj'] + yolo_losses['loss_cls']
        yolo_losses['loss'] = loss_all
        return yolo_losses

    def build_targets(self, p, targets, anchors, batch_images):
        indices, anch = self.find_3_positive(p, targets, anchors)
        # numpy indices,anch for fast assign

        matching_bs = [[] for pp in p]
        matching_as = [[] for pp in p]
        matching_gjs = [[] for pp in p]
        matching_gis = [[] for pp in p]
        matching_targets = [[] for pp in p]
        matching_anchs = [[] for pp in p]

        nl = len(p)
        for batch_idx in range(p[0].shape[0]):
            b_idx = targets[:, 0] == batch_idx
            if b_idx.sum() == 0:
                continue
            this_target = targets[b_idx]
            txywh = this_target[:, 2:6] * batch_images[batch_idx].shape[1]
            # this_target[:, 2:6] * 640
            txyxy = xywh2xyxy(paddle.to_tensor(txywh, 'float32'))  # tensor op

            pxyxys, p_cls, p_obj = [], [], []
            from_which_layer = []
            all_b, all_a, all_gj, all_gi = [], [], [], []
            all_anch = []

            empty_feats_num = 0
            for i, pi in enumerate(p):
                b, a, gj, gi = indices[i]
                idx = (b == batch_idx)
                if idx.sum() == 0:
                    empty_feats_num += 1
                    continue
                b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx]
                all_b.append(b)
                all_a.append(a)
                all_gj.append(gj)
                all_gi.append(gi)
                all_anch.append(anch[i][idx])
                from_which_layer.append(np.ones([len(b)]) * i)

                fg_pred = pi[b, a, gj, gi]  # numpy index
                if len(fg_pred.shape) == 1:  # Note: when only one sample
                    fg_pred = fg_pred.unsqueeze(0)
                p_obj.append(fg_pred[:, 4:5])
                p_cls.append(fg_pred[:, 5:])

                tensor_grid = paddle.to_tensor(np.stack([gi, gj], 1), 'float32')
                pxy = (F.sigmoid(fg_pred[:, :2]) * 2. - 0.5 + tensor_grid
                       ) * self.downsample_ratios[i]
                tensor_anch = paddle.to_tensor(anch[i][idx], 'float32')
                pwh = (F.sigmoid(fg_pred[:, 2:4]) *
                       2)**2 * tensor_anch * self.downsample_ratios[i]
                pxywh = paddle.concat([pxy, pwh], -1)
                pxyxy = xywh2xyxy(pxywh)  # tensor op
                pxyxys.append(pxyxy)

            if empty_feats_num == len(p) or len(pxyxys) == 0:  # Note: empty
                continue
            pxyxys = paddle.concat(pxyxys, 0)

            p_obj = paddle.concat(p_obj, 0)
            p_cls = paddle.concat(p_cls, 0)

            from_which_layer = np.concatenate(from_which_layer, 0)
            all_b = np.concatenate(all_b, 0)
            all_a = np.concatenate(all_a, 0)
            all_gj = np.concatenate(all_gj, 0)
            all_gi = np.concatenate(all_gi, 0)
            all_anch = np.concatenate(all_anch, 0)

            #pairwise_ious = box_iou(txyxy, pxyxys)  # tensor op
            _, h, w = batch_images[batch_idx].shape
            pairwise_ious = box_iou_normalization(txyxy, pxyxys, h,
                                                  w)  # tensor op
            # [N, 4] [M, 4] to get [N, M] ious

            pairwise_iou_loss = -paddle.log(pairwise_ious + 1e-5)

            min_topk = 10
            topk_ious, _ = paddle.topk(pairwise_ious,
                                       min(min_topk, pairwise_ious.shape[1]), 1)
            dynamic_ks = paddle.clip(topk_ious.sum(1).cast('int'), min=1)

            gt_cls_per_image = (paddle.tile(
                F.one_hot(
                    paddle.to_tensor(this_target[:, 1], 'int64'),
                    self.num_classes).unsqueeze(1), [1, pxyxys.shape[0], 1]))

            num_gt = this_target.shape[0]
            cls_preds_ = (
                F.sigmoid(paddle.tile(p_cls.unsqueeze(0), [num_gt, 1, 1])) *
                F.sigmoid(paddle.tile(p_obj.unsqueeze(0), [num_gt, 1, 1])))

            y = cls_preds_.sqrt_()
            pairwise_cls_loss = F.binary_cross_entropy_with_logits(
                paddle.log(y / (1 - y) + 1e-5),
                gt_cls_per_image,
                reduction="none").sum(-1)
            del cls_preds_

            cost = (pairwise_cls_loss + 3.0 * pairwise_iou_loss)

            matching_matrix = np.zeros(cost.shape)
            for gt_idx in range(num_gt):
                _, pos_idx = paddle.topk(
                    cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
                matching_matrix[gt_idx, pos_idx.numpy()] = 1.0
            del topk_ious, dynamic_ks, pos_idx

            anchor_matching_gt = matching_matrix.sum(0)
            if (anchor_matching_gt > 1).sum() > 0:
                cost_argmin = np.argmin(cost.numpy()[:, anchor_matching_gt > 1],
                                        0)
                matching_matrix[:, anchor_matching_gt > 1] *= 0.0
                matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0
            fg_mask_inboxes = matching_matrix.sum(0) > 0.0
            matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)

            from_which_layer = from_which_layer[fg_mask_inboxes]
            all_b = all_b[fg_mask_inboxes]
            all_a = all_a[fg_mask_inboxes]
            all_gj = all_gj[fg_mask_inboxes]
            all_gi = all_gi[fg_mask_inboxes]
            all_anch = all_anch[fg_mask_inboxes]

            this_target = this_target[matched_gt_inds]

            for i in range(nl):
                layer_idx = from_which_layer == i
                matching_bs[i].append(all_b[layer_idx])
                matching_as[i].append(all_a[layer_idx])
                matching_gjs[i].append(all_gj[layer_idx])
                matching_gis[i].append(all_gi[layer_idx])
                matching_targets[i].append(
                    this_target[layer_idx])  # this_ not all_
                matching_anchs[i].append(all_anch[layer_idx])

        for i in range(nl):
            if matching_targets[i] != []:
                matching_bs[i] = np.concatenate(matching_bs[i], 0)
                matching_as[i] = np.concatenate(matching_as[i], 0)
                matching_gjs[i] = np.concatenate(matching_gjs[i], 0)
                matching_gis[i] = np.concatenate(matching_gis[i], 0)
                matching_targets[i] = np.concatenate(matching_targets[i], 0)
                matching_anchs[i] = np.concatenate(matching_anchs[i], 0)

        return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs

    def find_3_positive(self, outputs, targets, all_anchors):
        na, nt = self.na, targets.shape[0]  # number of anchors, targets
        indices, anch = [], []
        gain = np.ones(7, dtype=np.float32)  # normalized to gridspace gain
        ai = np.tile(np.arange(na, dtype=np.float32).reshape(na, 1), [1, nt])
        targets_labels = np.concatenate((np.tile(
            np.expand_dims(targets, 0), [na, 1, 1]), ai[:, :, None]), 2)
        g = self.bias  # 0.5

        for i in range(len(all_anchors)):
            anchors = np.array(all_anchors[i]) / self.downsample_ratios[i]
            gain[2:6] = np.array(
                outputs[i].shape, dtype=np.float32)[[3, 2, 3, 2]]  # xyxy gain

            # Match targets_labels to anchors
            t = targets_labels * gain
            if nt:
                # Matches
                r = t[:, :, 4:6] / anchors[:, None]  # wh ratio
                j = np.maximum(r, 1. / r).max(2) < self.anchor_t
                t = t[j]  # filter

                # Offsets
                gxy = t[:, 2:4]  # grid xy
                gxi = gain[[2, 3]] - gxy  # inverse
                j, k = ((gxy % 1. < g) & (gxy > 1.)).T
                l, m = ((gxi % 1. < g) & (gxi > 1.)).T
                j = np.stack([np.ones_like(j), j, k, l, m])
                t = np.tile(t, [5, 1, 1])[j]
                offsets = (np.zeros_like(gxy)[None] + self.off[:, None])[j]
            else:
                t = targets_labels[0]
                offsets = 0

            # Define
            b, c = t[:, :2].astype(np.int64).T
            gxy = t[:, 2:4]  # grid xy
            gij = (gxy - offsets).astype(np.int64)
            gi, gj = gij.T  # grid xy indices

            # Append
            a = t[:, 6].astype(np.int64)  # anchor indices
            gj, gi = gj.clip(0, gain[3] - 1).astype(np.int64), gi.clip(
                0, gain[2] - 1).astype(np.int64)
            indices.append((b, a, gj, gi))
            anch.append(anchors[a])  # anchors
        # return numpy rather than tensor
        return indices, anch

    def build_targets2(self, p, targets, anchors, batch_images):
        indices, anch = self.find_5_positive(p, targets, anchors)
        # numpy indices,anch for fast assign

        matching_bs = [[] for pp in p]
        matching_as = [[] for pp in p]
        matching_gjs = [[] for pp in p]
        matching_gis = [[] for pp in p]
        matching_targets = [[] for pp in p]
        matching_anchs = [[] for pp in p]

        nl = len(p)
        for batch_idx in range(p[0].shape[0]):
            b_idx = targets[:, 0] == batch_idx
            if b_idx.sum() == 0:
                continue
            this_target = targets[b_idx]
            txywh = this_target[:, 2:6] * batch_images[batch_idx].shape[1]
            # this_target[:, 2:6] * 1280
            txyxy = xywh2xyxy(paddle.to_tensor(txywh, 'float32'))  # tensor op

            pxyxys, p_cls, p_obj = [], [], []
            from_which_layer = []
            all_b, all_a, all_gj, all_gi = [], [], [], []
            all_anch = []

            empty_feats_num = 0
            for i, pi in enumerate(p):
                b, a, gj, gi = indices[i]
                idx = (b == batch_idx)
                if idx.sum() == 0:
                    empty_feats_num += 1
                    continue
                b, a, gj, gi = b[idx], a[idx], gj[idx], gi[idx]
                all_b.append(b)
                all_a.append(a)
                all_gj.append(gj)
                all_gi.append(gi)
                all_anch.append(anch[i][idx])
                from_which_layer.append(np.ones([len(b)]) * i)

                fg_pred = pi[b, a, gj, gi]  # numpy index
                if len(fg_pred.shape) == 1:  # Note: when only one sample
                    fg_pred = fg_pred.unsqueeze(0)
                p_obj.append(fg_pred[:, 4:5])
                p_cls.append(fg_pred[:, 5:])

                tensor_grid = paddle.to_tensor(np.stack([gi, gj], 1), 'float32')
                pxy = (F.sigmoid(fg_pred[:, :2]) * 2. - 0.5 + tensor_grid
                       ) * self.downsample_ratios[i]
                tensor_anch = paddle.to_tensor(anch[i][idx], 'float32')
                pwh = (F.sigmoid(fg_pred[:, 2:4]) *
                       2)**2 * tensor_anch * self.downsample_ratios[i]
                pxywh = paddle.concat([pxy, pwh], -1)
                pxyxy = xywh2xyxy(pxywh)  # tensor op
                pxyxys.append(pxyxy)

            if empty_feats_num == len(p) or len(pxyxys) == 0:  # Note: empty
                continue
            pxyxys = paddle.concat(pxyxys, 0)

            p_obj = paddle.concat(p_obj, 0)
            p_cls = paddle.concat(p_cls, 0)

            from_which_layer = np.concatenate(from_which_layer, 0)
            all_b = np.concatenate(all_b, 0)
            all_a = np.concatenate(all_a, 0)
            all_gj = np.concatenate(all_gj, 0)
            all_gi = np.concatenate(all_gi, 0)
            all_anch = np.concatenate(all_anch, 0)

            pairwise_ious = box_iou(txyxy, pxyxys)  # tensor op
            # [N, 4] [M, 4] to get [N, M] ious

            pairwise_iou_loss = -paddle.log(pairwise_ious + 1e-8)

            min_topk = 20  # diff, 10 in build_targets()
            topk_ious, _ = paddle.topk(pairwise_ious,
                                       min(min_topk, pairwise_ious.shape[1]), 1)
            dynamic_ks = paddle.clip(topk_ious.sum(1).cast('int'), min=1)

            gt_cls_per_image = (paddle.tile(
                F.one_hot(
                    paddle.to_tensor(this_target[:, 1], 'int64'),
                    self.num_classes).unsqueeze(1), [1, pxyxys.shape[0], 1]))

            num_gt = this_target.shape[0]
            cls_preds_ = (
                F.sigmoid(paddle.tile(p_cls.unsqueeze(0), [num_gt, 1, 1])) *
                F.sigmoid(paddle.tile(p_obj.unsqueeze(0), [num_gt, 1, 1])))

            y = cls_preds_.sqrt_()
            pairwise_cls_loss = F.binary_cross_entropy_with_logits(
                paddle.log(y / (1 - y) + 1e-5),
                gt_cls_per_image,
                reduction="none").sum(-1)
            del cls_preds_

            cost = (pairwise_cls_loss + 3.0 * pairwise_iou_loss)

            matching_matrix = np.zeros(cost.shape)
            for gt_idx in range(num_gt):
                _, pos_idx = paddle.topk(
                    cost[gt_idx], k=dynamic_ks[gt_idx].item(), largest=False)
                matching_matrix[gt_idx, pos_idx.numpy()] = 1.0
            del topk_ious, dynamic_ks, pos_idx

            anchor_matching_gt = matching_matrix.sum(0)
            if (anchor_matching_gt > 1).sum() > 0:
                cost_argmin = np.argmin(cost.numpy()[:, anchor_matching_gt > 1],
                                        0)
                matching_matrix[:, anchor_matching_gt > 1] *= 0.0
                matching_matrix[cost_argmin, anchor_matching_gt > 1] = 1.0
            fg_mask_inboxes = matching_matrix.sum(0) > 0.0
            matched_gt_inds = matching_matrix[:, fg_mask_inboxes].argmax(0)

            from_which_layer = from_which_layer[fg_mask_inboxes]
            all_b = all_b[fg_mask_inboxes]
            all_a = all_a[fg_mask_inboxes]
            all_gj = all_gj[fg_mask_inboxes]
            all_gi = all_gi[fg_mask_inboxes]
            all_anch = all_anch[fg_mask_inboxes]

            this_target = this_target[matched_gt_inds]

            for i in range(nl):
                layer_idx = from_which_layer == i
                matching_bs[i].append(all_b[layer_idx])
                matching_as[i].append(all_a[layer_idx])
                matching_gjs[i].append(all_gj[layer_idx])
                matching_gis[i].append(all_gi[layer_idx])
                matching_targets[i].append(
                    this_target[layer_idx])  # this_ not all_
                matching_anchs[i].append(all_anch[layer_idx])

        for i in range(nl):
            if matching_targets[i] != []:
                matching_bs[i] = np.concatenate(matching_bs[i], 0)
                matching_as[i] = np.concatenate(matching_as[i], 0)
                matching_gjs[i] = np.concatenate(matching_gjs[i], 0)
                matching_gis[i] = np.concatenate(matching_gis[i], 0)
                matching_targets[i] = np.concatenate(matching_targets[i], 0)
                matching_anchs[i] = np.concatenate(matching_anchs[i], 0)

        return matching_bs, matching_as, matching_gjs, matching_gis, matching_targets, matching_anchs

    def find_5_positive(self, outputs, targets, all_anchors):
        na, nt = self.na, targets.shape[0]  # number of anchors, targets
        indices, anch = [], []
        gain = np.ones(7, dtype=np.float32)  # normalized to gridspace gain
        ai = np.tile(np.arange(na, dtype=np.float32).reshape(na, 1), [1, nt])
        targets_labels = np.concatenate((np.tile(
            np.expand_dims(targets, 0), [na, 1, 1]), ai[:, :, None]), 2)
        g = 1.0  # Note: diff, not self.bias(0.5) in find_3_positive()

        for i in range(len(all_anchors)):
            anchors = np.array(all_anchors[i]) / self.downsample_ratios[i]
            gain[2:6] = np.array(
                outputs[i].shape, dtype=np.float32)[[3, 2, 3, 2]]  # xyxy gain

            # Match targets_labels to anchors
            t = targets_labels * gain
            if nt:
                # Matches
                r = t[:, :, 4:6] / anchors[:, None]  # wh ratio
                j = np.maximum(r, 1. / r).max(2) < self.anchor_t
                t = t[j]  # filter

                # Offsets
                gxy = t[:, 2:4]  # grid xy
                gxi = gain[[2, 3]] - gxy  # inverse
                j, k = ((gxy % 1. < g) & (gxy > 1.)).T
                l, m = ((gxi % 1. < g) & (gxi > 1.)).T
                j = np.stack([np.ones_like(j), j, k, l, m])
                t = np.tile(t, [5, 1, 1])[j]
                offsets = (np.zeros_like(gxy)[None] + self.off[:, None])[j]
            else:
                t = targets_labels[0]
                offsets = 0

            # Define
            b, c = t[:, :2].astype(np.int64).T
            gxy = t[:, 2:4]  # grid xy
            gij = (gxy - offsets).astype(np.int64)
            gi, gj = gij.T  # grid xy indices

            # Append
            a = t[:, 6].astype(np.int64)  # anchor indices
            gj, gi = gj.clip(0, gain[3] - 1).astype(np.int64), gi.clip(
                0, gain[2] - 1).astype(np.int64)
            indices.append((b, a, gj, gi))
            anch.append(anchors[a])  # anchors
        # return numpy rather than tensor
        return indices, anch


def xywh2xyxy(x):
    """
    [x, y, w, h] to [x1, y1, x2, y2], paddle Tensor op
    """
    y = x.clone()
    y[:, 0] = x[:, 0] - x[:, 2] / 2
    y[:, 1] = x[:, 1] - x[:, 3] / 2
    y[:, 2] = x[:, 0] + x[:, 2] / 2
    y[:, 3] = x[:, 1] + x[:, 3] / 2
    return y


def box_iou(box1, box2):
    """
    [N, 4] [M, 4] to get [N, M] ious, boxes in [x1, y1, x2, y2] format. paddle Tensor op
     """

    def box_area(box):
        return (box[2] - box[0]) * (box[3] - box[1])

    area1 = box_area(box1.T)
    area2 = box_area(box2.T)
    inter = (paddle.minimum(box1[:, None, 2:], box2[:, 2:]) - paddle.maximum(
        box1[:, None, :2], box2[:, :2])).clip(0).prod(2)
    return inter / (area1[:, None] + area2 - inter)


def box_iou_normalization(box1, box2, h, w):
    """
    [N, 4] [M, 4] to get [N, M] ious, boxes in [x1, y1, x2, y2] format. paddle Tensor op
     """

    def box_area(box):
        return (box[2] - box[0]) / h * (box[3] - box[1]) / w

    area1 = box_area(box1.T)
    area2 = box_area(box2.T)

    xy_max = paddle.minimum(paddle.unsqueeze(box1, 1)[:, :, 2:], box2[:, 2:])
    xy_min = paddle.maximum(paddle.unsqueeze(box1, 1)[:, :, :2], box2[:, :2])
    width_height = xy_max - xy_min

    width_height = width_height.clip(min=0)
    width_height[:, :, 0] = width_height[:, :, 0] / h
    width_height[:, :, 1] = width_height[:, :, 1] / w
    inter = width_height.prod(2)

    return inter / (area1[:, None] + area2 - inter)