Commit e9cee049 authored by luopl's avatar luopl
Browse files

Initial commit

parents
Pipeline #1056 canceled with stages
# Copyright (c) Tencent Inc. All rights reserved.
from typing import List
import torch
import torch.nn as nn
from torch import Tensor
import torch.nn.functional as F
from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Linear
from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
from mmengine.model import BaseModule
from mmyolo.registry import MODELS
from mmyolo.models.layers import CSPLayerWithTwoConv
@MODELS.register_module()
class MaxSigmoidAttnBlock(BaseModule):
"""Max Sigmoid attention block."""
def __init__(self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(init_cfg=init_cfg)
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0), \
'out_channels and embed_channels should be divisible by num_heads.'
self.num_heads = num_heads
self.head_channels = out_channels // num_heads
self.use_einsum = use_einsum
self.embed_conv = ConvModule(
in_channels,
embed_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None) if embed_channels != in_channels else None
self.guide_fc = Linear(guide_channels, embed_channels)
self.bias = nn.Parameter(torch.zeros(num_heads))
if with_scale:
self.scale = nn.Parameter(torch.ones(1, num_heads, 1, 1))
else:
self.scale = 1.0
self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
B, _, H, W = x.shape
guide = self.guide_fc(guide)
guide = guide.reshape(B, -1, self.num_heads, self.head_channels)
embed = self.embed_conv(x) if self.embed_conv is not None else x
embed = embed.reshape(B, self.num_heads, self.head_channels, H, W)
if self.use_einsum:
attn_weight = torch.einsum('bmchw,bnmc->bmhwn', embed, guide)
else:
batch, m, channel, height, width = embed.shape
_, n, _, _ = guide.shape
embed = embed.permute(0, 1, 3, 4, 2)
embed = embed.reshape(batch, m, -1, channel)
guide = guide.permute(0, 2, 3, 1)
attn_weight = torch.matmul(embed, guide)
attn_weight = attn_weight.reshape(batch, m, height, width, n)
attn_weight = attn_weight.max(dim=-1)[0]
attn_weight = attn_weight / (self.head_channels**0.5)
attn_weight = attn_weight + self.bias[None, :, None, None]
attn_weight = attn_weight.sigmoid() * self.scale
x = self.project_conv(x)
x = x.reshape(B, self.num_heads, -1, H, W)
x = x * attn_weight.unsqueeze(2)
x = x.reshape(B, -1, H, W)
return x
@MODELS.register_module()
class RepMatrixMaxSigmoidAttnBlock(BaseModule):
"""Max Sigmoid attention block."""
def __init__(self,
in_channels: int,
out_channels: int,
embed_channels: int,
guide_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(init_cfg=init_cfg)
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0), \
'out_channels and embed_channels should be divisible by num_heads.'
self.num_heads = num_heads
self.head_channels = out_channels // num_heads
self.use_einsum = use_einsum
self.embed_conv = ConvModule(
in_channels,
embed_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None) if embed_channels != in_channels else None
self.bias = nn.Parameter(torch.zeros(num_heads))
self.guide_weight = nn.Parameter(
torch.zeros(guide_channels, embed_channels // num_heads,
num_heads))
self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
def forward(self, x: Tensor, txt_feats: Tensor = None) -> Tensor:
"""Forward process."""
B, _, H, W = x.shape
embed = self.embed_conv(x) if self.embed_conv is not None else x
embed = embed.reshape(B, self.num_heads, self.head_channels, H, W)
batch, m, channel, height, width = embed.shape
_, n, _, _ = self.guide_weight.shape
# can be formulated to split conv
embed = embed.permute(0, 1, 3, 4, 2)
embed = embed.reshape(batch, m, -1, channel)
attn_weight = torch.matmul(embed, self.guide_weight)
attn_weight = attn_weight.reshape(batch, m, height, width, n)
attn_weight = attn_weight.max(dim=-1)[0]
attn_weight = attn_weight / (self.head_channels**0.5)
attn_weight = attn_weight + self.bias[None, :, None, None]
attn_weight = attn_weight.sigmoid()
x = self.project_conv(x)
x = x.reshape(B, self.num_heads, -1, H, W)
x = x * attn_weight.unsqueeze(2)
x = x.reshape(B, -1, H, W)
return x
@MODELS.register_module()
class RepConvMaxSigmoidAttnBlock(BaseModule):
"""Max Sigmoid attention block."""
def __init__(self,
in_channels: int,
out_channels: int,
embed_channels: int,
guide_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(init_cfg=init_cfg)
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0), \
'out_channels and embed_channels should be divisible by num_heads.'
self.num_heads = num_heads
self.head_channels = out_channels // num_heads
self.use_einsum = use_einsum
self.embed_conv = ConvModule(
in_channels,
embed_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None) if embed_channels != in_channels else None
self.bias = nn.Parameter(torch.zeros(num_heads))
self.num_heads = num_heads
self.split_channels = embed_channels // num_heads
self.guide_convs = nn.ModuleList(
nn.Conv2d(self.split_channels, guide_channels, 1, bias=False)
for _ in range(num_heads))
self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
def forward(self, x: Tensor, txt_feats: Tensor = None) -> Tensor:
"""Forward process."""
B, C, H, W = x.shape
embed = self.embed_conv(x) if self.embed_conv is not None else x
embed = list(embed.split(self.split_channels, 1))
# Bx(MxN)xHxW (H*c=C, H: heads)
attn_weight = torch.cat(
[conv(x) for conv, x in zip(self.guide_convs, embed)], dim=1)
# BxMxNxHxW
attn_weight = attn_weight.view(B, self.num_heads, -1, H, W)
# attn_weight = torch.stack(
# [conv(x) for conv, x in zip(self.guide_convs, embed)])
# BxMxNxHxW -> BxMxHxW
attn_weight = attn_weight.max(dim=2)[0] / (self.head_channels**0.5)
attn_weight = (attn_weight + self.bias.view(1, -1, 1, 1)).sigmoid()
# .transpose(0, 1)
# BxMx1xHxW
attn_weight = attn_weight[:, :, None]
x = self.project_conv(x)
# BxHxCxHxW
x = x.view(B, self.num_heads, -1, H, W)
x = x * attn_weight
x = x.view(B, -1, H, W)
return x
@MODELS.register_module()
class MaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def __init__(
self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
num_heads: int = 1,
expand_ratio: float = 0.5,
num_blocks: int = 1,
with_scale: bool = False,
add_identity: bool = True, # shortcut
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.attn_block = MaxSigmoidAttnBlock(self.mid_channels,
self.mid_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
use_einsum=use_einsum)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x_main = self.main_conv(x)
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
x_main.append(self.attn_block(x_main[-1], guide))
return self.final_conv(torch.cat(x_main, 1))
@MODELS.register_module()
class RepMaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def __init__(
self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
num_heads: int = 1,
expand_ratio: float = 0.5,
num_blocks: int = 1,
with_scale: bool = False,
add_identity: bool = True, # shortcut
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.attn_block = RepMatrixMaxSigmoidAttnBlock(
self.mid_channels,
self.mid_channels,
embed_channels=embed_channels,
guide_channels=guide_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
use_einsum=use_einsum)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x_main = self.main_conv(x)
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
x_main.append(self.attn_block(x_main[-1], guide))
return self.final_conv(torch.cat(x_main, 1))
@MODELS.register_module()
class RepConvMaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def __init__(
self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
num_heads: int = 1,
expand_ratio: float = 0.5,
num_blocks: int = 1,
with_scale: bool = False,
add_identity: bool = True, # shortcut
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None,
use_einsum: bool = True) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.attn_block = RepConvMaxSigmoidAttnBlock(
self.mid_channels,
self.mid_channels,
embed_channels=embed_channels,
guide_channels=guide_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
use_einsum=use_einsum)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x_main = self.main_conv(x)
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
x_main.append(self.attn_block(x_main[-1], guide))
return self.final_conv(torch.cat(x_main, 1))
@MODELS.register_module()
class ImagePoolingAttentionModule(nn.Module):
def __init__(self,
image_channels: List[int],
text_channels: int,
embed_channels: int,
with_scale: bool = False,
num_feats: int = 3,
num_heads: int = 8,
pool_size: int = 3,
use_einsum: bool = True):
super().__init__()
self.text_channels = text_channels
self.embed_channels = embed_channels
self.num_heads = num_heads
self.num_feats = num_feats
self.head_channels = embed_channels // num_heads
self.pool_size = pool_size
self.use_einsum = use_einsum
if with_scale:
self.scale = nn.Parameter(torch.tensor([0.]), requires_grad=True)
else:
self.scale = 1.0
self.projections = nn.ModuleList([
ConvModule(in_channels, embed_channels, 1, act_cfg=None)
for in_channels in image_channels
])
self.query = nn.Sequential(nn.LayerNorm(text_channels),
Linear(text_channels, embed_channels))
self.key = nn.Sequential(nn.LayerNorm(embed_channels),
Linear(embed_channels, embed_channels))
self.value = nn.Sequential(nn.LayerNorm(embed_channels),
Linear(embed_channels, embed_channels))
self.proj = Linear(embed_channels, text_channels)
self.image_pools = nn.ModuleList([
nn.AdaptiveMaxPool2d((pool_size, pool_size))
for _ in range(num_feats)
])
def forward(self, text_features, image_features):
B = image_features[0].shape[0]
assert len(image_features) == self.num_feats
num_patches = self.pool_size**2
mlvl_image_features = [
pool(proj(x)).view(B, -1, num_patches)
for (x, proj, pool
) in zip(image_features, self.projections, self.image_pools)
]
mlvl_image_features = torch.cat(mlvl_image_features,
dim=-1).transpose(1, 2)
q = self.query(text_features)
k = self.key(mlvl_image_features)
v = self.value(mlvl_image_features)
q = q.reshape(B, -1, self.num_heads, self.head_channels)
k = k.reshape(B, -1, self.num_heads, self.head_channels)
v = v.reshape(B, -1, self.num_heads, self.head_channels)
if self.use_einsum:
attn_weight = torch.einsum('bnmc,bkmc->bmnk', q, k)
else:
q = q.permute(0, 2, 1, 3)
k = k.permute(0, 2, 3, 1)
attn_weight = torch.matmul(q, k)
attn_weight = attn_weight / (self.head_channels**0.5)
attn_weight = F.softmax(attn_weight, dim=-1)
if self.use_einsum:
x = torch.einsum('bmnk,bkmc->bnmc', attn_weight, v)
else:
v = v.permute(0, 2, 1, 3)
x = torch.matmul(attn_weight, v)
x = x.permute(0, 2, 1, 3)
x = self.proj(x.reshape(B, -1, self.embed_channels))
return x * self.scale + text_features
@MODELS.register_module()
class VanillaSigmoidBlock(BaseModule):
"""Sigmoid attention block."""
def __init__(self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
kernel_size: int = 3,
padding: int = 1,
num_heads: int = 1,
use_depthwise: bool = False,
with_scale: bool = False,
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(init_cfg=init_cfg)
conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
assert (out_channels % num_heads == 0 and
embed_channels % num_heads == 0), \
'out_channels and embed_channels should be divisible by num_heads.'
self.num_heads = num_heads
self.head_channels = out_channels // num_heads
self.project_conv = conv(in_channels,
out_channels,
kernel_size,
stride=1,
padding=padding,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=None)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x = self.project_conv(x)
# remove sigmoid
# x = x * x.sigmoid()
return x
@MODELS.register_module()
class EfficientCSPLayerWithTwoConv(CSPLayerWithTwoConv):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def __init__(
self,
in_channels: int,
out_channels: int,
guide_channels: int,
embed_channels: int,
num_heads: int = 1,
expand_ratio: float = 0.5,
num_blocks: int = 1,
with_scale: bool = False,
add_identity: bool = True, # shortcut
conv_cfg: OptConfigType = None,
norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
expand_ratio=expand_ratio,
num_blocks=num_blocks,
add_identity=add_identity,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
out_channels,
1,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg)
self.attn_block = VanillaSigmoidBlock(self.mid_channels,
self.mid_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
with_scale=with_scale,
conv_cfg=conv_cfg,
norm_cfg=norm_cfg)
def forward(self, x: Tensor, guide: Tensor) -> Tensor:
"""Forward process."""
x_main = self.main_conv(x)
x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
x_main.append(self.attn_block(x_main[-1], guide))
return self.final_conv(torch.cat(x_main, 1))
# Copyright (c) Tencent Inc. All rights reserved.
from .dynamic_loss import CoVMSELoss
__all__ = ['CoVMSELoss']
# Copyright (c) Tencent Inc. All rights reserved.
from typing import Optional
import torch
import torch.nn as nn
from torch import Tensor
from mmdet.models.losses.mse_loss import mse_loss
from mmyolo.registry import MODELS
@MODELS.register_module()
class CoVMSELoss(nn.Module):
def __init__(self,
dim: int = 0,
reduction: str = 'mean',
loss_weight: float = 1.0,
eps: float = 1e-6) -> None:
super().__init__()
self.dim = dim
self.reduction = reduction
self.loss_weight = loss_weight
self.eps = eps
def forward(self,
pred: Tensor,
weight: Optional[Tensor] = None,
avg_factor: Optional[int] = None,
reduction_override: Optional[str] = None) -> Tensor:
"""Forward function of loss."""
assert reduction_override in (None, 'none', 'mean', 'sum')
reduction = (
reduction_override if reduction_override else self.reduction)
cov = pred.std(self.dim) / pred.mean(self.dim).clamp(min=self.eps)
target = torch.zeros_like(cov)
loss = self.loss_weight * mse_loss(
cov, target, weight, reduction=reduction, avg_factor=avg_factor)
return loss
# Copyright (c) Tencent Inc. All rights reserved.
from .yolo_world_pafpn import YOLOWorldPAFPN, YOLOWorldDualPAFPN
__all__ = ['YOLOWorldPAFPN', 'YOLOWorldDualPAFPN']
# Copyright (c) Tencent Inc. All rights reserved.
import copy
from typing import List, Union
import torch
import torch.nn as nn
from torch import Tensor
from mmdet.utils import ConfigType, OptMultiConfig
from mmyolo.registry import MODELS
from mmyolo.models.utils import make_divisible, make_round
from mmyolo.models.necks.yolov8_pafpn import YOLOv8PAFPN
@MODELS.register_module()
class YOLOWorldPAFPN(YOLOv8PAFPN):
"""Path Aggregation Network used in YOLO World
Following YOLOv8 PAFPN, including text to image fusion
"""
def __init__(self,
in_channels: List[int],
out_channels: Union[List[int], int],
guide_channels: int,
embed_channels: List[int],
num_heads: List[int],
deepen_factor: float = 1.0,
widen_factor: float = 1.0,
num_csp_blocks: int = 3,
freeze_all: bool = False,
block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'),
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None) -> None:
self.guide_channels = guide_channels
self.embed_channels = embed_channels
self.num_heads = num_heads
self.block_cfg = block_cfg
super().__init__(in_channels=in_channels,
out_channels=out_channels,
deepen_factor=deepen_factor,
widen_factor=widen_factor,
num_csp_blocks=num_csp_blocks,
freeze_all=freeze_all,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
def build_top_down_layer(self, idx: int) -> nn.Module:
"""build top down layer.
Args:
idx (int): layer idx.
Returns:
nn.Module: The top down layer.
"""
block_cfg = copy.deepcopy(self.block_cfg)
block_cfg.update(
dict(in_channels=make_divisible(
(self.in_channels[idx - 1] + self.in_channels[idx]),
self.widen_factor),
out_channels=make_divisible(self.out_channels[idx - 1],
self.widen_factor),
guide_channels=self.guide_channels,
embed_channels=make_round(self.embed_channels[idx - 1],
self.widen_factor),
num_heads=make_round(self.num_heads[idx - 1],
self.widen_factor),
num_blocks=make_round(self.num_csp_blocks,
self.deepen_factor),
add_identity=False,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg))
return MODELS.build(block_cfg)
def build_bottom_up_layer(self, idx: int) -> nn.Module:
"""build bottom up layer.
Args:
idx (int): layer idx.
Returns:
nn.Module: The bottom up layer.
"""
block_cfg = copy.deepcopy(self.block_cfg)
block_cfg.update(
dict(in_channels=make_divisible(
(self.out_channels[idx] + self.out_channels[idx + 1]),
self.widen_factor),
out_channels=make_divisible(self.out_channels[idx + 1],
self.widen_factor),
guide_channels=self.guide_channels,
embed_channels=make_round(self.embed_channels[idx + 1],
self.widen_factor),
num_heads=make_round(self.num_heads[idx + 1],
self.widen_factor),
num_blocks=make_round(self.num_csp_blocks,
self.deepen_factor),
add_identity=False,
norm_cfg=self.norm_cfg,
act_cfg=self.act_cfg))
return MODELS.build(block_cfg)
def forward(self, img_feats: List[Tensor], txt_feats: Tensor = None) -> tuple:
"""Forward function.
including multi-level image features, text features: BxLxD
"""
assert len(img_feats) == len(self.in_channels)
# reduce layers
reduce_outs = []
for idx in range(len(self.in_channels)):
reduce_outs.append(self.reduce_layers[idx](img_feats[idx]))
# top-down path
inner_outs = [reduce_outs[-1]]
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_high = inner_outs[0]
feat_low = reduce_outs[idx - 1]
upsample_feat = self.upsample_layers[len(self.in_channels) - 1 -
idx](feat_high)
if self.upsample_feats_cat_first:
top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1)
else:
top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1)
inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](
top_down_layer_inputs, txt_feats)
inner_outs.insert(0, inner_out)
# bottom-up path
outs = [inner_outs[0]]
for idx in range(len(self.in_channels) - 1):
feat_low = outs[-1]
feat_high = inner_outs[idx + 1]
downsample_feat = self.downsample_layers[idx](feat_low)
out = self.bottom_up_layers[idx](torch.cat(
[downsample_feat, feat_high], 1), txt_feats)
outs.append(out)
# out_layers
results = []
for idx in range(len(self.in_channels)):
results.append(self.out_layers[idx](outs[idx]))
return tuple(results)
@MODELS.register_module()
class YOLOWorldDualPAFPN(YOLOWorldPAFPN):
"""Path Aggregation Network used in YOLO World v8."""
def __init__(self,
in_channels: List[int],
out_channels: Union[List[int], int],
guide_channels: int,
embed_channels: List[int],
num_heads: List[int],
deepen_factor: float = 1.0,
widen_factor: float = 1.0,
num_csp_blocks: int = 3,
freeze_all: bool = False,
text_enhancder: ConfigType = dict(
type='ImagePoolingAttentionModule',
embed_channels=256,
num_heads=8,
pool_size=3),
block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'),
norm_cfg: ConfigType = dict(type='BN',
momentum=0.03,
eps=0.001),
act_cfg: ConfigType = dict(type='SiLU', inplace=True),
init_cfg: OptMultiConfig = None) -> None:
super().__init__(in_channels=in_channels,
out_channels=out_channels,
guide_channels=guide_channels,
embed_channels=embed_channels,
num_heads=num_heads,
deepen_factor=deepen_factor,
widen_factor=widen_factor,
num_csp_blocks=num_csp_blocks,
freeze_all=freeze_all,
block_cfg=block_cfg,
norm_cfg=norm_cfg,
act_cfg=act_cfg,
init_cfg=init_cfg)
text_enhancder.update(
dict(
image_channels=[int(x * widen_factor) for x in out_channels],
text_channels=guide_channels,
num_feats=len(out_channels),
))
print(text_enhancder)
self.text_enhancer = MODELS.build(text_enhancder)
def forward(self, img_feats: List[Tensor], txt_feats: Tensor) -> tuple:
"""Forward function."""
assert len(img_feats) == len(self.in_channels)
# reduce layers
reduce_outs = []
for idx in range(len(self.in_channels)):
reduce_outs.append(self.reduce_layers[idx](img_feats[idx]))
# top-down path
inner_outs = [reduce_outs[-1]]
for idx in range(len(self.in_channels) - 1, 0, -1):
feat_high = inner_outs[0]
feat_low = reduce_outs[idx - 1]
upsample_feat = self.upsample_layers[len(self.in_channels) - 1 -
idx](feat_high)
if self.upsample_feats_cat_first:
top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1)
else:
top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1)
inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](
top_down_layer_inputs, txt_feats)
inner_outs.insert(0, inner_out)
txt_feats = self.text_enhancer(txt_feats, inner_outs)
# bottom-up path
outs = [inner_outs[0]]
for idx in range(len(self.in_channels) - 1):
feat_low = outs[-1]
feat_high = inner_outs[idx + 1]
downsample_feat = self.downsample_layers[idx](feat_low)
out = self.bottom_up_layers[idx](torch.cat(
[downsample_feat, feat_high], 1), txt_feats)
outs.append(out)
# out_layers
results = []
for idx in range(len(self.in_channels)):
results.append(self.out_layers[idx](outs[idx]))
return tuple(results)
# Copyright (c) Tencent Inc. All rights reserved.
from yolo_world import __version__
def __version_info() -> tuple:
"""Parse a version string into a tuple.
Returns:
tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
(1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
"""
version_info = []
for x in __version__.split('.'):
if x.isdigit():
version_info.append(int(x))
elif x.find('rc') != -1:
patch_version = x.split('rc')
version_info.append(int(patch_version[0]))
version_info.append(f'rc{patch_version[1]}')
return tuple(version_info)
version_info = __version_info()
__all__ = ['__version__', 'version_info']
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment