Initial commit

e9cee049 · luopl · e9cee049 · e9cee049 · e9cee049 · e9cee049
Commit e9cee049 authored May 31, 2024 by luopl
6 changed files
--- a/yolo_world/models/layers/yolo_bricks.py
+++ b/yolo_world/models/layers/yolo_bricks.py
+# Copyright (c) Tencent Inc. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule, Linear
+from mmdet.utils import ConfigType, OptConfigType, OptMultiConfig
+from mmengine.model import BaseModule
+from mmyolo.registry import MODELS
+from mmyolo.models.layers import CSPLayerWithTwoConv
+
+
+@MODELS.register_module()
+class MaxSigmoidAttnBlock(BaseModule):
+    """Max Sigmoid attention block."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 guide_channels: int,
+                 embed_channels: int,
+                 kernel_size: int = 3,
+                 padding: int = 1,
+                 num_heads: int = 1,
+                 use_depthwise: bool = False,
+                 with_scale: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN',
+                                             momentum=0.03,
+                                             eps=0.001),
+                 init_cfg: OptMultiConfig = None,
+                 use_einsum: bool = True) -> None:
+        super().__init__(init_cfg=init_cfg)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        assert (out_channels % num_heads == 0 and
+                embed_channels % num_heads == 0), \
+            'out_channels and embed_channels should be divisible by num_heads.'
+        self.num_heads = num_heads
+        self.head_channels = out_channels // num_heads
+        self.use_einsum = use_einsum
+
+        self.embed_conv = ConvModule(
+            in_channels,
+            embed_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None) if embed_channels != in_channels else None
+        self.guide_fc = Linear(guide_channels, embed_channels)
+        self.bias = nn.Parameter(torch.zeros(num_heads))
+        if with_scale:
+            self.scale = nn.Parameter(torch.ones(1, num_heads, 1, 1))
+        else:
+            self.scale = 1.0
+
+        self.project_conv = conv(in_channels,
+                                 out_channels,
+                                 kernel_size,
+                                 stride=1,
+                                 padding=padding,
+                                 conv_cfg=conv_cfg,
+                                 norm_cfg=norm_cfg,
+                                 act_cfg=None)
+
+    def forward(self, x: Tensor, guide: Tensor) -> Tensor:
+        """Forward process."""
+        B, _, H, W = x.shape
+
+        guide = self.guide_fc(guide)
+        guide = guide.reshape(B, -1, self.num_heads, self.head_channels)
+        embed = self.embed_conv(x) if self.embed_conv is not None else x
+        embed = embed.reshape(B, self.num_heads, self.head_channels, H, W)
+
+        if self.use_einsum:
+            attn_weight = torch.einsum('bmchw,bnmc->bmhwn', embed, guide)
+        else:
+            batch, m, channel, height, width = embed.shape
+            _, n, _, _ = guide.shape
+            embed = embed.permute(0, 1, 3, 4, 2)
+            embed = embed.reshape(batch, m, -1, channel)
+            guide = guide.permute(0, 2, 3, 1)
+            attn_weight = torch.matmul(embed, guide)
+            attn_weight = attn_weight.reshape(batch, m, height, width, n)
+
+        attn_weight = attn_weight.max(dim=-1)[0]
+        attn_weight = attn_weight / (self.head_channels**0.5)
+        attn_weight = attn_weight + self.bias[None, :, None, None]
+        attn_weight = attn_weight.sigmoid() * self.scale
+
+        x = self.project_conv(x)
+        x = x.reshape(B, self.num_heads, -1, H, W)
+        x = x * attn_weight.unsqueeze(2)
+        x = x.reshape(B, -1, H, W)
+        return x
+
+
+@MODELS.register_module()
+class RepMatrixMaxSigmoidAttnBlock(BaseModule):
+    """Max Sigmoid attention block."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 embed_channels: int,
+                 guide_channels: int,
+                 kernel_size: int = 3,
+                 padding: int = 1,
+                 num_heads: int = 1,
+                 use_depthwise: bool = False,
+                 with_scale: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN',
+                                             momentum=0.03,
+                                             eps=0.001),
+                 init_cfg: OptMultiConfig = None,
+                 use_einsum: bool = True) -> None:
+        super().__init__(init_cfg=init_cfg)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        assert (out_channels % num_heads == 0 and
+                embed_channels % num_heads == 0), \
+            'out_channels and embed_channels should be divisible by num_heads.'
+        self.num_heads = num_heads
+        self.head_channels = out_channels // num_heads
+        self.use_einsum = use_einsum
+
+        self.embed_conv = ConvModule(
+            in_channels,
+            embed_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None) if embed_channels != in_channels else None
+        self.bias = nn.Parameter(torch.zeros(num_heads))
+        self.guide_weight = nn.Parameter(
+            torch.zeros(guide_channels, embed_channels // num_heads,
+                        num_heads))
+        self.project_conv = conv(in_channels,
+                                 out_channels,
+                                 kernel_size,
+                                 stride=1,
+                                 padding=padding,
+                                 conv_cfg=conv_cfg,
+                                 norm_cfg=norm_cfg,
+                                 act_cfg=None)
+
+    def forward(self, x: Tensor, txt_feats: Tensor = None) -> Tensor:
+        """Forward process."""
+        B, _, H, W = x.shape
+
+        embed = self.embed_conv(x) if self.embed_conv is not None else x
+        embed = embed.reshape(B, self.num_heads, self.head_channels, H, W)
+
+        batch, m, channel, height, width = embed.shape
+        _, n, _, _ = self.guide_weight.shape
+        # can be formulated to split conv
+        embed = embed.permute(0, 1, 3, 4, 2)
+        embed = embed.reshape(batch, m, -1, channel)
+        attn_weight = torch.matmul(embed, self.guide_weight)
+        attn_weight = attn_weight.reshape(batch, m, height, width, n)
+
+        attn_weight = attn_weight.max(dim=-1)[0]
+        attn_weight = attn_weight / (self.head_channels**0.5)
+        attn_weight = attn_weight + self.bias[None, :, None, None]
+        attn_weight = attn_weight.sigmoid()
+
+        x = self.project_conv(x)
+        x = x.reshape(B, self.num_heads, -1, H, W)
+        x = x * attn_weight.unsqueeze(2)
+        x = x.reshape(B, -1, H, W)
+        return x
+
+
+@MODELS.register_module()
+class RepConvMaxSigmoidAttnBlock(BaseModule):
+    """Max Sigmoid attention block."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 embed_channels: int,
+                 guide_channels: int,
+                 kernel_size: int = 3,
+                 padding: int = 1,
+                 num_heads: int = 1,
+                 use_depthwise: bool = False,
+                 with_scale: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN',
+                                             momentum=0.03,
+                                             eps=0.001),
+                 init_cfg: OptMultiConfig = None,
+                 use_einsum: bool = True) -> None:
+        super().__init__(init_cfg=init_cfg)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        assert (out_channels % num_heads == 0 and
+                embed_channels % num_heads == 0), \
+            'out_channels and embed_channels should be divisible by num_heads.'
+        self.num_heads = num_heads
+        self.head_channels = out_channels // num_heads
+        self.use_einsum = use_einsum
+
+        self.embed_conv = ConvModule(
+            in_channels,
+            embed_channels,
+            1,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=None) if embed_channels != in_channels else None
+        self.bias = nn.Parameter(torch.zeros(num_heads))
+        self.num_heads = num_heads
+        self.split_channels = embed_channels // num_heads
+        self.guide_convs = nn.ModuleList(
+            nn.Conv2d(self.split_channels, guide_channels, 1, bias=False)
+            for _ in range(num_heads))
+        self.project_conv = conv(in_channels,
+                                 out_channels,
+                                 kernel_size,
+                                 stride=1,
+                                 padding=padding,
+                                 conv_cfg=conv_cfg,
+                                 norm_cfg=norm_cfg,
+                                 act_cfg=None)
+
+    def forward(self, x: Tensor, txt_feats: Tensor = None) -> Tensor:
+        """Forward process."""
+        B, C, H, W = x.shape
+
+        embed = self.embed_conv(x) if self.embed_conv is not None else x
+        embed = list(embed.split(self.split_channels, 1))
+        # Bx(MxN)xHxW (H*c=C, H: heads)
+        attn_weight = torch.cat(
+            [conv(x) for conv, x in zip(self.guide_convs, embed)], dim=1)
+        # BxMxNxHxW
+        attn_weight = attn_weight.view(B, self.num_heads, -1, H, W)
+        # attn_weight = torch.stack(
+        #     [conv(x) for conv, x in zip(self.guide_convs, embed)])
+        # BxMxNxHxW -> BxMxHxW
+        attn_weight = attn_weight.max(dim=2)[0] / (self.head_channels**0.5)
+        attn_weight = (attn_weight + self.bias.view(1, -1, 1, 1)).sigmoid()
+        # .transpose(0, 1)
+        # BxMx1xHxW
+        attn_weight = attn_weight[:, :, None]
+        x = self.project_conv(x)
+        # BxHxCxHxW
+        x = x.view(B, self.num_heads, -1, H, W)
+        x = x * attn_weight
+        x = x.view(B, -1, H, W)
+        return x
+
+
+@MODELS.register_module()
+class MaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
+    """Sigmoid-attention based CSP layer with two convolution layers."""
+
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            guide_channels: int,
+            embed_channels: int,
+            num_heads: int = 1,
+            expand_ratio: float = 0.5,
+            num_blocks: int = 1,
+            with_scale: bool = False,
+            add_identity: bool = True,  # shortcut
+            conv_cfg: OptConfigType = None,
+            norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+            init_cfg: OptMultiConfig = None,
+            use_einsum: bool = True) -> None:
+        super().__init__(in_channels=in_channels,
+                         out_channels=out_channels,
+                         expand_ratio=expand_ratio,
+                         num_blocks=num_blocks,
+                         add_identity=add_identity,
+                         conv_cfg=conv_cfg,
+                         norm_cfg=norm_cfg,
+                         act_cfg=act_cfg,
+                         init_cfg=init_cfg)
+
+        self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
+                                     out_channels,
+                                     1,
+                                     conv_cfg=conv_cfg,
+                                     norm_cfg=norm_cfg,
+                                     act_cfg=act_cfg)
+
+        self.attn_block = MaxSigmoidAttnBlock(self.mid_channels,
+                                              self.mid_channels,
+                                              guide_channels=guide_channels,
+                                              embed_channels=embed_channels,
+                                              num_heads=num_heads,
+                                              with_scale=with_scale,
+                                              conv_cfg=conv_cfg,
+                                              norm_cfg=norm_cfg,
+                                              use_einsum=use_einsum)
+
+    def forward(self, x: Tensor, guide: Tensor) -> Tensor:
+        """Forward process."""
+        x_main = self.main_conv(x)
+        x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
+        x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
+        x_main.append(self.attn_block(x_main[-1], guide))
+        return self.final_conv(torch.cat(x_main, 1))
+
+
+@MODELS.register_module()
+class RepMaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
+    """Sigmoid-attention based CSP layer with two convolution layers."""
+
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            guide_channels: int,
+            embed_channels: int,
+            num_heads: int = 1,
+            expand_ratio: float = 0.5,
+            num_blocks: int = 1,
+            with_scale: bool = False,
+            add_identity: bool = True,  # shortcut
+            conv_cfg: OptConfigType = None,
+            norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+            init_cfg: OptMultiConfig = None,
+            use_einsum: bool = True) -> None:
+        super().__init__(in_channels=in_channels,
+                         out_channels=out_channels,
+                         expand_ratio=expand_ratio,
+                         num_blocks=num_blocks,
+                         add_identity=add_identity,
+                         conv_cfg=conv_cfg,
+                         norm_cfg=norm_cfg,
+                         act_cfg=act_cfg,
+                         init_cfg=init_cfg)
+
+        self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
+                                     out_channels,
+                                     1,
+                                     conv_cfg=conv_cfg,
+                                     norm_cfg=norm_cfg,
+                                     act_cfg=act_cfg)
+
+        self.attn_block = RepMatrixMaxSigmoidAttnBlock(
+            self.mid_channels,
+            self.mid_channels,
+            embed_channels=embed_channels,
+            guide_channels=guide_channels,
+            num_heads=num_heads,
+            with_scale=with_scale,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            use_einsum=use_einsum)
+
+    def forward(self, x: Tensor, guide: Tensor) -> Tensor:
+        """Forward process."""
+        x_main = self.main_conv(x)
+        x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
+        x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
+        x_main.append(self.attn_block(x_main[-1], guide))
+        return self.final_conv(torch.cat(x_main, 1))
+
+
+@MODELS.register_module()
+class RepConvMaxSigmoidCSPLayerWithTwoConv(CSPLayerWithTwoConv):
+    """Sigmoid-attention based CSP layer with two convolution layers."""
+
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            guide_channels: int,
+            embed_channels: int,
+            num_heads: int = 1,
+            expand_ratio: float = 0.5,
+            num_blocks: int = 1,
+            with_scale: bool = False,
+            add_identity: bool = True,  # shortcut
+            conv_cfg: OptConfigType = None,
+            norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+            init_cfg: OptMultiConfig = None,
+            use_einsum: bool = True) -> None:
+        super().__init__(in_channels=in_channels,
+                         out_channels=out_channels,
+                         expand_ratio=expand_ratio,
+                         num_blocks=num_blocks,
+                         add_identity=add_identity,
+                         conv_cfg=conv_cfg,
+                         norm_cfg=norm_cfg,
+                         act_cfg=act_cfg,
+                         init_cfg=init_cfg)
+
+        self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
+                                     out_channels,
+                                     1,
+                                     conv_cfg=conv_cfg,
+                                     norm_cfg=norm_cfg,
+                                     act_cfg=act_cfg)
+
+        self.attn_block = RepConvMaxSigmoidAttnBlock(
+            self.mid_channels,
+            self.mid_channels,
+            embed_channels=embed_channels,
+            guide_channels=guide_channels,
+            num_heads=num_heads,
+            with_scale=with_scale,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            use_einsum=use_einsum)
+
+    def forward(self, x: Tensor, guide: Tensor) -> Tensor:
+        """Forward process."""
+        x_main = self.main_conv(x)
+        x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
+        x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
+        x_main.append(self.attn_block(x_main[-1], guide))
+        return self.final_conv(torch.cat(x_main, 1))
+
+
+@MODELS.register_module()
+class ImagePoolingAttentionModule(nn.Module):
+
+    def __init__(self,
+                 image_channels: List[int],
+                 text_channels: int,
+                 embed_channels: int,
+                 with_scale: bool = False,
+                 num_feats: int = 3,
+                 num_heads: int = 8,
+                 pool_size: int = 3,
+                 use_einsum: bool = True):
+        super().__init__()
+
+        self.text_channels = text_channels
+        self.embed_channels = embed_channels
+        self.num_heads = num_heads
+        self.num_feats = num_feats
+        self.head_channels = embed_channels // num_heads
+        self.pool_size = pool_size
+        self.use_einsum = use_einsum
+        if with_scale:
+            self.scale = nn.Parameter(torch.tensor([0.]), requires_grad=True)
+        else:
+            self.scale = 1.0
+        self.projections = nn.ModuleList([
+            ConvModule(in_channels, embed_channels, 1, act_cfg=None)
+            for in_channels in image_channels
+        ])
+        self.query = nn.Sequential(nn.LayerNorm(text_channels),
+                                   Linear(text_channels, embed_channels))
+        self.key = nn.Sequential(nn.LayerNorm(embed_channels),
+                                 Linear(embed_channels, embed_channels))
+        self.value = nn.Sequential(nn.LayerNorm(embed_channels),
+                                   Linear(embed_channels, embed_channels))
+        self.proj = Linear(embed_channels, text_channels)
+
+        self.image_pools = nn.ModuleList([
+            nn.AdaptiveMaxPool2d((pool_size, pool_size))
+            for _ in range(num_feats)
+        ])
+
+    def forward(self, text_features, image_features):
+        B = image_features[0].shape[0]
+        assert len(image_features) == self.num_feats
+        num_patches = self.pool_size**2
+        mlvl_image_features = [
+            pool(proj(x)).view(B, -1, num_patches)
+            for (x, proj, pool
+                 ) in zip(image_features, self.projections, self.image_pools)
+        ]
+        mlvl_image_features = torch.cat(mlvl_image_features,
+                                        dim=-1).transpose(1, 2)
+        q = self.query(text_features)
+        k = self.key(mlvl_image_features)
+        v = self.value(mlvl_image_features)
+
+        q = q.reshape(B, -1, self.num_heads, self.head_channels)
+        k = k.reshape(B, -1, self.num_heads, self.head_channels)
+        v = v.reshape(B, -1, self.num_heads, self.head_channels)
+        if self.use_einsum:
+            attn_weight = torch.einsum('bnmc,bkmc->bmnk', q, k)
+        else:
+            q = q.permute(0, 2, 1, 3)
+            k = k.permute(0, 2, 3, 1)
+            attn_weight = torch.matmul(q, k)
+
+        attn_weight = attn_weight / (self.head_channels**0.5)
+        attn_weight = F.softmax(attn_weight, dim=-1)
+        if self.use_einsum:
+            x = torch.einsum('bmnk,bkmc->bnmc', attn_weight, v)
+        else:
+            v = v.permute(0, 2, 1, 3)
+            x = torch.matmul(attn_weight, v)
+            x = x.permute(0, 2, 1, 3)
+        x = self.proj(x.reshape(B, -1, self.embed_channels))
+        return x * self.scale + text_features
+
+
+@MODELS.register_module()
+class VanillaSigmoidBlock(BaseModule):
+    """Sigmoid attention block."""
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 guide_channels: int,
+                 embed_channels: int,
+                 kernel_size: int = 3,
+                 padding: int = 1,
+                 num_heads: int = 1,
+                 use_depthwise: bool = False,
+                 with_scale: bool = False,
+                 conv_cfg: OptConfigType = None,
+                 norm_cfg: ConfigType = dict(type='BN',
+                                             momentum=0.03,
+                                             eps=0.001),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        conv = DepthwiseSeparableConvModule if use_depthwise else ConvModule
+
+        assert (out_channels % num_heads == 0 and
+                embed_channels % num_heads == 0), \
+            'out_channels and embed_channels should be divisible by num_heads.'
+        self.num_heads = num_heads
+        self.head_channels = out_channels // num_heads
+
+        self.project_conv = conv(in_channels,
+                                 out_channels,
+                                 kernel_size,
+                                 stride=1,
+                                 padding=padding,
+                                 conv_cfg=conv_cfg,
+                                 norm_cfg=norm_cfg,
+                                 act_cfg=None)
+
+    def forward(self, x: Tensor, guide: Tensor) -> Tensor:
+        """Forward process."""
+        x = self.project_conv(x)
+        # remove sigmoid
+        # x = x * x.sigmoid()
+        return x
+
+
+@MODELS.register_module()
+class EfficientCSPLayerWithTwoConv(CSPLayerWithTwoConv):
+    """Sigmoid-attention based CSP layer with two convolution layers."""
+
+    def __init__(
+            self,
+            in_channels: int,
+            out_channels: int,
+            guide_channels: int,
+            embed_channels: int,
+            num_heads: int = 1,
+            expand_ratio: float = 0.5,
+            num_blocks: int = 1,
+            with_scale: bool = False,
+            add_identity: bool = True,  # shortcut
+            conv_cfg: OptConfigType = None,
+            norm_cfg: ConfigType = dict(type='BN', momentum=0.03, eps=0.001),
+            act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+            init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(in_channels=in_channels,
+                         out_channels=out_channels,
+                         expand_ratio=expand_ratio,
+                         num_blocks=num_blocks,
+                         add_identity=add_identity,
+                         conv_cfg=conv_cfg,
+                         norm_cfg=norm_cfg,
+                         act_cfg=act_cfg,
+                         init_cfg=init_cfg)
+
+        self.final_conv = ConvModule((3 + num_blocks) * self.mid_channels,
+                                     out_channels,
+                                     1,
+                                     conv_cfg=conv_cfg,
+                                     norm_cfg=norm_cfg,
+                                     act_cfg=act_cfg)
+
+        self.attn_block = VanillaSigmoidBlock(self.mid_channels,
+                                              self.mid_channels,
+                                              guide_channels=guide_channels,
+                                              embed_channels=embed_channels,
+                                              num_heads=num_heads,
+                                              with_scale=with_scale,
+                                              conv_cfg=conv_cfg,
+                                              norm_cfg=norm_cfg)
+
+    def forward(self, x: Tensor, guide: Tensor) -> Tensor:
+        """Forward process."""
+        x_main = self.main_conv(x)
+        x_main = list(x_main.split((self.mid_channels, self.mid_channels), 1))
+        x_main.extend(blocks(x_main[-1]) for blocks in self.blocks)
+        x_main.append(self.attn_block(x_main[-1], guide))
+        return self.final_conv(torch.cat(x_main, 1))
--- a/yolo_world/models/losses/__init__.py
+++ b/yolo_world/models/losses/__init__.py
+# Copyright (c) Tencent Inc. All rights reserved.
+from .dynamic_loss import CoVMSELoss
+
+__all__ = ['CoVMSELoss']
--- a/yolo_world/models/losses/dynamic_loss.py
+++ b/yolo_world/models/losses/dynamic_loss.py
+# Copyright (c) Tencent Inc. All rights reserved.
+from typing import Optional
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from mmdet.models.losses.mse_loss import mse_loss
+from mmyolo.registry import MODELS
+
+
+@MODELS.register_module()
+class CoVMSELoss(nn.Module):
+
+    def __init__(self,
+                 dim: int = 0,
+                 reduction: str = 'mean',
+                 loss_weight: float = 1.0,
+                 eps: float = 1e-6) -> None:
+        super().__init__()
+        self.dim = dim
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.eps = eps
+
+    def forward(self,
+                pred: Tensor,
+                weight: Optional[Tensor] = None,
+                avg_factor: Optional[int] = None,
+                reduction_override: Optional[str] = None) -> Tensor:
+        """Forward function of loss."""
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        cov = pred.std(self.dim) / pred.mean(self.dim).clamp(min=self.eps)
+        target = torch.zeros_like(cov)
+        loss = self.loss_weight * mse_loss(
+            cov, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss
--- a/yolo_world/models/necks/__init__.py
+++ b/yolo_world/models/necks/__init__.py
+# Copyright (c) Tencent Inc. All rights reserved.
+from .yolo_world_pafpn import YOLOWorldPAFPN, YOLOWorldDualPAFPN
+
+__all__ = ['YOLOWorldPAFPN', 'YOLOWorldDualPAFPN']
--- a/yolo_world/models/necks/yolo_world_pafpn.py
+++ b/yolo_world/models/necks/yolo_world_pafpn.py
+# Copyright (c) Tencent Inc. All rights reserved.
+import copy
+from typing import List, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+from mmdet.utils import ConfigType, OptMultiConfig
+
+from mmyolo.registry import MODELS
+from mmyolo.models.utils import make_divisible, make_round
+from mmyolo.models.necks.yolov8_pafpn import YOLOv8PAFPN
+
+
+@MODELS.register_module()
+class YOLOWorldPAFPN(YOLOv8PAFPN):
+    """Path Aggregation Network used in YOLO World
+    Following YOLOv8 PAFPN, including text to image fusion
+    """
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: Union[List[int], int],
+                 guide_channels: int,
+                 embed_channels: List[int],
+                 num_heads: List[int],
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 num_csp_blocks: int = 3,
+                 freeze_all: bool = False,
+                 block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'),
+                 norm_cfg: ConfigType = dict(type='BN',
+                                             momentum=0.03,
+                                             eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        self.guide_channels = guide_channels
+        self.embed_channels = embed_channels
+        self.num_heads = num_heads
+        self.block_cfg = block_cfg
+        super().__init__(in_channels=in_channels,
+                         out_channels=out_channels,
+                         deepen_factor=deepen_factor,
+                         widen_factor=widen_factor,
+                         num_csp_blocks=num_csp_blocks,
+                         freeze_all=freeze_all,
+                         norm_cfg=norm_cfg,
+                         act_cfg=act_cfg,
+                         init_cfg=init_cfg)
+
+    def build_top_down_layer(self, idx: int) -> nn.Module:
+        """build top down layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The top down layer.
+        """
+        block_cfg = copy.deepcopy(self.block_cfg)
+        block_cfg.update(
+            dict(in_channels=make_divisible(
+                (self.in_channels[idx - 1] + self.in_channels[idx]),
+                self.widen_factor),
+                 out_channels=make_divisible(self.out_channels[idx - 1],
+                                             self.widen_factor),
+                 guide_channels=self.guide_channels,
+                 embed_channels=make_round(self.embed_channels[idx - 1],
+                                           self.widen_factor),
+                 num_heads=make_round(self.num_heads[idx - 1],
+                                      self.widen_factor),
+                 num_blocks=make_round(self.num_csp_blocks,
+                                       self.deepen_factor),
+                 add_identity=False,
+                 norm_cfg=self.norm_cfg,
+                 act_cfg=self.act_cfg))
+        return MODELS.build(block_cfg)
+
+    def build_bottom_up_layer(self, idx: int) -> nn.Module:
+        """build bottom up layer.
+
+        Args:
+            idx (int): layer idx.
+
+        Returns:
+            nn.Module: The bottom up layer.
+        """
+        block_cfg = copy.deepcopy(self.block_cfg)
+        block_cfg.update(
+            dict(in_channels=make_divisible(
+                (self.out_channels[idx] + self.out_channels[idx + 1]),
+                self.widen_factor),
+                 out_channels=make_divisible(self.out_channels[idx + 1],
+                                             self.widen_factor),
+                 guide_channels=self.guide_channels,
+                 embed_channels=make_round(self.embed_channels[idx + 1],
+                                           self.widen_factor),
+                 num_heads=make_round(self.num_heads[idx + 1],
+                                      self.widen_factor),
+                 num_blocks=make_round(self.num_csp_blocks,
+                                       self.deepen_factor),
+                 add_identity=False,
+                 norm_cfg=self.norm_cfg,
+                 act_cfg=self.act_cfg))
+        return MODELS.build(block_cfg)
+
+    def forward(self, img_feats: List[Tensor], txt_feats: Tensor = None) -> tuple:
+        """Forward function.
+        including multi-level image features, text features: BxLxD
+        """
+        assert len(img_feats) == len(self.in_channels)
+        # reduce layers
+        reduce_outs = []
+        for idx in range(len(self.in_channels)):
+            reduce_outs.append(self.reduce_layers[idx](img_feats[idx]))
+
+        # top-down path
+        inner_outs = [reduce_outs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_high = inner_outs[0]
+            feat_low = reduce_outs[idx - 1]
+            upsample_feat = self.upsample_layers[len(self.in_channels) - 1 -
+                                                 idx](feat_high)
+            if self.upsample_feats_cat_first:
+                top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1)
+            else:
+                top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1)
+            inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](
+                top_down_layer_inputs, txt_feats)
+            inner_outs.insert(0, inner_out)
+
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_high = inner_outs[idx + 1]
+            downsample_feat = self.downsample_layers[idx](feat_low)
+            out = self.bottom_up_layers[idx](torch.cat(
+                [downsample_feat, feat_high], 1), txt_feats)
+            outs.append(out)
+
+        # out_layers
+        results = []
+        for idx in range(len(self.in_channels)):
+            results.append(self.out_layers[idx](outs[idx]))
+
+        return tuple(results)
+
+
+@MODELS.register_module()
+class YOLOWorldDualPAFPN(YOLOWorldPAFPN):
+    """Path Aggregation Network used in YOLO World v8."""
+    def __init__(self,
+                 in_channels: List[int],
+                 out_channels: Union[List[int], int],
+                 guide_channels: int,
+                 embed_channels: List[int],
+                 num_heads: List[int],
+                 deepen_factor: float = 1.0,
+                 widen_factor: float = 1.0,
+                 num_csp_blocks: int = 3,
+                 freeze_all: bool = False,
+                 text_enhancder: ConfigType = dict(
+                     type='ImagePoolingAttentionModule',
+                     embed_channels=256,
+                     num_heads=8,
+                     pool_size=3),
+                 block_cfg: ConfigType = dict(type='CSPLayerWithTwoConv'),
+                 norm_cfg: ConfigType = dict(type='BN',
+                                             momentum=0.03,
+                                             eps=0.001),
+                 act_cfg: ConfigType = dict(type='SiLU', inplace=True),
+                 init_cfg: OptMultiConfig = None) -> None:
+        super().__init__(in_channels=in_channels,
+                         out_channels=out_channels,
+                         guide_channels=guide_channels,
+                         embed_channels=embed_channels,
+                         num_heads=num_heads,
+                         deepen_factor=deepen_factor,
+                         widen_factor=widen_factor,
+                         num_csp_blocks=num_csp_blocks,
+                         freeze_all=freeze_all,
+                         block_cfg=block_cfg,
+                         norm_cfg=norm_cfg,
+                         act_cfg=act_cfg,
+                         init_cfg=init_cfg)
+
+        text_enhancder.update(
+            dict(
+                image_channels=[int(x * widen_factor) for x in out_channels],
+                text_channels=guide_channels,
+                num_feats=len(out_channels),
+            ))
+        print(text_enhancder)
+        self.text_enhancer = MODELS.build(text_enhancder)
+
+    def forward(self, img_feats: List[Tensor], txt_feats: Tensor) -> tuple:
+        """Forward function."""
+        assert len(img_feats) == len(self.in_channels)
+        # reduce layers
+        reduce_outs = []
+        for idx in range(len(self.in_channels)):
+            reduce_outs.append(self.reduce_layers[idx](img_feats[idx]))
+
+        # top-down path
+        inner_outs = [reduce_outs[-1]]
+        for idx in range(len(self.in_channels) - 1, 0, -1):
+            feat_high = inner_outs[0]
+            feat_low = reduce_outs[idx - 1]
+            upsample_feat = self.upsample_layers[len(self.in_channels) - 1 -
+                                                 idx](feat_high)
+            if self.upsample_feats_cat_first:
+                top_down_layer_inputs = torch.cat([upsample_feat, feat_low], 1)
+            else:
+                top_down_layer_inputs = torch.cat([feat_low, upsample_feat], 1)
+            inner_out = self.top_down_layers[len(self.in_channels) - 1 - idx](
+                top_down_layer_inputs, txt_feats)
+            inner_outs.insert(0, inner_out)
+
+        txt_feats = self.text_enhancer(txt_feats, inner_outs)
+        # bottom-up path
+        outs = [inner_outs[0]]
+        for idx in range(len(self.in_channels) - 1):
+            feat_low = outs[-1]
+            feat_high = inner_outs[idx + 1]
+            downsample_feat = self.downsample_layers[idx](feat_low)
+            out = self.bottom_up_layers[idx](torch.cat(
+                [downsample_feat, feat_high], 1), txt_feats)
+            outs.append(out)
+
+        # out_layers
+        results = []
+        for idx in range(len(self.in_channels)):
+            results.append(self.out_layers[idx](outs[idx]))
+
+        return tuple(results)
--- a/yolo_world/version.py
+++ b/yolo_world/version.py
+# Copyright (c) Tencent Inc. All rights reserved.
+from yolo_world import __version__
+
+def __version_info() -> tuple:
+    """Parse a version string into a tuple.
+    Returns:
+        tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
+            (1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
+    """
+    version_info = []
+    for x in __version__.split('.'):
+        if x.isdigit():
+            version_info.append(int(x))
+        elif x.find('rc') != -1:
+            patch_version = x.split('rc')
+            version_info.append(int(patch_version[0]))
+            version_info.append(f'rc{patch_version[1]}')
+    return tuple(version_info)
+
+
+version_info = __version_info()
+
+__all__ = ['__version__', 'version_info']