Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
YOLO-World_pytorch
Commits
e9cee049
Commit
e9cee049
authored
May 31, 2024
by
luopl
Browse files
Initial commit
parents
Pipeline
#1056
canceled with stages
Changes
166
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
905 additions
and
0 deletions
+905
-0
yolo_world/models/layers/yolo_bricks.py
yolo_world/models/layers/yolo_bricks.py
+601
-0
yolo_world/models/losses/__init__.py
yolo_world/models/losses/__init__.py
+4
-0
yolo_world/models/losses/dynamic_loss.py
yolo_world/models/losses/dynamic_loss.py
+38
-0
yolo_world/models/necks/__init__.py
yolo_world/models/necks/__init__.py
+4
-0
yolo_world/models/necks/yolo_world_pafpn.py
yolo_world/models/necks/yolo_world_pafpn.py
+235
-0
yolo_world/version.py
yolo_world/version.py
+23
-0
No files found.
yolo_world/models/layers/yolo_bricks.py
0 → 100644
View file @
e9cee049
# Copyright (c) Tencent Inc. All rights reserved.
from
typing
import
List
import
torch
import
torch.nn
as
nn
from
torch
import
Tensor
import
torch.nn.functional
as
F
from
mmcv.cnn
import
ConvModule
,
DepthwiseSeparableConvModule
,
Linear
from
mmdet.utils
import
ConfigType
,
OptConfigType
,
OptMultiConfig
from
mmengine.model
import
BaseModule
from
mmyolo.registry
import
MODELS
from
mmyolo.models.layers
import
CSPLayerWithTwoConv
@
MODELS
.
register_module
()
class
MaxSigmoidAttnBlock
(
BaseModule
):
"""Max Sigmoid attention block."""
def
__init__
(
self
,
in_channels
:
int
,
out_channels
:
int
,
guide_channels
:
int
,
embed_channels
:
int
,
kernel_size
:
int
=
3
,
padding
:
int
=
1
,
num_heads
:
int
=
1
,
use_depthwise
:
bool
=
False
,
with_scale
:
bool
=
False
,
conv_cfg
:
OptConfigType
=
None
,
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
init_cfg
:
OptMultiConfig
=
None
,
use_einsum
:
bool
=
True
)
->
None
:
super
().
__init__
(
init_cfg
=
init_cfg
)
conv
=
DepthwiseSeparableConvModule
if
use_depthwise
else
ConvModule
assert
(
out_channels
%
num_heads
==
0
and
embed_channels
%
num_heads
==
0
),
\
'out_channels and embed_channels should be divisible by num_heads.'
self
.
num_heads
=
num_heads
self
.
head_channels
=
out_channels
//
num_heads
self
.
use_einsum
=
use_einsum
self
.
embed_conv
=
ConvModule
(
in_channels
,
embed_channels
,
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
None
)
if
embed_channels
!=
in_channels
else
None
self
.
guide_fc
=
Linear
(
guide_channels
,
embed_channels
)
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
num_heads
))
if
with_scale
:
self
.
scale
=
nn
.
Parameter
(
torch
.
ones
(
1
,
num_heads
,
1
,
1
))
else
:
self
.
scale
=
1.0
self
.
project_conv
=
conv
(
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
padding
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
None
)
def
forward
(
self
,
x
:
Tensor
,
guide
:
Tensor
)
->
Tensor
:
"""Forward process."""
B
,
_
,
H
,
W
=
x
.
shape
guide
=
self
.
guide_fc
(
guide
)
guide
=
guide
.
reshape
(
B
,
-
1
,
self
.
num_heads
,
self
.
head_channels
)
embed
=
self
.
embed_conv
(
x
)
if
self
.
embed_conv
is
not
None
else
x
embed
=
embed
.
reshape
(
B
,
self
.
num_heads
,
self
.
head_channels
,
H
,
W
)
if
self
.
use_einsum
:
attn_weight
=
torch
.
einsum
(
'bmchw,bnmc->bmhwn'
,
embed
,
guide
)
else
:
batch
,
m
,
channel
,
height
,
width
=
embed
.
shape
_
,
n
,
_
,
_
=
guide
.
shape
embed
=
embed
.
permute
(
0
,
1
,
3
,
4
,
2
)
embed
=
embed
.
reshape
(
batch
,
m
,
-
1
,
channel
)
guide
=
guide
.
permute
(
0
,
2
,
3
,
1
)
attn_weight
=
torch
.
matmul
(
embed
,
guide
)
attn_weight
=
attn_weight
.
reshape
(
batch
,
m
,
height
,
width
,
n
)
attn_weight
=
attn_weight
.
max
(
dim
=-
1
)[
0
]
attn_weight
=
attn_weight
/
(
self
.
head_channels
**
0.5
)
attn_weight
=
attn_weight
+
self
.
bias
[
None
,
:,
None
,
None
]
attn_weight
=
attn_weight
.
sigmoid
()
*
self
.
scale
x
=
self
.
project_conv
(
x
)
x
=
x
.
reshape
(
B
,
self
.
num_heads
,
-
1
,
H
,
W
)
x
=
x
*
attn_weight
.
unsqueeze
(
2
)
x
=
x
.
reshape
(
B
,
-
1
,
H
,
W
)
return
x
@
MODELS
.
register_module
()
class
RepMatrixMaxSigmoidAttnBlock
(
BaseModule
):
"""Max Sigmoid attention block."""
def
__init__
(
self
,
in_channels
:
int
,
out_channels
:
int
,
embed_channels
:
int
,
guide_channels
:
int
,
kernel_size
:
int
=
3
,
padding
:
int
=
1
,
num_heads
:
int
=
1
,
use_depthwise
:
bool
=
False
,
with_scale
:
bool
=
False
,
conv_cfg
:
OptConfigType
=
None
,
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
init_cfg
:
OptMultiConfig
=
None
,
use_einsum
:
bool
=
True
)
->
None
:
super
().
__init__
(
init_cfg
=
init_cfg
)
conv
=
DepthwiseSeparableConvModule
if
use_depthwise
else
ConvModule
assert
(
out_channels
%
num_heads
==
0
and
embed_channels
%
num_heads
==
0
),
\
'out_channels and embed_channels should be divisible by num_heads.'
self
.
num_heads
=
num_heads
self
.
head_channels
=
out_channels
//
num_heads
self
.
use_einsum
=
use_einsum
self
.
embed_conv
=
ConvModule
(
in_channels
,
embed_channels
,
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
None
)
if
embed_channels
!=
in_channels
else
None
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
num_heads
))
self
.
guide_weight
=
nn
.
Parameter
(
torch
.
zeros
(
guide_channels
,
embed_channels
//
num_heads
,
num_heads
))
self
.
project_conv
=
conv
(
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
padding
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
None
)
def
forward
(
self
,
x
:
Tensor
,
txt_feats
:
Tensor
=
None
)
->
Tensor
:
"""Forward process."""
B
,
_
,
H
,
W
=
x
.
shape
embed
=
self
.
embed_conv
(
x
)
if
self
.
embed_conv
is
not
None
else
x
embed
=
embed
.
reshape
(
B
,
self
.
num_heads
,
self
.
head_channels
,
H
,
W
)
batch
,
m
,
channel
,
height
,
width
=
embed
.
shape
_
,
n
,
_
,
_
=
self
.
guide_weight
.
shape
# can be formulated to split conv
embed
=
embed
.
permute
(
0
,
1
,
3
,
4
,
2
)
embed
=
embed
.
reshape
(
batch
,
m
,
-
1
,
channel
)
attn_weight
=
torch
.
matmul
(
embed
,
self
.
guide_weight
)
attn_weight
=
attn_weight
.
reshape
(
batch
,
m
,
height
,
width
,
n
)
attn_weight
=
attn_weight
.
max
(
dim
=-
1
)[
0
]
attn_weight
=
attn_weight
/
(
self
.
head_channels
**
0.5
)
attn_weight
=
attn_weight
+
self
.
bias
[
None
,
:,
None
,
None
]
attn_weight
=
attn_weight
.
sigmoid
()
x
=
self
.
project_conv
(
x
)
x
=
x
.
reshape
(
B
,
self
.
num_heads
,
-
1
,
H
,
W
)
x
=
x
*
attn_weight
.
unsqueeze
(
2
)
x
=
x
.
reshape
(
B
,
-
1
,
H
,
W
)
return
x
@
MODELS
.
register_module
()
class
RepConvMaxSigmoidAttnBlock
(
BaseModule
):
"""Max Sigmoid attention block."""
def
__init__
(
self
,
in_channels
:
int
,
out_channels
:
int
,
embed_channels
:
int
,
guide_channels
:
int
,
kernel_size
:
int
=
3
,
padding
:
int
=
1
,
num_heads
:
int
=
1
,
use_depthwise
:
bool
=
False
,
with_scale
:
bool
=
False
,
conv_cfg
:
OptConfigType
=
None
,
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
init_cfg
:
OptMultiConfig
=
None
,
use_einsum
:
bool
=
True
)
->
None
:
super
().
__init__
(
init_cfg
=
init_cfg
)
conv
=
DepthwiseSeparableConvModule
if
use_depthwise
else
ConvModule
assert
(
out_channels
%
num_heads
==
0
and
embed_channels
%
num_heads
==
0
),
\
'out_channels and embed_channels should be divisible by num_heads.'
self
.
num_heads
=
num_heads
self
.
head_channels
=
out_channels
//
num_heads
self
.
use_einsum
=
use_einsum
self
.
embed_conv
=
ConvModule
(
in_channels
,
embed_channels
,
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
None
)
if
embed_channels
!=
in_channels
else
None
self
.
bias
=
nn
.
Parameter
(
torch
.
zeros
(
num_heads
))
self
.
num_heads
=
num_heads
self
.
split_channels
=
embed_channels
//
num_heads
self
.
guide_convs
=
nn
.
ModuleList
(
nn
.
Conv2d
(
self
.
split_channels
,
guide_channels
,
1
,
bias
=
False
)
for
_
in
range
(
num_heads
))
self
.
project_conv
=
conv
(
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
padding
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
None
)
def
forward
(
self
,
x
:
Tensor
,
txt_feats
:
Tensor
=
None
)
->
Tensor
:
"""Forward process."""
B
,
C
,
H
,
W
=
x
.
shape
embed
=
self
.
embed_conv
(
x
)
if
self
.
embed_conv
is
not
None
else
x
embed
=
list
(
embed
.
split
(
self
.
split_channels
,
1
))
# Bx(MxN)xHxW (H*c=C, H: heads)
attn_weight
=
torch
.
cat
(
[
conv
(
x
)
for
conv
,
x
in
zip
(
self
.
guide_convs
,
embed
)],
dim
=
1
)
# BxMxNxHxW
attn_weight
=
attn_weight
.
view
(
B
,
self
.
num_heads
,
-
1
,
H
,
W
)
# attn_weight = torch.stack(
# [conv(x) for conv, x in zip(self.guide_convs, embed)])
# BxMxNxHxW -> BxMxHxW
attn_weight
=
attn_weight
.
max
(
dim
=
2
)[
0
]
/
(
self
.
head_channels
**
0.5
)
attn_weight
=
(
attn_weight
+
self
.
bias
.
view
(
1
,
-
1
,
1
,
1
)).
sigmoid
()
# .transpose(0, 1)
# BxMx1xHxW
attn_weight
=
attn_weight
[:,
:,
None
]
x
=
self
.
project_conv
(
x
)
# BxHxCxHxW
x
=
x
.
view
(
B
,
self
.
num_heads
,
-
1
,
H
,
W
)
x
=
x
*
attn_weight
x
=
x
.
view
(
B
,
-
1
,
H
,
W
)
return
x
@
MODELS
.
register_module
()
class
MaxSigmoidCSPLayerWithTwoConv
(
CSPLayerWithTwoConv
):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def
__init__
(
self
,
in_channels
:
int
,
out_channels
:
int
,
guide_channels
:
int
,
embed_channels
:
int
,
num_heads
:
int
=
1
,
expand_ratio
:
float
=
0.5
,
num_blocks
:
int
=
1
,
with_scale
:
bool
=
False
,
add_identity
:
bool
=
True
,
# shortcut
conv_cfg
:
OptConfigType
=
None
,
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
act_cfg
:
ConfigType
=
dict
(
type
=
'SiLU'
,
inplace
=
True
),
init_cfg
:
OptMultiConfig
=
None
,
use_einsum
:
bool
=
True
)
->
None
:
super
().
__init__
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
expand_ratio
=
expand_ratio
,
num_blocks
=
num_blocks
,
add_identity
=
add_identity
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
init_cfg
=
init_cfg
)
self
.
final_conv
=
ConvModule
((
3
+
num_blocks
)
*
self
.
mid_channels
,
out_channels
,
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
)
self
.
attn_block
=
MaxSigmoidAttnBlock
(
self
.
mid_channels
,
self
.
mid_channels
,
guide_channels
=
guide_channels
,
embed_channels
=
embed_channels
,
num_heads
=
num_heads
,
with_scale
=
with_scale
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
use_einsum
=
use_einsum
)
def
forward
(
self
,
x
:
Tensor
,
guide
:
Tensor
)
->
Tensor
:
"""Forward process."""
x_main
=
self
.
main_conv
(
x
)
x_main
=
list
(
x_main
.
split
((
self
.
mid_channels
,
self
.
mid_channels
),
1
))
x_main
.
extend
(
blocks
(
x_main
[
-
1
])
for
blocks
in
self
.
blocks
)
x_main
.
append
(
self
.
attn_block
(
x_main
[
-
1
],
guide
))
return
self
.
final_conv
(
torch
.
cat
(
x_main
,
1
))
@
MODELS
.
register_module
()
class
RepMaxSigmoidCSPLayerWithTwoConv
(
CSPLayerWithTwoConv
):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def
__init__
(
self
,
in_channels
:
int
,
out_channels
:
int
,
guide_channels
:
int
,
embed_channels
:
int
,
num_heads
:
int
=
1
,
expand_ratio
:
float
=
0.5
,
num_blocks
:
int
=
1
,
with_scale
:
bool
=
False
,
add_identity
:
bool
=
True
,
# shortcut
conv_cfg
:
OptConfigType
=
None
,
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
act_cfg
:
ConfigType
=
dict
(
type
=
'SiLU'
,
inplace
=
True
),
init_cfg
:
OptMultiConfig
=
None
,
use_einsum
:
bool
=
True
)
->
None
:
super
().
__init__
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
expand_ratio
=
expand_ratio
,
num_blocks
=
num_blocks
,
add_identity
=
add_identity
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
init_cfg
=
init_cfg
)
self
.
final_conv
=
ConvModule
((
3
+
num_blocks
)
*
self
.
mid_channels
,
out_channels
,
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
)
self
.
attn_block
=
RepMatrixMaxSigmoidAttnBlock
(
self
.
mid_channels
,
self
.
mid_channels
,
embed_channels
=
embed_channels
,
guide_channels
=
guide_channels
,
num_heads
=
num_heads
,
with_scale
=
with_scale
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
use_einsum
=
use_einsum
)
def
forward
(
self
,
x
:
Tensor
,
guide
:
Tensor
)
->
Tensor
:
"""Forward process."""
x_main
=
self
.
main_conv
(
x
)
x_main
=
list
(
x_main
.
split
((
self
.
mid_channels
,
self
.
mid_channels
),
1
))
x_main
.
extend
(
blocks
(
x_main
[
-
1
])
for
blocks
in
self
.
blocks
)
x_main
.
append
(
self
.
attn_block
(
x_main
[
-
1
],
guide
))
return
self
.
final_conv
(
torch
.
cat
(
x_main
,
1
))
@
MODELS
.
register_module
()
class
RepConvMaxSigmoidCSPLayerWithTwoConv
(
CSPLayerWithTwoConv
):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def
__init__
(
self
,
in_channels
:
int
,
out_channels
:
int
,
guide_channels
:
int
,
embed_channels
:
int
,
num_heads
:
int
=
1
,
expand_ratio
:
float
=
0.5
,
num_blocks
:
int
=
1
,
with_scale
:
bool
=
False
,
add_identity
:
bool
=
True
,
# shortcut
conv_cfg
:
OptConfigType
=
None
,
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
act_cfg
:
ConfigType
=
dict
(
type
=
'SiLU'
,
inplace
=
True
),
init_cfg
:
OptMultiConfig
=
None
,
use_einsum
:
bool
=
True
)
->
None
:
super
().
__init__
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
expand_ratio
=
expand_ratio
,
num_blocks
=
num_blocks
,
add_identity
=
add_identity
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
init_cfg
=
init_cfg
)
self
.
final_conv
=
ConvModule
((
3
+
num_blocks
)
*
self
.
mid_channels
,
out_channels
,
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
)
self
.
attn_block
=
RepConvMaxSigmoidAttnBlock
(
self
.
mid_channels
,
self
.
mid_channels
,
embed_channels
=
embed_channels
,
guide_channels
=
guide_channels
,
num_heads
=
num_heads
,
with_scale
=
with_scale
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
use_einsum
=
use_einsum
)
def
forward
(
self
,
x
:
Tensor
,
guide
:
Tensor
)
->
Tensor
:
"""Forward process."""
x_main
=
self
.
main_conv
(
x
)
x_main
=
list
(
x_main
.
split
((
self
.
mid_channels
,
self
.
mid_channels
),
1
))
x_main
.
extend
(
blocks
(
x_main
[
-
1
])
for
blocks
in
self
.
blocks
)
x_main
.
append
(
self
.
attn_block
(
x_main
[
-
1
],
guide
))
return
self
.
final_conv
(
torch
.
cat
(
x_main
,
1
))
@
MODELS
.
register_module
()
class
ImagePoolingAttentionModule
(
nn
.
Module
):
def
__init__
(
self
,
image_channels
:
List
[
int
],
text_channels
:
int
,
embed_channels
:
int
,
with_scale
:
bool
=
False
,
num_feats
:
int
=
3
,
num_heads
:
int
=
8
,
pool_size
:
int
=
3
,
use_einsum
:
bool
=
True
):
super
().
__init__
()
self
.
text_channels
=
text_channels
self
.
embed_channels
=
embed_channels
self
.
num_heads
=
num_heads
self
.
num_feats
=
num_feats
self
.
head_channels
=
embed_channels
//
num_heads
self
.
pool_size
=
pool_size
self
.
use_einsum
=
use_einsum
if
with_scale
:
self
.
scale
=
nn
.
Parameter
(
torch
.
tensor
([
0.
]),
requires_grad
=
True
)
else
:
self
.
scale
=
1.0
self
.
projections
=
nn
.
ModuleList
([
ConvModule
(
in_channels
,
embed_channels
,
1
,
act_cfg
=
None
)
for
in_channels
in
image_channels
])
self
.
query
=
nn
.
Sequential
(
nn
.
LayerNorm
(
text_channels
),
Linear
(
text_channels
,
embed_channels
))
self
.
key
=
nn
.
Sequential
(
nn
.
LayerNorm
(
embed_channels
),
Linear
(
embed_channels
,
embed_channels
))
self
.
value
=
nn
.
Sequential
(
nn
.
LayerNorm
(
embed_channels
),
Linear
(
embed_channels
,
embed_channels
))
self
.
proj
=
Linear
(
embed_channels
,
text_channels
)
self
.
image_pools
=
nn
.
ModuleList
([
nn
.
AdaptiveMaxPool2d
((
pool_size
,
pool_size
))
for
_
in
range
(
num_feats
)
])
def
forward
(
self
,
text_features
,
image_features
):
B
=
image_features
[
0
].
shape
[
0
]
assert
len
(
image_features
)
==
self
.
num_feats
num_patches
=
self
.
pool_size
**
2
mlvl_image_features
=
[
pool
(
proj
(
x
)).
view
(
B
,
-
1
,
num_patches
)
for
(
x
,
proj
,
pool
)
in
zip
(
image_features
,
self
.
projections
,
self
.
image_pools
)
]
mlvl_image_features
=
torch
.
cat
(
mlvl_image_features
,
dim
=-
1
).
transpose
(
1
,
2
)
q
=
self
.
query
(
text_features
)
k
=
self
.
key
(
mlvl_image_features
)
v
=
self
.
value
(
mlvl_image_features
)
q
=
q
.
reshape
(
B
,
-
1
,
self
.
num_heads
,
self
.
head_channels
)
k
=
k
.
reshape
(
B
,
-
1
,
self
.
num_heads
,
self
.
head_channels
)
v
=
v
.
reshape
(
B
,
-
1
,
self
.
num_heads
,
self
.
head_channels
)
if
self
.
use_einsum
:
attn_weight
=
torch
.
einsum
(
'bnmc,bkmc->bmnk'
,
q
,
k
)
else
:
q
=
q
.
permute
(
0
,
2
,
1
,
3
)
k
=
k
.
permute
(
0
,
2
,
3
,
1
)
attn_weight
=
torch
.
matmul
(
q
,
k
)
attn_weight
=
attn_weight
/
(
self
.
head_channels
**
0.5
)
attn_weight
=
F
.
softmax
(
attn_weight
,
dim
=-
1
)
if
self
.
use_einsum
:
x
=
torch
.
einsum
(
'bmnk,bkmc->bnmc'
,
attn_weight
,
v
)
else
:
v
=
v
.
permute
(
0
,
2
,
1
,
3
)
x
=
torch
.
matmul
(
attn_weight
,
v
)
x
=
x
.
permute
(
0
,
2
,
1
,
3
)
x
=
self
.
proj
(
x
.
reshape
(
B
,
-
1
,
self
.
embed_channels
))
return
x
*
self
.
scale
+
text_features
@
MODELS
.
register_module
()
class
VanillaSigmoidBlock
(
BaseModule
):
"""Sigmoid attention block."""
def
__init__
(
self
,
in_channels
:
int
,
out_channels
:
int
,
guide_channels
:
int
,
embed_channels
:
int
,
kernel_size
:
int
=
3
,
padding
:
int
=
1
,
num_heads
:
int
=
1
,
use_depthwise
:
bool
=
False
,
with_scale
:
bool
=
False
,
conv_cfg
:
OptConfigType
=
None
,
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
init_cfg
:
OptMultiConfig
=
None
)
->
None
:
super
().
__init__
(
init_cfg
=
init_cfg
)
conv
=
DepthwiseSeparableConvModule
if
use_depthwise
else
ConvModule
assert
(
out_channels
%
num_heads
==
0
and
embed_channels
%
num_heads
==
0
),
\
'out_channels and embed_channels should be divisible by num_heads.'
self
.
num_heads
=
num_heads
self
.
head_channels
=
out_channels
//
num_heads
self
.
project_conv
=
conv
(
in_channels
,
out_channels
,
kernel_size
,
stride
=
1
,
padding
=
padding
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
None
)
def
forward
(
self
,
x
:
Tensor
,
guide
:
Tensor
)
->
Tensor
:
"""Forward process."""
x
=
self
.
project_conv
(
x
)
# remove sigmoid
# x = x * x.sigmoid()
return
x
@
MODELS
.
register_module
()
class
EfficientCSPLayerWithTwoConv
(
CSPLayerWithTwoConv
):
"""Sigmoid-attention based CSP layer with two convolution layers."""
def
__init__
(
self
,
in_channels
:
int
,
out_channels
:
int
,
guide_channels
:
int
,
embed_channels
:
int
,
num_heads
:
int
=
1
,
expand_ratio
:
float
=
0.5
,
num_blocks
:
int
=
1
,
with_scale
:
bool
=
False
,
add_identity
:
bool
=
True
,
# shortcut
conv_cfg
:
OptConfigType
=
None
,
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
act_cfg
:
ConfigType
=
dict
(
type
=
'SiLU'
,
inplace
=
True
),
init_cfg
:
OptMultiConfig
=
None
)
->
None
:
super
().
__init__
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
expand_ratio
=
expand_ratio
,
num_blocks
=
num_blocks
,
add_identity
=
add_identity
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
init_cfg
=
init_cfg
)
self
.
final_conv
=
ConvModule
((
3
+
num_blocks
)
*
self
.
mid_channels
,
out_channels
,
1
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
)
self
.
attn_block
=
VanillaSigmoidBlock
(
self
.
mid_channels
,
self
.
mid_channels
,
guide_channels
=
guide_channels
,
embed_channels
=
embed_channels
,
num_heads
=
num_heads
,
with_scale
=
with_scale
,
conv_cfg
=
conv_cfg
,
norm_cfg
=
norm_cfg
)
def
forward
(
self
,
x
:
Tensor
,
guide
:
Tensor
)
->
Tensor
:
"""Forward process."""
x_main
=
self
.
main_conv
(
x
)
x_main
=
list
(
x_main
.
split
((
self
.
mid_channels
,
self
.
mid_channels
),
1
))
x_main
.
extend
(
blocks
(
x_main
[
-
1
])
for
blocks
in
self
.
blocks
)
x_main
.
append
(
self
.
attn_block
(
x_main
[
-
1
],
guide
))
return
self
.
final_conv
(
torch
.
cat
(
x_main
,
1
))
yolo_world/models/losses/__init__.py
0 → 100644
View file @
e9cee049
# Copyright (c) Tencent Inc. All rights reserved.
from
.dynamic_loss
import
CoVMSELoss
__all__
=
[
'CoVMSELoss'
]
yolo_world/models/losses/dynamic_loss.py
0 → 100644
View file @
e9cee049
# Copyright (c) Tencent Inc. All rights reserved.
from
typing
import
Optional
import
torch
import
torch.nn
as
nn
from
torch
import
Tensor
from
mmdet.models.losses.mse_loss
import
mse_loss
from
mmyolo.registry
import
MODELS
@
MODELS
.
register_module
()
class
CoVMSELoss
(
nn
.
Module
):
def
__init__
(
self
,
dim
:
int
=
0
,
reduction
:
str
=
'mean'
,
loss_weight
:
float
=
1.0
,
eps
:
float
=
1e-6
)
->
None
:
super
().
__init__
()
self
.
dim
=
dim
self
.
reduction
=
reduction
self
.
loss_weight
=
loss_weight
self
.
eps
=
eps
def
forward
(
self
,
pred
:
Tensor
,
weight
:
Optional
[
Tensor
]
=
None
,
avg_factor
:
Optional
[
int
]
=
None
,
reduction_override
:
Optional
[
str
]
=
None
)
->
Tensor
:
"""Forward function of loss."""
assert
reduction_override
in
(
None
,
'none'
,
'mean'
,
'sum'
)
reduction
=
(
reduction_override
if
reduction_override
else
self
.
reduction
)
cov
=
pred
.
std
(
self
.
dim
)
/
pred
.
mean
(
self
.
dim
).
clamp
(
min
=
self
.
eps
)
target
=
torch
.
zeros_like
(
cov
)
loss
=
self
.
loss_weight
*
mse_loss
(
cov
,
target
,
weight
,
reduction
=
reduction
,
avg_factor
=
avg_factor
)
return
loss
yolo_world/models/necks/__init__.py
0 → 100644
View file @
e9cee049
# Copyright (c) Tencent Inc. All rights reserved.
from
.yolo_world_pafpn
import
YOLOWorldPAFPN
,
YOLOWorldDualPAFPN
__all__
=
[
'YOLOWorldPAFPN'
,
'YOLOWorldDualPAFPN'
]
yolo_world/models/necks/yolo_world_pafpn.py
0 → 100644
View file @
e9cee049
# Copyright (c) Tencent Inc. All rights reserved.
import
copy
from
typing
import
List
,
Union
import
torch
import
torch.nn
as
nn
from
torch
import
Tensor
from
mmdet.utils
import
ConfigType
,
OptMultiConfig
from
mmyolo.registry
import
MODELS
from
mmyolo.models.utils
import
make_divisible
,
make_round
from
mmyolo.models.necks.yolov8_pafpn
import
YOLOv8PAFPN
@
MODELS
.
register_module
()
class
YOLOWorldPAFPN
(
YOLOv8PAFPN
):
"""Path Aggregation Network used in YOLO World
Following YOLOv8 PAFPN, including text to image fusion
"""
def
__init__
(
self
,
in_channels
:
List
[
int
],
out_channels
:
Union
[
List
[
int
],
int
],
guide_channels
:
int
,
embed_channels
:
List
[
int
],
num_heads
:
List
[
int
],
deepen_factor
:
float
=
1.0
,
widen_factor
:
float
=
1.0
,
num_csp_blocks
:
int
=
3
,
freeze_all
:
bool
=
False
,
block_cfg
:
ConfigType
=
dict
(
type
=
'CSPLayerWithTwoConv'
),
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
act_cfg
:
ConfigType
=
dict
(
type
=
'SiLU'
,
inplace
=
True
),
init_cfg
:
OptMultiConfig
=
None
)
->
None
:
self
.
guide_channels
=
guide_channels
self
.
embed_channels
=
embed_channels
self
.
num_heads
=
num_heads
self
.
block_cfg
=
block_cfg
super
().
__init__
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
deepen_factor
=
deepen_factor
,
widen_factor
=
widen_factor
,
num_csp_blocks
=
num_csp_blocks
,
freeze_all
=
freeze_all
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
init_cfg
=
init_cfg
)
def
build_top_down_layer
(
self
,
idx
:
int
)
->
nn
.
Module
:
"""build top down layer.
Args:
idx (int): layer idx.
Returns:
nn.Module: The top down layer.
"""
block_cfg
=
copy
.
deepcopy
(
self
.
block_cfg
)
block_cfg
.
update
(
dict
(
in_channels
=
make_divisible
(
(
self
.
in_channels
[
idx
-
1
]
+
self
.
in_channels
[
idx
]),
self
.
widen_factor
),
out_channels
=
make_divisible
(
self
.
out_channels
[
idx
-
1
],
self
.
widen_factor
),
guide_channels
=
self
.
guide_channels
,
embed_channels
=
make_round
(
self
.
embed_channels
[
idx
-
1
],
self
.
widen_factor
),
num_heads
=
make_round
(
self
.
num_heads
[
idx
-
1
],
self
.
widen_factor
),
num_blocks
=
make_round
(
self
.
num_csp_blocks
,
self
.
deepen_factor
),
add_identity
=
False
,
norm_cfg
=
self
.
norm_cfg
,
act_cfg
=
self
.
act_cfg
))
return
MODELS
.
build
(
block_cfg
)
def
build_bottom_up_layer
(
self
,
idx
:
int
)
->
nn
.
Module
:
"""build bottom up layer.
Args:
idx (int): layer idx.
Returns:
nn.Module: The bottom up layer.
"""
block_cfg
=
copy
.
deepcopy
(
self
.
block_cfg
)
block_cfg
.
update
(
dict
(
in_channels
=
make_divisible
(
(
self
.
out_channels
[
idx
]
+
self
.
out_channels
[
idx
+
1
]),
self
.
widen_factor
),
out_channels
=
make_divisible
(
self
.
out_channels
[
idx
+
1
],
self
.
widen_factor
),
guide_channels
=
self
.
guide_channels
,
embed_channels
=
make_round
(
self
.
embed_channels
[
idx
+
1
],
self
.
widen_factor
),
num_heads
=
make_round
(
self
.
num_heads
[
idx
+
1
],
self
.
widen_factor
),
num_blocks
=
make_round
(
self
.
num_csp_blocks
,
self
.
deepen_factor
),
add_identity
=
False
,
norm_cfg
=
self
.
norm_cfg
,
act_cfg
=
self
.
act_cfg
))
return
MODELS
.
build
(
block_cfg
)
def
forward
(
self
,
img_feats
:
List
[
Tensor
],
txt_feats
:
Tensor
=
None
)
->
tuple
:
"""Forward function.
including multi-level image features, text features: BxLxD
"""
assert
len
(
img_feats
)
==
len
(
self
.
in_channels
)
# reduce layers
reduce_outs
=
[]
for
idx
in
range
(
len
(
self
.
in_channels
)):
reduce_outs
.
append
(
self
.
reduce_layers
[
idx
](
img_feats
[
idx
]))
# top-down path
inner_outs
=
[
reduce_outs
[
-
1
]]
for
idx
in
range
(
len
(
self
.
in_channels
)
-
1
,
0
,
-
1
):
feat_high
=
inner_outs
[
0
]
feat_low
=
reduce_outs
[
idx
-
1
]
upsample_feat
=
self
.
upsample_layers
[
len
(
self
.
in_channels
)
-
1
-
idx
](
feat_high
)
if
self
.
upsample_feats_cat_first
:
top_down_layer_inputs
=
torch
.
cat
([
upsample_feat
,
feat_low
],
1
)
else
:
top_down_layer_inputs
=
torch
.
cat
([
feat_low
,
upsample_feat
],
1
)
inner_out
=
self
.
top_down_layers
[
len
(
self
.
in_channels
)
-
1
-
idx
](
top_down_layer_inputs
,
txt_feats
)
inner_outs
.
insert
(
0
,
inner_out
)
# bottom-up path
outs
=
[
inner_outs
[
0
]]
for
idx
in
range
(
len
(
self
.
in_channels
)
-
1
):
feat_low
=
outs
[
-
1
]
feat_high
=
inner_outs
[
idx
+
1
]
downsample_feat
=
self
.
downsample_layers
[
idx
](
feat_low
)
out
=
self
.
bottom_up_layers
[
idx
](
torch
.
cat
(
[
downsample_feat
,
feat_high
],
1
),
txt_feats
)
outs
.
append
(
out
)
# out_layers
results
=
[]
for
idx
in
range
(
len
(
self
.
in_channels
)):
results
.
append
(
self
.
out_layers
[
idx
](
outs
[
idx
]))
return
tuple
(
results
)
@
MODELS
.
register_module
()
class
YOLOWorldDualPAFPN
(
YOLOWorldPAFPN
):
"""Path Aggregation Network used in YOLO World v8."""
def
__init__
(
self
,
in_channels
:
List
[
int
],
out_channels
:
Union
[
List
[
int
],
int
],
guide_channels
:
int
,
embed_channels
:
List
[
int
],
num_heads
:
List
[
int
],
deepen_factor
:
float
=
1.0
,
widen_factor
:
float
=
1.0
,
num_csp_blocks
:
int
=
3
,
freeze_all
:
bool
=
False
,
text_enhancder
:
ConfigType
=
dict
(
type
=
'ImagePoolingAttentionModule'
,
embed_channels
=
256
,
num_heads
=
8
,
pool_size
=
3
),
block_cfg
:
ConfigType
=
dict
(
type
=
'CSPLayerWithTwoConv'
),
norm_cfg
:
ConfigType
=
dict
(
type
=
'BN'
,
momentum
=
0.03
,
eps
=
0.001
),
act_cfg
:
ConfigType
=
dict
(
type
=
'SiLU'
,
inplace
=
True
),
init_cfg
:
OptMultiConfig
=
None
)
->
None
:
super
().
__init__
(
in_channels
=
in_channels
,
out_channels
=
out_channels
,
guide_channels
=
guide_channels
,
embed_channels
=
embed_channels
,
num_heads
=
num_heads
,
deepen_factor
=
deepen_factor
,
widen_factor
=
widen_factor
,
num_csp_blocks
=
num_csp_blocks
,
freeze_all
=
freeze_all
,
block_cfg
=
block_cfg
,
norm_cfg
=
norm_cfg
,
act_cfg
=
act_cfg
,
init_cfg
=
init_cfg
)
text_enhancder
.
update
(
dict
(
image_channels
=
[
int
(
x
*
widen_factor
)
for
x
in
out_channels
],
text_channels
=
guide_channels
,
num_feats
=
len
(
out_channels
),
))
print
(
text_enhancder
)
self
.
text_enhancer
=
MODELS
.
build
(
text_enhancder
)
def
forward
(
self
,
img_feats
:
List
[
Tensor
],
txt_feats
:
Tensor
)
->
tuple
:
"""Forward function."""
assert
len
(
img_feats
)
==
len
(
self
.
in_channels
)
# reduce layers
reduce_outs
=
[]
for
idx
in
range
(
len
(
self
.
in_channels
)):
reduce_outs
.
append
(
self
.
reduce_layers
[
idx
](
img_feats
[
idx
]))
# top-down path
inner_outs
=
[
reduce_outs
[
-
1
]]
for
idx
in
range
(
len
(
self
.
in_channels
)
-
1
,
0
,
-
1
):
feat_high
=
inner_outs
[
0
]
feat_low
=
reduce_outs
[
idx
-
1
]
upsample_feat
=
self
.
upsample_layers
[
len
(
self
.
in_channels
)
-
1
-
idx
](
feat_high
)
if
self
.
upsample_feats_cat_first
:
top_down_layer_inputs
=
torch
.
cat
([
upsample_feat
,
feat_low
],
1
)
else
:
top_down_layer_inputs
=
torch
.
cat
([
feat_low
,
upsample_feat
],
1
)
inner_out
=
self
.
top_down_layers
[
len
(
self
.
in_channels
)
-
1
-
idx
](
top_down_layer_inputs
,
txt_feats
)
inner_outs
.
insert
(
0
,
inner_out
)
txt_feats
=
self
.
text_enhancer
(
txt_feats
,
inner_outs
)
# bottom-up path
outs
=
[
inner_outs
[
0
]]
for
idx
in
range
(
len
(
self
.
in_channels
)
-
1
):
feat_low
=
outs
[
-
1
]
feat_high
=
inner_outs
[
idx
+
1
]
downsample_feat
=
self
.
downsample_layers
[
idx
](
feat_low
)
out
=
self
.
bottom_up_layers
[
idx
](
torch
.
cat
(
[
downsample_feat
,
feat_high
],
1
),
txt_feats
)
outs
.
append
(
out
)
# out_layers
results
=
[]
for
idx
in
range
(
len
(
self
.
in_channels
)):
results
.
append
(
self
.
out_layers
[
idx
](
outs
[
idx
]))
return
tuple
(
results
)
yolo_world/version.py
0 → 100644
View file @
e9cee049
# Copyright (c) Tencent Inc. All rights reserved.
from
yolo_world
import
__version__
def
__version_info
()
->
tuple
:
"""Parse a version string into a tuple.
Returns:
tuple[int | str]: The version info, e.g., "1.3.0" is parsed into
(1, 3, 0), and "2.0.0rc1" is parsed into (2, 0, 0, 'rc1').
"""
version_info
=
[]
for
x
in
__version__
.
split
(
'.'
):
if
x
.
isdigit
():
version_info
.
append
(
int
(
x
))
elif
x
.
find
(
'rc'
)
!=
-
1
:
patch_version
=
x
.
split
(
'rc'
)
version_info
.
append
(
int
(
patch_version
[
0
]))
version_info
.
append
(
f
'rc
{
patch_version
[
1
]
}
'
)
return
tuple
(
version_info
)
version_info
=
__version_info
()
__all__
=
[
'__version__'
,
'version_info'
]
Prev
1
…
5
6
7
8
9
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment